From 798e2e46d3d6999bcbf052edb19d5ffd7d1a0ace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=99=E4=BC=A0=E6=95=8F?= <2509165031@student.example.com> Date: Tue, 31 Mar 2026 11:36:26 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=2020260331?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20260331/text1.py | 76 +++++++++++++++++++++++++++++++++++++++++++++++ 20260331/text2.py | 67 +++++++++++++++++++++++++++++++++++++++++ 20260331/text3.py | 64 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 207 insertions(+) create mode 100644 20260331/text1.py create mode 100644 20260331/text2.py create mode 100644 20260331/text3.py diff --git a/20260331/text1.py b/20260331/text1.py new file mode 100644 index 0000000..d6fc606 --- /dev/null +++ b/20260331/text1.py @@ -0,0 +1,76 @@ +import requests +from bs4 import BeautifulSoup +import time + +# 伪装成浏览器 +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} + +all_movies = [] + +# 爬10页,每页25部 = 250部 +for page in range(10): + start = page * 25 + url = f"https://movie.douban.com/top250?start={start}&filter=" + + print(f"正在爬第 {page+1} 页 …") + resp = requests.get(url, headers=headers) + soup = BeautifulSoup(resp.text, "html.parser") + + items = soup.find_all("div", class_="item") + + for item in items: + # 排名 & 片名 + rank = item.find("em").text + title = item.find("span", class_="title").text + + # 评分 + score = item.find("span", class_="rating_num").text + + # 导演、年份、地区、类型 + info = item.find("div", class_="bd").p.text.strip() + lines = [line.strip() for line in info.split("\n") if line.strip()] + + # 第一行:导演 + director_line = lines[0] + if "导演:" in director_line: + director = director_line.split("导演:")[1].split("主演:")[0].strip() + else: + director = "未知" + + # 第二行:年份 / 地区 / 类型 + if len(lines) >= 2: + year_area_genre = lines[1].split("/") + year = year_area_genre[0].strip() if len(year_area_genre) > 0 else "未知" + area = year_area_genre[1].strip() if len(year_area_genre) > 1 else "未知" + genre = year_area_genre[2].strip() if len(year_area_genre) > 2 else "未知" + else: + year = area = genre = "未知" + + movie = { + "排名": rank, + "片名": title, + "导演": director, + "年份": year, + "地区": area, + "类型": genre, + "评分": score + } + all_movies.append(movie) + + time.sleep(1) # 防封 + +# ------------------- 写入 TXT ------------------- +with open("douban_top250.txt", "w", encoding="utf-8") as f: + for m in all_movies: + f.write(f"排名:{m['排名']}\n") + f.write(f"片名:{m['片名']}\n") + f.write(f"导演:{m['导演']}\n") + f.write(f"年份:{m['年份']}\n") + f.write(f"地区:{m['地区']}\n") + f.write(f"类型:{m['类型']}\n") + f.write(f"评分:{m['评分']}\n") + f.write("-" * 50 + "\n") + +print("全部爬完!已保存到 douban_top250.txt") \ No newline at end of file diff --git a/20260331/text2.py b/20260331/text2.py new file mode 100644 index 0000000..fd12ad4 --- /dev/null +++ b/20260331/text2.py @@ -0,0 +1,67 @@ +import requests +from bs4 import BeautifulSoup +import csv +import time + +# 请求头(模拟浏览器) +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} + +# 存储所有电影数据 +movies_data = [] + +# 爬取10页(每页25条,共250条) +for page in range(10): + start = page * 25 + url = f"https://movie.douban.com/top250?start={start}&filter=" + + try: + # 发送请求获取页面 + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + + # 提取当前页所有电影项 + items = soup.find_all("div", class_="item") + for item in items: + # 提取排名、片名、评分 + rank = item.find("em").text + title = item.find("span", class_="title").text + score = item.find("span", class_="rating_num").text + + # 提取导演、年份、地区、类型 + info = item.find("div", class_="bd").p.text.strip().split("\n") + director_line = info[0].strip() + director = director_line.split("导演:")[1].split("主演:")[0].strip() + + year_area_genre = info[1].strip().split("/") + year = year_area_genre[0].strip() + area = year_area_genre[1].strip() + genre = year_area_genre[2].strip() + + # 添加到数据列表 + movies_data.append({ + "排名": rank, + "片名": title, + "导演": director, + "上映年份": year, + "地区": area, + "类型": genre, + "评分": score + }) + + print(f"第 {page+1} 页爬取完成") + time.sleep(1) # 延迟1秒,避免反爬 + except Exception as e: + print(f"第 {page+1} 页爬取失败:{str(e)}") + +# 保存为CSV文件 +with open("豆瓣电影Top250.csv", "w", encoding="utf-8-sig", newline="") as f: + # 表头 + fieldnames = ["排名", "片名", "导演", "上映年份", "地区", "类型", "评分"] + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(movies_data) + +print("✅ 数据已保存为「豆瓣电影Top250.csv」") \ No newline at end of file diff --git a/20260331/text3.py b/20260331/text3.py new file mode 100644 index 0000000..0c70a8d --- /dev/null +++ b/20260331/text3.py @@ -0,0 +1,64 @@ +import requests +from bs4 import BeautifulSoup +import json +import time + +# 请求头(模拟浏览器) +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} + +# 存储所有电影数据 +movies_data = [] + +# 爬取10页(每页25条,共250条) +for page in range(10): + start = page * 25 + url = f"https://movie.douban.com/top250?start={start}&filter=" + + try: + # 发送请求获取页面 + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + + # 提取当前页所有电影项 + items = soup.find_all("div", class_="item") + for item in items: + # 提取排名、片名、评分 + rank = item.find("em").text + title = item.find("span", class_="title").text + score = item.find("span", class_="rating_num").text + + # 提取导演、年份、地区、类型 + info = item.find("div", class_="bd").p.text.strip().split("\n") + director_line = info[0].strip() + director = director_line.split("导演:")[1].split("主演:")[0].strip() + + year_area_genre = info[1].strip().split("/") + year = year_area_genre[0].strip() + area = year_area_genre[1].strip() + genre = year_area_genre[2].strip() + + # 添加到数据列表 + movies_data.append({ + "排名": rank, + "片名": title, + "导演": director, + "上映年份": year, + "地区": area, + "类型": genre, + "评分": score + }) + + print(f"第 {page+1} 页爬取完成") + time.sleep(1) # 延迟1秒,避免反爬 + except Exception as e: + print(f"第 {page+1} 页爬取失败:{str(e)}") + +# 保存为JSON文件 +with open("豆瓣电影Top250.json", "w", encoding="utf-8") as f: + # ensure_ascii=False 保证中文正常显示,indent=2 格式化缩进 + json.dump(movies_data, f, ensure_ascii=False, indent=2) + +print("✅ 数据已保存为「豆瓣电影Top250.json」") \ No newline at end of file