From 2190cdde25d7b879e669721152b229964911e21c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=BF=9E=E5=85=B4=E6=9D=B0?= <2509165004@student.example.com> Date: Tue, 9 Jun 2026 11:20:49 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 爬虫top250.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 爬虫top250.py diff --git a/爬虫top250.py b/爬虫top250.py new file mode 100644 index 0000000..d270e43 --- /dev/null +++ b/爬虫top250.py @@ -0,0 +1,75 @@ +import re +import json +import requests + + +def fetch_page(url): + """获取网页内容""" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36" + } + response = requests.get(url, headers=headers) + response.encoding = "utf-8" + return response.text + + +def extract_movie_info(html: str) -> list: + """ + 从 HTML 中提取所有电影信息 + 返回电影列表,每个元素为字典 + """ + movies = [] + + # 提取每部电影的 HTML 块(用非贪婪匹配,兼容所有版本) + items = re.findall(r'
.*?
', html, re.DOTALL) + print(f"找到 {len(items)} 个电影 item 块") + + for idx, item in enumerate(items): + movie = {} + movie["rank"] = idx + 1 + + # 1. 电影名称(宽松匹配,只要是 class="title" 的 span) + title_match = re.search(r']*>([^<]+)', item) + movie["title"] = title_match.group(1).strip() if title_match else "无标题" + + # 2. 主演(匹配“主演:”后面到换行/下一个标签前的内容) + actors_match = re.search(r'主演:\s*(.*?)(?:
| |

)', item, re.DOTALL) + movie["actors"] = actors_match.group(1).strip() if actors_match else "无主演" + + # 3. 经典台词(匹配 quote 下的 span 内容) + quote_match = re.search(r'

(.*?)', item, re.DOTALL) + movie["quote"] = quote_match.group(1).strip() if quote_match else "无短评" + + movies.append(movie) + + return movies + + +def save_to_json(movies: list, filename: str): + """保存为 JSON 文件""" + with open(filename, "w", encoding="utf-8") as f: + json.dump(movies, f, ensure_ascii=False, indent=2) + + +if __name__ == "__main__": + # 爬取前 50 部(两页) + all_movies = [] + for offset in [0, 25]: + url = f"https://movie.douban.com/top250?start={offset}" + print(f"\n正在获取: {url}") + html = fetch_page(url) + print(f"页面长度: {len(html)}") + + page_movies = extract_movie_info(html) + all_movies.extend(page_movies) + + print(f"\n总共提取到 {len(all_movies)} 部电影") + save_to_json(all_movies, "movies.json") + print("结果已保存到 movies.json") + + # 打印前 3 部,确保数据正确 + print("\n==== 前 3 部电影信息 ====") + for m in all_movies[:3]: + print(f"{m['rank']}. {m['title']}") + print(f"主演: {m['actors']}") + print(f"短评: {m['quote']}\n") \ No newline at end of file