From 2190cdde25d7b879e669721152b229964911e21c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=BF=9E=E5=85=B4=E6=9D=B0?=
 <2509165004@student.example.com>
Date: Tue, 9 Jun 2026 11:20:49 +0800
Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?=
 =?UTF-8?q?=20/?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 爬虫top250.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 爬虫top250.py
diff --git a/爬虫top250.py b/爬虫top250.py
new file mode 100644
index 0000000..d270e43
--- /dev/null
+++ b/爬虫top250.py
@@ -0,0 +1,75 @@
+import re
+import json
+import requests
+
+
+def fetch_page(url):
+    """获取网页内容"""
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
+    }
+    response = requests.get(url, headers=headers)
+    response.encoding = "utf-8"
+    return response.text
+
+
+def extract_movie_info(html: str) -> list:
+    """
+    从 HTML 中提取所有电影信息
+    返回电影列表，每个元素为字典
+    """
+    movies = []
+
+    # 提取每部电影的 HTML 块（用非贪婪匹配，兼容所有版本）
+    items = re.findall(r'<div class="item">.*?</div>', html, re.DOTALL)
+    print(f"找到 {len(items)} 个电影 item 块")
+
+    for idx, item in enumerate(items):
+        movie = {}
+        movie["rank"] = idx + 1
+
+        # 1. 电影名称（宽松匹配，只要是 class="title" 的 span）
+        title_match = re.search(r'<span class="title"[^>]*>([^<]+)</span>', item)
+        movie["title"] = title_match.group(1).strip() if title_match else "无标题"
+
+        # 2. 主演（匹配“主演:”后面到换行/下一个标签前的内容）
+        actors_match = re.search(r'主演:\s*(.*?)(?:<br>|&nbsp;|</p>)', item, re.DOTALL)
+        movie["actors"] = actors_match.group(1).strip() if actors_match else "无主演"
+
+        # 3. 经典台词（匹配 quote 下的 span 内容）
+        quote_match = re.search(r'<p class="quote".*?<span>(.*?)</span>', item, re.DOTALL)
+        movie["quote"] = quote_match.group(1).strip() if quote_match else "无短评"
+
+        movies.append(movie)
+
+    return movies
+
+
+def save_to_json(movies: list, filename: str):
+    """保存为 JSON 文件"""
+    with open(filename, "w", encoding="utf-8") as f:
+        json.dump(movies, f, ensure_ascii=False, indent=2)
+
+
+if __name__ == "__main__":
+    # 爬取前 50 部（两页）
+    all_movies = []
+    for offset in [0, 25]:
+        url = f"https://movie.douban.com/top250?start={offset}"
+        print(f"\n正在获取: {url}")
+        html = fetch_page(url)
+        print(f"页面长度: {len(html)}")
+
+        page_movies = extract_movie_info(html)
+        all_movies.extend(page_movies)
+
+    print(f"\n总共提取到 {len(all_movies)} 部电影")
+    save_to_json(all_movies, "movies.json")
+    print("结果已保存到 movies.json")
+
+    # 打印前 3 部，确保数据正确
+    print("\n==== 前 3 部电影信息 ====")
+    for m in all_movies[:3]:
+        print(f"{m['rank']}. {m['title']}")
+        print(f"主演: {m['actors']}")
+        print(f"短评: {m['quote']}\n")
\ No newline at end of file