Python文件操作完全指南

2026-03-31 11:29:51 +08:00
parent fda798f369
commit 1ce8948e89
6 changed files with 2155 additions and 0 deletions
--- a/0331+2509165015/Text.3.py
+++ b/0331+2509165015/Text.3.py
@@ -0,0 +1,73 @@
+import requests
+from bs4 import BeautifulSoup
+import json
+import time
+import random
+import re  
+
+headers = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
+    "Accept-Language": "zh-CN,zh;q=0.9",
+    "Referer": "https://movie.douban.com/"
+}
+
+
+def parse_movie_info(item):
+    """解析单部电影信息"""
+    try:
+        rank = item.find("em").text.strip()
+        title = item.find("span", class_="title").text.strip()
+
+        other_span = item.find("span", class_="other")
+        en_title = other_span.text.strip().replace("/ ", "") if other_span else ""
+
+        rating = item.find("span", class_="rating_num").text.strip()
+
+        quote_span = item.find("span", class_="inq")
+        quote = quote_span.text.strip() if quote_span else ""
+        info_p = item.find("div", class_="bd").find("p").text
+        year = re.search(r"(\d{4})", info_p).group(1) if re.search(r"(\d{4})", info_p) else ""
+
+        return {
+            "rank": int(rank),
+            "title": title,
+            "en_title": en_title,
+            "rating": float(rating),
+            "quote": quote,
+            "year": year
+        }
+    except Exception as e:
+        print(f"解析电影信息失败：{e}")
+        return None
+
+
+def crawl_douban_top250():
+    """爬取豆瓣Top250全量数据并保存为JSON"""
+    all_movies = []
+    base_url = "https://movie.douban.com/top250"
+    for page_num in range(10):
+        url = f"{base_url}?start={page_num * 25}&filter="
+        try:
+            time.sleep(random.uniform(1.5, 2.5))
+            response = requests.get(url, headers=headers, timeout=15)
+            response.raise_for_status()
+
+            soup = BeautifulSoup(response.text, "html.parser")
+            movie_items = soup.find_all("div", class_="item")
+            for item in movie_items:
+                movie_info = parse_movie_info(item)
+                if movie_info:
+                    all_movies.append(movie_info)
+
+            print(f"✅ 第{page_num + 1}页爬取完成，已获取{len(movie_items)}部电影")
+        except Exception as e:
+            print(f"❌ 第{page_num + 1}页爬取失败：{str(e)[:50]}...")
+            continue
+    with open("movies.json", "w", encoding="utf-8") as f:
+        json.dump(all_movies, f, ensure_ascii=False, indent=2)
+
+    print(f"\n🎉 爬取完成！共收录{len(all_movies)}部电影")
+    print(f"📄 文件保存路径：movies.json")
+    return all_movies
+if __name__ == "__main__":
+    crawl_douban_top250()