Files
2026-03-31 11:29:51 +08:00

73 lines
2.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import json
import time
import random
import re
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept-Language": "zh-CN,zh;q=0.9",
"Referer": "https://movie.douban.com/"
}
def parse_movie_info(item):
"""解析单部电影信息"""
try:
rank = item.find("em").text.strip()
title = item.find("span", class_="title").text.strip()
other_span = item.find("span", class_="other")
en_title = other_span.text.strip().replace("/ ", "") if other_span else ""
rating = item.find("span", class_="rating_num").text.strip()
quote_span = item.find("span", class_="inq")
quote = quote_span.text.strip() if quote_span else ""
info_p = item.find("div", class_="bd").find("p").text
year = re.search(r"(\d{4})", info_p).group(1) if re.search(r"(\d{4})", info_p) else ""
return {
"rank": int(rank),
"title": title,
"en_title": en_title,
"rating": float(rating),
"quote": quote,
"year": year
}
except Exception as e:
print(f"解析电影信息失败:{e}")
return None
def crawl_douban_top250():
"""爬取豆瓣Top250全量数据并保存为JSON"""
all_movies = []
base_url = "https://movie.douban.com/top250"
for page_num in range(10):
url = f"{base_url}?start={page_num * 25}&filter="
try:
time.sleep(random.uniform(1.5, 2.5))
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
movie_items = soup.find_all("div", class_="item")
for item in movie_items:
movie_info = parse_movie_info(item)
if movie_info:
all_movies.append(movie_info)
print(f"✅ 第{page_num + 1}页爬取完成,已获取{len(movie_items)}部电影")
except Exception as e:
print(f"❌ 第{page_num + 1}页爬取失败:{str(e)[:50]}...")
continue
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(all_movies, f, ensure_ascii=False, indent=2)
print(f"\n🎉 爬取完成!共收录{len(all_movies)}部电影")
print(f"📄 文件保存路径movies.json")
return all_movies
if __name__ == "__main__":
crawl_douban_top250()