import re import json import requests def fetch_page(url): """获取网页内容""" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36" } response = requests.get(url, headers=headers) response.encoding = "utf-8" return response.text def extract_movie_info(html: str) -> list: """ 从 HTML 中提取所有电影信息 返回电影列表,每个元素为字典 """ movies = [] # 提取每部电影的 HTML 块(用非贪婪匹配,兼容所有版本) items = re.findall(r'
(.*?)', item, re.DOTALL) movie["quote"] = quote_match.group(1).strip() if quote_match else "无短评" movies.append(movie) return movies def save_to_json(movies: list, filename: str): """保存为 JSON 文件""" with open(filename, "w", encoding="utf-8") as f: json.dump(movies, f, ensure_ascii=False, indent=2) if __name__ == "__main__": # 爬取前 50 部(两页) all_movies = [] for offset in [0, 25]: url = f"https://movie.douban.com/top250?start={offset}" print(f"\n正在获取: {url}") html = fetch_page(url) print(f"页面长度: {len(html)}") page_movies = extract_movie_info(html) all_movies.extend(page_movies) print(f"\n总共提取到 {len(all_movies)} 部电影") save_to_json(all_movies, "movies.json") print("结果已保存到 movies.json") # 打印前 3 部,确保数据正确 print("\n==== 前 3 部电影信息 ====") for m in all_movies[:3]: print(f"{m['rank']}. {m['title']}") print(f"主演: {m['actors']}") print(f"短评: {m['quote']}\n")