import re import csv import json import requests headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36" } movie_list = [] for start in range(0, 250, 25): url = f"https://movie.douban.com/top250?start={start}" print(f"正在爬取:{url}") try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() html = response.text pattern = re.compile( r'
.*?' r'(.*?).*?' r'(.*?).*?' # 评分 r'(.*?)人评价.*?' r'(.*?)', re.S ) movies = pattern.findall(html) for movie in movies: name = movie[0].strip() score = movie[1].strip() comment_num = movie[2].strip() comment = movie[3].strip() movie_dict = { "电影名称": name, "评分": score, "评价人数": comment_num, "短评": comment } movie_list.append(movie_dict) except Exception as e: print(f"爬取失败:{e}") with open("douban_top250.csv", "w", encoding="utf-8-sig", newline="") as f: fieldnames = ["电影名称", "评分", "评价人数", "短评"] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(movie_list) print("✅ CSV 文件已保存:douban_top250.csv") with open("douban_top250.json", "w", encoding="utf-8") as f: json.dump(movie_list, f, ensure_ascii=False, indent=4) print("✅ JSON 文件已保存:douban_top250.json") print(f"\n🎉 爬取完成!共获取 {len(movie_list)} 部电影数据")