import requests import re import time import random import csv from bs4 import BeautifulSoup # 保存到 CSV def save_to_csv(data, filename="douban_top250_with_comments.csv"): with open(filename, "w", encoding="utf-8-sig", newline="") as f: writer = csv.DictWriter(f, fieldnames=["排名", "片名", "评分", "评分人数", "热门评语"]) writer.writeheader() writer.writerows(data) # 爬取豆瓣 Top250 def crawl_douban_top250(): base_url = "https://movie.douban.com/top250" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", "Referer": "https://movie.douban.com/", } movie_list = [] # 爬 10 页 for start in range(0, 250, 25): url = f"{base_url}?start={start}" try: resp = requests.get(url, headers=headers, timeout=10) soup = BeautifulSoup(resp.text, "html.parser") items = soup.find_all("div", class_="item") for item in items: # 排名 rank = item.find("em").text.strip() # 片名pip title = item.find("span", class_="title").text.strip() # 评分 rating = item.find("span", class_="rating_num").text.strip() # 评分人数 eval_text = item.find("div", class_="star").find_all("span")[-1].text eval_num = re.search(r"(\d+)人评价", eval_text).group(1) # 评语(热门短评) comment_tag = item.find("span", class_="inq") comment = comment_tag.text.strip() if comment_tag else "无评语" movie_list.append({ "排名": rank, "片名": title, "评分": rating, "评分人数": eval_num, "热门评语": comment }) print(f"{rank}. {title} | 评分:{rating} | 评价:{eval_num} | 评语:{comment[:20]}...") time.sleep(random.uniform(1, 2.5)) # 防封 except Exception as e: print(f"爬取失败:{e}") continue return movie_list if __name__ == "__main__": data = crawl_douban_top250() save_to_csv(data)