diff --git a/2026-04-02 2509165010.py b/2026-04-02 2509165010.py new file mode 100644 index 0000000..cd893ec --- /dev/null +++ b/2026-04-02 2509165010.py @@ -0,0 +1,66 @@ +import requests +import re +import time +import random +import csv +from bs4 import BeautifulSoup + +# 保存到 CSV +def save_to_csv(data, filename="douban_top250_with_comments.csv"): + with open(filename, "w", encoding="utf-8-sig", newline="") as f: + writer = csv.DictWriter(f, fieldnames=["排名", "片名", "评分", "评分人数", "热门评语"]) + writer.writeheader() + writer.writerows(data) + +# 爬取豆瓣 Top250 +def crawl_douban_top250(): + base_url = "https://movie.douban.com/top250" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", + "Referer": "https://movie.douban.com/", + } + movie_list = [] + + # 爬 10 页 + for start in range(0, 250, 25): + url = f"{base_url}?start={start}" + try: + resp = requests.get(url, headers=headers, timeout=10) + soup = BeautifulSoup(resp.text, "html.parser") + items = soup.find_all("div", class_="item") + + for item in items: + # 排名 + rank = item.find("em").text.strip() + # 片名pip + title = item.find("span", class_="title").text.strip() + # 评分 + rating = item.find("span", class_="rating_num").text.strip() + # 评分人数 + eval_text = item.find("div", class_="star").find_all("span")[-1].text + eval_num = re.search(r"(\d+)人评价", eval_text).group(1) + # 评语(热门短评) + comment_tag = item.find("span", class_="inq") + comment = comment_tag.text.strip() if comment_tag else "无评语" + + movie_list.append({ + "排名": rank, + "片名": title, + "评分": rating, + "评分人数": eval_num, + "热门评语": comment + }) + + print(f"{rank}. {title} | 评分:{rating} | 评价:{eval_num} | 评语:{comment[:20]}...") + + time.sleep(random.uniform(1, 2.5)) # 防封 + + except Exception as e: + print(f"爬取失败:{e}") + continue + + return movie_list + +if __name__ == "__main__": + data = crawl_douban_top250() + save_to_csv(data)