From b276802b0dfa0fddc14206b026532087e01f2140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=BF=9E=E5=85=B4=E6=9D=B0?= <2509165004@student.example.com> Date: Thu, 2 Apr 2026 15:56:38 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 260402-2509165004.py | 56 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 260402-2509165004.py diff --git a/260402-2509165004.py b/260402-2509165004.py new file mode 100644 index 0000000..ab6eb8d --- /dev/null +++ b/260402-2509165004.py @@ -0,0 +1,56 @@ +import re +import csv +import json +import requests +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36" +} +movie_list = [] +for start in range(0, 250, 25): + url = f"https://movie.douban.com/top250?start={start}" + print(f"正在爬取:{url}") + + try: + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + html = response.text + pattern = re.compile( + r'
.*?' + r'(.*?).*?' + r'(.*?).*?' # 评分 + r'(.*?)人评价.*?' + r'(.*?)', + re.S + ) + movies = pattern.findall(html) + + for movie in movies: + name = movie[0].strip() + score = movie[1].strip() + comment_num = movie[2].strip() + comment = movie[3].strip() + + movie_dict = { + "电影名称": name, + "评分": score, + "评价人数": comment_num, + "短评": comment + } + movie_list.append(movie_dict) + + except Exception as e: + print(f"爬取失败:{e}") + +with open("douban_top250.csv", "w", encoding="utf-8-sig", newline="") as f: + fieldnames = ["电影名称", "评分", "评价人数", "短评"] + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(movie_list) + +print("✅ CSV 文件已保存:douban_top250.csv") + +with open("douban_top250.json", "w", encoding="utf-8") as f: + json.dump(movie_list, f, ensure_ascii=False, indent=4) + +print("✅ JSON 文件已保存:douban_top250.json") +print(f"\n🎉 爬取完成!共获取 {len(movie_list)} 部电影数据") \ No newline at end of file