From 164b754c1373d40de35f81fc365eef9dcd3e38cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=91=E7=BB=B4?= <2509165034@student.example.com> Date: Tue, 7 Apr 2026 11:33:58 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 260402-2509165034.py | 103 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 260402-2509165034.py diff --git a/260402-2509165034.py b/260402-2509165034.py new file mode 100644 index 0000000..d31c44c --- /dev/null +++ b/260402-2509165034.py @@ -0,0 +1,103 @@ +import requests +import re +import json +import csv +import time +import random + +BASE_URL = "https://movie.douban.com/top250?start={}&filter=" + +HEADERS = { + "User-Agent":"Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/91.0.4472.124 Safari/537.36" +} + +all_movies=[] + +def get_movie_data(): + print("正在开始爬取豆瓣Top 250 数据...") + + for i in range(0,10): + start_num = i * 25 + url = BASE_URL.format(start_num) + + try: + response = requests.get(url,headers=HEADERS) + + if response.status_code == 200: + html = response.text + + parse_html(html) + print(f"第{i+1}页爬取完成...") + + time.sleep(random.uniform(1,2)) + else: + print(f"第{i+1}页爬取失败,状态码:{response.status_code}") + except Exception as e: + print(f"发生错误:{e}") + + +def parse_html(html): + li_list = re.findall(r'
  • .*?
  • ',html,re.S) + + for li in li_list: + if 'class="item"' not in li: + continue + + title_match = re.search(r'(.*?)',li,re.S) + title = title_match.group(1) if title_match else "未知标题" + + rating_match = re.search(r'(.*?)',li,re.S) + rating = rating_match.group(1) if rating_match else "0" + + people_match = re.search(r'(\d+)人评价',li,re.S) + people = people_match.group(1) if people_match else "0" + + quote_match = re.search(r'(.*?)',li,re.S) + quote = quote_match.group(1) if quote_match else "无引言" + + info_match = re.search(r'

    (.*?)

    ',li,re.S) + if info_match: + info_raw = info_match.group(1) + info_clean = re.sub(r'\s+','',info_raw).strip() + else: + info_clean = "未知信息" + + movie = { + "title": title, + "rating": rating, + "people": people, + "info": info_clean, + "quote": quote + } + all_movies.append(movie) + + +def save_data(): + print("正在保存数据...") + + with open("douban_top250.txt","w",encoding="utf-8") as f: + for movie in all_movies: + line = f"电影名:{movie['title']} | 评分:{movie['rating']} | 评价人数:{movie['people']} | 引言:{movie['quote']}\n" + f.write(line) + print("已保存为douban_top250.txt") + + + with open("douban_top250.csv","w",newline="",encoding="utf-8-sig") as f: + writer = csv.writer(f) + + writer.writerow(["电影名","评分","评价人数","详细信息","引言"]) + + for movie in all_movies: + writer.writerow([movie['title'],movie['rating'],movie['people'],movie['info'],movie['quote']]) + print("已保存为douban_top250.csv") + + + with open("douban_top250.json","w",encoding="utf-8") as f: + json.dump(all_movies,f,ensure_ascii=False,indent=4) + print("已保存为douban_top250.json") + + +if __name__ == "__main__": + get_movie_data() + save_data() + print("全部任务完成!") \ No newline at end of file