import requests import re import json import csv import time import random BASE_URL = "https://movie.douban.com/top250?start={}&filter=" HEADERS = { "User-Agent":"Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/91.0.4472.124 Safari/537.36" } all_movies=[] def get_movie_data(): print("正在开始爬取豆瓣Top 250 数据...") for i in range(0,10): start_num = i * 25 url = BASE_URL.format(start_num) try: response = requests.get(url,headers=HEADERS) if response.status_code == 200: html = response.text parse_html(html) print(f"第{i+1}页爬取完成...") time.sleep(random.uniform(1,2)) else: print(f"第{i+1}页爬取失败,状态码:{response.status_code}") except Exception as e: print(f"发生错误:{e}") def parse_html(html): li_list = re.findall(r'
  • .*?
  • ',html,re.S) for li in li_list: if 'class="item"' not in li: continue title_match = re.search(r'(.*?)',li,re.S) title = title_match.group(1) if title_match else "未知标题" rating_match = re.search(r'(.*?)',li,re.S) rating = rating_match.group(1) if rating_match else "0" people_match = re.search(r'(\d+)人评价',li,re.S) people = people_match.group(1) if people_match else "0" quote_match = re.search(r'(.*?)',li,re.S) quote = quote_match.group(1) if quote_match else "无引言" info_match = re.search(r'

    (.*?)

    ',li,re.S) if info_match: info_raw = info_match.group(1) info_clean = re.sub(r'\s+','',info_raw).strip() else: info_clean = "未知信息" movie = { "title": title, "rating": rating, "people": people, "info": info_clean, "quote": quote } all_movies.append(movie) def save_data(): print("正在保存数据...") with open("douban_top250.txt","w",encoding="utf-8") as f: for movie in all_movies: line = f"电影名:{movie['title']} | 评分:{movie['rating']} | 评价人数:{movie['people']} | 引言:{movie['quote']}\n" f.write(line) print("已保存为douban_top250.txt") with open("douban_top250.csv","w",newline="",encoding="utf-8-sig") as f: writer = csv.writer(f) writer.writerow(["电影名","评分","评价人数","详细信息","引言"]) for movie in all_movies: writer.writerow([movie['title'],movie['rating'],movie['people'],movie['info'],movie['quote']]) print("已保存为douban_top250.csv") with open("douban_top250.json","w",encoding="utf-8") as f: json.dump(all_movies,f,ensure_ascii=False,indent=4) print("已保存为douban_top250.json") if __name__ == "__main__": get_movie_data() save_data() print("全部任务完成!")