import requests import re import csv import time import random headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", "Referer": "https://movie.douban.com/" } movie_pattern = re.compile( r'
.*?' r'(\d+).*?' r'([^&]+?).*?' r'/ ([^<]+?).*?' r'(\d+\.\d+)', re.S ) def crawl_douban_top250(): all_movies = [] for page in range(0, 250, 25): url = f"https://movie.douban.com/top250?start={page}&filter=" try: time.sleep(random.uniform(1, 2)) response = requests.get(url=url, headers=headers, timeout=15) response.raise_for_status() page_source = response.text movies = movie_pattern.findall(page_source) all_movies.extend(movies) print(f"第{page//25 + 1}页爬取成功") except Exception as e: print(f"第{page//25 + 1}页爬取失败:{e}") continue with open("movies.csv", "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(["排名", "中文名", "英文名", "评分"]) writer.writerows(all_movies) print(f"全部爬取完成!共获取{len(all_movies)}部电影,已保存到movies.csv") if __name__ == "__main__": crawl_douban_top250()