import requests from lxml import etree import csv import time import random headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } movie_list = [] for page in range(10): start = page * 25 url = f'https://movie.douban.com/top250?start={start}&filter=' print(f'正在爬取第 {page+1} 页...') time.sleep(random.uniform(0.5, 1.5)) try: response = requests.get(url, headers=headers, timeout=10) tree = etree.HTML(response.text) items = tree.xpath('//div[@class="item"]') for item in items: rank = item.xpath('.//em/text()')[0] title = item.xpath('.//span[@class="title"][1]/text()')[0] score = item.xpath('.//span[@class="rating_num"]/text()')[0] quote = item.xpath('.//span[@class="inq"]/text()') quote = quote[0] if quote else "无" info = item.xpath('.//div[@class="bd"]/p[1]/text()')[0].strip() movie = { "排名": rank, "电影名": title, "评分": score, "引言": quote, "主创信息": info } movie_list.append(movie) except Exception as e: print(f"第 {page+1} 页爬取失败:{e}") with open("douban_top250_xpath.csv", "w", encoding="utf-8-sig", newline="") as f: writer = csv.DictWriter(f, fieldnames=["排名", "电影名", "评分", "引言", "主创信息"]) writer.writeheader() writer.writerows(movie_list) print(f"? 爬取完成!共 {len(movie_list)} 条数据,已保存到 douban_top250_xpath.csv")