import requests import re import time headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } all_chinese_titles = [] # 循环爬取10页(每页25部,共250部) for page in range(10): start = page * 25 url = f'https://movie.douban.com/top250?start={start}&filter=' print(f'正在爬取第 {page+1} 页...') try: response = requests.get(url, headers=headers) response.raise_for_status() # 检查请求是否成功 html = response.text # 正则提取所有电影名称 pattern = r'([^<&]+)' titles = re.findall(pattern, html) # 过滤掉英文名(只保留中文名) chinese_titles = [t for t in titles if not t.startswith('/')] all_chinese_titles.extend(chinese_titles) time.sleep(1) # 延迟1秒,避免请求过快被封 except Exception as e: print(f'第 {page+1} 页爬取失败: {e}') break # 保存到文本文件 with open('douban_top250.txt', 'w', encoding='utf-8') as f: for i, title in enumerate(all_chinese_titles, 1): f.write(f'{i}. {title}\n') print(f'已成功保存全部 {len(all_chinese_titles)} 部电影到 douban_top250.txt') # 验证输出 with open('douban_top250.txt', 'r', encoding='utf-8') as f: print(f.read())