diff --git a/douban2.py b/douban2.py new file mode 100644 index 0000000..e69de29 diff --git a/douban3.py b/douban3.py new file mode 100644 index 0000000..e69de29 diff --git a/wenben.txt b/wenben.txt new file mode 100644 index 0000000..69b7e0e --- /dev/null +++ b/wenben.txt @@ -0,0 +1,44 @@ +import requests +import re +import time + +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' +} + +all_chinese_titles = [] + +# 循环爬取10页(每页25部,共250部) +for page in range(10): + start = page * 25 + url = f'https://movie.douban.com/top250?start={start}&filter=' + print(f'正在爬取第 {page+1} 页...') + + try: + response = requests.get(url, headers=headers) + response.raise_for_status() # 检查请求是否成功 + html = response.text + + # 正则提取所有电影名称 + pattern = r'([^<&]+)' + titles = re.findall(pattern, html) + + # 过滤掉英文名(只保留中文名) + chinese_titles = [t for t in titles if not t.startswith('/')] + all_chinese_titles.extend(chinese_titles) + + time.sleep(1) # 延迟1秒,避免请求过快被封 + except Exception as e: + print(f'第 {page+1} 页爬取失败: {e}') + break + +# 保存到文本文件 +with open('douban_top250.txt', 'w', encoding='utf-8') as f: + for i, title in enumerate(all_chinese_titles, 1): + f.write(f'{i}. {title}\n') + +print(f'已成功保存全部 {len(all_chinese_titles)} 部电影到 douban_top250.txt') + +# 验证输出 +with open('douban_top250.txt', 'r', encoding='utf-8') as f: + print(f.read()) \ No newline at end of file