44 lines
1.3 KiB
Plaintext
44 lines
1.3 KiB
Plaintext
import requests
|
||
import re
|
||
import time
|
||
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||
}
|
||
|
||
all_chinese_titles = []
|
||
|
||
# 循环爬取10页(每页25部,共250部)
|
||
for page in range(10):
|
||
start = page * 25
|
||
url = f'https://movie.douban.com/top250?start={start}&filter='
|
||
print(f'正在爬取第 {page+1} 页...')
|
||
|
||
try:
|
||
response = requests.get(url, headers=headers)
|
||
response.raise_for_status() # 检查请求是否成功
|
||
html = response.text
|
||
|
||
# 正则提取所有电影名称
|
||
pattern = r'<span class="title">([^<&]+)</span>'
|
||
titles = re.findall(pattern, html)
|
||
|
||
# 过滤掉英文名(只保留中文名)
|
||
chinese_titles = [t for t in titles if not t.startswith('/')]
|
||
all_chinese_titles.extend(chinese_titles)
|
||
|
||
time.sleep(1) # 延迟1秒,避免请求过快被封
|
||
except Exception as e:
|
||
print(f'第 {page+1} 页爬取失败: {e}')
|
||
break
|
||
|
||
# 保存到文本文件
|
||
with open('douban_top250.txt', 'w', encoding='utf-8') as f:
|
||
for i, title in enumerate(all_chinese_titles, 1):
|
||
f.write(f'{i}. {title}\n')
|
||
|
||
print(f'已成功保存全部 {len(all_chinese_titles)} 部电影到 douban_top250.txt')
|
||
|
||
# 验证输出
|
||
with open('douban_top250.txt', 'r', encoding='utf-8') as f:
|
||
print(f.read()) |