Files
task-2-3-File-Operations/wenben.txt
2026-03-31 11:33:06 +08:00

44 lines
1.3 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import re
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
all_chinese_titles = []
# 循环爬取10页每页25部共250部
for page in range(10):
start = page * 25
url = f'https://movie.douban.com/top250?start={start}&filter='
print(f'正在爬取第 {page+1} 页...')
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # 检查请求是否成功
html = response.text
# 正则提取所有电影名称
pattern = r'<span class="title">([^<&]+)</span>'
titles = re.findall(pattern, html)
# 过滤掉英文名(只保留中文名)
chinese_titles = [t for t in titles if not t.startswith('/')]
all_chinese_titles.extend(chinese_titles)
time.sleep(1) # 延迟1秒避免请求过快被封
except Exception as e:
print(f'第 {page+1} 页爬取失败: {e}')
break
# 保存到文本文件
with open('douban_top250.txt', 'w', encoding='utf-8') as f:
for i, title in enumerate(all_chinese_titles, 1):
f.write(f'{i}. {title}\n')
print(f'已成功保存全部 {len(all_chinese_titles)} 部电影到 douban_top250.txt')
# 验证输出
with open('douban_top250.txt', 'r', encoding='utf-8') as f:
print(f.read())