Files
task-2-1-data-collection/06 林伟泰.ini
2026-03-31 11:30:12 +08:00

45 lines
1.3 KiB
INI
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
url = 'https://movie.douban.com/top250'
response = requests.get(url, headers=headers)
html = response.text
attern = r'<span class="title">([^<&]+)</span>'
titles = re.findall(pattern, html)
top10 = chinese_titles[:10]
with open('movies.txt', 'w', encoding='utf-8') as f:
for i, title in enumerate(top10, 1):
f.write(f'{i}. {title}\n')
# 准备:导入必要的库
import requests
import re
import csv
import json
import time
# 设置请求头,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
url = 'https://movie.douban.com/top250'
print('开始爬取豆瓣电影Top250...')
response = requests.get(url, headers=headers)
print(f'状态码: {response.status_code}')
print(f'内容长度: {len(response.text)} 字符')
# 找到所有电影标题
html = response.text
# 匹配 <span class="title">电影名</span>
title_pattern = r'<span class="title">([^<]+)</span>'
titles = re.findall(title_pattern, html)
# 过滤掉英文名(以/开头)
chinese_titles = [t for t in titles if not t.startswith('/')]
print('电影名称前10部')
for i, title in enumerate(chinese_titles[:10], 1):
print(f'{i}. {title}')