Files
task-2-1-data-collection/260324_2509165031.py
2026-03-24 11:26:32 +08:00

39 lines
1.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': 'https://movie.douban.com/'
}
all_movies = [] # 存放全部250部电影
# TOP250 共 10 页,每页 25 条
for page in range(10):
start = page * 25
url = f'https://movie.douban.com/top250?start={start}&filter='
try:
response = requests.get(url, headers=headers, timeout=15)
response.encoding = 'utf-8'
print(f'正在爬取第 {page+1}/10 页...')
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find_all('div', class_='item') # 每部电影的父容器
for item in items:
# 提取电影名
title_tag = item.find('span', class_='title')
if title_tag:
title = title_tag.get_text(strip=True)
if title not in all_movies: # 去重
all_movies.append(title)
except Exception as e:
print(f'{page+1} 页爬取异常:', e)
# 输出结果
print(f'\n✅ 成功爬取 {len(all_movies)} 部电影TOP250 完整数据)')
print('='*30)
for idx, movie in enumerate(all_movies, 1):
print(f'{idx}. {movie}')