Files
final-practice/数据采集.py
2026-06-09 11:31:02 +08:00

65 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import json
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
"Referer": "https://movie.douban.com/top250"
}
movies_data = []
# 豆瓣Top250前50条需要爬2页start=0和start=25
for start in range(0, 50, 25):
url = f"https://movie.douban.com/top250?start={start}"
print(f"正在爬取第 {start//25 + 1} 页...")
# 发送请求
response = requests.get(url, headers=headers)
response.raise_for_status() # 请求失败自动报错
soup = BeautifulSoup(response.text, "html.parser")
# 提取所有电影条目
movie_items = soup.find_all("div", class_="item")
for idx, item in enumerate(movie_items):
rank = start + idx + 1
# 1. 电影名称
title_tag = item.find("span", class_="title")
title = title_tag.text.strip() if title_tag else ""
# 2. 主演信息
actors = ""
bd_div = item.find("div", class_="bd")
if bd_div:
p_tag = bd_div.find("p")
if p_tag:
info_text = p_tag.text.strip()
if "主演:" in info_text:
actors_part = info_text.split("主演:")[-1].strip()
actors = actors_part.split("\n")[0].strip()
# 3. 短评适配当前页面的inq标签
quote_tag = item.find("span", class_="inq")
quote = quote_tag.text.strip() if quote_tag else ""
# 构造数据
movie_info = {
"rank": rank,
"title": title,
"actors": actors,
"quote": quote
}
movies_data.append(movie_info)
print(f"已获取:{rank}. {title} | 短评:{quote if quote else ''}")
# 每页爬完暂停2秒避免请求过快被封
time.sleep(2)
# 保存到movies.json
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(movies_data, f, ensure_ascii=False, indent=4)
print("\n✅ 爬取完成!数据已保存到 movies.json")