Files
final-practice/2.py
2026-06-09 11:24:34 +08:00

43 lines
1.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import json
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
movies = []
# 豆瓣Top250每页25条前50条需要爬取2页start=0和start=25
for page in range(2):
url = f"https://movie.douban.com/top250?start={page*25}"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all("div", class_="item")
for idx, item in enumerate(items):
rank = page * 25 + idx + 1
# 电影名称
title = item.find("span", class_="title").text.strip()
# 主演信息
info = item.find("div", class_="bd").find("p", class_="").text.strip()
actors = info.split("\n")[0].split("主演:")[-1].strip() if "主演:" in info else "未知"
# 短评
quote_tag = item.find("span", class_="inq")
quote = quote_tag.text.strip() if quote_tag else "无短评"
movies.append({
"rank": rank,
"title": title,
"actors": actors,
"quote": quote
})
# 礼貌间隔,避免被反爬
time.sleep(1)
# 保存为movies.json
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(movies, f, ensure_ascii=False, indent=2)
print("爬取完成,数据已保存到 movies.json")