Files
final-practice/python.py
2026-06-11 16:25:14 +08:00

49 lines
1.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import time
import json # 补上json库
# 请求头,模拟浏览器,防止被拦截
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
# 存储所有电影数据
movie_list = []
# 只爬前两页每页25条合计50条
for page in range(0, 50, 25):
url = f"https://movie.douban.com/top250?start={page}"
res = requests.get(url, headers=headers)
res.encoding = "utf-8"
soup = BeautifulSoup(res.text, "html.parser")
items = soup.find_all("div", class_="item")
for item in items:
# 电影名称
title = item.find("span", class_="title").get_text(strip=True)
# 主演信息
info_text = item.find("div", class_="bd").p.get_text(strip=True)
# 截取导演演员部分
actor_info = info_text.split("\n")[0]
# 短评,部分电影无短评做容错
quote_tag = item.find("span", class_="inq")
short_comment = quote_tag.get_text(strip=True) if quote_tag else "无短评"
data = {
"电影名": title,
"主创主演": actor_info,
"经典短评": short_comment
}
movie_list.append(data)
print(data)
# 每页延时1秒降低访问频率避免封IP
time.sleep(1)
# 打印总数量
print(f"\n一共抓取{len(movie_list)}部电影")
with open("movies.json","w",encoding="utf-8") as f:
json.dump(movie_list, f, ensure_ascii=False, indent=2)
print("数据已保存到 movies.json 文件!")