上传文件至 20260331
This commit is contained in:
64
20260331/text3.py
Normal file
64
20260331/text3.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import time
|
||||
|
||||
# 请求头(模拟浏览器)
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
# 存储所有电影数据
|
||||
movies_data = []
|
||||
|
||||
# 爬取10页(每页25条,共250条)
|
||||
for page in range(10):
|
||||
start = page * 25
|
||||
url = f"https://movie.douban.com/top250?start={start}&filter="
|
||||
|
||||
try:
|
||||
# 发送请求获取页面
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# 提取当前页所有电影项
|
||||
items = soup.find_all("div", class_="item")
|
||||
for item in items:
|
||||
# 提取排名、片名、评分
|
||||
rank = item.find("em").text
|
||||
title = item.find("span", class_="title").text
|
||||
score = item.find("span", class_="rating_num").text
|
||||
|
||||
# 提取导演、年份、地区、类型
|
||||
info = item.find("div", class_="bd").p.text.strip().split("\n")
|
||||
director_line = info[0].strip()
|
||||
director = director_line.split("导演:")[1].split("主演:")[0].strip()
|
||||
|
||||
year_area_genre = info[1].strip().split("/")
|
||||
year = year_area_genre[0].strip()
|
||||
area = year_area_genre[1].strip()
|
||||
genre = year_area_genre[2].strip()
|
||||
|
||||
# 添加到数据列表
|
||||
movies_data.append({
|
||||
"排名": rank,
|
||||
"片名": title,
|
||||
"导演": director,
|
||||
"上映年份": year,
|
||||
"地区": area,
|
||||
"类型": genre,
|
||||
"评分": score
|
||||
})
|
||||
|
||||
print(f"第 {page+1} 页爬取完成")
|
||||
time.sleep(1) # 延迟1秒,避免反爬
|
||||
except Exception as e:
|
||||
print(f"第 {page+1} 页爬取失败:{str(e)}")
|
||||
|
||||
# 保存为JSON文件
|
||||
with open("豆瓣电影Top250.json", "w", encoding="utf-8") as f:
|
||||
# ensure_ascii=False 保证中文正常显示,indent=2 格式化缩进
|
||||
json.dump(movies_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print("✅ 数据已保存为「豆瓣电影Top250.json」")
|
||||
Reference in New Issue
Block a user