Files
final-practice/douban.py
2026-06-11 16:21:08 +08:00

68 lines
1.7 KiB
Python

import requests
from bs4 import BeautifulSoup as bs
import json
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
"AppleWebKit/537.36 (KHTML, like Gecko)"
"Chrome/129.0.0.0 Safari/537.36"
}
url = "https://movie.douban.com/top250?start=0&filter="
data = []
resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8'
soup = bs(resp.text, "html.parser")
# print(soup)
items = soup.find_all("div", class_="item")
# print(items[0])
# print(len(items))
for i in range(len(items)):
print(i)
title = items[i].find("span", class_="title").get_text()
# print(title)
actors = items[i].find("div", class_="bd").get_text().strip()
try:
actors = actors.split("主演:")[1].split("\n")[0].strip()
except:
actors = ""
#print(actors)
quote = items[i].find("div", class_="bd").find("p", class_="quote").get_text().strip()
# print(quote)
data.append({
"title": title,
"actors": actors,
"quote": quote
})
for i in range(len(items)):
# 电影标题
title = items[i].find("span", class_="title").get_text().strip()
# 演员信息
actors = items[i].find("div", class_="bd").get_text().strip()
try:
actors = actors.split("主演:")[1].split("\n")[0].strip()
except:
actors = ""
# 经典台词
try:
quote = items[i].find("div", class_="bd").find("p", class_="quote").get_text().strip()
except:
quote = ""
# print(quote)
data.append({
"title": title,
"actors": actors,
"quote": quote
})
print(data)
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False)