上传文件至 /

This commit is contained in:
2026-06-23 11:12:36 +08:00
parent 16397e49cd
commit 3e821217d2
3 changed files with 70 additions and 0 deletions

44
crawl.py Normal file
View File

@@ -0,0 +1,44 @@
import requests
from bs4 import BeautifulSoup
import json
head = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/129.0.0.0 Safari/537.36"
}
web_url = "https://exam.detr.top/exam-b/movies"
res = requests.get(web_url, headers=head)
res.encoding = "utf-8"
source_html = res.text
html_file = open("movies.html", "w", encoding="utf-8")
html_file.write(source_html)
html_file.close()
html_parse = BeautifulSoup(source_html, "html.parser")
all_movie = html_parse.find_all("div", class_="item")
movie_info = []
for movie in all_movie:
movie_id = movie.get("data-id")
m_title = movie.find("span", class_="title").get_text().strip()
info_block = movie.find("div", class_="bd")
info_text = info_block.get_text().strip().split("\n")[0]
split_data = [text.strip() for text in info_text.split("/")]
director = split_data[0]
year = split_data[1]
duration = split_data[2]
genre = split_data[3]
actor_num = int(split_data[4])
score = float(movie.find("p", class_="quote").get_text().strip())
single_movie = {
"id": movie_id,
"title": m_title,
"director": director,
"year": year,
"rating": score,
"duration": duration,
"genre": genre,
"actors_count": actor_num
}
movie_info.append(single_movie)
json_file = open("movies.json", "w", encoding="utf-8")
json.dump(movie_info, json_file, ensure_ascii=False, indent=2)
json_file.close()
print("爬虫抓取完成已生成movies.html、movies.json两个文件")