删除 https:/gitea.detr.top/2509165036/simulated-examination/src/branch/main/q2_1_crawler/q2_1.py
This commit is contained in:
@@ -1,50 +0,0 @@
|
|||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import json
|
|
||||||
|
|
||||||
# 1. 配置请求头(题目强制要求必须带检测头)
|
|
||||||
url = "https://exam.detr.top/exam-b/movies"
|
|
||||||
headers = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36"
|
|
||||||
}
|
|
||||||
|
|
||||||
# 仅一次请求,一次性拿到页面全部数据(符合“一次爬取获取所有数据”得分要求)
|
|
||||||
resp = requests.get(url, headers=headers, timeout=10)
|
|
||||||
resp.encoding = "utf-8"
|
|
||||||
html_text = resp.text
|
|
||||||
|
|
||||||
# 保存原始网页源码 movies.html
|
|
||||||
with open("movies.html", "w", encoding="utf-8") as f:
|
|
||||||
f.write(html_text)
|
|
||||||
|
|
||||||
# 解析页面
|
|
||||||
soup = BeautifulSoup(html_text, "html.parser")
|
|
||||||
# 获取数据编号 actors_count
|
|
||||||
count_tag = soup.find("span", id="actors_count")
|
|
||||||
actors_count = count_tag.get_text(strip=True) if count_tag else "0"
|
|
||||||
|
|
||||||
# 提取全部10条电影
|
|
||||||
movie_items = soup.select(".movie-item")
|
|
||||||
movie_list = []
|
|
||||||
for item in movie_items:
|
|
||||||
movie = {
|
|
||||||
"id": item.get("data-id", ""),
|
|
||||||
"title": item.select_one(".title").get_text(strip=True) if item.select_one(".title") else "",
|
|
||||||
"director": item.select_one(".director").get_text(strip=True) if item.select_one(".director") else "",
|
|
||||||
"year": item.select_one(".year").get_text(strip=True) if item.select_one(".year") else "",
|
|
||||||
"rating": float(item.select_one(".rating").get_text(strip=True)) if item.select_one(".rating") else 0.0,
|
|
||||||
"duration": item.select_one(".duration").get_text(strip=True) if item.select_one(".duration") else "",
|
|
||||||
"genre": item.select_one(".genre").get_text(strip=True) if item.select_one(".genre") else ""
|
|
||||||
}
|
|
||||||
movie_list.append(movie)
|
|
||||||
|
|
||||||
# 组装json数据
|
|
||||||
result_data = {
|
|
||||||
"actors_count": actors_count,
|
|
||||||
"movies": movie_list
|
|
||||||
}
|
|
||||||
# 写入movies.json
|
|
||||||
with open("movies.json", "w", encoding="utf-8") as f:
|
|
||||||
json.dump(result_data, f, ensure_ascii=False, indent=2)
|
|
||||||
|
|
||||||
print(f"爬取完成,共抓取{len(movie_list)}部电影,已生成 movies.html、movies.json")
|
|
||||||
Reference in New Issue
Block a user