Files
simulated-examination/q2_1.py .py
2026-06-23 11:30:08 +08:00

74 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
from bs4 import BeautifulSoup
# 题目强制要求请求头
url = "https://exam.detr.top/exam-b/movies"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
# 仅一次请求,符合题目要求
resp = requests.get(url, headers=headers)
resp.encoding = "utf-8"
html_text = resp.text
# 保存原始网页 movies.html
with open("movies.html", "w", encoding="utf-8") as f:
f.write(html_text)
print("✅ 已保存网页源码 movies.html")
# 优先尝试直接解析接口JSON接口真实返回格式
movie_list = []
data_id = None
try:
api_data = json.loads(html_text)
data_id = api_data.get("data_id")
movie_list = api_data.get("movies", [])
print("✅ 识别为JSON接口直接读取数据")
except json.JSONDecodeError:
# 若为HTML表格页面执行原bs4解析逻辑
print("识别为HTML表格页面使用BeautifulSoup解析")
soup = BeautifulSoup(html_text, "html.parser")
# 提取页面data_id
if soup.body and "data-id" in soup.body.attrs:
data_id = soup.body["data-id"]
# 提取表格行
all_tr = soup.find_all("tr")
for tr in all_tr[1:]:
td_list = tr.find_all("td")
if len(td_list) >= 8:
# 增加类型转换容错
def safe_int(txt):
try:
return int(txt.strip())
except:
return 0
def safe_float(txt):
try:
return float(txt.strip())
except:
return 0.0
movie = {
"id": safe_int(td_list[0].get_text()),
"title": td_list[1].get_text(strip=True),
"director": td_list[2].get_text(strip=True),
"year": safe_int(td_list[3].get_text()),
"rating": safe_float(td_list[4].get_text()),
"duration": safe_int(td_list[5].get_text()),
"genre": td_list[6].get_text(strip=True),
"actors_count": safe_int(td_list[7].get_text())
}
movie_list.append(movie)
print(f"页面data_id: {data_id}")
print(f"一共抓取到 {len(movie_list)} 部电影")
# 组装并保存 movies.json
save_data = {
"data_id": data_id,
"movies": movie_list
}
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(save_data, f, ensure_ascii=False, indent=2)
print("✅ movies.json 写入完成")