Files
4-2-image-labeling/爬虫.py
2026-06-23 11:30:01 +08:00

92 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
from bs4 import BeautifulSoup
# 题目强制要求请求头
url = "https://exam.detr.top/exam-b/movies"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
# 仅一次请求,符合题目要求
resp = requests.get(url, headers=headers)
resp.encoding = "utf-8"
html_text = resp.text
# 保存原始网页 movies.html
with open("movies.html", "w", encoding="utf-8") as f:
f.write(html_text)
print("✅ 已保存网页源码 movies.html")
# 解析表格页面
soup = BeautifulSoup(html_text, "html.parser")
# 提取页面data_id页面顶部/body标签没有则留空
data_id = None
if soup.body and "data-id" in soup.body.attrs:
data_id = soup.body["data-id"]
print(f"页面data_id: {data_id}")
# 提取表格所有数据行 <tr>,跳过表头第一行
all_tr = soup.find_all("tr")
movie_list = []
# 表头是第0行电影从第1行开始遍历
for tr in all_tr[1:]:
td_list = tr.find_all("td")
# 表格列顺序id、title、director、year、rating、duration、genre、actors_count
if len(td_list) >= 8:
movie = {
"id": int(td_list[0].get_text(strip=True)),
"title": td_list[1].get_text(strip=True),
"director": td_list[2].get_text(strip=True),
"year": int(td_list[3].get_text(strip=True)),
"rating": float(td_list[4].get_text(strip=True)),
"duration": int(td_list[5].get_text(strip=True)),
"genre": td_list[6].get_text(strip=True),
"actors_count": int(td_list[7].get_text(strip=True))
}
movie_list.append(movie)
print(f"一共抓取到 {len(movie_list)} 部电影")
# 组装并保存 movies.json
save_data = {
"data_id": data_id,
"movies": movie_list
}
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(save_data, f, ensure_ascii=False, indent=2)
print("✅ movies.json 写入完成")
# ====================== 第2题数据分析防止空列表报错 ======================
if len(movie_list) == 0:
print("❌ 未抓取到任何电影数据,请检查表格解析逻辑!")
else:
# ① 找出评分最高、最低电影
sorted_movies = sorted(movie_list, key=lambda x: x["rating"])
lowest_movie = sorted_movies[0]
highest_movie = sorted_movies[-1]
print("\n① 评分最高&最低电影:")
print(f"评分最低:{lowest_movie['title']} {lowest_movie['rating']}")
print(f"评分最高:{highest_movie['title']} {highest_movie['rating']}")
# ② 统计各类型电影数量(字典输出)
genre_count = {}
for m in movie_list:
g = m["genre"]
genre_count[g] = genre_count.get(g, 0) + 1
print("\n② 各类型电影数量:", genre_count)
# ③ 统计各导演电影数量(字典输出)
director_count = {}
for m in movie_list:
d = m["director"]
director_count[d] = director_count.get(d, 0) + 1
print("\n③ 各导演电影数量:", director_count)
# ④ 统计2020年以后上映电影
count_2020 = 0
for m in movie_list:
if m["year"] >= 2020:
count_2020 += 1
print(f"\n④ 2020年(含)后上映电影总数:{count_2020}")