From 5a8416c1459569313c4028a3f9cebcb65890f67e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=A4=A9=E8=B5=90?= <2509165007@student.example.com> Date: Tue, 23 Jun 2026 11:18:32 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20=20q2=5F1=5Fcrawler?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- q2_1_crawler/q2_1.py | 39 ++++++++++++++++++++++++++++++ q2_1_crawler/q2_2.py | 55 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 q2_1_crawler/q2_1.py create mode 100644 q2_1_crawler/q2_2.py diff --git a/ q2_1_crawler/q2_1.py b/ q2_1_crawler/q2_1.py new file mode 100644 index 0000000..df5ad89 --- /dev/null +++ b/ q2_1_crawler/q2_1.py @@ -0,0 +1,39 @@ +import requests +import json + +# 1. 配置请求头(测试头,满足题目要求) +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept-Language": "zh-CN,zh;q=0.9", + "Referer": "https://exam.detr.top/" +} + +url = "https://exam.detr.top/exam-b/movies" + +# 2. 一次性请求获取全部数据(题目要求单次抓取) +resp = requests.get(url, headers=headers) +resp.raise_for_status() # 捕获请求异常 + +# 保存原始网页源码到 movies.html +with open("movies.html", "w", encoding="utf-8") as f: + f.write(resp.text) + +# 解析接口返回的json数据 +movie_data = resp.json() + +# 筛选全部10部电影,校验字段:id, title, director, year, rating, duration, genre, actors_count +valid_movies = [] +for item in movie_data: + needed_keys = ["id", "title", "director", "year", "rating", "duration", "genre", "actors_count"] + # 只保留包含全部要求键的电影 + if all(k in item for k in needed_keys): + valid_movies.append(item) + +# 保存电影数据到 movies.json +with open("movies.json", "w", encoding="utf-8") as f: + json.dump(valid_movies, f, ensure_ascii=False, indent=2) + +print("抓取完成:已生成 movies.html 和 movies.json") +print(f"共抓取到 {len(valid_movies)} 部电影") + + diff --git a/ q2_1_crawler/q2_2.py b/ q2_1_crawler/q2_2.py new file mode 100644 index 0000000..7129447 --- /dev/null +++ b/ q2_1_crawler/q2_2.py @@ -0,0 +1,55 @@ +import json +from collections import defaultdict + +# 读取json文件 +with open("movies.json", "r", encoding="utf-8") as f: + movies = json.load(f) + +# ① 找出评分最高、最低电影 +def get_rating_extreme(): + # 按rating排序 + sorted_movies = sorted(movies, key=lambda x: x["rating"]) + lowest = sorted_movies[0] + highest = sorted_movies[-1] + print("=== ① 评分极值 ===") + print(f"评分最低电影:{lowest['title']},评分:{lowest['rating']}") + print(f"评分最高电影:{highest['title']},评分:{highest['rating']}") + return highest, lowest + +# ② 统计各类型电影数量(genre为列表,拆分统计) +def count_genre(): + genre_count = defaultdict(int) + for movie in movies: + genres = movie["genre"] + for g in genres: + genre_count[g] += 1 + print("\n=== ② 各类型电影数量(字典格式)===") + print(dict(genre_count)) + return dict(genre_count) + +# ③ 统计各导演电影数量 +def count_director(): + dir_count = defaultdict(int) + for movie in movies: + d = movie["director"] + dir_count[d] += 1 + print("\n=== ③ 各导演电影数量(字典格式)===") + print(dict(dir_count)) + return dict(dir_count) + +# ④ 统计2020年(含)以后上映电影数量 +def count_after_2020(): + cnt = 0 + for movie in movies: + if movie["year"] >= 2020: + cnt += 1 + print("\n=== ④ 2020年(含)后上映电影数量 ===") + print(f"总数:{cnt}") + return cnt + +# 执行全部分析逻辑 +if __name__ == "__main__": + get_rating_extreme() + count_genre() + count_director() + count_after_2020() \ No newline at end of file