From 495d11960094a7db829020cc3209668480631666 Mon Sep 17 00:00:00 2001
From: 2509165028 <2509165028@student.edu.cn>
Date: Tue, 23 Jun 2026 11:16:59 +0800
Subject: [PATCH] 1
---
q2_1_crawler/movies.html | 152 +++++++++++++++++++++++++++++++++++++++
q2_1_crawler/movies.json | 22 ++++++
q2_1_crawler/q2_1.py | 44 ++++++++++++
q2_1_crawler/q2_2.py | 31 ++++++++
4 files changed, 249 insertions(+)
create mode 100644 q2_1_crawler/movies.html
create mode 100644 q2_1_crawler/movies.json
create mode 100644 q2_1_crawler/q2_1.py
create mode 100644 q2_1_crawler/q2_2.py
diff --git a/q2_1_crawler/movies.html b/q2_1_crawler/movies.html
new file mode 100644
index 0000000..229767d
--- /dev/null
+++ b/q2_1_crawler/movies.html
@@ -0,0 +1,152 @@
+
+
+
+
+
+
+
+ 电影列表
+
+
+
+ 电影列表
+ 数据编号:B-20260623-8741
+
+
+
+
+ | 编号 |
+ 电影名 |
+ 导演 |
+ 上映年份 |
+ 评分 |
+ 时长(分钟) |
+ 类型 |
+ 主演数 |
+
+
+
+
+
+ | 1 |
+ 盗梦空间 |
+ Frank Darabont |
+ 2024 |
+ 7.1 |
+ 126 |
+ 悬疑 |
+ 2 |
+
+
+
+ | 2 |
+ 放牛班的春天 |
+ 陈凯歌 |
+ 2013 |
+ 7.8 |
+ 162 |
+ 悬疑 |
+ 3 |
+
+
+
+ | 3 |
+ 三傻大闹宝莱坞 |
+ Robert Zemeckis |
+ 2004 |
+ 9.1 |
+ 179 |
+ 爱情 |
+ 2 |
+
+
+
+ | 4 |
+ 泰坦尼克号 |
+ James Cameron |
+ 2006 |
+ 8.1 |
+ 172 |
+ 爱情 |
+ 4 |
+
+
+
+ | 5 |
+ 肖申克的救赎 |
+ 宫崎骏 |
+ 2002 |
+ 6.0 |
+ 153 |
+ 冒险 |
+ 5 |
+
+
+
+ | 6 |
+ 千与千寻 |
+ Christopher Nolan |
+ 2017 |
+ 8.0 |
+ 163 |
+ 爱情 |
+ 3 |
+
+
+
+ | 7 |
+ 星际穿越 |
+ Lasse Hallström |
+ 2021 |
+ 9.5 |
+ 148 |
+ 冒险 |
+ 3 |
+
+
+
+ | 8 |
+ 忠犬八公的故事 |
+ Rajkumar Hirani |
+ 2006 |
+ 7.3 |
+ 115 |
+ 动画 |
+ 3 |
+
+
+
+ | 9 |
+ 霸王别姬 |
+ Christophe Barratier |
+ 2010 |
+ 6.6 |
+ 136 |
+ 喜剧 |
+ 2 |
+
+
+
+ | 10 |
+ 阿甘正传 |
+ Christopher Nolan |
+ 2001 |
+ 9.1 |
+ 107 |
+ 喜剧 |
+ 4 |
+
+
+
+
+
+
\ No newline at end of file
diff --git a/q2_1_crawler/movies.json b/q2_1_crawler/movies.json
new file mode 100644
index 0000000..d65bdaf
--- /dev/null
+++ b/q2_1_crawler/movies.json
@@ -0,0 +1,22 @@
+[
+ {
+ "id": "1",
+ "title": "示例电影A",
+ "director": "导演A",
+ "year": 2022,
+ "rating": 8.5,
+ "duration": 120,
+ "genre": "动作",
+ "actors_count": 4
+ },
+ {
+ "id": "2",
+ "title": "示例电影B",
+ "director": "导演B",
+ "year": 2019,
+ "rating": 6.0,
+ "duration": 95,
+ "genre": "喜剧",
+ "actors_count": 6
+ }
+]
\ No newline at end of file
diff --git a/q2_1_crawler/q2_1.py b/q2_1_crawler/q2_1.py
new file mode 100644
index 0000000..a12ae86
--- /dev/null
+++ b/q2_1_crawler/q2_1.py
@@ -0,0 +1,44 @@
+import requests
+import json
+from bs4 import BeautifulSoup
+
+# 1. 必须包含检测头
+headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+
+url = 'https://exam.detr.top/exam-b/movies'
+
+try:
+ response = requests.get(url, headers=headers)
+ response.encoding = 'utf-8'
+
+ # 2. 保存原始网页源码
+ with open('movies.html', 'w', encoding='utf-8') as f_html:
+ f_html.write(response.text)
+
+ # 3. 解析数据 (这里假设网页是表格或列表,需要手动调试定位)
+ soup = BeautifulSoup(response.text, 'lxml')
+ # 示例:假设电影信息在一个 class 为 'movie-item' 的 div 中,实际需要你按 F12 查看源码修改
+ # movie_items = soup.select('.movie-item')
+ # 获取前10条(或全部)
+ # movies_list = []
+ # for i, item in enumerate(movie_items[:10]):
+ # # 根据实际标签提取:id, title, director, year, rating, duration, genre, actors_count
+ # pass
+
+ # ⚠️ 由于无法访问在线真实网页,此处提供解析后构造的示例数据(仅供保存json逻辑参考)
+ # 你实际做的时候,需要将上面的 `movies_list` 替换为真实的爬取结果。
+ movies_list = [
+ {"id": "1", "title": "示例电影A", "director": "导演A", "year": 2022, "rating": 8.5, "duration": 120, "genre": "动作", "actors_count": 4},
+ {"id": "2", "title": "示例电影B", "director": "导演B", "year": 2019, "rating": 6.0, "duration": 95, "genre": "喜剧", "actors_count": 6}
+ ] # 这里要保证爬够10条
+
+ # 4. 保存为 movies.json
+ with open('movies.json', 'w', encoding='utf-8') as f_json:
+ json.dump(movies_list, f_json, ensure_ascii=False, indent=4)
+
+ print("爬取完成,已保存 movies.json 和 movies.html")
+
+except Exception as e:
+ print(f"爬取失败: {e}")
\ No newline at end of file
diff --git a/q2_1_crawler/q2_2.py b/q2_1_crawler/q2_2.py
new file mode 100644
index 0000000..e27ae77
--- /dev/null
+++ b/q2_1_crawler/q2_2.py
@@ -0,0 +1,31 @@
+import json
+
+# 1. 读取数据
+with open('movies.json', 'r', encoding='utf-8') as f:
+ movies = json.load(f)
+
+# 2. ① 找出评分最高和最低的电影
+max_rating_movie = max(movies, key=lambda x: x['rating'])
+min_rating_movie = min(movies, key=lambda x: x['rating'])
+print(f"评分最高电影: {max_rating_movie['title']} - {max_rating_movie['rating']}")
+print(f"评分最低电影: {min_rating_movie['title']} - {min_rating_movie['rating']}")
+
+# 3. ② 统计各类型的电影数量 (字典格式)
+genre_counts = {}
+for movie in movies:
+ genre = movie['genre']
+ genre_counts[genre] = genre_counts.get(genre, 0) + 1
+print("各类型电影数量统计:")
+print(json.dumps(genre_counts, ensure_ascii=False, indent=4))
+
+# 4. ③ 统计各导演的电影数量 (字典格式)
+director_counts = {}
+for movie in movies:
+ director = movie['director']
+ director_counts[director] = director_counts.get(director, 0) + 1
+print("各导演电影数量统计:")
+print(json.dumps(director_counts, ensure_ascii=False, indent=4))
+
+# 5. ④ 统计2020年(含)以后上映的电影数量
+count_post_2020 = sum(1 for movie in movies if movie['year'] >= 2020)
+print(f"2020年(含)以后上映的电影数量: {count_post_2020}")
\ No newline at end of file