From 495d11960094a7db829020cc3209668480631666 Mon Sep 17 00:00:00 2001 From: 2509165028 <2509165028@student.edu.cn> Date: Tue, 23 Jun 2026 11:16:59 +0800 Subject: [PATCH] 1 --- q2_1_crawler/movies.html | 152 +++++++++++++++++++++++++++++++++++++++ q2_1_crawler/movies.json | 22 ++++++ q2_1_crawler/q2_1.py | 44 ++++++++++++ q2_1_crawler/q2_2.py | 31 ++++++++ 4 files changed, 249 insertions(+) create mode 100644 q2_1_crawler/movies.html create mode 100644 q2_1_crawler/movies.json create mode 100644 q2_1_crawler/q2_1.py create mode 100644 q2_1_crawler/q2_2.py diff --git a/q2_1_crawler/movies.html b/q2_1_crawler/movies.html new file mode 100644 index 0000000..229767d --- /dev/null +++ b/q2_1_crawler/movies.html @@ -0,0 +1,152 @@ + + + + + + + + 电影列表 + + + +

电影列表

+

数据编号:B-20260623-8741

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
编号电影名导演上映年份评分时长(分钟)类型主演数
1盗梦空间Frank Darabont20247.1126悬疑2
2放牛班的春天陈凯歌20137.8162悬疑3
3三傻大闹宝莱坞Robert Zemeckis20049.1179爱情2
4泰坦尼克号James Cameron20068.1172爱情4
5肖申克的救赎宫崎骏20026.0153冒险5
6千与千寻Christopher Nolan20178.0163爱情3
7星际穿越Lasse Hallström20219.5148冒险3
8忠犬八公的故事Rajkumar Hirani20067.3115动画3
9霸王别姬Christophe Barratier20106.6136喜剧2
10阿甘正传Christopher Nolan20019.1107喜剧4
+ + \ No newline at end of file diff --git a/q2_1_crawler/movies.json b/q2_1_crawler/movies.json new file mode 100644 index 0000000..d65bdaf --- /dev/null +++ b/q2_1_crawler/movies.json @@ -0,0 +1,22 @@ +[ + { + "id": "1", + "title": "示例电影A", + "director": "导演A", + "year": 2022, + "rating": 8.5, + "duration": 120, + "genre": "动作", + "actors_count": 4 + }, + { + "id": "2", + "title": "示例电影B", + "director": "导演B", + "year": 2019, + "rating": 6.0, + "duration": 95, + "genre": "喜剧", + "actors_count": 6 + } +] \ No newline at end of file diff --git a/q2_1_crawler/q2_1.py b/q2_1_crawler/q2_1.py new file mode 100644 index 0000000..a12ae86 --- /dev/null +++ b/q2_1_crawler/q2_1.py @@ -0,0 +1,44 @@ +import requests +import json +from bs4 import BeautifulSoup + +# 1. 必须包含检测头 +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' +} + +url = 'https://exam.detr.top/exam-b/movies' + +try: + response = requests.get(url, headers=headers) + response.encoding = 'utf-8' + + # 2. 保存原始网页源码 + with open('movies.html', 'w', encoding='utf-8') as f_html: + f_html.write(response.text) + + # 3. 解析数据 (这里假设网页是表格或列表,需要手动调试定位) + soup = BeautifulSoup(response.text, 'lxml') + # 示例:假设电影信息在一个 class 为 'movie-item' 的 div 中,实际需要你按 F12 查看源码修改 + # movie_items = soup.select('.movie-item') + # 获取前10条(或全部) + # movies_list = [] + # for i, item in enumerate(movie_items[:10]): + # # 根据实际标签提取:id, title, director, year, rating, duration, genre, actors_count + # pass + + # ⚠️ 由于无法访问在线真实网页,此处提供解析后构造的示例数据(仅供保存json逻辑参考) + # 你实际做的时候,需要将上面的 `movies_list` 替换为真实的爬取结果。 + movies_list = [ + {"id": "1", "title": "示例电影A", "director": "导演A", "year": 2022, "rating": 8.5, "duration": 120, "genre": "动作", "actors_count": 4}, + {"id": "2", "title": "示例电影B", "director": "导演B", "year": 2019, "rating": 6.0, "duration": 95, "genre": "喜剧", "actors_count": 6} + ] # 这里要保证爬够10条 + + # 4. 保存为 movies.json + with open('movies.json', 'w', encoding='utf-8') as f_json: + json.dump(movies_list, f_json, ensure_ascii=False, indent=4) + + print("爬取完成,已保存 movies.json 和 movies.html") + +except Exception as e: + print(f"爬取失败: {e}") \ No newline at end of file diff --git a/q2_1_crawler/q2_2.py b/q2_1_crawler/q2_2.py new file mode 100644 index 0000000..e27ae77 --- /dev/null +++ b/q2_1_crawler/q2_2.py @@ -0,0 +1,31 @@ +import json + +# 1. 读取数据 +with open('movies.json', 'r', encoding='utf-8') as f: + movies = json.load(f) + +# 2. ① 找出评分最高和最低的电影 +max_rating_movie = max(movies, key=lambda x: x['rating']) +min_rating_movie = min(movies, key=lambda x: x['rating']) +print(f"评分最高电影: {max_rating_movie['title']} - {max_rating_movie['rating']}") +print(f"评分最低电影: {min_rating_movie['title']} - {min_rating_movie['rating']}") + +# 3. ② 统计各类型的电影数量 (字典格式) +genre_counts = {} +for movie in movies: + genre = movie['genre'] + genre_counts[genre] = genre_counts.get(genre, 0) + 1 +print("各类型电影数量统计:") +print(json.dumps(genre_counts, ensure_ascii=False, indent=4)) + +# 4. ③ 统计各导演的电影数量 (字典格式) +director_counts = {} +for movie in movies: + director = movie['director'] + director_counts[director] = director_counts.get(director, 0) + 1 +print("各导演电影数量统计:") +print(json.dumps(director_counts, ensure_ascii=False, indent=4)) + +# 5. ④ 统计2020年(含)以后上映的电影数量 +count_post_2020 = sum(1 for movie in movies if movie['year'] >= 2020) +print(f"2020年(含)以后上映的电影数量: {count_post_2020}") \ No newline at end of file