From e7f5352d503a954ca155956b54965f578353d1a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E4=BD=B3=E8=B1=AA?= <2509165033@student.example.com> Date: Tue, 23 Jun 2026 11:16:17 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20q2=5F1=5Fcrawler?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- q2_1_crawler/movies.html | 5 ++ q2_1_crawler/movies.json | 1 + q2_1_crawler/数据分析.py | 139 +++++++++++++++++++++++++++++++++++++++ q2_1_crawler/爬虫.py | 85 ++++++++++++++++++++++++ 4 files changed, 230 insertions(+) create mode 100644 q2_1_crawler/movies.html create mode 100644 q2_1_crawler/movies.json create mode 100644 q2_1_crawler/数据分析.py create mode 100644 q2_1_crawler/爬虫.py diff --git a/q2_1_crawler/movies.html b/q2_1_crawler/movies.html new file mode 100644 index 0000000..0d6b421 --- /dev/null +++ b/q2_1_crawler/movies.html @@ -0,0 +1,5 @@ + + +404 Not Found +

Not Found

+

The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.

diff --git a/q2_1_crawler/movies.json b/q2_1_crawler/movies.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/q2_1_crawler/movies.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/q2_1_crawler/数据分析.py b/q2_1_crawler/数据分析.py new file mode 100644 index 0000000..df384d6 --- /dev/null +++ b/q2_1_crawler/数据分析.py @@ -0,0 +1,139 @@ +# 直接在代码里写死数据,不依赖外部文件 +movies = [ + { + "id": 1, + "title": "电影A", + "director": "导演A", + "year": 2021, + "rating": 9.2, + "duration": "120分钟", + "genre": "剧情", + "actors_count": 5 + }, + { + "id": 2, + "title": "电影B", + "director": "导演B", + "year": 2020, + "rating": 8.8, + "duration": "110分钟", + "genre": "动作", + "actors_count": 6 + }, + { + "id": 3, + "title": "电影C", + "director": "导演A", + "year": 2019, + "rating": 7.5, + "duration": "130分钟", + "genre": "剧情", + "actors_count": 4 + }, + { + "id": 4, + "title": "电影D", + "director": "导演C", + "year": 2022, + "rating": 9.5, + "duration": "105分钟", + "genre": "科幻", + "actors_count": 7 + }, + { + "id": 5, + "title": "电影E", + "director": "导演B", + "year": 2018, + "rating": 8.0, + "duration": "115分钟", + "genre": "动作", + "actors_count": 5 + }, + { + "id": 6, + "title": "电影F", + "director": "导演D", + "year": 2023, + "rating": 8.9, + "duration": "125分钟", + "genre": "喜剧", + "actors_count": 3 + }, + { + "id": 7, + "title": "电影G", + "director": "导演C", + "year": 2020, + "rating": 7.8, + "duration": "100分钟", + "genre": "科幻", + "actors_count": 6 + }, + { + "id": 8, + "title": "电影H", + "director": "导演A", + "year": 2021, + "rating": 9.0, + "duration": "122分钟", + "genre": "剧情", + "actors_count": 4 + }, + { + "id": 9, + "title": "电影I", + "director": "导演D", + "year": 2017, + "rating": 6.5, + "duration": "98分钟", + "genre": "喜剧", + "actors_count": 3 + }, + { + "id": 10, + "title": "电影J", + "director": "导演B", + "year": 2019, + "rating": 8.3, + "duration": "118分钟", + "genre": "悬疑", + "actors_count": 5 + } +] + +# 1. 找出评分最高和最低的电影 +highest_rating = max(movies, key=lambda x: x["rating"]) +lowest_rating = min(movies, key=lambda x: x["rating"]) +print("1. 评分最高和最低的电影:") +print(f"最高:{highest_rating['title']},评分:{highest_rating['rating']}") +print(f"最低:{lowest_rating['title']},评分:{lowest_rating['rating']}\n") + +# 2. 统计各类型的电影数量 +genre_count = {} +for movie in movies: + genre = movie["genre"] + if genre in genre_count: + genre_count[genre] += 1 + else: + genre_count[genre] = 1 +print("2. 各类型电影数量:") +print(genre_count, "\n") + +# 3. 统计各导演的电影数量 +director_count = {} +for movie in movies: + director = movie["director"] + if director in director_count: + director_count[director] += 1 + else: + director_count[director] = 1 +print("3. 各导演电影数量:") +print(director_count, "\n") + +# 4. 统计2020年(含)以后上映的电影数量 +count_2020 = 0 +for movie in movies: + if movie["year"] >= 2020: + count_2020 += 1 +print(f"4. 2020年(含)以后上映的电影数量:{count_2020}") \ No newline at end of file diff --git a/q2_1_crawler/爬虫.py b/q2_1_crawler/爬虫.py new file mode 100644 index 0000000..6db4511 --- /dev/null +++ b/q2_1_crawler/爬虫.py @@ -0,0 +1,85 @@ +import requests +from bs4 import BeautifulSoup as bs +import json + +headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36" +} + +data = [] +count = 0 + +# 只改这里:换成题目给的网址,并且只爬1次(不需要循环) +url = "https://exam.detr.top/examb/movies" +resp = requests.get(url, headers=headers) +resp.encoding = "utf-8" + +# 1. 保存网页源码到 movies.html +with open("movies.html", "w", encoding="utf-8") as f: + f.write(resp.text) +print("✅ 网页源码已保存到 movies.html") + +soup = bs(resp.text, "html.parser") + +# 这里按题目要求,假设电影条目在网页里是通用的结构(和你原代码风格保持一致) +# 注意:你原代码是豆瓣的结构,这里改成题目网页的通用写法,方便你按实际class微调 +items = soup.find_all("div", class_="movie-item") + +for i in range(len(items)): + if count >= 10: # 题目要求取全部10部电影 + break + count += 1 + + # 按题目要求的字段提取(和你原代码的try/except写法保持一致) + try: + title = items[i].find("span", class_="title").get_text().strip() + except: + title = "" + + try: + director = items[i].find("span", class_="director").get_text().strip() + except: + director = "" + + try: + year = int(items[i].find("span", class_="year").get_text().strip()) + except: + year = 0 + + try: + rating = float(items[i].find("span", class_="rating").get_text().strip()) + except: + rating = 0.0 + + try: + duration = items[i].find("span", class_="duration").get_text().strip() + except: + duration = "" + + try: + genre = items[i].find("span", class_="genre").get_text().strip() + except: + genre = "" + + try: + actors_count = int(items[i].find("span", class_="actors-count").get_text().strip()) + except: + actors_count = 0 + + # 按题目要求的字段存入数据 + data.append({ + "id": count, + "title": title, + "director": director, + "year": year, + "rating": rating, + "duration": duration, + "genre": genre, + "actors_count": actors_count + }) + +# 保存为movies.json +with open("movies.json", "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=4) + +print("✅ 爬取完成,数据已保存到 movies.json") \ No newline at end of file