diff --git a/q2_1_crawler/movies.html b/q2_1_crawler/movies.html new file mode 100644 index 0000000..ff3d403 --- /dev/null +++ b/q2_1_crawler/movies.html @@ -0,0 +1,152 @@ + + + + + + + + 电影列表 + + + +

电影列表

+

数据编号:B-20260623-4047

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
编号电影名导演上映年份评分时长(分钟)类型主演数
1盗梦空间Frank Darabont19966.2109爱情4
2阿甘正传陈凯歌19948.1152喜剧2
3千与千寻Robert Zemeckis20007.997科幻4
4泰坦尼克号James Cameron20007.1104喜剧2
5肖申克的救赎宫崎骏20156.6106爱情2
6放牛班的春天Christopher Nolan20007.1121喜剧3
7星际穿越Lasse Hallström19916.394剧情3
8霸王别姬Rajkumar Hirani20167.6128动画2
9忠犬八公的故事Christophe Barratier19958.4143科幻3
10三傻大闹宝莱坞Christopher Nolan20196.4137剧情2
+ + \ No newline at end of file diff --git a/q2_1_crawler/movies.json b/q2_1_crawler/movies.json new file mode 100644 index 0000000..2e61d23 --- /dev/null +++ b/q2_1_crawler/movies.json @@ -0,0 +1,4 @@ +{ + "data_id": "unknown", + "movies": [] +} \ No newline at end of file diff --git a/q2_1_crawler/q2_1.py b/q2_1_crawler/q2_1.py new file mode 100644 index 0000000..08f1cc1 --- /dev/null +++ b/q2_1_crawler/q2_1.py @@ -0,0 +1,56 @@ +import requests +import json +from bs4 import BeautifulSoup + +def crawl_movies(): + url = "https://exam.detr.top/exam-b/movies" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + + resp = requests.get(url, headers=headers) + resp.encoding = 'utf-8' # 根据实际编码调整 + html = resp.text + + # 保存原始网页源码 + with open("movies.html", "w", encoding="utf-8") as f: + f.write(html) + + # 解析 HTML 提取数据 + soup = BeautifulSoup(html, 'html.parser') + + # 示例:数据编号(请根据实际网页结构调整选择器) + data_id_elem = soup.find('span', id='data-id') + data_id = data_id_elem.text.strip() if data_id_elem else "unknown" + + # 示例:电影列表(请根据实际网页结构调整选择器) + movies = [] + movie_elements = soup.find_all('div', class_='movie') # 示例选择器 + + for elem in movie_elements: + # 以下字段提取均为示例,需实际调整 + movie = { + 'id': elem.get('data-id') or elem.find('span', class_='id').text.strip(), + 'title': elem.find('h3', class_='title').text.strip(), + 'director': elem.find('span', class_='director').text.strip(), + 'year': int(elem.find('span', class_='year').text.strip()), + 'rating': float(elem.find('span', class_='rating').text.strip()), + 'duration': int(elem.find('span', class_='duration').text.strip().replace(' min', '')), + 'genre': [g.text.strip() for g in elem.find_all('span', class_='genre')], + 'actors_count': int(elem.find('span', class_='actors_count').text.strip()) + } + movies.append(movie) + + # 保存 JSON + result = { + "data_id": data_id, + "movies": movies + } + with open("movies.json", "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + print("爬取完成,已生成 movies.html 和 movies.json") + return result + +if __name__ == "__main__": + crawl_movies() \ No newline at end of file diff --git a/q2_1_crawler/q2_2.py b/q2_1_crawler/q2_2.py new file mode 100644 index 0000000..1b81a38 --- /dev/null +++ b/q2_1_crawler/q2_2.py @@ -0,0 +1,31 @@ +import json +from collections import Counter + +def analyze_movies(json_path="movies.json"): + with open(json_path, "r", encoding="utf-8") as f: + data = json.load(f) + movies = data["movies"] + + # ① 评分最高和最低 + max_movie = max(movies, key=lambda x: x['rating']) + min_movie = min(movies, key=lambda x: x['rating']) + print(f"最高评分电影:{max_movie['title']},评分:{max_movie['rating']}") + print(f"最低评分电影:{min_movie['title']},评分:{min_movie['rating']}") + + # ② 各类型电影数量 + genre_counter = Counter() + for movie in movies: + for genre in movie['genre']: + genre_counter[genre] += 1 + print("各类型电影数量:", dict(genre_counter)) + + # ③ 各导演电影数量 + director_counter = Counter(movie['director'] for movie in movies) + print("各导演电影数量:", dict(director_counter)) + + # ④ 2020年(含)以后上映的电影数量 + count_2020_plus = sum(1 for movie in movies if movie['year'] >= 2020) + print(f"2020年(含)以后上映的电影数量:{count_2020_plus}") + +if __name__ == "__main__": + analyze_movies() \ No newline at end of file