diff --git a/q2_1_crawler/movies.html b/q2_1_crawler/movies.html new file mode 100644 index 0000000..2e24df0 --- /dev/null +++ b/q2_1_crawler/movies.html @@ -0,0 +1,152 @@ + + + + + + + + 电影列表 + + + +

电影列表

+

数据编号:B-20260623-2074

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
编号电影名导演上映年份评分时长(分钟)类型主演数
1放牛班的春天Frank Darabont20198.2134爱情2
2忠犬八公的故事陈凯歌20106.7130爱情4
3星际穿越Robert Zemeckis20107.7109动画2
4阿甘正传James Cameron20059.298爱情4
5肖申克的救赎宫崎骏20228.9111冒险4
6霸王别姬Christopher Nolan20059.4169悬疑5
7千与千寻Lasse Hallström20009.2106冒险4
8盗梦空间Rajkumar Hirani20148.0101悬疑2
9泰坦尼克号Christophe Barratier19969.293喜剧5
10三傻大闹宝莱坞Christopher Nolan20037.9114冒险4
+ + \ No newline at end of file diff --git a/q2_1_crawler/movies.json b/q2_1_crawler/movies.json new file mode 100644 index 0000000..0241519 --- /dev/null +++ b/q2_1_crawler/movies.json @@ -0,0 +1,102 @@ +[ + { + "id": "1", + "title": "放牛班的春天", + "director": "Frank Darabont", + "year": 2019, + "rating": 8.2, + "duration": 134, + "genre": "爱情", + "actors_count": 2 + }, + { + "id": "2", + "title": "忠犬八公的故事", + "director": "陈凯歌", + "year": 2010, + "rating": 6.7, + "duration": 130, + "genre": "爱情", + "actors_count": 4 + }, + { + "id": "3", + "title": "星际穿越", + "director": "Robert Zemeckis", + "year": 2010, + "rating": 7.7, + "duration": 109, + "genre": "动画", + "actors_count": 2 + }, + { + "id": "4", + "title": "阿甘正传", + "director": "James Cameron", + "year": 2005, + "rating": 9.2, + "duration": 98, + "genre": "爱情", + "actors_count": 4 + }, + { + "id": "5", + "title": "肖申克的救赎", + "director": "宫崎骏", + "year": 2022, + "rating": 8.9, + "duration": 111, + "genre": "冒险", + "actors_count": 4 + }, + { + "id": "6", + "title": "霸王别姬", + "director": "Christopher Nolan", + "year": 2005, + "rating": 9.4, + "duration": 169, + "genre": "悬疑", + "actors_count": 5 + }, + { + "id": "7", + "title": "千与千寻", + "director": "Lasse Hallström", + "year": 2000, + "rating": 9.2, + "duration": 106, + "genre": "冒险", + "actors_count": 4 + }, + { + "id": "8", + "title": "盗梦空间", + "director": "Rajkumar Hirani", + "year": 2014, + "rating": 8.0, + "duration": 101, + "genre": "悬疑", + "actors_count": 2 + }, + { + "id": "9", + "title": "泰坦尼克号", + "director": "Christophe Barratier", + "year": 1996, + "rating": 9.2, + "duration": 93, + "genre": "喜剧", + "actors_count": 5 + }, + { + "id": "10", + "title": "三傻大闹宝莱坞", + "director": "Christopher Nolan", + "year": 2003, + "rating": 7.9, + "duration": 114, + "genre": "冒险", + "actors_count": 4 + } +] \ No newline at end of file diff --git a/q2_1_crawler/q2_1.py b/q2_1_crawler/q2_1.py new file mode 100644 index 0000000..caae0d4 --- /dev/null +++ b/q2_1_crawler/q2_1.py @@ -0,0 +1,46 @@ +import requests +import json +from bs4 import BeautifulSoup + +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' +} + +url = "https://exam.detr.top/exam-b/movies" + +try: + response = requests.get(url, headers=headers) + response.encoding = 'utf-8' + html_content = response.text + with open('movies.html', 'w', encoding='utf-8') as f_html: + f_html.write(html_content) + soup = BeautifulSoup(html_content, 'html.parser') + table = soup.find('table') + tbody = table.find('tbody') + rows = tbody.find_all('tr') + + movies_data = [] + + for row in rows: + tds = row.find_all('td') + if len(tds) >= 8: + movie = { + "id": tds[0].text.strip(), + "title": tds[1].text.strip(), + "director": tds[2].text.strip(), + "year": int(tds[3].text.strip()), + "rating": float(tds[4].text.strip()), + "duration": int(tds[5].text.strip()), + "genre": tds[6].text.strip(), + "actors_count": int(tds[7].text.strip()) + } + movies_data.append(movie) + + with open('movies.json', 'w', encoding='utf-8') as f_json: + json.dump(movies_data, f_json, ensure_ascii=False, indent=4) + + print(f"爬取成功!共获取 {len(movies_data)} 条电影数据。") + print("文件 movies.html 和 movies.json 已保存。") + +except Exception as e: + print(f"爬取或解析失败,错误信息:{e}") diff --git a/q2_1_crawler/q2_2.py b/q2_1_crawler/q2_2.py new file mode 100644 index 0000000..f8f1ea8 --- /dev/null +++ b/q2_1_crawler/q2_2.py @@ -0,0 +1,37 @@ +import json + +with open('q2_1_crawler\movies.json', 'r', encoding='utf-8') as f: + movies = json.load(f) + +if movies: + sorted_by_rating = sorted(movies, key=lambda x: x['rating']) + lowest_movie = sorted_by_rating[0] + highest_movie = sorted_by_rating[-1] + + print("2.1 最高评分电影:") + print(f"电影名: {highest_movie['title']}, 评分: {highest_movie['rating']}") + print("\n最低评分电影:") + print(f"电影名: {lowest_movie['title']}, 评分: {lowest_movie['rating']}") + +genre_counts = {} +for movie in movies: + genre = movie['genre'] + genre_counts[genre] = genre_counts.get(genre, 0) + 1 + +print("\n2.2 各类型电影数量字典:") +print(genre_counts) + +director_counts = {} +for movie in movies: + director = movie['director'] + director_counts[director] = director_counts.get(director, 0) + 1 + +print("\n2.3 各导演电影数量字典:") +print(director_counts) + +count_2020_later = 0 +for movie in movies: + if movie['year'] >= 2020: + count_2020_later += 1 + +print(f"\n2.4 2020年(含)以后上映的电影数量: {count_2020_later} 部") \ No newline at end of file