diff --git a/q2_1_crawler/260623.py b/q2_1_crawler/260623.py new file mode 100644 index 0000000..d52d0a0 --- /dev/null +++ b/q2_1_crawler/260623.py @@ -0,0 +1,34 @@ +import json + +with open("movies.json", "r", encoding="utf-8") as f: + total_data = json.load(f) +movie_list = total_data["movies"] + +# ① 最高分、最低分电影 +max_movie = max(movie_list, key=lambda x: x["rating"]) +min_movie = min(movie_list, key=lambda x: x["rating"]) +print("评分最高电影:", max_movie["title"], ",评分:", max_movie["rating"]) +print("评分最低电影:", min_movie["title"], ",评分:", min_movie["rating"]) + +# ② 统计各电影类型数量 +genre_dict = {} +for m in movie_list: + g_list = m["genre"].split(",") + for g in g_list: + g = g.strip() + genre_dict[g] = genre_dict.get(g, 0) + 1 +print("\n各类型电影数量:", genre_dict) + +# ③ 统计各导演电影数量 +director_dict = {} +for m in movie_list: + name = m["director"] + director_dict[name] = director_dict.get(name, 0) + 1 +print("\n各导演电影数量:", director_dict) + +# ④ 统计2020年(含)后上映影片 +cnt = 0 +for m in movie_list: + if m["year"] >= 2020: + cnt += 1 +print("\n2020年(含)以后上映电影总数:", cnt) \ No newline at end of file diff --git a/q2_1_crawler/26062339.py b/q2_1_crawler/26062339.py new file mode 100644 index 0000000..3ff21ab --- /dev/null +++ b/q2_1_crawler/26062339.py @@ -0,0 +1,50 @@ +import requests +import json +from bs4 import BeautifulSoup + +# 请求检测头(题目硬性要求) +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} +url = "https://exam.detr.top/exam-b/movies" + +response = requests.get(url, headers=headers) +response.raise_for_status() +response.encoding = "utf-8" + +# 保存网页源码 +with open("movies.html", "w", encoding="utf-8") as f: + f.write(response.text) + +# 解析表格数据 +soup = BeautifulSoup(response.text, "html.parser") +table = soup.find("table") +tr_rows = table.find_all("tr")[1:] + +movie_list = [] +for row in tr_rows: + cell = row.find_all("td") + info = { + "id": int(cell[0].text.strip()), + "title": cell[1].text.strip(), + "director": cell[2].text.strip(), + "year": int(cell[3].text.strip()), + "rating": float(cell[4].text.strip()), + "duration": int(cell[5].text.strip()), + "genre": cell[6].text.strip(), + "actors_count": int(cell[7].text.strip()) + } + movie_list.append(info) + +# 提取页面数据编号 +data_code = soup.find("code").get_text(strip=True) +result = { + "data_id": data_code, + "movies": movie_list +} + +# 写入JSON文件 +with open("movies.json", "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=4) + +print("✅ 爬取完成,两个文件已正常生成") \ No newline at end of file diff --git a/q2_1_crawler/movies.html b/q2_1_crawler/movies.html new file mode 100644 index 0000000..647c8c0 --- /dev/null +++ b/q2_1_crawler/movies.html @@ -0,0 +1,152 @@ + + + + + + + + 电影列表 + + + +

电影列表

+

数据编号:B-20260623-3049

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
编号电影名导演上映年份评分时长(分钟)类型主演数
1千与千寻Frank Darabont20138.0126剧情3
2肖申克的救赎陈凯歌20186.8127悬疑2
3星际穿越Robert Zemeckis20249.0131冒险2
4阿甘正传James Cameron19998.2160喜剧5
5三傻大闹宝莱坞宫崎骏19969.495动画4
6泰坦尼克号Christopher Nolan20088.690科幻3
7忠犬八公的故事Lasse Hallström19966.8168喜剧3
8放牛班的春天Rajkumar Hirani20209.3112喜剧5
9盗梦空间Christophe Barratier20059.1154剧情4
10霸王别姬Christopher Nolan20158.7103剧情5
+ + \ No newline at end of file diff --git a/q2_1_crawler/movies.json b/q2_1_crawler/movies.json new file mode 100644 index 0000000..8e4e626 --- /dev/null +++ b/q2_1_crawler/movies.json @@ -0,0 +1,105 @@ +{ + "data_id": "B-20260623-3049", + "movies": [ + { + "id": 1, + "title": "千与千寻", + "director": "Frank Darabont", + "year": 2013, + "rating": 8.0, + "duration": 126, + "genre": "剧情", + "actors_count": 3 + }, + { + "id": 2, + "title": "肖申克的救赎", + "director": "陈凯歌", + "year": 2018, + "rating": 6.8, + "duration": 127, + "genre": "悬疑", + "actors_count": 2 + }, + { + "id": 3, + "title": "星际穿越", + "director": "Robert Zemeckis", + "year": 2024, + "rating": 9.0, + "duration": 131, + "genre": "冒险", + "actors_count": 2 + }, + { + "id": 4, + "title": "阿甘正传", + "director": "James Cameron", + "year": 1999, + "rating": 8.2, + "duration": 160, + "genre": "喜剧", + "actors_count": 5 + }, + { + "id": 5, + "title": "三傻大闹宝莱坞", + "director": "宫崎骏", + "year": 1996, + "rating": 9.4, + "duration": 95, + "genre": "动画", + "actors_count": 4 + }, + { + "id": 6, + "title": "泰坦尼克号", + "director": "Christopher Nolan", + "year": 2008, + "rating": 8.6, + "duration": 90, + "genre": "科幻", + "actors_count": 3 + }, + { + "id": 7, + "title": "忠犬八公的故事", + "director": "Lasse Hallström", + "year": 1996, + "rating": 6.8, + "duration": 168, + "genre": "喜剧", + "actors_count": 3 + }, + { + "id": 8, + "title": "放牛班的春天", + "director": "Rajkumar Hirani", + "year": 2020, + "rating": 9.3, + "duration": 112, + "genre": "喜剧", + "actors_count": 5 + }, + { + "id": 9, + "title": "盗梦空间", + "director": "Christophe Barratier", + "year": 2005, + "rating": 9.1, + "duration": 154, + "genre": "剧情", + "actors_count": 4 + }, + { + "id": 10, + "title": "霸王别姬", + "director": "Christopher Nolan", + "year": 2015, + "rating": 8.7, + "duration": 103, + "genre": "剧情", + "actors_count": 5 + } + ] +} \ No newline at end of file