diff --git a/q2_1_crawler/movie.json b/q2_1_crawler/movie.json new file mode 100644 index 0000000..fdce86a --- /dev/null +++ b/q2_1_crawler/movie.json @@ -0,0 +1,102 @@ +[ + { + "id": 1, + "title": "三傻大闹宝莱坞", + "director": "Frank Darabont", + "year": 2018, + "rating": 7.0, + "duration": 118, + "genre": "动画", + "actors_count": 5 + }, + { + "id": 2, + "title": "霸王别姬", + "director": "陈凯歌", + "year": 2012, + "rating": 7.1, + "duration": 119, + "genre": "爱情", + "actors_count": 4 + }, + { + "id": 3, + "title": "星际穿越", + "director": "Robert Zemeckis", + "year": 2015, + "rating": 8.8, + "duration": 171, + "genre": "冒险", + "actors_count": 3 + }, + { + "id": 4, + "title": "肖申克的救赎", + "director": "James Cameron", + "year": 2017, + "rating": 8.2, + "duration": 149, + "genre": "剧情", + "actors_count": 3 + }, + { + "id": 5, + "title": "阿甘正传", + "director": "宫崎骏", + "year": 2001, + "rating": 7.1, + "duration": 163, + "genre": "悬疑", + "actors_count": 3 + }, + { + "id": 6, + "title": "泰坦尼克号", + "director": "Christopher Nolan", + "year": 1996, + "rating": 8.6, + "duration": 171, + "genre": "冒险", + "actors_count": 5 + }, + { + "id": 7, + "title": "放牛班的春天", + "director": "Lasse Hallström", + "year": 2010, + "rating": 7.8, + "duration": 126, + "genre": "科幻", + "actors_count": 2 + }, + { + "id": 8, + "title": "千与千寻", + "director": "Rajkumar Hirani", + "year": 2002, + "rating": 8.6, + "duration": 160, + "genre": "悬疑", + "actors_count": 5 + }, + { + "id": 9, + "title": "忠犬八公的故事", + "director": "Christophe Barratier", + "year": 1997, + "rating": 7.9, + "duration": 138, + "genre": "冒险", + "actors_count": 5 + }, + { + "id": 10, + "title": "盗梦空间", + "director": "Christopher Nolan", + "year": 2008, + "rating": 7.3, + "duration": 158, + "genre": "爱情", + "actors_count": 5 + } +] \ No newline at end of file diff --git a/q2_1_crawler/movies.html b/q2_1_crawler/movies.html new file mode 100644 index 0000000..513eae1 --- /dev/null +++ b/q2_1_crawler/movies.html @@ -0,0 +1,152 @@ + + + + + + + + 电影列表 + + + +

电影列表

+

数据编号:B-20260623-1192

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
编号电影名导演上映年份评分时长(分钟)类型主演数
1三傻大闹宝莱坞Frank Darabont20187.0118动画5
2霸王别姬陈凯歌20127.1119爱情4
3星际穿越Robert Zemeckis20158.8171冒险3
4肖申克的救赎James Cameron20178.2149剧情3
5阿甘正传宫崎骏20017.1163悬疑3
6泰坦尼克号Christopher Nolan19968.6171冒险5
7放牛班的春天Lasse Hallström20107.8126科幻2
8千与千寻Rajkumar Hirani20028.6160悬疑5
9忠犬八公的故事Christophe Barratier19977.9138冒险5
10盗梦空间Christopher Nolan20087.3158爱情5
+ + \ No newline at end of file diff --git a/q2_1_crawler/q2_1.py b/q2_1_crawler/q2_1.py new file mode 100644 index 0000000..51dbfdb --- /dev/null +++ b/q2_1_crawler/q2_1.py @@ -0,0 +1,42 @@ +import requests +from bs4 import BeautifulSoup as bs +import json +header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/149.0.0.0 Safari/537.36 Edg/149.0.0.0'} +url="https://exam.detr.top/exam-b/movies" +resp=requests.get(url,headers=header) +resp.encoding="utf-8" +#print(resp.text) +soup=bs(resp.text,"html.parser") +#print(soup) +items= soup.find_all("tbody") +#print(items) +with open("movies.html", "w", encoding="utf-8") as f: + f.write(resp.text) +data=[] +for item in items: + trs=item.find_all("tr") + #print(trs) + for tr in trs: + tds=tr.find_all("td") + #print(tds) + id=int(tds[0].get_text(strip=True)) + title = tds[1].get_text(strip=True) + director = tds[2].get_text(strip=True) + year = int(tds[3].get_text(strip=True)) + rating = float(tds[4].get_text(strip=True)) + duration = int(tds[5].get_text(strip=True)) + genre = tds[6].get_text(strip=True) + actors_count = int(tds[7].get_text(strip=True)) + data.append ({ + "id": id, + "title": title, + "director": director, + "year": year, + "rating": rating, + "duration": duration, + "genre": genre, + "actors_count": actors_count + }) +print(data) +with open("movie.json","w",encoding="utf-8") as f: + json.dump(data,f,ensure_ascii=False,indent=4) \ No newline at end of file diff --git a/q2_1_crawler/q2_2.py b/q2_1_crawler/q2_2.py new file mode 100644 index 0000000..ad13bb9 --- /dev/null +++ b/q2_1_crawler/q2_2.py @@ -0,0 +1,42 @@ +import json + +# 读取json文件 +with open("movie.json", "r", encoding="utf-8") as f: + movie_list = json.load(f) + +# ① 找出评分最高、最低的电影,打印名称+评分 +max_rating_movie = max(movie_list, key=lambda x: x["rating"]) +min_rating_movie = min(movie_list, key=lambda x: x["rating"]) +print("=====① 最高/最低评分电影=====") +print(f"最高分电影:{max_rating_movie['title']},评分:{max_rating_movie['rating']}") +print(f"最低分电影:{min_rating_movie['title']},评分:{min_rating_movie['rating']}") + +# ② 统计各类型电影数量(字典输出) +genre_count = {} +for m in movie_list: + g = m["genre"] + if g in genre_count: + genre_count[g] += 1 + else: + genre_count[g] = 1 +print("\n=====② 各类型电影数量=====") +print(genre_count) + +# ③ 统计各导演电影数量(字典输出) +director_count = {} +for m in movie_list: + d = m["director"] + if d in director_count: + director_count[d] += 1 + else: + director_count[d] = 1 +print("\n=====③ 各导演电影数量=====") +print(director_count) + +# ④ 统计2020年(含)以后上映电影数量 +new_movie_num = 0 +for m in movie_list: + if m["year"] >= 2020: + new_movie_num += 1 +print("\n=====④ 2020年及以后上映影片总数=====") +print(f"数量:{new_movie_num}") \ No newline at end of file