diff --git a/q2_1_crawler/260623.py b/q2_1_crawler/260623.py new file mode 100644 index 0000000..d52d0a0 --- /dev/null +++ b/q2_1_crawler/260623.py @@ -0,0 +1,34 @@ +import json + +with open("movies.json", "r", encoding="utf-8") as f: + total_data = json.load(f) +movie_list = total_data["movies"] + +# ① 最高分、最低分电影 +max_movie = max(movie_list, key=lambda x: x["rating"]) +min_movie = min(movie_list, key=lambda x: x["rating"]) +print("评分最高电影:", max_movie["title"], ",评分:", max_movie["rating"]) +print("评分最低电影:", min_movie["title"], ",评分:", min_movie["rating"]) + +# ② 统计各电影类型数量 +genre_dict = {} +for m in movie_list: + g_list = m["genre"].split(",") + for g in g_list: + g = g.strip() + genre_dict[g] = genre_dict.get(g, 0) + 1 +print("\n各类型电影数量:", genre_dict) + +# ③ 统计各导演电影数量 +director_dict = {} +for m in movie_list: + name = m["director"] + director_dict[name] = director_dict.get(name, 0) + 1 +print("\n各导演电影数量:", director_dict) + +# ④ 统计2020年(含)后上映影片 +cnt = 0 +for m in movie_list: + if m["year"] >= 2020: + cnt += 1 +print("\n2020年(含)以后上映电影总数:", cnt) \ No newline at end of file diff --git a/q2_1_crawler/26062343.py b/q2_1_crawler/26062343.py new file mode 100644 index 0000000..3ff21ab --- /dev/null +++ b/q2_1_crawler/26062343.py @@ -0,0 +1,50 @@ +import requests +import json +from bs4 import BeautifulSoup + +# 请求检测头(题目硬性要求) +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} +url = "https://exam.detr.top/exam-b/movies" + +response = requests.get(url, headers=headers) +response.raise_for_status() +response.encoding = "utf-8" + +# 保存网页源码 +with open("movies.html", "w", encoding="utf-8") as f: + f.write(response.text) + +# 解析表格数据 +soup = BeautifulSoup(response.text, "html.parser") +table = soup.find("table") +tr_rows = table.find_all("tr")[1:] + +movie_list = [] +for row in tr_rows: + cell = row.find_all("td") + info = { + "id": int(cell[0].text.strip()), + "title": cell[1].text.strip(), + "director": cell[2].text.strip(), + "year": int(cell[3].text.strip()), + "rating": float(cell[4].text.strip()), + "duration": int(cell[5].text.strip()), + "genre": cell[6].text.strip(), + "actors_count": int(cell[7].text.strip()) + } + movie_list.append(info) + +# 提取页面数据编号 +data_code = soup.find("code").get_text(strip=True) +result = { + "data_id": data_code, + "movies": movie_list +} + +# 写入JSON文件 +with open("movies.json", "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=4) + +print("✅ 爬取完成,两个文件已正常生成") \ No newline at end of file diff --git a/q2_1_crawler/movies.html b/q2_1_crawler/movies.html new file mode 100644 index 0000000..647c8c0 --- /dev/null +++ b/q2_1_crawler/movies.html @@ -0,0 +1,152 @@ + + + + + +
+ +| 编号 | +电影名 | +导演 | +上映年份 | +评分 | +时长(分钟) | +类型 | +主演数 | +
|---|---|---|---|---|---|---|---|
| 1 | +千与千寻 | +Frank Darabont | +2013 | +8.0 | +126 | +剧情 | +3 | +
| 2 | +肖申克的救赎 | +陈凯歌 | +2018 | +6.8 | +127 | +悬疑 | +2 | +
| 3 | +星际穿越 | +Robert Zemeckis | +2024 | +9.0 | +131 | +冒险 | +2 | +
| 4 | +阿甘正传 | +James Cameron | +1999 | +8.2 | +160 | +喜剧 | +5 | +
| 5 | +三傻大闹宝莱坞 | +宫崎骏 | +1996 | +9.4 | +95 | +动画 | +4 | +
| 6 | +泰坦尼克号 | +Christopher Nolan | +2008 | +8.6 | +90 | +科幻 | +3 | +
| 7 | +忠犬八公的故事 | +Lasse Hallström | +1996 | +6.8 | +168 | +喜剧 | +3 | +
| 8 | +放牛班的春天 | +Rajkumar Hirani | +2020 | +9.3 | +112 | +喜剧 | +5 | +
| 9 | +盗梦空间 | +Christophe Barratier | +2005 | +9.1 | +154 | +剧情 | +4 | +
| 10 | +霸王别姬 | +Christopher Nolan | +2015 | +8.7 | +103 | +剧情 | +5 | +