上传文件至 q2_1_crawler
This commit is contained in:
39
q2_1_crawler/q2_1.py
Normal file
39
q2_1_crawler/q2_1.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
# 1. 配置请求头(测试头,满足题目要求)
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
||||||
|
"Referer": "https://exam.detr.top/"
|
||||||
|
}
|
||||||
|
|
||||||
|
url = "https://exam.detr.top/exam-b/movies"
|
||||||
|
|
||||||
|
# 2. 一次性请求获取全部数据(题目要求单次抓取)
|
||||||
|
resp = requests.get(url, headers=headers)
|
||||||
|
resp.raise_for_status() # 捕获请求异常
|
||||||
|
|
||||||
|
# 保存原始网页源码到 movies.html
|
||||||
|
with open("movies.html", "w", encoding="utf-8") as f:
|
||||||
|
f.write(resp.text)
|
||||||
|
|
||||||
|
# 解析接口返回的json数据
|
||||||
|
movie_data = resp.json()
|
||||||
|
|
||||||
|
# 筛选全部10部电影,校验字段:id, title, director, year, rating, duration, genre, actors_count
|
||||||
|
valid_movies = []
|
||||||
|
for item in movie_data:
|
||||||
|
needed_keys = ["id", "title", "director", "year", "rating", "duration", "genre", "actors_count"]
|
||||||
|
# 只保留包含全部要求键的电影
|
||||||
|
if all(k in item for k in needed_keys):
|
||||||
|
valid_movies.append(item)
|
||||||
|
|
||||||
|
# 保存电影数据到 movies.json
|
||||||
|
with open("movies.json", "w", encoding="utf-8") as f:
|
||||||
|
json.dump(valid_movies, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print("抓取完成:已生成 movies.html 和 movies.json")
|
||||||
|
print(f"共抓取到 {len(valid_movies)} 部电影")
|
||||||
|
|
||||||
|
|
||||||
55
q2_1_crawler/q2_2.py
Normal file
55
q2_1_crawler/q2_2.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
import json
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# 读取json文件
|
||||||
|
with open("movies.json", "r", encoding="utf-8") as f:
|
||||||
|
movies = json.load(f)
|
||||||
|
|
||||||
|
# ① 找出评分最高、最低电影
|
||||||
|
def get_rating_extreme():
|
||||||
|
# 按rating排序
|
||||||
|
sorted_movies = sorted(movies, key=lambda x: x["rating"])
|
||||||
|
lowest = sorted_movies[0]
|
||||||
|
highest = sorted_movies[-1]
|
||||||
|
print("=== ① 评分极值 ===")
|
||||||
|
print(f"评分最低电影:{lowest['title']},评分:{lowest['rating']}")
|
||||||
|
print(f"评分最高电影:{highest['title']},评分:{highest['rating']}")
|
||||||
|
return highest, lowest
|
||||||
|
|
||||||
|
# ② 统计各类型电影数量(genre为列表,拆分统计)
|
||||||
|
def count_genre():
|
||||||
|
genre_count = defaultdict(int)
|
||||||
|
for movie in movies:
|
||||||
|
genres = movie["genre"]
|
||||||
|
for g in genres:
|
||||||
|
genre_count[g] += 1
|
||||||
|
print("\n=== ② 各类型电影数量(字典格式)===")
|
||||||
|
print(dict(genre_count))
|
||||||
|
return dict(genre_count)
|
||||||
|
|
||||||
|
# ③ 统计各导演电影数量
|
||||||
|
def count_director():
|
||||||
|
dir_count = defaultdict(int)
|
||||||
|
for movie in movies:
|
||||||
|
d = movie["director"]
|
||||||
|
dir_count[d] += 1
|
||||||
|
print("\n=== ③ 各导演电影数量(字典格式)===")
|
||||||
|
print(dict(dir_count))
|
||||||
|
return dict(dir_count)
|
||||||
|
|
||||||
|
# ④ 统计2020年(含)以后上映电影数量
|
||||||
|
def count_after_2020():
|
||||||
|
cnt = 0
|
||||||
|
for movie in movies:
|
||||||
|
if movie["year"] >= 2020:
|
||||||
|
cnt += 1
|
||||||
|
print("\n=== ④ 2020年(含)后上映电影数量 ===")
|
||||||
|
print(f"总数:{cnt}")
|
||||||
|
return cnt
|
||||||
|
|
||||||
|
# 执行全部分析逻辑
|
||||||
|
if __name__ == "__main__":
|
||||||
|
get_rating_extreme()
|
||||||
|
count_genre()
|
||||||
|
count_director()
|
||||||
|
count_after_2020()
|
||||||
Reference in New Issue
Block a user