This commit is contained in:
2509165008
2026-06-25 15:54:43 +08:00
parent b91c3c9f4f
commit 96d521be98
30 changed files with 428 additions and 737 deletions

102
q2_1_crawler/move.html Normal file
View File

@@ -0,0 +1,102 @@
[
{
"id": "1",
"title": "泰坦尼克号",
"director": "Frank Darabont",
"year": "2015",
"rating": "6.8",
"duration": "91",
"genre": "科幻",
"actors_count": "3"
},
{
"id": "2",
"title": "星际穿越",
"director": "陈凯歌",
"year": "2021",
"rating": "6.2",
"duration": "113",
"genre": "科幻",
"actors_count": "2"
},
{
"id": "3",
"title": "三傻大闹宝莱坞",
"director": "Robert Zemeckis",
"year": "2004",
"rating": "7.4",
"duration": "95",
"genre": "悬疑",
"actors_count": "4"
},
{
"id": "4",
"title": "阿甘正传",
"director": "James Cameron",
"year": "2013",
"rating": "6.9",
"duration": "93",
"genre": "爱情",
"actors_count": "4"
},
{
"id": "5",
"title": "放牛班的春天",
"director": "宫崎骏",
"year": "2005",
"rating": "7.1",
"duration": "127",
"genre": "悬疑",
"actors_count": "3"
},
{
"id": "6",
"title": "千与千寻",
"director": "Christopher Nolan",
"year": "2024",
"rating": "6.4",
"duration": "147",
"genre": "动画",
"actors_count": "3"
},
{
"id": "7",
"title": "忠犬八公的故事",
"director": "Lasse Hallström",
"year": "2002",
"rating": "6.2",
"duration": "166",
"genre": "剧情",
"actors_count": "4"
},
{
"id": "8",
"title": "霸王别姬",
"director": "Rajkumar Hirani",
"year": "2005",
"rating": "7.9",
"duration": "149",
"genre": "冒险",
"actors_count": "2"
},
{
"id": "9",
"title": "肖申克的救赎",
"director": "Christophe Barratier",
"year": "2008",
"rating": "9.3",
"duration": "91",
"genre": "冒险",
"actors_count": "2"
},
{
"id": "10",
"title": "盗梦空间",
"director": "Christopher Nolan",
"year": "2019",
"rating": "7.1",
"duration": "132",
"genre": "剧情",
"actors_count": "5"
}
]

102
q2_1_crawler/movie.json Normal file
View File

@@ -0,0 +1,102 @@
[
{
"id": "1",
"title": "泰坦尼克号",
"director": "Frank Darabont",
"year": "2015",
"rating": "6.8",
"duration": "91",
"genre": "科幻",
"actors_count": "3"
},
{
"id": "2",
"title": "星际穿越",
"director": "陈凯歌",
"year": "2021",
"rating": "6.2",
"duration": "113",
"genre": "科幻",
"actors_count": "2"
},
{
"id": "3",
"title": "三傻大闹宝莱坞",
"director": "Robert Zemeckis",
"year": "2004",
"rating": "7.4",
"duration": "95",
"genre": "悬疑",
"actors_count": "4"
},
{
"id": "4",
"title": "阿甘正传",
"director": "James Cameron",
"year": "2013",
"rating": "6.9",
"duration": "93",
"genre": "爱情",
"actors_count": "4"
},
{
"id": "5",
"title": "放牛班的春天",
"director": "宫崎骏",
"year": "2005",
"rating": "7.1",
"duration": "127",
"genre": "悬疑",
"actors_count": "3"
},
{
"id": "6",
"title": "千与千寻",
"director": "Christopher Nolan",
"year": "2024",
"rating": "6.4",
"duration": "147",
"genre": "动画",
"actors_count": "3"
},
{
"id": "7",
"title": "忠犬八公的故事",
"director": "Lasse Hallström",
"year": "2002",
"rating": "6.2",
"duration": "166",
"genre": "剧情",
"actors_count": "4"
},
{
"id": "8",
"title": "霸王别姬",
"director": "Rajkumar Hirani",
"year": "2005",
"rating": "7.9",
"duration": "149",
"genre": "冒险",
"actors_count": "2"
},
{
"id": "9",
"title": "肖申克的救赎",
"director": "Christophe Barratier",
"year": "2008",
"rating": "9.3",
"duration": "91",
"genre": "冒险",
"actors_count": "2"
},
{
"id": "10",
"title": "盗梦空间",
"director": "Christopher Nolan",
"year": "2019",
"rating": "7.1",
"duration": "132",
"genre": "剧情",
"actors_count": "5"
}
]

65
q2_1_crawler/q2_1.py Normal file
View File

@@ -0,0 +1,65 @@
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://exam.detr.top/exam-b/movies'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/149.0.0.0 Safari/537.36 Edg/149.0.0.0',
'Referer':'https://exam.detr.top/exam-b/movies'}
req = requests.get(url, headers=headers)
req.encoding="utf-8"
data=[]
soup=bs(req.text,"html.parser")
# print(soup)
#id, title, director, year, rating, duration, genre, actors_count
item=soup.select("table tbody tr" )
movie_list=[]
for tr in item:
tds=tr.find_all("td")
tds=list(tds)
# print(tds)
if len(tds)<8:
continue
movie={
"id":tds[0].get_text(strip=True),
"title":tds[1].get_text(strip=True),
"director":tds[2].get_text(strip=True),
"year":tds[3].get_text(strip=True),
"rating":tds[4].get_text(strip=True),
"duration":tds[5].get_text(strip=True),
"genre":tds[6].get_text(strip=True),
"actors_count":tds[7].get_text(strip=True)
}
movie_list.append(movie)
print(movie_list)
with open('movie.json', 'w', encoding='utf-8') as f:
json.dump(movie_list, f, ensure_ascii=False, indent=2)
with open("move.html","w",encoding='utf-8') as f:
json.dump(movie_list, f, ensure_ascii=False, indent=2)
# for i in range(len(items)):
# rank=i+1
# title=items[i].find("span",class_="title").get_text()
# actors=items[i].find("div",class_="bd").get_text().strip()
# try:
# actors=actors.split("主演:")[1].split("\n")[0]
# except:
# actors="无"
# quote=items[i].find("p",class_="quote").get_text().strip()
# data.append({
# "rank":rank,
# "title":title,
# "actors":actors,
# "quote":quote
# })

43
q2_1_crawler/q2_2.py Normal file
View File

@@ -0,0 +1,43 @@
# ① 找出评分最高和最低的电影,打印电影名 + 评分。
# ② 统计各类型的电影数量,用字典格式输出。
# ③ 统计各导演的电影数量,用字典格式输出。
# ④ 统计 2020 年(含)以后上映的电影数量。
import json
with open('movie.json', 'r', encoding='utf-8') as f:
data=json.load(f)
# print(data)
sort_movie=sorted(data,key=lambda x:x["rating"])
min=sort_movie[0]
max=sort_movie[-1]
print("评分最低的电影",min["title"],min["rating"])
print("评分最高的电影",max["title"],max["rating"])
genre_shu={}
for g in data:
ge=g["genre"]
if ge in genre_shu:
genre_shu[ge]+=1
else:
genre_shu[ge]=1
print("各类型的电影数量",genre_shu)
director_shu={}
for d in data:
di=d["director"]
if di in director_shu:
director_shu[di]+=1
else:
director_shu[di]=1
print("各导演的电影数量",director_shu)
a=0
for y in data:
if int(y["year"]) >= 2020:
a+=1
print("2020 年(含)以后上映的电影数量",a)