上传文件至 /
This commit is contained in:
44
crawl.py
Normal file
44
crawl.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
head = {
|
||||||
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/129.0.0.0 Safari/537.36"
|
||||||
|
}
|
||||||
|
web_url = "https://exam.detr.top/exam-b/movies"
|
||||||
|
res = requests.get(web_url, headers=head)
|
||||||
|
res.encoding = "utf-8"
|
||||||
|
source_html = res.text
|
||||||
|
html_file = open("movies.html", "w", encoding="utf-8")
|
||||||
|
html_file.write(source_html)
|
||||||
|
html_file.close()
|
||||||
|
html_parse = BeautifulSoup(source_html, "html.parser")
|
||||||
|
all_movie = html_parse.find_all("div", class_="item")
|
||||||
|
movie_info = []
|
||||||
|
for movie in all_movie:
|
||||||
|
movie_id = movie.get("data-id")
|
||||||
|
m_title = movie.find("span", class_="title").get_text().strip()
|
||||||
|
info_block = movie.find("div", class_="bd")
|
||||||
|
info_text = info_block.get_text().strip().split("\n")[0]
|
||||||
|
split_data = [text.strip() for text in info_text.split("/")]
|
||||||
|
director = split_data[0]
|
||||||
|
year = split_data[1]
|
||||||
|
duration = split_data[2]
|
||||||
|
genre = split_data[3]
|
||||||
|
actor_num = int(split_data[4])
|
||||||
|
score = float(movie.find("p", class_="quote").get_text().strip())
|
||||||
|
single_movie = {
|
||||||
|
"id": movie_id,
|
||||||
|
"title": m_title,
|
||||||
|
"director": director,
|
||||||
|
"year": year,
|
||||||
|
"rating": score,
|
||||||
|
"duration": duration,
|
||||||
|
"genre": genre,
|
||||||
|
"actors_count": actor_num
|
||||||
|
}
|
||||||
|
movie_info.append(single_movie)
|
||||||
|
json_file = open("movies.json", "w", encoding="utf-8")
|
||||||
|
json.dump(movie_info, json_file, ensure_ascii=False, indent=2)
|
||||||
|
json_file.close()
|
||||||
|
|
||||||
|
print("爬虫抓取完成,已生成movies.html、movies.json两个文件")
|
||||||
26
data_analysis.py
Normal file
26
data_analysis.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
import json
|
||||||
|
read_file = open("movies.json", "r", encoding="utf-8")
|
||||||
|
movie_data = json.load(read_file)
|
||||||
|
read_file.close()
|
||||||
|
rank_movie = sorted(movie_data, key=lambda x: x["rating"])
|
||||||
|
min_score_movie = rank_movie[0]
|
||||||
|
max_score_movie = rank_movie[-1]
|
||||||
|
print("1、评分最低电影:", min_score_movie["title"], ",评分:", min_score_movie["rating"])
|
||||||
|
print(" 评分最高电影:", max_score_movie["title"], ",评分:", max_score_movie["rating"])
|
||||||
|
genre_count_dict = {}
|
||||||
|
for film in movie_data:
|
||||||
|
film_type = film["genre"]
|
||||||
|
if film_type in genre_count_dict:
|
||||||
|
genre_count_dict[film_type] = genre_count_dict[film_type] + 1
|
||||||
|
else:
|
||||||
|
genre_count_dict[film_type] = 1
|
||||||
|
print("\n2、各类型电影统计:", genre_count_dict)
|
||||||
|
director_count_dict = {}
|
||||||
|
for film in movie_data:
|
||||||
|
dir_name = film["director"]
|
||||||
|
if dir_name in director_count_dict:
|
||||||
|
director_count_dict[dir_name] += 1
|
||||||
|
else:
|
||||||
|
director_count_dict[dir_name] = 1
|
||||||
|
print("\n3、各导演电影数量:", director_count_dict)
|
||||||
|
count_after_2020 = 0
|
||||||
BIN
q3_1_image_labels.zip
Normal file
BIN
q3_1_image_labels.zip
Normal file
Binary file not shown.
Reference in New Issue
Block a user