上传文件至 /

2026-06-23 11:12:36 +08:00
parent 16397e49cd
commit 3e821217d2
3 changed files with 70 additions and 0 deletions
--- a/crawl.py
+++ b/crawl.py
@@ -0,0 +1,44 @@
+import requests
+from bs4 import BeautifulSoup
+import json
+head = {
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/129.0.0.0 Safari/537.36"
+}
+web_url = "https://exam.detr.top/exam-b/movies"
+res = requests.get(web_url, headers=head)
+res.encoding = "utf-8"
+source_html = res.text
+html_file = open("movies.html", "w", encoding="utf-8")
+html_file.write(source_html)
+html_file.close()
+html_parse = BeautifulSoup(source_html, "html.parser")
+all_movie = html_parse.find_all("div", class_="item")
+movie_info = []
+for movie in all_movie:
+    movie_id = movie.get("data-id")
+    m_title = movie.find("span", class_="title").get_text().strip()
+    info_block = movie.find("div", class_="bd")
+    info_text = info_block.get_text().strip().split("\n")[0]
+    split_data = [text.strip() for text in info_text.split("/")]
+    director = split_data[0]
+    year = split_data[1]
+    duration = split_data[2]
+    genre = split_data[3]
+    actor_num = int(split_data[4])
+    score = float(movie.find("p", class_="quote").get_text().strip())
+    single_movie = {
+        "id": movie_id,
+        "title": m_title,
+        "director": director,
+        "year": year,
+        "rating": score,
+        "duration": duration,
+        "genre": genre,
+        "actors_count": actor_num
+    }
+    movie_info.append(single_movie)
+json_file = open("movies.json", "w", encoding="utf-8")
+json.dump(movie_info, json_file, ensure_ascii=False, indent=2)
+json_file.close()
+
+print("爬虫抓取完成，已生成movies.html、movies.json两个文件")
--- a/data_analysis.py
+++ b/data_analysis.py
@@ -0,0 +1,26 @@
+import json
+read_file = open("movies.json", "r", encoding="utf-8")
+movie_data = json.load(read_file)
+read_file.close()
+rank_movie = sorted(movie_data, key=lambda x: x["rating"])
+min_score_movie = rank_movie[0]
+max_score_movie = rank_movie[-1]
+print("1、评分最低电影：", min_score_movie["title"], "，评分：", min_score_movie["rating"])
+print("   评分最高电影：", max_score_movie["title"], "，评分：", max_score_movie["rating"])
+genre_count_dict = {}
+for film in movie_data:
+    film_type = film["genre"]
+    if film_type in genre_count_dict:
+        genre_count_dict[film_type] = genre_count_dict[film_type] + 1
+    else:
+        genre_count_dict[film_type] = 1
+print("\n2、各类型电影统计：", genre_count_dict)
+director_count_dict = {}
+for film in movie_data:
+    dir_name = film["director"]
+    if dir_name in director_count_dict:
+        director_count_dict[dir_name] += 1
+    else:
+        director_count_dict[dir_name] = 1
+print("\n3、各导演电影数量：", director_count_dict)
+count_after_2020 = 0
--- a/q3_1_image_labels.zip
+++ b/q3_1_image_labels.zip