diff --git a/crawl.py b/crawl.py new file mode 100644 index 0000000..6c0f423 --- /dev/null +++ b/crawl.py @@ -0,0 +1,44 @@ +import requests +from bs4 import BeautifulSoup +import json +head = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/129.0.0.0 Safari/537.36" +} +web_url = "https://exam.detr.top/exam-b/movies" +res = requests.get(web_url, headers=head) +res.encoding = "utf-8" +source_html = res.text +html_file = open("movies.html", "w", encoding="utf-8") +html_file.write(source_html) +html_file.close() +html_parse = BeautifulSoup(source_html, "html.parser") +all_movie = html_parse.find_all("div", class_="item") +movie_info = [] +for movie in all_movie: + movie_id = movie.get("data-id") + m_title = movie.find("span", class_="title").get_text().strip() + info_block = movie.find("div", class_="bd") + info_text = info_block.get_text().strip().split("\n")[0] + split_data = [text.strip() for text in info_text.split("/")] + director = split_data[0] + year = split_data[1] + duration = split_data[2] + genre = split_data[3] + actor_num = int(split_data[4]) + score = float(movie.find("p", class_="quote").get_text().strip()) + single_movie = { + "id": movie_id, + "title": m_title, + "director": director, + "year": year, + "rating": score, + "duration": duration, + "genre": genre, + "actors_count": actor_num + } + movie_info.append(single_movie) +json_file = open("movies.json", "w", encoding="utf-8") +json.dump(movie_info, json_file, ensure_ascii=False, indent=2) +json_file.close() + +print("爬虫抓取完成,已生成movies.html、movies.json两个文件") \ No newline at end of file diff --git a/data_analysis.py b/data_analysis.py new file mode 100644 index 0000000..190fde9 --- /dev/null +++ b/data_analysis.py @@ -0,0 +1,26 @@ +import json +read_file = open("movies.json", "r", encoding="utf-8") +movie_data = json.load(read_file) +read_file.close() +rank_movie = sorted(movie_data, key=lambda x: x["rating"]) +min_score_movie = rank_movie[0] +max_score_movie = rank_movie[-1] +print("1、评分最低电影:", min_score_movie["title"], ",评分:", min_score_movie["rating"]) +print(" 评分最高电影:", max_score_movie["title"], ",评分:", max_score_movie["rating"]) +genre_count_dict = {} +for film in movie_data: + film_type = film["genre"] + if film_type in genre_count_dict: + genre_count_dict[film_type] = genre_count_dict[film_type] + 1 + else: + genre_count_dict[film_type] = 1 +print("\n2、各类型电影统计:", genre_count_dict) +director_count_dict = {} +for film in movie_data: + dir_name = film["director"] + if dir_name in director_count_dict: + director_count_dict[dir_name] += 1 + else: + director_count_dict[dir_name] = 1 +print("\n3、各导演电影数量:", director_count_dict) +count_after_2020 = 0 \ No newline at end of file diff --git a/q3_1_image_labels.zip b/q3_1_image_labels.zip new file mode 100644 index 0000000..0a5c72e Binary files /dev/null and b/q3_1_image_labels.zip differ