import requests from bs4 import BeautifulSoup import json head = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/129.0.0.0 Safari/537.36" } web_url = "https://exam.detr.top/exam-b/movies" res = requests.get(web_url, headers=head) res.encoding = "utf-8" source_html = res.text html_file = open("movies.html", "w", encoding="utf-8") html_file.write(source_html) html_file.close() html_parse = BeautifulSoup(source_html, "html.parser") all_movie = html_parse.find_all("div", class_="item") movie_info = [] for movie in all_movie: movie_id = movie.get("data-id") m_title = movie.find("span", class_="title").get_text().strip() info_block = movie.find("div", class_="bd") info_text = info_block.get_text().strip().split("\n")[0] split_data = [text.strip() for text in info_text.split("/")] director = split_data[0] year = split_data[1] duration = split_data[2] genre = split_data[3] actor_num = int(split_data[4]) score = float(movie.find("p", class_="quote").get_text().strip()) single_movie = { "id": movie_id, "title": m_title, "director": director, "year": year, "rating": score, "duration": duration, "genre": genre, "actors_count": actor_num } movie_info.append(single_movie) json_file = open("movies.json", "w", encoding="utf-8") json.dump(movie_info, json_file, ensure_ascii=False, indent=2) json_file.close() print("爬虫抓取完成,已生成movies.html、movies.json两个文件")