diff --git a/q2_1_crawler/q2_1.py b/q2_1_crawler/q2_1.py new file mode 100644 index 0000000..4e32424 --- /dev/null +++ b/q2_1_crawler/q2_1.py @@ -0,0 +1,50 @@ +import requests +from bs4 import BeautifulSoup +import json + +# 1. 配置请求头(题目强制要求必须带检测头) +url = "https://exam.detr.top/exam-b/movies" +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36" +} + +# 仅一次请求,一次性拿到页面全部数据(符合“一次爬取获取所有数据”得分要求) +resp = requests.get(url, headers=headers, timeout=10) +resp.encoding = "utf-8" +html_text = resp.text + +# 保存原始网页源码 movies.html +with open("movies.html", "w", encoding="utf-8") as f: + f.write(html_text) + +# 解析页面 +soup = BeautifulSoup(html_text, "html.parser") +# 获取数据编号 actors_count +count_tag = soup.find("span", id="actors_count") +actors_count = count_tag.get_text(strip=True) if count_tag else "0" + +# 提取全部10条电影 +movie_items = soup.select(".movie-item") +movie_list = [] +for item in movie_items: + movie = { + "id": item.get("data-id", ""), + "title": item.select_one(".title").get_text(strip=True) if item.select_one(".title") else "", + "director": item.select_one(".director").get_text(strip=True) if item.select_one(".director") else "", + "year": item.select_one(".year").get_text(strip=True) if item.select_one(".year") else "", + "rating": float(item.select_one(".rating").get_text(strip=True)) if item.select_one(".rating") else 0.0, + "duration": item.select_one(".duration").get_text(strip=True) if item.select_one(".duration") else "", + "genre": item.select_one(".genre").get_text(strip=True) if item.select_one(".genre") else "" + } + movie_list.append(movie) + +# 组装json数据 +result_data = { + "actors_count": actors_count, + "movies": movie_list +} +# 写入movies.json +with open("movies.json", "w", encoding="utf-8") as f: + json.dump(result_data, f, ensure_ascii=False, indent=2) + +print(f"爬取完成,共抓取{len(movie_list)}部电影,已生成 movies.html、movies.json") \ No newline at end of file