From a44027c78b45b2dbb3f157659aaa751154f712f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=83=A1=E7=BA=A2=E8=BE=89?= <2509165036@student.example.com> Date: Tue, 23 Jun 2026 11:00:03 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=20https:/gitea.detr.top/2509?= =?UTF-8?q?165036/simulated-examination/src/branch/main/q2=5F1=5Fcrawler/q?= =?UTF-8?q?2=5F1.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/branch/main/q2_1_crawler/q2_1.py | 50 ------------------- 1 file changed, 50 deletions(-) delete mode 100644 https:/gitea.detr.top/2509165036/simulated-examination/src/branch/main/q2_1_crawler/q2_1.py diff --git a/https:/gitea.detr.top/2509165036/simulated-examination/src/branch/main/q2_1_crawler/q2_1.py b/https:/gitea.detr.top/2509165036/simulated-examination/src/branch/main/q2_1_crawler/q2_1.py deleted file mode 100644 index 4e32424..0000000 --- a/https:/gitea.detr.top/2509165036/simulated-examination/src/branch/main/q2_1_crawler/q2_1.py +++ /dev/null @@ -1,50 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import json - -# 1. 配置请求头(题目强制要求必须带检测头) -url = "https://exam.detr.top/exam-b/movies" -headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36" -} - -# 仅一次请求,一次性拿到页面全部数据(符合“一次爬取获取所有数据”得分要求) -resp = requests.get(url, headers=headers, timeout=10) -resp.encoding = "utf-8" -html_text = resp.text - -# 保存原始网页源码 movies.html -with open("movies.html", "w", encoding="utf-8") as f: - f.write(html_text) - -# 解析页面 -soup = BeautifulSoup(html_text, "html.parser") -# 获取数据编号 actors_count -count_tag = soup.find("span", id="actors_count") -actors_count = count_tag.get_text(strip=True) if count_tag else "0" - -# 提取全部10条电影 -movie_items = soup.select(".movie-item") -movie_list = [] -for item in movie_items: - movie = { - "id": item.get("data-id", ""), - "title": item.select_one(".title").get_text(strip=True) if item.select_one(".title") else "", - "director": item.select_one(".director").get_text(strip=True) if item.select_one(".director") else "", - "year": item.select_one(".year").get_text(strip=True) if item.select_one(".year") else "", - "rating": float(item.select_one(".rating").get_text(strip=True)) if item.select_one(".rating") else 0.0, - "duration": item.select_one(".duration").get_text(strip=True) if item.select_one(".duration") else "", - "genre": item.select_one(".genre").get_text(strip=True) if item.select_one(".genre") else "" - } - movie_list.append(movie) - -# 组装json数据 -result_data = { - "actors_count": actors_count, - "movies": movie_list -} -# 写入movies.json -with open("movies.json", "w", encoding="utf-8") as f: - json.dump(result_data, f, ensure_ascii=False, indent=2) - -print(f"爬取完成,共抓取{len(movie_list)}部电影,已生成 movies.html、movies.json") \ No newline at end of file