From 8e12bbc2817592dadfb6a2a31518184d70c97161 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E5=B0=A4=E4=BC=98?= <2509165039@student.example.com> Date: Tue, 23 Jun 2026 11:11:41 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20q2=5F1=5Fcrawler?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 爬虫代码第一题 --- q2_1_crawler/q2_1.py | 57 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 q2_1_crawler/q2_1.py diff --git a/q2_1_crawler/q2_1.py b/q2_1_crawler/q2_1.py new file mode 100644 index 0000000..d7d282d --- /dev/null +++ b/q2_1_crawler/q2_1.py @@ -0,0 +1,57 @@ +import requests +from bs4 import BeautifulSoup +import json +import os + +def task_1_scrape(): + url = "https://exam.detr.top/exam-b/movies" + + headers = { + "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)" + "AppleWebKit/537.36 (KHTML, like Gecko)" + "Chrome/129.0.0.0 Safari/537.36" + } + + try: + response = requests.get(url,headers=headers) + response.encoding = 'utf-8' + + if response.status_code == 200: + html_content = response.text + + + with open("movies.html","w",encoding="utf-8") as f: + f.write(html_content) + print("[成功]已保存movies.html") + + soup = BeautifulSoup(html_content,'html.parser') + table = soup.find('table') + rows = table.find_all('tr') + + movies_data = [] + + print(f"DEBUG:我抓到了{len(movies_data)}个电影数据") + + for row in rows[1:]: + cols = row.find_all('td') + if len(cols) > 0: + movie = { + "id": int(cols[0].get_text(strip = True)), + "title": cols[1].get_text(strip = True), + "director": cols[2].get_text(strip = True), + "year": int(cols[3].get_text(strip = True)), + "rating": float(cols[4].get_text(strip = True)), + "duration": int(cols[5].get_text(strip = True)), + "genre": cols[6].get_text(strip = True), + "actors_count": int(cols[7].get_text(strip = True)) + } + movies_data.append(movie) + + with open("movies.json","w",encoding="utf-8") as f: + json.dump(movies_data, f, ensure_ascii=False, indent = 4) + print(f"[成功]已抓取{len(movies_data)}部电影并保存至movies.json") + else: + print(f"[错误]请求失败,状态码:{response.status_code}") + + except Exception as e: + print(f"[异常]发生错误:{e}") \ No newline at end of file