上传文件至 q2_1_crawler

爬虫代码第一题
2026-06-23 11:11:41 +08:00
parent ee1b6ec298
commit 8e12bbc281
1 changed files with 57 additions and 0 deletions
--- a/q2_1_crawler/q2_1.py
+++ b/q2_1_crawler/q2_1.py
@@ -0,0 +1,57 @@
+import requests
+from bs4 import BeautifulSoup
+import json
+import os
+
+def task_1_scrape():
+    url = "https://exam.detr.top/exam-b/movies"
+
+    headers = {
+        "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
+         "AppleWebKit/537.36 (KHTML, like Gecko)" 
+         "Chrome/129.0.0.0 Safari/537.36"
+    }
+
+    try:
+        response = requests.get(url,headers=headers)
+        response.encoding = 'utf-8'
+
+        if response.status_code == 200:
+            html_content = response.text
+
+
+            with open("movies.html","w",encoding="utf-8") as f:
+                f.write(html_content)
+            print("[成功]已保存movies.html")
+
+            soup = BeautifulSoup(html_content,'html.parser')
+            table = soup.find('table')
+            rows = table.find_all('tr')
+
+            movies_data = []
+
+            print(f"DEBUG:我抓到了{len(movies_data)}个电影数据")
+
+            for row in rows[1:]:
+                cols = row.find_all('td')
+                if len(cols) > 0:
+                    movie = {
+                        "id": int(cols[0].get_text(strip = True)),
+                        "title": cols[1].get_text(strip = True),
+                        "director": cols[2].get_text(strip = True),
+                        "year": int(cols[3].get_text(strip = True)),
+                        "rating": float(cols[4].get_text(strip = True)),
+                        "duration": int(cols[5].get_text(strip = True)),
+                        "genre": cols[6].get_text(strip = True),
+                        "actors_count": int(cols[7].get_text(strip = True))
+                    }
+                    movies_data.append(movie)
+           
+            with open("movies.json","w",encoding="utf-8") as f:
+                json.dump(movies_data, f, ensure_ascii=False, indent = 4)
+            print(f"[成功]已抓取{len(movies_data)}部电影并保存至movies.json")
+        else:
+            print(f"[错误]请求失败,状态码：{response.status_code}")
+
+    except Exception as e:
+        print(f"[异常]发生错误：{e}")