finish

2026-06-23 11:22:41 +08:00
parent 1e00974a79
commit 74906f0596
4 changed files with 337 additions and 0 deletions
--- a/q2_1_crawler/q2_1.py
+++ b/q2_1_crawler/q2_1.py
@@ -0,0 +1,46 @@
+import requests
+import json
+from bs4 import BeautifulSoup
+
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+
+url = "https://exam.detr.top/exam-b/movies"
+
+try:
+    response = requests.get(url, headers=headers)
+    response.encoding = 'utf-8'
+    html_content = response.text
+    with open('movies.html', 'w', encoding='utf-8') as f_html:
+        f_html.write(html_content)
+    soup = BeautifulSoup(html_content, 'html.parser')
+    table = soup.find('table')
+    tbody = table.find('tbody')
+    rows = tbody.find_all('tr')
+
+    movies_data = []
+
+    for row in rows:
+        tds = row.find_all('td')
+        if len(tds) >= 8:
+            movie = {
+                "id": tds[0].text.strip(),
+                "title": tds[1].text.strip(),
+                "director": tds[2].text.strip(),
+                "year": int(tds[3].text.strip()),
+                "rating": float(tds[4].text.strip()),
+                "duration": int(tds[5].text.strip()),
+                "genre": tds[6].text.strip(),
+                "actors_count": int(tds[7].text.strip())
+            }
+            movies_data.append(movie)
+
+    with open('movies.json', 'w', encoding='utf-8') as f_json:
+        json.dump(movies_data, f_json, ensure_ascii=False, indent=4)
+
+    print(f"爬取成功！共获取 {len(movies_data)} 条电影数据。")
+    print("文件 movies.html 和 movies.json 已保存。")
+
+except Exception as e:
+    print(f"爬取或解析失败，错误信息：{e}")