删除 q2_1_crawler/q2_1.py

This commit is contained in:
2026-06-23 11:20:15 +08:00
parent 1d1a1b8979
commit e98c2ad05b

View File

@@ -1,57 +0,0 @@
import requests
from bs4 import BeautifulSoup
import json
import os
def task_1_scrape():
url = "https://exam.detr.top/exam-b/movies"
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
"AppleWebKit/537.36 (KHTML, like Gecko)"
"Chrome/129.0.0.0 Safari/537.36"
}
try:
response = requests.get(url,headers=headers)
response.encoding = 'utf-8'
if response.status_code == 200:
html_content = response.text
with open("movies.html","w",encoding="utf-8") as f:
f.write(html_content)
print("[成功]已保存movies.html")
soup = BeautifulSoup(html_content,'html.parser')
table = soup.find('table')
rows = table.find_all('tr')
movies_data = []
print(f"DEBUG:我抓到了{len(movies_data)}个电影数据")
for row in rows[1:]:
cols = row.find_all('td')
if len(cols) > 0:
movie = {
"id": int(cols[0].get_text(strip = True)),
"title": cols[1].get_text(strip = True),
"director": cols[2].get_text(strip = True),
"year": int(cols[3].get_text(strip = True)),
"rating": float(cols[4].get_text(strip = True)),
"duration": int(cols[5].get_text(strip = True)),
"genre": cols[6].get_text(strip = True),
"actors_count": int(cols[7].get_text(strip = True))
}
movies_data.append(movie)
with open("movies.json","w",encoding="utf-8") as f:
json.dump(movies_data, f, ensure_ascii=False, indent = 4)
print(f"[成功]已抓取{len(movies_data)}部电影并保存至movies.json")
else:
print(f"[错误]请求失败,状态码:{response.status_code}")
except Exception as e:
print(f"[异常]发生错误:{e}")