上传文件至 q2_1_crawler
爬虫代码第一题
This commit is contained in:
57
q2_1_crawler/q2_1.py
Normal file
57
q2_1_crawler/q2_1.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
def task_1_scrape():
|
||||||
|
url = "https://exam.detr.top/exam-b/movies"
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko)"
|
||||||
|
"Chrome/129.0.0.0 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url,headers=headers)
|
||||||
|
response.encoding = 'utf-8'
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
html_content = response.text
|
||||||
|
|
||||||
|
|
||||||
|
with open("movies.html","w",encoding="utf-8") as f:
|
||||||
|
f.write(html_content)
|
||||||
|
print("[成功]已保存movies.html")
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_content,'html.parser')
|
||||||
|
table = soup.find('table')
|
||||||
|
rows = table.find_all('tr')
|
||||||
|
|
||||||
|
movies_data = []
|
||||||
|
|
||||||
|
print(f"DEBUG:我抓到了{len(movies_data)}个电影数据")
|
||||||
|
|
||||||
|
for row in rows[1:]:
|
||||||
|
cols = row.find_all('td')
|
||||||
|
if len(cols) > 0:
|
||||||
|
movie = {
|
||||||
|
"id": int(cols[0].get_text(strip = True)),
|
||||||
|
"title": cols[1].get_text(strip = True),
|
||||||
|
"director": cols[2].get_text(strip = True),
|
||||||
|
"year": int(cols[3].get_text(strip = True)),
|
||||||
|
"rating": float(cols[4].get_text(strip = True)),
|
||||||
|
"duration": int(cols[5].get_text(strip = True)),
|
||||||
|
"genre": cols[6].get_text(strip = True),
|
||||||
|
"actors_count": int(cols[7].get_text(strip = True))
|
||||||
|
}
|
||||||
|
movies_data.append(movie)
|
||||||
|
|
||||||
|
with open("movies.json","w",encoding="utf-8") as f:
|
||||||
|
json.dump(movies_data, f, ensure_ascii=False, indent = 4)
|
||||||
|
print(f"[成功]已抓取{len(movies_data)}部电影并保存至movies.json")
|
||||||
|
else:
|
||||||
|
print(f"[错误]请求失败,状态码:{response.status_code}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[异常]发生错误:{e}")
|
||||||
Reference in New Issue
Block a user