finish
This commit is contained in:
46
q2_1_crawler/q2_1.py
Normal file
46
q2_1_crawler/q2_1.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import requests
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
url = "https://exam.detr.top/exam-b/movies"
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
response.encoding = 'utf-8'
|
||||
html_content = response.text
|
||||
with open('movies.html', 'w', encoding='utf-8') as f_html:
|
||||
f_html.write(html_content)
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
table = soup.find('table')
|
||||
tbody = table.find('tbody')
|
||||
rows = tbody.find_all('tr')
|
||||
|
||||
movies_data = []
|
||||
|
||||
for row in rows:
|
||||
tds = row.find_all('td')
|
||||
if len(tds) >= 8:
|
||||
movie = {
|
||||
"id": tds[0].text.strip(),
|
||||
"title": tds[1].text.strip(),
|
||||
"director": tds[2].text.strip(),
|
||||
"year": int(tds[3].text.strip()),
|
||||
"rating": float(tds[4].text.strip()),
|
||||
"duration": int(tds[5].text.strip()),
|
||||
"genre": tds[6].text.strip(),
|
||||
"actors_count": int(tds[7].text.strip())
|
||||
}
|
||||
movies_data.append(movie)
|
||||
|
||||
with open('movies.json', 'w', encoding='utf-8') as f_json:
|
||||
json.dump(movies_data, f_json, ensure_ascii=False, indent=4)
|
||||
|
||||
print(f"爬取成功!共获取 {len(movies_data)} 条电影数据。")
|
||||
print("文件 movies.html 和 movies.json 已保存。")
|
||||
|
||||
except Exception as e:
|
||||
print(f"爬取或解析失败,错误信息:{e}")
|
||||
Reference in New Issue
Block a user