Files
simulated-examination/q2_1_crawler/爬虫.py

85 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup as bs
import json
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
}
data = []
count = 0
# 只改这里换成题目给的网址并且只爬1次不需要循环
url = "https://exam.detr.top/examb/movies"
resp = requests.get(url, headers=headers)
resp.encoding = "utf-8"
# 1. 保存网页源码到 movies.html
with open("movies.html", "w", encoding="utf-8") as f:
f.write(resp.text)
print("✅ 网页源码已保存到 movies.html")
soup = bs(resp.text, "html.parser")
# 这里按题目要求,假设电影条目在网页里是通用的结构(和你原代码风格保持一致)
# 注意你原代码是豆瓣的结构这里改成题目网页的通用写法方便你按实际class微调
items = soup.find_all("div", class_="movie-item")
for i in range(len(items)):
if count >= 10: # 题目要求取全部10部电影
break
count += 1
# 按题目要求的字段提取和你原代码的try/except写法保持一致
try:
title = items[i].find("span", class_="title").get_text().strip()
except:
title = ""
try:
director = items[i].find("span", class_="director").get_text().strip()
except:
director = ""
try:
year = int(items[i].find("span", class_="year").get_text().strip())
except:
year = 0
try:
rating = float(items[i].find("span", class_="rating").get_text().strip())
except:
rating = 0.0
try:
duration = items[i].find("span", class_="duration").get_text().strip()
except:
duration = ""
try:
genre = items[i].find("span", class_="genre").get_text().strip()
except:
genre = ""
try:
actors_count = int(items[i].find("span", class_="actors-count").get_text().strip())
except:
actors_count = 0
# 按题目要求的字段存入数据
data.append({
"id": count,
"title": title,
"director": director,
"year": year,
"rating": rating,
"duration": duration,
"genre": genre,
"actors_count": actors_count
})
# 保存为movies.json
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print("✅ 爬取完成,数据已保存到 movies.json")