Files
2509165028 495d119600 1
2026-06-23 11:16:59 +08:00

44 lines
1.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
from bs4 import BeautifulSoup
# 1. 必须包含检测头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
url = 'https://exam.detr.top/exam-b/movies'
try:
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
# 2. 保存原始网页源码
with open('movies.html', 'w', encoding='utf-8') as f_html:
f_html.write(response.text)
# 3. 解析数据 (这里假设网页是表格或列表,需要手动调试定位)
soup = BeautifulSoup(response.text, 'lxml')
# 示例:假设电影信息在一个 class 为 'movie-item' 的 div 中,实际需要你按 F12 查看源码修改
# movie_items = soup.select('.movie-item')
# 获取前10条或全部
# movies_list = []
# for i, item in enumerate(movie_items[:10]):
# # 根据实际标签提取id, title, director, year, rating, duration, genre, actors_count
# pass
# ⚠️ 由于无法访问在线真实网页此处提供解析后构造的示例数据仅供保存json逻辑参考
# 你实际做的时候,需要将上面的 `movies_list` 替换为真实的爬取结果。
movies_list = [
{"id": "1", "title": "示例电影A", "director": "导演A", "year": 2022, "rating": 8.5, "duration": 120, "genre": "动作", "actors_count": 4},
{"id": "2", "title": "示例电影B", "director": "导演B", "year": 2019, "rating": 6.0, "duration": 95, "genre": "喜剧", "actors_count": 6}
] # 这里要保证爬够10条
# 4. 保存为 movies.json
with open('movies.json', 'w', encoding='utf-8') as f_json:
json.dump(movies_list, f_json, ensure_ascii=False, indent=4)
print("爬取完成,已保存 movies.json 和 movies.html")
except Exception as e:
print(f"爬取失败: {e}")