上传文件至 /
This commit is contained in:
75
爬虫top250.py
Normal file
75
爬虫top250.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import re
|
||||
import json
|
||||
import requests
|
||||
|
||||
|
||||
def fetch_page(url):
|
||||
"""获取网页内容"""
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
|
||||
}
|
||||
response = requests.get(url, headers=headers)
|
||||
response.encoding = "utf-8"
|
||||
return response.text
|
||||
|
||||
|
||||
def extract_movie_info(html: str) -> list:
|
||||
"""
|
||||
从 HTML 中提取所有电影信息
|
||||
返回电影列表,每个元素为字典
|
||||
"""
|
||||
movies = []
|
||||
|
||||
# 提取每部电影的 HTML 块(用非贪婪匹配,兼容所有版本)
|
||||
items = re.findall(r'<div class="item">.*?</div>', html, re.DOTALL)
|
||||
print(f"找到 {len(items)} 个电影 item 块")
|
||||
|
||||
for idx, item in enumerate(items):
|
||||
movie = {}
|
||||
movie["rank"] = idx + 1
|
||||
|
||||
# 1. 电影名称(宽松匹配,只要是 class="title" 的 span)
|
||||
title_match = re.search(r'<span class="title"[^>]*>([^<]+)</span>', item)
|
||||
movie["title"] = title_match.group(1).strip() if title_match else "无标题"
|
||||
|
||||
# 2. 主演(匹配“主演:”后面到换行/下一个标签前的内容)
|
||||
actors_match = re.search(r'主演:\s*(.*?)(?:<br>| |</p>)', item, re.DOTALL)
|
||||
movie["actors"] = actors_match.group(1).strip() if actors_match else "无主演"
|
||||
|
||||
# 3. 经典台词(匹配 quote 下的 span 内容)
|
||||
quote_match = re.search(r'<p class="quote".*?<span>(.*?)</span>', item, re.DOTALL)
|
||||
movie["quote"] = quote_match.group(1).strip() if quote_match else "无短评"
|
||||
|
||||
movies.append(movie)
|
||||
|
||||
return movies
|
||||
|
||||
|
||||
def save_to_json(movies: list, filename: str):
|
||||
"""保存为 JSON 文件"""
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
json.dump(movies, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 爬取前 50 部(两页)
|
||||
all_movies = []
|
||||
for offset in [0, 25]:
|
||||
url = f"https://movie.douban.com/top250?start={offset}"
|
||||
print(f"\n正在获取: {url}")
|
||||
html = fetch_page(url)
|
||||
print(f"页面长度: {len(html)}")
|
||||
|
||||
page_movies = extract_movie_info(html)
|
||||
all_movies.extend(page_movies)
|
||||
|
||||
print(f"\n总共提取到 {len(all_movies)} 部电影")
|
||||
save_to_json(all_movies, "movies.json")
|
||||
print("结果已保存到 movies.json")
|
||||
|
||||
# 打印前 3 部,确保数据正确
|
||||
print("\n==== 前 3 部电影信息 ====")
|
||||
for m in all_movies[:3]:
|
||||
print(f"{m['rank']}. {m['title']}")
|
||||
print(f"主演: {m['actors']}")
|
||||
print(f"短评: {m['quote']}\n")
|
||||
Reference in New Issue
Block a user