Files
final-practice/爬虫top250.py
2026-06-09 11:20:49 +08:00

75 lines
2.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import json
import requests
def fetch_page(url):
"""获取网页内容"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers)
response.encoding = "utf-8"
return response.text
def extract_movie_info(html: str) -> list:
"""
从 HTML 中提取所有电影信息
返回电影列表,每个元素为字典
"""
movies = []
# 提取每部电影的 HTML 块(用非贪婪匹配,兼容所有版本)
items = re.findall(r'<div class="item">.*?</div>', html, re.DOTALL)
print(f"找到 {len(items)} 个电影 item 块")
for idx, item in enumerate(items):
movie = {}
movie["rank"] = idx + 1
# 1. 电影名称(宽松匹配,只要是 class="title" 的 span
title_match = re.search(r'<span class="title"[^>]*>([^<]+)</span>', item)
movie["title"] = title_match.group(1).strip() if title_match else "无标题"
# 2. 主演(匹配“主演:”后面到换行/下一个标签前的内容)
actors_match = re.search(r'主演:\s*(.*?)(?:<br>|&nbsp;|</p>)', item, re.DOTALL)
movie["actors"] = actors_match.group(1).strip() if actors_match else "无主演"
# 3. 经典台词(匹配 quote 下的 span 内容)
quote_match = re.search(r'<p class="quote".*?<span>(.*?)</span>', item, re.DOTALL)
movie["quote"] = quote_match.group(1).strip() if quote_match else "无短评"
movies.append(movie)
return movies
def save_to_json(movies: list, filename: str):
"""保存为 JSON 文件"""
with open(filename, "w", encoding="utf-8") as f:
json.dump(movies, f, ensure_ascii=False, indent=2)
if __name__ == "__main__":
# 爬取前 50 部(两页)
all_movies = []
for offset in [0, 25]:
url = f"https://movie.douban.com/top250?start={offset}"
print(f"\n正在获取: {url}")
html = fetch_page(url)
print(f"页面长度: {len(html)}")
page_movies = extract_movie_info(html)
all_movies.extend(page_movies)
print(f"\n总共提取到 {len(all_movies)} 部电影")
save_to_json(all_movies, "movies.json")
print("结果已保存到 movies.json")
# 打印前 3 部,确保数据正确
print("\n==== 前 3 部电影信息 ====")
for m in all_movies[:3]:
print(f"{m['rank']}. {m['title']}")
print(f"主演: {m['actors']}")
print(f"短评: {m['quote']}\n")