import requests import re import csv import json import os import time def crawl_douban_top10(): """爬取豆瓣Top10电影信息""" print('开始爬取豆瓣电影Top10...') # 1. 爬取页面 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml', } url = 'https://movie.douban.com/top250' response = requests.get(url, headers=headers, timeout=10) html = response.text # 2. 提取数据 # 电影名称(中文) title_cn = re.findall(r'([^<&]+)', html) # 评分 ratings = re.findall(r']*>(\d+\.?\d*)', html) # 经典台词 quotes = re.findall(r'([^<]+)', html) # 3. 整理数据 movies = [] cn_index = 0 for i in range(10): # 跳过英文名 while cn_index < len(title_cn) and title_cn[cn_index].startswith('/'): cn_index += 1 movie = { 'rank': i + 1, 'title': title_cn[cn_index] if cn_index < len(title_cn) else '', 'rating': ratings[i] if i < len(ratings) else '', 'quote': quotes[i] if i < len(quotes) else '' } movies.append(movie) cn_index += 1 return movies def save_to_csv(movies, filename): """保存为CSV""" with open(filename, 'w', encoding='utf-8', newline='') as f: writer = csv.DictWriter(f, fieldnames=['rank', 'title', 'rating', 'quote']) writer.writeheader() writer.writerows(movies) print(f'CSV已保存: {filename}') def save_to_json(movies, filename): """保存为JSON""" with open(filename, 'w', encoding='utf-8') as f: json.dump(movies, f, ensure_ascii=False, indent=2) print(f'JSON已保存: {filename}') def main(): # 创建输出目录 os.makedirs('douban_output', exist_ok=True) # 爬取数据 movies = crawl_douban_top10() # 保存文件 save_to_csv(movies, 'douban_output/movies.csv') save_to_json(movies, 'douban_output/movies.json') # 显示结果 print('\n爬取结果:') print('-' * 50) for m in movies: quote_text = f'「{m["quote"]}」' if m['quote'] else '' print(f"{m['rank']}. {m['title']} - 评分: {m['rating']} {quote_text}") print('\n完成!') if __name__ == '__main__': main()