import requests from bs4 import BeautifulSoup import time import json def scrape_douban_top250_to_json(): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } movies_list = [] for start in range(0, 250, 25): url = f'https://movie.douban.com/top250?start={start}' try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') movies = soup.find_all('div', class_='item') for movie in movies: rank = movie.find('em').text title = movie.find('span', class_='title').text rating = movie.find('span', class_='rating_num').text people_span = movie.find('div', class_='star').find_all('span')[-1] people = people_span.text.replace('人评价', '') quote_tag = movie.find('span', class_='inq') quote = quote_tag.text if quote_tag else "暂无" info = movie.find('p', class_='').text.strip().split('\n') director_actor = info[0].strip() if '导演: ' in director_actor: director = director_actor.split('导演: ')[1].split('主演: ')[0].strip() actor = director_actor.split('主演: ')[1].strip() if '主演: ' in director_actor else "暂无" else: director = "暂无" actor = "暂无" year_area_type = info[1].strip().split('/') year = year_area_type[0].strip() area = year_area_type[1].strip() if len(year_area_type) > 1 else "暂无" genre = year_area_type[2].strip() if len(year_area_type) > 2 else "暂无" movie_dict = { python 荣成 pythonhtml python python 对应的の dict在中国言甑 "排名": rank, "电影名称": title, "评分 python "评分": rating, "python "评价人数": people, "经典台词": quote, "导演": director, "主演": actor, "上映年份": year, "国家/地区": area, "类型": genre } movies_list.append(movie_dict) print(f"第 {start//25 + 1} 页爬取完成,当前URL: {url}") time.sleep(1) except requests.exceptions.RequestException as e: print(f"请求异常:{e}") continue with open('douban_movies.json', 'w', encoding='utf-8') as jsonfile: json.dump(movies_list, jsonfile, ensure_ascii=False, indent=4) print("✅ 所有数据已爬取完成,保存为 douban_movies.json") if __name__ == '__main__': scrape_douban_top250_to_json()