import requests from bs4 import BeautifulSoup import json import time # 请求头(模拟浏览器) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } # 存储所有电影数据 movies_data = [] # 爬取10页(每页25条,共250条) for page in range(10): start = page * 25 url = f"https://movie.douban.com/top250?start={start}&filter=" try: # 发送请求获取页面 response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # 提取当前页所有电影项 items = soup.find_all("div", class_="item") for item in items: # 提取排名、片名、评分 rank = item.find("em").text title = item.find("span", class_="title").text score = item.find("span", class_="rating_num").text # 提取导演、年份、地区、类型 info = item.find("div", class_="bd").p.text.strip().split("\n") director_line = info[0].strip() director = director_line.split("导演:")[1].split("主演:")[0].strip() year_area_genre = info[1].strip().split("/") year = year_area_genre[0].strip() area = year_area_genre[1].strip() genre = year_area_genre[2].strip() # 添加到数据列表 movies_data.append({ "排名": rank, "片名": title, "导演": director, "上映年份": year, "地区": area, "类型": genre, "评分": score }) print(f"第 {page+1} 页爬取完成") time.sleep(1) # 延迟1秒,避免反爬 except Exception as e: print(f"第 {page+1} 页爬取失败:{str(e)}") # 保存为JSON文件 with open("豆瓣电影Top250.json", "w", encoding="utf-8") as f: # ensure_ascii=False 保证中文正常显示,indent=2 格式化缩进 json.dump(movies_data, f, ensure_ascii=False, indent=2) print("✅ 数据已保存为「豆瓣电影Top250.json」")