import requests from bs4 import BeautifulSoup import time import json # 补上json库 # 请求头,模拟浏览器,防止被拦截 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } # 存储所有电影数据 movie_list = [] # 只爬前两页,每页25条,合计50条 for page in range(0, 50, 25): url = f"https://movie.douban.com/top250?start={page}" res = requests.get(url, headers=headers) res.encoding = "utf-8" soup = BeautifulSoup(res.text, "html.parser") items = soup.find_all("div", class_="item") for item in items: # 电影名称 title = item.find("span", class_="title").get_text(strip=True) # 主演信息 info_text = item.find("div", class_="bd").p.get_text(strip=True) # 截取导演演员部分 actor_info = info_text.split("\n")[0] # 短评,部分电影无短评做容错 quote_tag = item.find("span", class_="inq") short_comment = quote_tag.get_text(strip=True) if quote_tag else "无短评" data = { "电影名": title, "主创主演": actor_info, "经典短评": short_comment } movie_list.append(data) print(data) # 每页延时1秒,降低访问频率,避免封IP time.sleep(1) # 打印总数量 print(f"\n一共抓取{len(movie_list)}部电影") with open("movies.json","w",encoding="utf-8") as f: json.dump(movie_list, f, ensure_ascii=False, indent=2) print("数据已保存到 movies.json 文件!")