import requests from bs4 import BeautifulSoup import time def scrape_douban_top250_to_txt(): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } with open('douban_movies.txt', 'w', encoding='utf-8') as txtfile: for start in range(0, 250, 25): url = f'https://movie.douban.com/top250?start={start}' try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') movies = soup.find_all('div', class_='item' for movie in movies: rank = movie.find('em').text title = movie.find('span', class_='title').text rating = movie.find('span', class_='rating_num').text people_span = movie.find('div', class_='star').find_all('span')[-1] people = people_span.text.replace('人评价', '') quote_tag = movie.find('span', class_='inq') quote = quote_tag.text if quote_tag else "暂无" info = movie.find('p', class_='').text.strip().split('\n') director_actor = info[0].strip() if '导演: ' in director_actor: director = director_actor.split('导演: ')[1].split('主演: ')[0].strip() actor = director_actor.split('主演: ')[1].strip() if '主演: ' in director_actor else "暂无" else: director = "暂无" actor = "暂无" year_area_type = info[1].strip().split('/') year = year_area_type[0].strip() area = year_area_type[1].strip() if len(year_area_type) > 1 else "暂无" genre = year_area_type[2].strip() if len(year_area_type) > 2 else "暂无 txtfile.write(f"排名:{rank}\n") txtfile.write(f"电影名称:{title}\n") txtfile.write(f"评分:{rating}\n") txtfile.write(f"评价人数:{people}\n") txtfile.write(f"经典台词:{quote}\n") txtfile.write(f"导演:{director}\n") txtfile.write(f"主演:{actor}\n") txtfile.write(f"上映年份:{year}\n") txtfile.write(f"国家/地区:{area}\n") txtfile.write(f"类型:{genre}\n") txtfile.write("-" * 50 + "\n") # 分隔线 print(f"第 {start//25 + 1} 页爬取完成,当前URL: {url}") time.sleep(1) # 延迟1秒,避免被封IP except requests.exceptions.RequestException as e: print(f"请求异常:{e}") continue print("? 所有数据已爬取完成,保存为 douban_movies.txt") if __name__ == '__main__': scrape_douban_top250_to_txt()