import requests from bs4 import BeautifulSoup def crawl_douban_movies(url, max_pages=5): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', } movies = [] page = 1 while page <= max_pages: page_url = f"{url}?start={(page-1)*25}" print(f"正在爬取第 {page} 页...") try: response = requests.get(page_url, headers=headers, timeout=10) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # 找到所有电影链接,去重 links = soup.find_all('a', href=lambda x: x and '/subject/' in x) page_movies = [] seen = set() for link in links: title = link.get_text(strip=True) href = link.get('href') # 跳过空的和重复的 if title and href and href not in seen: seen.add(href) page_movies.append(title) print(f" - {title}") if not page_movies: print(" 没有更多电影了") break movies.extend(page_movies) page += 1 except Exception as e: print(f"异常: {e}") break return movies if __name__ == "__main__": url = "https://www.douban.com/doulist/3936288/" movies = crawl_douban_movies(url, max_pages=10) print(f"\n共爬取到 {len(movies)} 部电影:") for i, movie in enumerate(movies, 1): print(f"{i}. {movie}") with open('movies.txt', 'w', encoding='utf-8') as f: for movie in movies: f.write(f"{movie}\n") print("\n已保存到 movies.txt")