diff --git a/爬豆瓣.py b/爬豆瓣.py new file mode 100644 index 0000000..c767c06 --- /dev/null +++ b/爬豆瓣.py @@ -0,0 +1,60 @@ +import requests +from bs4 import BeautifulSoup + +def crawl_douban_movies(url, max_pages=5): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + } + + movies = [] + page = 1 + + while page <= max_pages: + page_url = f"{url}?start={(page-1)*25}" + print(f"正在爬取第 {page} 页...") + + try: + response = requests.get(page_url, headers=headers, timeout=10) + response.encoding = 'utf-8' + soup = BeautifulSoup(response.text, 'html.parser') + + # 找到所有电影链接,去重 + links = soup.find_all('a', href=lambda x: x and '/subject/' in x) + + page_movies = [] + seen = set() + for link in links: + title = link.get_text(strip=True) + href = link.get('href') + # 跳过空的和重复的 + if title and href and href not in seen: + seen.add(href) + page_movies.append(title) + print(f" - {title}") + + if not page_movies: + print(" 没有更多电影了") + break + + movies.extend(page_movies) + page += 1 + + except Exception as e: + print(f"异常: {e}") + break + + return movies + + +if __name__ == "__main__": + url = "https://www.douban.com/doulist/3936288/" + movies = crawl_douban_movies(url, max_pages=10) + + print(f"\n共爬取到 {len(movies)} 部电影:") + for i, movie in enumerate(movies, 1): + print(f"{i}. {movie}") + + with open('movies.txt', 'w', encoding='utf-8') as f: + for movie in movies: + f.write(f"{movie}\n") + print("\n已保存到 movies.txt") \ No newline at end of file