This commit is contained in:
2509165045
2026-03-31 11:26:50 +08:00
parent 8b8b6cbb71
commit 73faf4cd53

65
爬豆瓣电影top250.py Normal file
View File

@@ -0,0 +1,65 @@
import requests
import re
import csv
import json
import os
import time
from bs4 import BeautifulSoup
def crawl_douban_movies(url, max_pages=5):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}
movies = []
page = 1
while page <= max_pages:
page_url = f"{url}?start={(page-1)*25}"
print(f"正在爬取第 {page} 页...")
try:
response = requests.get(page_url, headers=headers, timeout=10)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# 找到所有电影链接,去重
links = soup.find_all('a', href=lambda x: x and '/subject/' in x)
page_movies = []
seen = set()
for link in links:
title = link.get_text(strip=True)
href = link.get('href')
# 跳过空的和重复的
if title and href and href not in seen:
seen.add(href)
page_movies.append(title)
print(f" - {title}")
if not page_movies:
print(" 没有更多电影了")
break
movies.extend(page_movies)
page += 1
except Exception as e:
print(f"异常: {e}")
break
return movies
if __name__ == "__main__":
url = "https://www.douban.com/doulist/3936288/"
movies = crawl_douban_movies(url, max_pages=10)
print(f"\n共爬取到 {len(movies)} 部电影:")
for i, movie in enumerate(movies, 1):
print(f"{i}. {movie}")
with open('movies.txt', 'w', encoding='utf-8') as f:
for movie in movies:
f.write(f"{movie}\n")
print("\n已保存到 movies.txt")