完成作业4
This commit is contained in:
60
爬豆瓣.py
Normal file
60
爬豆瓣.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
def crawl_douban_movies(url, max_pages=5):
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
}
|
||||||
|
|
||||||
|
movies = []
|
||||||
|
page = 1
|
||||||
|
|
||||||
|
while page <= max_pages:
|
||||||
|
page_url = f"{url}?start={(page-1)*25}"
|
||||||
|
print(f"正在爬取第 {page} 页...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(page_url, headers=headers, timeout=10)
|
||||||
|
response.encoding = 'utf-8'
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
# 找到所有电影链接,去重
|
||||||
|
links = soup.find_all('a', href=lambda x: x and '/subject/' in x)
|
||||||
|
|
||||||
|
page_movies = []
|
||||||
|
seen = set()
|
||||||
|
for link in links:
|
||||||
|
title = link.get_text(strip=True)
|
||||||
|
href = link.get('href')
|
||||||
|
# 跳过空的和重复的
|
||||||
|
if title and href and href not in seen:
|
||||||
|
seen.add(href)
|
||||||
|
page_movies.append(title)
|
||||||
|
print(f" - {title}")
|
||||||
|
|
||||||
|
if not page_movies:
|
||||||
|
print(" 没有更多电影了")
|
||||||
|
break
|
||||||
|
|
||||||
|
movies.extend(page_movies)
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"异常: {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
return movies
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
url = "https://www.douban.com/doulist/3936288/"
|
||||||
|
movies = crawl_douban_movies(url, max_pages=10)
|
||||||
|
|
||||||
|
print(f"\n共爬取到 {len(movies)} 部电影:")
|
||||||
|
for i, movie in enumerate(movies, 1):
|
||||||
|
print(f"{i}. {movie}")
|
||||||
|
|
||||||
|
with open('movies.txt', 'w', encoding='utf-8') as f:
|
||||||
|
for movie in movies:
|
||||||
|
f.write(f"{movie}\n")
|
||||||
|
print("\n已保存到 movies.txt")
|
||||||
Reference in New Issue
Block a user