57 lines
2.1 KiB
CSV
57 lines
2.1 KiB
CSV
import requests
|
|
from bs4 import BeautifulSoup
|
|
import csv
|
|
import time
|
|
|
|
def scrape_douban_top250():
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
}
|
|
|
|
with open('douban_movies.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
|
fieldnames = ['排名', '电影名称', '评分', '评价人数', '经典台词']
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
|
|
for start in range(0, 250, 25):
|
|
url = f'https://movie.douban.com/top250?start={start}'
|
|
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
movies = soup.find_all('div', class_='item')
|
|
|
|
for movie in movies:
|
|
rank = movie.find('em').text
|
|
|
|
title = movie.find('span', class_='title').text
|
|
|
|
rating = movie.find('span', class_='rating_num').text
|
|
|
|
people_span = movie.find('div', class_='star').find_all('span')[-1]
|
|
people = people_span.text.replace('人评价', '')
|
|
|
|
quote_tag = movie.find('span', class_='inq')
|
|
quote = quote_tag.text if quote_tag else "暂无"
|
|
|
|
writer.writerow({
|
|
'排名': rank,
|
|
'电影名称': title,
|
|
'评分': rating,
|
|
'评价人数': people,
|
|
'经典台词': quote
|
|
})
|
|
|
|
print(f"第 {start//25 + 1} 页爬取完成,当前 URL: {url}")
|
|
time.sleep(1)
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"请求异常:{e}")
|
|
continue
|
|
|
|
print("✅ 所有数据已爬取完成,已保存为 douban_movies.csv")
|
|
|
|
if __name__ == '__main__':
|
|
scrape_douban_top250() |