67 lines
3.0 KiB
Plaintext
67 lines
3.0 KiB
Plaintext
import requests
|
||
from bs4 import BeautifulSoup
|
||
import time
|
||
|
||
def scrape_douban_top250_to_txt():
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||
}
|
||
|
||
with open('douban_movies.txt', 'w', encoding='utf-8') as txtfile:
|
||
for start in range(0, 250, 25):
|
||
url = f'https://movie.douban.com/top250?start={start}'
|
||
|
||
try:
|
||
response = requests.get(url, headers=headers, timeout=10)
|
||
response.raise_for_status()
|
||
|
||
soup = BeautifulSoup(response.text, 'html.parser')
|
||
movies = soup.find_all('div', class_='item'
|
||
|
||
for movie in movies:
|
||
rank = movie.find('em').text
|
||
title = movie.find('span', class_='title').text
|
||
rating = movie.find('span', class_='rating_num').text
|
||
people_span = movie.find('div', class_='star').find_all('span')[-1]
|
||
people = people_span.text.replace('人评价', '')
|
||
quote_tag = movie.find('span', class_='inq')
|
||
quote = quote_tag.text if quote_tag else "暂无"
|
||
|
||
info = movie.find('p', class_='').text.strip().split('\n')
|
||
director_actor = info[0].strip()
|
||
|
||
if '导演: ' in director_actor:
|
||
director = director_actor.split('导演: ')[1].split('主演: ')[0].strip()
|
||
actor = director_actor.split('主演: ')[1].strip() if '主演: ' in director_actor else "暂无"
|
||
else:
|
||
director = "暂无"
|
||
actor = "暂无"
|
||
|
||
year_area_type = info[1].strip().split('/')
|
||
year = year_area_type[0].strip()
|
||
area = year_area_type[1].strip() if len(year_area_type) > 1 else "暂无"
|
||
genre = year_area_type[2].strip() if len(year_area_type) > 2 else "暂无
|
||
|
||
txtfile.write(f"排名:{rank}\n")
|
||
txtfile.write(f"电影名称:{title}\n")
|
||
txtfile.write(f"评分:{rating}\n")
|
||
txtfile.write(f"评价人数:{people}\n")
|
||
txtfile.write(f"经典台词:{quote}\n")
|
||
txtfile.write(f"导演:{director}\n")
|
||
txtfile.write(f"主演:{actor}\n")
|
||
txtfile.write(f"上映年份:{year}\n")
|
||
txtfile.write(f"国家/地区:{area}\n")
|
||
txtfile.write(f"类型:{genre}\n")
|
||
txtfile.write("-" * 50 + "\n") # 分隔线
|
||
|
||
print(f"第 {start//25 + 1} 页爬取完成,当前URL: {url}")
|
||
time.sleep(1) # 延迟1秒,避免被封IP
|
||
|
||
except requests.exceptions.RequestException as e:
|
||
print(f"请求异常:{e}")
|
||
continue
|
||
|
||
print("? 所有数据已爬取完成,保存为 douban_movies.txt")
|
||
|
||
if __name__ == '__main__':
|
||
scrape_douban_top250_to_txt() |