Files
task-2-4-regular-expression/爬虫.txt.txt
2026-04-02 15:55:18 +08:00

67 lines
3.0 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import time
def scrape_douban_top250_to_txt():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
with open('douban_movies.txt', 'w', encoding='utf-8') as txtfile:
for start in range(0, 250, 25):
url = f'https://movie.douban.com/top250?start={start}'
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
movies = soup.find_all('div', class_='item'
for movie in movies:
rank = movie.find('em').text
title = movie.find('span', class_='title').text
rating = movie.find('span', class_='rating_num').text
people_span = movie.find('div', class_='star').find_all('span')[-1]
people = people_span.text.replace('人评价', '')
quote_tag = movie.find('span', class_='inq')
quote = quote_tag.text if quote_tag else "暂无"
info = movie.find('p', class_='').text.strip().split('\n')
director_actor = info[0].strip()
if '导演: ' in director_actor:
director = director_actor.split('导演: ')[1].split('主演: ')[0].strip()
actor = director_actor.split('主演: ')[1].strip() if '主演: ' in director_actor else "暂无"
else:
director = "暂无"
actor = "暂无"
year_area_type = info[1].strip().split('/')
year = year_area_type[0].strip()
area = year_area_type[1].strip() if len(year_area_type) > 1 else "暂无"
genre = year_area_type[2].strip() if len(year_area_type) > 2 else "暂无
txtfile.write(f"排名:{rank}\n")
txtfile.write(f"电影名称:{title}\n")
txtfile.write(f"评分:{rating}\n")
txtfile.write(f"评价人数:{people}\n")
txtfile.write(f"经典台词:{quote}\n")
txtfile.write(f"导演:{director}\n")
txtfile.write(f"主演:{actor}\n")
txtfile.write(f"上映年份:{year}\n")
txtfile.write(f"国家/地区:{area}\n")
txtfile.write(f"类型:{genre}\n")
txtfile.write("-" * 50 + "\n") # 分隔线
print(f"第 {start//25 + 1} 页爬取完成当前URL: {url}")
time.sleep(1) # 延迟1秒避免被封IP
except requests.exceptions.RequestException as e:
print(f"请求异常:{e}")
continue
print("? 所有数据已爬取完成,保存为 douban_movies.txt")
if __name__ == '__main__':
scrape_douban_top250_to_txt()