task-2-4-regular-expression/爬虫.txt.txt

import requests
from bs4 import BeautifulSoup
import time

def scrape_douban_top250_to_txt():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }

    with open('douban_movies.txt', 'w', encoding='utf-8') as txtfile:
        for start in range(0, 250, 25):
            url = f'https://movie.douban.com/top250?start={start}'

            try:
                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()

                soup = BeautifulSoup(response.text, 'html.parser')
                movies = soup.find_all('div', class_='item'

                for movie in movies:
                    rank = movie.find('em').text
                    title = movie.find('span', class_='title').text
                    rating = movie.find('span', class_='rating_num').text
                    people_span = movie.find('div', class_='star').find_all('span')[-1]
                    people = people_span.text.replace('人评价', '')
                    quote_tag = movie.find('span', class_='inq')
                    quote = quote_tag.text if quote_tag else "暂无"

                    info = movie.find('p', class_='').text.strip().split('\n')
                    director_actor = info[0].strip()

                    if '导演: ' in director_actor:
                        director = director_actor.split('导演: ')[1].split('主演: ')[0].strip()
                        actor = director_actor.split('主演: ')[1].strip() if '主演: ' in director_actor else "暂无"
                    else:
                        director = "暂无"
                        actor = "暂无"

                    year_area_type = info[1].strip().split('/')
                    year = year_area_type[0].strip()
                    area = year_area_type[1].strip() if len(year_area_type) > 1 else "暂无"
                    genre = year_area_type[2].strip() if len(year_area_type) > 2 else "暂无

                    txtfile.write(f"排名：{rank}\n")
                    txtfile.write(f"电影名称：{title}\n")
                    txtfile.write(f"评分：{rating}\n")
                    txtfile.write(f"评价人数：{people}\n")
                    txtfile.write(f"经典台词：{quote}\n")
                    txtfile.write(f"导演：{director}\n")
                    txtfile.write(f"主演：{actor}\n")
                    txtfile.write(f"上映年份：{year}\n")
                    txtfile.write(f"国家/地区：{area}\n")
                    txtfile.write(f"类型：{genre}\n")
                    txtfile.write("-" * 50 + "\n")  # 分隔线

                print(f"第 {start//25 + 1} 页爬取完成，当前URL: {url}")
                time.sleep(1)  # 延迟1秒，避免被封IP

            except requests.exceptions.RequestException as e:
                print(f"请求异常：{e}")
                continue

    print("? 所有数据已爬取完成，保存为 douban_movies.txt")

if __name__ == '__main__':
    scrape_douban_top250_to_txt()