task-2-1-data-collection/爬虫/爬虫.py (2).txt

import requests
from bs4 import BeautifulSoup
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

all_movies = []

for page in range(0, 250, 25):
    url = f"https://movie.douban.com/top250?start={page}&filter="
    print(f"正在爬取第 {page//25 + 1} 页：{url}")

    response = requests.get(url, headers=headers)
    response.encoding = "utf-8"
    soup = BeautifulSoup(response.text, "html.parser")


    items = soup.find_all("div", class_="item")
    for item in items:
        title = item.find("span", class_="title").get_text(strip=True)
        all_movies.append(title)
        print(title)


    time.sleep(1)


print(f"\n一共爬到电影：{len(all_movies)} 部")