import requests
from bs4 import BeautifulSoup
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

all_movies = []

for page in range(10):
    start = page * 25
    url = f"https://movie.douban.com/top250?start={start}&filter="

    print(f" {page+1} ҳ ")
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    items = soup.find_all("div", class_="item")

    for item in items:
        rank = item.find("em").text
        title = item.find("span", class_="title").t
        score = item.find("span", class_="rating_num").text

        info = item.find("div", class_="bd").p.text.strip()
        lines = [line.strip() for line in info.split("\n") if line.strip()]

        director_line = lines[0]
        if ":" in director_line:
            director = director_line.split(":")[1].split(":")[0].strip()
        else:
            director = "δ֪"

        if len(lines) >= 2:
            year_area_genre = lines[1].split("/")
            year = year_area_genre[0].strip() if len(year_area_genre) > 0 else "δ֪"
            area = year_area_genre[1].strip() if len(year_area_genre) > 1 else "δ֪"
            genre = year_area_genre[2].strip() if len(year_area_genre) > 2 else "δ֪"
        else:
            year = area = genre = "δ֪"

        movie = {
            "": rank,
            "Ƭ": title,
            "": director,
            "": year,
            "": area,
            "": genre,
            "": score
        }
        all_movies.append(movie)

    time.sleep(1)  

with open("douban_top250.txt", "w", encoding="utf-8") as f:
    for m in all_movies:
        f.write(f"{m['']}\n")
        f.write(f"Ƭ{m['Ƭ']}\n")
        f.write(f"ݣ{m['']}\n")
        f.write(f"ݣ{m['']}\n")
        f.write(f"{m['']}\n")
        f.write(f"ͣ{m['']}\n")
        f.write(f"֣{m['']}\n")
        f.write("-" * 50 + "\n")

print("ȫ꣡ѱ浽 douban_top250.txt")