From 4398d1ee788109cd99a3bdb22cc1e1891399fdb0 Mon Sep 17 00:00:00 2001 From: 2509165025 <2509165025@student.edu.cn> Date: Tue, 31 Mar 2026 11:27:41 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BD=9C=E4=B8=9Adouban250?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 250json.py | 0 douban.txt/douban.txt | 67 +++++++++++++++++++++++++++++++++++++++++++ douban3.31 .py | 0 3 files changed, 67 insertions(+) create mode 100644 250json.py create mode 100644 douban.txt/douban.txt create mode 100644 douban3.31 .py diff --git a/250json.py b/250json.py new file mode 100644 index 0000000..e69de29 diff --git a/douban.txt/douban.txt b/douban.txt/douban.txt new file mode 100644 index 0000000..db0a605 --- /dev/null +++ b/douban.txt/douban.txt @@ -0,0 +1,67 @@ +import requests +from bs4 import BeautifulSoup +import time + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} + +all_movies = [] + +for page in range(10): + start = page * 25 + url = f"https://movie.douban.com/top250?start={start}&filter=" + + print(f"正在爬第 {page+1} 页 …") + resp = requests.get(url, headers=headers) + soup = BeautifulSoup(resp.text, "html.parser") + + items = soup.find_all("div", class_="item") + + for item in items: + rank = item.find("em").text + title = item.find("span", class_="title").t + score = item.find("span", class_="rating_num").text + + info = item.find("div", class_="bd").p.text.strip() + lines = [line.strip() for line in info.split("\n") if line.strip()] + + director_line = lines[0] + if "导演:" in director_line: + director = director_line.split("导演:")[1].split("主演:")[0].strip() + else: + director = "未知" + + if len(lines) >= 2: + year_area_genre = lines[1].split("/") + year = year_area_genre[0].strip() if len(year_area_genre) > 0 else "未知" + area = year_area_genre[1].strip() if len(year_area_genre) > 1 else "未知" + genre = year_area_genre[2].strip() if len(year_area_genre) > 2 else "未知" + else: + year = area = genre = "未知" + + movie = { + "排名": rank, + "片名": title, + "导演": director, + "年份": year, + "地区": area, + "类型": genre, + "评分": score + } + all_movies.append(movie) + + time.sleep(1) + +with open("douban_top250.txt", "w", encoding="utf-8") as f: + for m in all_movies: + f.write(f"排名:{m['排名']}\n") + f.write(f"片名:{m['片名']}\n") + f.write(f"导演:{m['导演']}\n") + f.write(f"年份:{m['年份']}\n") + f.write(f"地区:{m['地区']}\n") + f.write(f"类型:{m['类型']}\n") + f.write(f"评分:{m['评分']}\n") + f.write("-" * 50 + "\n") + +print("全部爬完!已保存到 douban_top250.txt") \ No newline at end of file diff --git a/douban3.31 .py b/douban3.31 .py new file mode 100644 index 0000000..e69de29