From da29ff0039faaabe54c7df4b842e7eaa11983848 Mon Sep 17 00:00:00 2001 From: 2509165015 <2509165015@student.edu.cn> Date: Thu, 2 Apr 2026 15:53:53 +0800 Subject: [PATCH] =?UTF-8?q?=E6=AD=A3=E5=88=99=E8=A1=A8=E8=BE=BE=E5=BC=8F?= =?UTF-8?q?=EF=BC=9A=E7=88=AC=E8=99=AB=E5=86=85=E5=AE=B9=E6=8F=90=E5=8F=96?= =?UTF-8?q?=E5=88=A9=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 0402+2509165015.CSV | 45 +++++++++++++++++++++++++++++++++++++++ 0402+2509165015.JSON | 46 ++++++++++++++++++++++++++++++++++++++++ 0402+2509165015.txt | 50 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 141 insertions(+) create mode 100644 0402+2509165015.CSV create mode 100644 0402+2509165015.JSON create mode 100644 0402+2509165015.txt diff --git a/0402+2509165015.CSV b/0402+2509165015.CSV new file mode 100644 index 0000000..d14d4b6 --- /dev/null +++ b/0402+2509165015.CSV @@ -0,0 +1,45 @@ +import requests +from bs4 import BeautifulSoup +import csv +import time + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} + +movies = [] + +for start in range(0, 250, 25): + url = f"https://movie.douban.com/top250?start={start}" + res = requests.get(url, headers=headers) + soup = BeautifulSoup(res.text, "html.parser") + items = soup.find_all("div", class_="item") + + for item in items: + rank = item.find("em").text + title = item.find("span", class_="title").text + rating = item.find("span", class_="rating_num").text + people = item.find("div", class_="star").find_all("span")[-1].text.replace("人评价", "") + quote = item.find("span", class_="inq").text if item.find("span", class_="inq") else "无" + info = item.find("p", class_="").text.strip().split("\n") + line1 = info[0].strip() + line2 = info[1].strip() if len(info) > 1 else "" + + director = line1.split("导演: ")[1].split("主演: ")[0].strip() if "导演: " in line1 else "未知" + actor = line1.split("主演: ")[1].strip() if "主演: " in line1 else "未知" + parts = line2.split("/") if line2 else [] + year = parts[0].strip() if len(parts) >= 1 else "未知" + area = parts[1].strip() if len(parts) >= 2 else "未知" + genre = parts[2].strip() if len(parts) >= 3 else "未知" + + movies.append({ + "排名": rank, "电影名": title, "评分": rating, "评价人数": people, "经典台词": quote, + "导演": director, "主演": actor, "年份": year, "地区": area, "类型": genre + }) + time.sleep(1) + print(f"已爬取 {start + 25} 条") +with open("douban_top250.csv", "w", encoding="utf-8", newline="") as f: + writer = csv.DictWriter(f, fieldnames=movies[0].keys()) + writer.writeheader() + writer.writerows(movies) +print("✅ CSV 导出完成") \ No newline at end of file diff --git a/0402+2509165015.JSON b/0402+2509165015.JSON new file mode 100644 index 0000000..d419a03 --- /dev/null +++ b/0402+2509165015.JSON @@ -0,0 +1,46 @@ +import requests +from bs4 import BeautifulSoup +import json +import time + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} + +movies = [] + +for start in range(0, 250, 25): + url = f"https://movie.douban.com/top250?start={start}" + res = requests.get(url, headers=headers) + soup = BeautifulSoup(res.text, "html.parser") + items = soup.find_all("div", class_="item") + + for item in items: + rank = item.find("em").text + title = item.find("span", class_="title").text + rating = item.find("span", class_="rating_num").text + people = item.find("div", class_="star").find_all("span")[-1].text.replace("人评价", "") + quote = item.find("span", class_="inq").text if item.find("span", class_="inq") else "无" + info = item.find("p", class_="").text.strip().split("\n") + line1 = info[0].strip() + line2 = info[1].strip() if len(info) > 1 else "" + + director = line1.split("导演: ")[1].split("主演: ")[0].strip() if "导演: " in line1 else "未知" + actor = line1.split("主演: ")[1].strip() if "主演: " in line1 else "未知" + parts = line2.split("/") if line2 else [] + year = parts[0].strip() if len(parts) >= 1 else "未知" + area = parts[1].strip() if len(parts) >= 2 else "未知" + genre = parts[2].strip() if len(parts) >= 3 else "未知" + + movies.append({ + "排名": rank, "电影名": title, "评分": rating, "评价人数": people, "经典台词": quote, + "导演": director, "主演": actor, "年份": year, "地区": area, "类型": genre + }) + + time.sleep(1) + print(f"已爬取 {start + 25} 条") + +with open("douban_top250.json", "w", encoding="utf-8") as f: + json.dump(movies, f, ensure_ascii=False, indent=2) + +print("✅ JSON 导出完成") \ No newline at end of file diff --git a/0402+2509165015.txt b/0402+2509165015.txt new file mode 100644 index 0000000..7ca51bd --- /dev/null +++ b/0402+2509165015.txt @@ -0,0 +1,50 @@ +import requests +from bs4 import BeautifulSoup +import time + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} + +movies = [] + +for start in range(0, 250, 25): + url = f"https://movie.douban.com/top250?start={start}" + res = requests.get(url, headers=headers) + soup = BeautifulSoup(res.text, "html.parser") + items = soup.find_all("div", class_="item") + + for item in items: + rank = item.find("em").text + title = item.find("span", class_="title").text + rating = item.find("span", class_="rating_num").text + people = item.find("div", class_="star").find_all("span")[-1].text.replace("", "") + quote = item.find("span", class_="inq").text if item.find("span", class_="inq") else "" + info = item.find("p", class_="").text.strip().split("\n") + line1 = info[0].strip() + line2 = info[1].strip() if len(info) > 1 else "" + + director = line1.split(": ")[1].split(": ")[0].strip() if ": " in line1 else "δ֪" + actor = line1.split(": ")[1].strip() if ": " in line1 else "δ֪" + parts = line2.split("/") if line2 else [] + year = parts[0].strip() if len(parts) >= 1 else "δ֪" + area = parts[1].strip() if len(parts) >= 2 else "δ֪" + genre = parts[2].strip() if len(parts) >= 3 else "δ֪" + + movies.append({ + "": rank, "Ӱ": title, "": rating, "": people, "̨": quote, + "": director, "": actor, "": year, "": area, "": genre + }) + + time.sleep(1) + print(f"ȡ {start + 25} ") + +with open("douban_top250.txt", "w", encoding="utf-8") as f: + for m in movies: + f.write(f"{m['']}{m['Ӱ']}\n") + f.write(f"֣{m['']} {m['']}\n") + f.write(f"ݣ{m['']} ݣ{m['']}\n") + f.write(f"ݣ{m['']} {m['']} ͣ{m['']}\n") + f.write(f"̨ʣ{m['̨']}\n\n") + +print("? TXT ") \ No newline at end of file