上传文件至 20260331

This commit is contained in:
2026-03-31 11:36:26 +08:00
parent 7911a50c75
commit 798e2e46d3
3 changed files with 207 additions and 0 deletions

76
20260331/text1.py Normal file
View File

@@ -0,0 +1,76 @@
import requests
from bs4 import BeautifulSoup
import time
# 伪装成浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
all_movies = []
# 爬10页每页25部 = 250部
for page in range(10):
start = page * 25
url = f"https://movie.douban.com/top250?start={start}&filter="
print(f"正在爬第 {page+1} 页 …")
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")
items = soup.find_all("div", class_="item")
for item in items:
# 排名 & 片名
rank = item.find("em").text
title = item.find("span", class_="title").text
# 评分
score = item.find("span", class_="rating_num").text
# 导演、年份、地区、类型
info = item.find("div", class_="bd").p.text.strip()
lines = [line.strip() for line in info.split("\n") if line.strip()]
# 第一行:导演
director_line = lines[0]
if "导演:" in director_line:
director = director_line.split("导演:")[1].split("主演:")[0].strip()
else:
director = "未知"
# 第二行:年份 / 地区 / 类型
if len(lines) >= 2:
year_area_genre = lines[1].split("/")
year = year_area_genre[0].strip() if len(year_area_genre) > 0 else "未知"
area = year_area_genre[1].strip() if len(year_area_genre) > 1 else "未知"
genre = year_area_genre[2].strip() if len(year_area_genre) > 2 else "未知"
else:
year = area = genre = "未知"
movie = {
"排名": rank,
"片名": title,
"导演": director,
"年份": year,
"地区": area,
"类型": genre,
"评分": score
}
all_movies.append(movie)
time.sleep(1) # 防封
# ------------------- 写入 TXT -------------------
with open("douban_top250.txt", "w", encoding="utf-8") as f:
for m in all_movies:
f.write(f"排名:{m['排名']}\n")
f.write(f"片名:{m['片名']}\n")
f.write(f"导演:{m['导演']}\n")
f.write(f"年份:{m['年份']}\n")
f.write(f"地区:{m['地区']}\n")
f.write(f"类型:{m['类型']}\n")
f.write(f"评分:{m['评分']}\n")
f.write("-" * 50 + "\n")
print("全部爬完!已保存到 douban_top250.txt")

67
20260331/text2.py Normal file
View File

@@ -0,0 +1,67 @@
import requests
from bs4 import BeautifulSoup
import csv
import time
# 请求头(模拟浏览器)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
# 存储所有电影数据
movies_data = []
# 爬取10页每页25条共250条
for page in range(10):
start = page * 25
url = f"https://movie.douban.com/top250?start={start}&filter="
try:
# 发送请求获取页面
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# 提取当前页所有电影项
items = soup.find_all("div", class_="item")
for item in items:
# 提取排名、片名、评分
rank = item.find("em").text
title = item.find("span", class_="title").text
score = item.find("span", class_="rating_num").text
# 提取导演、年份、地区、类型
info = item.find("div", class_="bd").p.text.strip().split("\n")
director_line = info[0].strip()
director = director_line.split("导演:")[1].split("主演:")[0].strip()
year_area_genre = info[1].strip().split("/")
year = year_area_genre[0].strip()
area = year_area_genre[1].strip()
genre = year_area_genre[2].strip()
# 添加到数据列表
movies_data.append({
"排名": rank,
"片名": title,
"导演": director,
"上映年份": year,
"地区": area,
"类型": genre,
"评分": score
})
print(f"{page+1} 页爬取完成")
time.sleep(1) # 延迟1秒避免反爬
except Exception as e:
print(f"{page+1} 页爬取失败:{str(e)}")
# 保存为CSV文件
with open("豆瓣电影Top250.csv", "w", encoding="utf-8-sig", newline="") as f:
# 表头
fieldnames = ["排名", "片名", "导演", "上映年份", "地区", "类型", "评分"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(movies_data)
print("✅ 数据已保存为「豆瓣电影Top250.csv」")

64
20260331/text3.py Normal file
View File

@@ -0,0 +1,64 @@
import requests
from bs4 import BeautifulSoup
import json
import time
# 请求头(模拟浏览器)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
# 存储所有电影数据
movies_data = []
# 爬取10页每页25条共250条
for page in range(10):
start = page * 25
url = f"https://movie.douban.com/top250?start={start}&filter="
try:
# 发送请求获取页面
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# 提取当前页所有电影项
items = soup.find_all("div", class_="item")
for item in items:
# 提取排名、片名、评分
rank = item.find("em").text
title = item.find("span", class_="title").text
score = item.find("span", class_="rating_num").text
# 提取导演、年份、地区、类型
info = item.find("div", class_="bd").p.text.strip().split("\n")
director_line = info[0].strip()
director = director_line.split("导演:")[1].split("主演:")[0].strip()
year_area_genre = info[1].strip().split("/")
year = year_area_genre[0].strip()
area = year_area_genre[1].strip()
genre = year_area_genre[2].strip()
# 添加到数据列表
movies_data.append({
"排名": rank,
"片名": title,
"导演": director,
"上映年份": year,
"地区": area,
"类型": genre,
"评分": score
})
print(f"{page+1} 页爬取完成")
time.sleep(1) # 延迟1秒避免反爬
except Exception as e:
print(f"{page+1} 页爬取失败:{str(e)}")
# 保存为JSON文件
with open("豆瓣电影Top250.json", "w", encoding="utf-8") as f:
# ensure_ascii=False 保证中文正常显示indent=2 格式化缩进
json.dump(movies_data, f, ensure_ascii=False, indent=2)
print("✅ 数据已保存为「豆瓣电影Top250.json」")