上传文件至 20260331
This commit is contained in:
76
20260331/text1.py
Normal file
76
20260331/text1.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import time
|
||||||
|
|
||||||
|
# 伪装成浏览器
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
all_movies = []
|
||||||
|
|
||||||
|
# 爬10页,每页25部 = 250部
|
||||||
|
for page in range(10):
|
||||||
|
start = page * 25
|
||||||
|
url = f"https://movie.douban.com/top250?start={start}&filter="
|
||||||
|
|
||||||
|
print(f"正在爬第 {page+1} 页 …")
|
||||||
|
resp = requests.get(url, headers=headers)
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
|
items = soup.find_all("div", class_="item")
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
# 排名 & 片名
|
||||||
|
rank = item.find("em").text
|
||||||
|
title = item.find("span", class_="title").text
|
||||||
|
|
||||||
|
# 评分
|
||||||
|
score = item.find("span", class_="rating_num").text
|
||||||
|
|
||||||
|
# 导演、年份、地区、类型
|
||||||
|
info = item.find("div", class_="bd").p.text.strip()
|
||||||
|
lines = [line.strip() for line in info.split("\n") if line.strip()]
|
||||||
|
|
||||||
|
# 第一行:导演
|
||||||
|
director_line = lines[0]
|
||||||
|
if "导演:" in director_line:
|
||||||
|
director = director_line.split("导演:")[1].split("主演:")[0].strip()
|
||||||
|
else:
|
||||||
|
director = "未知"
|
||||||
|
|
||||||
|
# 第二行:年份 / 地区 / 类型
|
||||||
|
if len(lines) >= 2:
|
||||||
|
year_area_genre = lines[1].split("/")
|
||||||
|
year = year_area_genre[0].strip() if len(year_area_genre) > 0 else "未知"
|
||||||
|
area = year_area_genre[1].strip() if len(year_area_genre) > 1 else "未知"
|
||||||
|
genre = year_area_genre[2].strip() if len(year_area_genre) > 2 else "未知"
|
||||||
|
else:
|
||||||
|
year = area = genre = "未知"
|
||||||
|
|
||||||
|
movie = {
|
||||||
|
"排名": rank,
|
||||||
|
"片名": title,
|
||||||
|
"导演": director,
|
||||||
|
"年份": year,
|
||||||
|
"地区": area,
|
||||||
|
"类型": genre,
|
||||||
|
"评分": score
|
||||||
|
}
|
||||||
|
all_movies.append(movie)
|
||||||
|
|
||||||
|
time.sleep(1) # 防封
|
||||||
|
|
||||||
|
# ------------------- 写入 TXT -------------------
|
||||||
|
with open("douban_top250.txt", "w", encoding="utf-8") as f:
|
||||||
|
for m in all_movies:
|
||||||
|
f.write(f"排名:{m['排名']}\n")
|
||||||
|
f.write(f"片名:{m['片名']}\n")
|
||||||
|
f.write(f"导演:{m['导演']}\n")
|
||||||
|
f.write(f"年份:{m['年份']}\n")
|
||||||
|
f.write(f"地区:{m['地区']}\n")
|
||||||
|
f.write(f"类型:{m['类型']}\n")
|
||||||
|
f.write(f"评分:{m['评分']}\n")
|
||||||
|
f.write("-" * 50 + "\n")
|
||||||
|
|
||||||
|
print("全部爬完!已保存到 douban_top250.txt")
|
||||||
67
20260331/text2.py
Normal file
67
20260331/text2.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import csv
|
||||||
|
import time
|
||||||
|
|
||||||
|
# 请求头(模拟浏览器)
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 存储所有电影数据
|
||||||
|
movies_data = []
|
||||||
|
|
||||||
|
# 爬取10页(每页25条,共250条)
|
||||||
|
for page in range(10):
|
||||||
|
start = page * 25
|
||||||
|
url = f"https://movie.douban.com/top250?start={start}&filter="
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 发送请求获取页面
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
# 提取当前页所有电影项
|
||||||
|
items = soup.find_all("div", class_="item")
|
||||||
|
for item in items:
|
||||||
|
# 提取排名、片名、评分
|
||||||
|
rank = item.find("em").text
|
||||||
|
title = item.find("span", class_="title").text
|
||||||
|
score = item.find("span", class_="rating_num").text
|
||||||
|
|
||||||
|
# 提取导演、年份、地区、类型
|
||||||
|
info = item.find("div", class_="bd").p.text.strip().split("\n")
|
||||||
|
director_line = info[0].strip()
|
||||||
|
director = director_line.split("导演:")[1].split("主演:")[0].strip()
|
||||||
|
|
||||||
|
year_area_genre = info[1].strip().split("/")
|
||||||
|
year = year_area_genre[0].strip()
|
||||||
|
area = year_area_genre[1].strip()
|
||||||
|
genre = year_area_genre[2].strip()
|
||||||
|
|
||||||
|
# 添加到数据列表
|
||||||
|
movies_data.append({
|
||||||
|
"排名": rank,
|
||||||
|
"片名": title,
|
||||||
|
"导演": director,
|
||||||
|
"上映年份": year,
|
||||||
|
"地区": area,
|
||||||
|
"类型": genre,
|
||||||
|
"评分": score
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"第 {page+1} 页爬取完成")
|
||||||
|
time.sleep(1) # 延迟1秒,避免反爬
|
||||||
|
except Exception as e:
|
||||||
|
print(f"第 {page+1} 页爬取失败:{str(e)}")
|
||||||
|
|
||||||
|
# 保存为CSV文件
|
||||||
|
with open("豆瓣电影Top250.csv", "w", encoding="utf-8-sig", newline="") as f:
|
||||||
|
# 表头
|
||||||
|
fieldnames = ["排名", "片名", "导演", "上映年份", "地区", "类型", "评分"]
|
||||||
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(movies_data)
|
||||||
|
|
||||||
|
print("✅ 数据已保存为「豆瓣电影Top250.csv」")
|
||||||
64
20260331/text3.py
Normal file
64
20260331/text3.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
|
||||||
|
# 请求头(模拟浏览器)
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 存储所有电影数据
|
||||||
|
movies_data = []
|
||||||
|
|
||||||
|
# 爬取10页(每页25条,共250条)
|
||||||
|
for page in range(10):
|
||||||
|
start = page * 25
|
||||||
|
url = f"https://movie.douban.com/top250?start={start}&filter="
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 发送请求获取页面
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
# 提取当前页所有电影项
|
||||||
|
items = soup.find_all("div", class_="item")
|
||||||
|
for item in items:
|
||||||
|
# 提取排名、片名、评分
|
||||||
|
rank = item.find("em").text
|
||||||
|
title = item.find("span", class_="title").text
|
||||||
|
score = item.find("span", class_="rating_num").text
|
||||||
|
|
||||||
|
# 提取导演、年份、地区、类型
|
||||||
|
info = item.find("div", class_="bd").p.text.strip().split("\n")
|
||||||
|
director_line = info[0].strip()
|
||||||
|
director = director_line.split("导演:")[1].split("主演:")[0].strip()
|
||||||
|
|
||||||
|
year_area_genre = info[1].strip().split("/")
|
||||||
|
year = year_area_genre[0].strip()
|
||||||
|
area = year_area_genre[1].strip()
|
||||||
|
genre = year_area_genre[2].strip()
|
||||||
|
|
||||||
|
# 添加到数据列表
|
||||||
|
movies_data.append({
|
||||||
|
"排名": rank,
|
||||||
|
"片名": title,
|
||||||
|
"导演": director,
|
||||||
|
"上映年份": year,
|
||||||
|
"地区": area,
|
||||||
|
"类型": genre,
|
||||||
|
"评分": score
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"第 {page+1} 页爬取完成")
|
||||||
|
time.sleep(1) # 延迟1秒,避免反爬
|
||||||
|
except Exception as e:
|
||||||
|
print(f"第 {page+1} 页爬取失败:{str(e)}")
|
||||||
|
|
||||||
|
# 保存为JSON文件
|
||||||
|
with open("豆瓣电影Top250.json", "w", encoding="utf-8") as f:
|
||||||
|
# ensure_ascii=False 保证中文正常显示,indent=2 格式化缩进
|
||||||
|
json.dump(movies_data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print("✅ 数据已保存为「豆瓣电影Top250.json」")
|
||||||
Reference in New Issue
Block a user