上传文件至 /

This commit is contained in:
2026-06-09 11:18:50 +08:00
parent 8d04e62a3e
commit 6eccc27f2e
5 changed files with 450 additions and 0 deletions

32
crawl.py Normal file
View File

@@ -0,0 +1,32 @@
import requests
from bs4 import BeautifulSoup
import json
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
movie_data = []
for start_offset in [0, 25]:
url = f"https://movie.douban.com/top250?start={start_offset}"
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")
item_list = soup.find_all("div", class_="item")
for index, item in enumerate(item_list):
rank = start_offset + index + 1
title = item.find("span", class_="title").get_text(strip=True)
info_str = item.find("div", class_="bd").p.get_text(strip=True)
if "主演:" in info_str:
actors = info_str.split("主演:")[-1].split("\n")[0].strip()
else:
actors = ""
quote_tag = item.find("span", class_="inq")
quote = quote_tag.get_text(strip=True) if quote_tag else ""
movie_data.append({
"rank": rank,
"title": title,
"actors": actors,
"quote": quote
})
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(movie_data, f, ensure_ascii=False, indent=2)
print(f"爬取完成,共{len(movie_data)}部电影,已生成 movies.json")

26
data_clean.py Normal file
View File

@@ -0,0 +1,26 @@
import json
# 读取原始爬虫数据
with open("movies.json", "r", encoding="utf-8") as f:
raw_movies = json.load(f)
# 过滤quote为空的样本
valid_movies = [movie for movie in raw_movies if movie["quote"].strip()]
# 1. 生成LabelStudio导入文件 quotes_processed.txt
with open("quotes_processed.txt", "w", encoding="utf-8") as out_f:
for item in valid_movies:
line = json.dumps({"text": item["quote"]}, ensure_ascii=False)
out_f.write(line + "\n")
# 2. 生成数据统计文件 process_stats.json
stats = {
"原始总样本": len(raw_movies),
"过滤后有效样本": len(valid_movies),
"过滤掉空短评样本": len(raw_movies) - len(valid_movies)
}
with open("process_stats.json", "w", encoding="utf-8") as f:
json.dump(stats, f, ensure_ascii=False, indent=2)
print("数据清洗完成,已生成 quotes_processed.txt、process_stats.json")
print(f"统计信息:{stats}")

302
movies.json Normal file
View File

@@ -0,0 +1,302 @@
[
{
"rank": 1,
"title": "肖申克的救赎",
"actors": "蒂姆·罗宾斯 Tim Robbins /...1994 / 美国 / 犯罪 剧情",
"quote": ""
},
{
"rank": 2,
"title": "霸王别姬",
"actors": "张国荣 Leslie Cheung / 张丰毅 Fengyi Zha...1993 / 中国大陆 中国香港 / 剧情 爱情 同性",
"quote": ""
},
{
"rank": 3,
"title": "泰坦尼克号",
"actors": "莱昂纳多·迪卡普里奥 Leonardo...1997 / 美国 / 剧情 爱情 灾难",
"quote": ""
},
{
"rank": 4,
"title": "阿甘正传",
"actors": "汤姆·汉克斯 Tom Hanks / ...1994 / 美国 / 剧情 爱情",
"quote": ""
},
{
"rank": 5,
"title": "千与千寻",
"actors": "柊瑠美 Rumi Hîragi / 入野自由 Miy...2001 / 日本 / 剧情 动画 奇幻",
"quote": ""
},
{
"rank": 6,
"title": "美丽人生",
"actors": "罗伯托·贝尼尼 Roberto Beni...1997 / 意大利 / 剧情 喜剧 爱情 战争",
"quote": ""
},
{
"rank": 7,
"title": "星际穿越",
"actors": "马修·麦康纳 Matthew Mc...2014 / 美国 英国 加拿大 / 剧情 科幻 冒险",
"quote": ""
},
{
"rank": 8,
"title": "这个杀手不太冷",
"actors": "让·雷诺 Jean Reno / 娜塔莉·波特曼 ...1994 / 法国 美国 / 剧情 动作 犯罪",
"quote": ""
},
{
"rank": 9,
"title": "盗梦空间",
"actors": "莱昂纳多·迪卡普里奥 Le...2010 / 美国 英国 / 剧情 科幻 悬疑 冒险",
"quote": ""
},
{
"rank": 10,
"title": "楚门的世界",
"actors": "金·凯瑞 Jim Carrey / 劳拉·琳妮 Lau...1998 / 美国 / 剧情 科幻",
"quote": ""
},
{
"rank": 11,
"title": "辛德勒的名单",
"actors": "连姆·尼森 Liam Neeson...1993 / 美国 / 剧情 历史 战争",
"quote": ""
},
{
"rank": 12,
"title": "忠犬八公的故事",
"actors": "理查·基尔 Richard Ger...2009 / 美国 英国 / 剧情",
"quote": ""
},
{
"rank": 13,
"title": "海上钢琴师",
"actors": "蒂姆·罗斯 Tim Roth / ...1998 / 意大利 / 剧情 音乐",
"quote": ""
},
{
"rank": 14,
"title": "疯狂动物城",
"actors": "金妮弗·...2016 / 美国 / 喜剧 动画 冒险",
"quote": ""
},
{
"rank": 15,
"title": "三傻大闹宝莱坞",
"actors": "阿米尔·汗 Aamir Khan / 卡...2009 / 印度 / 剧情 喜剧 爱情 歌舞",
"quote": ""
},
{
"rank": 16,
"title": "机器人总动员",
"actors": "本·贝尔特 Ben Burtt / 艾丽...2008 / 美国 / 科幻 动画 冒险",
"quote": ""
},
{
"rank": 17,
"title": "放牛班的春天",
"actors": "让-巴蒂斯特·莫尼...2004 / 法国 瑞士 德国 / 剧情 音乐",
"quote": ""
},
{
"rank": 18,
"title": "无间道",
"actors": "刘德华 Andy Lau / 梁朝伟 Tony Leung Chiu W...2002 / 中国香港 / 剧情 犯罪 惊悚",
"quote": ""
},
{
"rank": 19,
"title": "控方证人",
"actors": "泰隆·鲍华 Tyrone Power / 玛琳·...1957 / 美国 / 剧情 犯罪 悬疑 惊悚",
"quote": ""
},
{
"rank": 20,
"title": "寻梦环游记",
"actors": "...2017 / 美国 / 喜剧 动画 奇幻 音乐",
"quote": ""
},
{
"rank": 21,
"title": "大话西游之大圣娶亲",
"actors": "周星驰 Stephen Chow / 吴孟达 Man Tat Ng...1995 / 中国香港 中国大陆 / 喜剧 爱情 奇幻 古装",
"quote": ""
},
{
"rank": 22,
"title": "熔炉",
"actors": "孔侑 Yoo Gong / 郑有美 Yu-mi Jung /...2011 / 韩国 / 剧情",
"quote": ""
},
{
"rank": 23,
"title": "触不可及",
"actors": "无",
"quote": ""
},
{
"rank": 24,
"title": "教父",
"actors": "马龙·白兰度 M...1972 / 美国 / 剧情 犯罪",
"quote": ""
},
{
"rank": 25,
"title": "末代皇帝",
"actors": "尊龙 John Lone / 陈...1987 / 英国 意大利 中国大陆 法国 / 剧情 传记 历史",
"quote": ""
},
{
"rank": 26,
"title": "哈利·波特与魔法石",
"actors": "Daniel Radcliffe / Emma Watson / Rupert Grint2001 / 美国 英国 / 奇幻 冒险",
"quote": ""
},
{
"rank": 27,
"title": "当幸福来敲门",
"actors": "威尔·史密斯 Will Smith ...2006 / 美国 / 剧情 传记 家庭",
"quote": ""
},
{
"rank": 28,
"title": "龙猫",
"actors": "日高法子 Noriko Hidaka / 坂本千夏 Ch...1988 / 日本 / 动画 奇幻 冒险",
"quote": ""
},
{
"rank": 29,
"title": "活着",
"actors": "葛优 You Ge / 巩俐 Li Gong / 姜武 Wu Jiang1994 / 中国大陆 中国香港 / 剧情 历史 家庭",
"quote": ""
},
{
"rank": 30,
"title": "怦然心动",
"actors": "玛德琳·卡罗尔 Madeline Carroll / 卡...2010 / 美国 / 剧情 喜剧 爱情",
"quote": ""
},
{
"rank": 31,
"title": "蝙蝠侠:黑暗骑士",
"actors": "克里斯蒂安·贝尔 Christ...2008 / 美国 英国 / 剧情 动作 科幻 犯罪 惊悚",
"quote": ""
},
{
"rank": 32,
"title": "指环王3王者无敌",
"actors": "伊利亚·伍德 Elijah Wood / 西恩...2003 / 美国 新西兰 / 剧情 动作 奇幻 冒险",
"quote": ""
},
{
"rank": 33,
"title": "我不是药神",
"actors": "徐峥 Zheng Xu / 王传君 Chuanjun Wang / 周...2018 / 中国大陆 / 剧情 喜剧",
"quote": ""
},
{
"rank": 34,
"title": "乱世佳人",
"actors": "费...1939 / 美国 / 剧情 历史 爱情 战争",
"quote": ""
},
{
"rank": 35,
"title": "让子弹飞",
"actors": "姜文 Wen Jiang / 葛优 You Ge / 周润发 Yun-F...2010 / 中国大陆 中国香港 / 剧情 喜剧 动作 西部",
"quote": ""
},
{
"rank": 36,
"title": "飞屋环游记",
"actors": "爱德...2009 / 美国 / 剧情 喜剧 动画 冒险",
"quote": ""
},
{
"rank": 37,
"title": "哈尔的移动城堡",
"actors": "倍赏千惠子 Chieko Baishô / 木村拓...2004 / 日本 / 爱情 动画 奇幻 冒险",
"quote": ""
},
{
"rank": 38,
"title": "十二怒汉",
"actors": "亨利·方达 Henry Fonda / 马丁...1957 / 美国 / 剧情",
"quote": ""
},
{
"rank": 39,
"title": "海蒂和爷爷",
"actors": "阿努克·斯特芬 Anuk Steffen /...2015 / 德国 瑞士 / 剧情 冒险 家庭",
"quote": ""
},
{
"rank": 40,
"title": "素媛",
"actors": "薛景求 Kyung-gu Sol / 严志媛 Ji-won Uhm ...2013 / 韩国 / 剧情",
"quote": ""
},
{
"rank": 41,
"title": "猫鼠游戏",
"actors": "莱昂纳多·迪卡普里奥 L...2002 / 美国 加拿大 / 传记 犯罪 剧情",
"quote": ""
},
{
"rank": 42,
"title": "天空之城",
"actors": "田中真弓 Mayumi Tanaka / 横泽启子 Ke...1986 / 日本 / 动画 奇幻 冒险",
"quote": ""
},
{
"rank": 43,
"title": "鬼子来了",
"actors": "姜文 Wen Jiang / 香川照之 Teruyuki Kagawa /...2000 / 中国大陆 / 剧情 喜剧",
"quote": ""
},
{
"rank": 44,
"title": "摔跤吧!爸爸",
"actors": "阿米尔·汗 Aamir Khan / 法缇玛...2016 / 印度 / 剧情 传记 运动 家庭",
"quote": ""
},
{
"rank": 45,
"title": "少年派的奇幻漂流",
"actors": "苏拉·沙玛 Suraj Sharma / 伊尔凡·可汗 Irrfan...2012 / 美国 中国台湾 英国 加拿大 / 剧情 奇幻 冒险",
"quote": ""
},
{
"rank": 46,
"title": "钢琴家",
"actors": "艾德里安·布洛迪 Adrien Brod...2002 / 英国 法国 波兰 德国 美国 / 剧情 传记 战争 音乐",
"quote": ""
},
{
"rank": 47,
"title": "死亡诗社",
"actors": "罗宾·威廉姆斯 Robin Williams / 罗伯...1989 / 美国 / 剧情",
"quote": ""
},
{
"rank": 48,
"title": "指环王2双塔奇兵",
"actors": "伊利亚·伍德 Elijah Wood / 西恩...2002 / 美国 新西兰 / 剧情 动作 奇幻 冒险",
"quote": ""
},
{
"rank": 49,
"title": "大话西游之月光宝盒",
"actors": "周星驰 Stephen Chow / 吴孟达 Man Tat Ng...1995 / 中国香港 中国大陆 / 喜剧 爱情 奇幻 古装",
"quote": ""
},
{
"rank": 50,
"title": "绿皮书",
"actors": "维果·莫腾森 Viggo Mortensen /...2018 / 美国 中国大陆 / 剧情 喜剧 传记 音乐",
"quote": ""
}
]

48
train_mlp.py Normal file
View File

@@ -0,0 +1,48 @@
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score
import csv
# 1. 读取标注数据
df = pd.read_csv("my_labels.csv")
texts = df["text"].tolist()
labels = df["label"].tolist()
# 2. TF-IDF文本特征提取
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(texts)
y = np.array(labels)
# 3. 划分训练集、验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# 4. 训练MLP记录每轮loss
mlp = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=100, random_state=42)
train_loss_list = []
val_precision_list = []
for epoch in range(1, mlp.max_iter+1):
mlp.partial_fit(X_train, y_train, classes=np.unique(y))
# 记录训练loss
train_loss_list.append({"epoch": epoch, "loss": mlp.loss_})
# 验证集预测、计算precision
y_pred_val = mlp.predict(X_val)
val_prec = precision_score(y_val, y_pred_val, average="macro", zero_division=0)
val_precision_list.append({"epoch": epoch, "precision": val_prec})
# 保存loss.csv
with open("loss.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["epoch", "loss"])
writer.writeheader()
writer.writerows(train_loss_list)
# 保存predictions.csv
with open("predictions.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["epoch", "precision"])
writer.writeheader()
writer.writerows(val_precision_list)
print("模型训练完成,已输出 loss.csv、predictions.csv")

42
visual_plot.py Normal file
View File

@@ -0,0 +1,42 @@
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
# ---------------------- 图1Loss曲线训练loss ----------------------
loss_df = pd.read_csv("loss.csv")
plt.figure(figsize=(10,4))
plt.plot(loss_df["epoch"], loss_df["loss"], color="#e74c3c", linewidth=2, label="训练Loss")
plt.xlabel("Epoch 训练轮次")
plt.ylabel("Loss 损失值")
plt.title("MLP训练Loss变化曲线")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
# 创建images文件夹存放图片
import os
if not os.path.exists("images"):
os.mkdir("images")
plt.savefig("images/loss_curve.png", dpi=300)
plt.close()
# ---------------------- 图210类别预测分布柱状图 ----------------------
label_df = pd.read_csv("my_labels.csv")
cate_count = label_df["label"].value_counts().sort_index()
cate_names = ["剧情","喜剧","科幻","悬疑","动作","爱情","动画","犯罪","奇幻","记录"]
plt.figure(figsize=(10,4))
bars = plt.bar([str(i) for i in range(10)], cate_count.values, color="#3498db")
plt.xlabel("类别编号")
plt.ylabel("样本数量")
plt.title("10个电影类别样本分布柱状图")
plt.xticks(range(10), cate_names, rotation=30)
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.savefig("images/category_bar.png", dpi=300)
plt.close()
print("可视化绘图完成,图片保存在 images/ 文件夹")