diff --git a/crawl.py b/crawl.py new file mode 100644 index 0000000..2a71cb1 --- /dev/null +++ b/crawl.py @@ -0,0 +1,32 @@ +import requests +from bs4 import BeautifulSoup +import json +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} +movie_data = [] +for start_offset in [0, 25]: + url = f"https://movie.douban.com/top250?start={start_offset}" + resp = requests.get(url, headers=headers) + soup = BeautifulSoup(resp.text, "html.parser") + item_list = soup.find_all("div", class_="item") + for index, item in enumerate(item_list): + rank = start_offset + index + 1 + title = item.find("span", class_="title").get_text(strip=True) + info_str = item.find("div", class_="bd").p.get_text(strip=True) + if "主演:" in info_str: + actors = info_str.split("主演:")[-1].split("\n")[0].strip() + else: + actors = "无" + quote_tag = item.find("span", class_="inq") + quote = quote_tag.get_text(strip=True) if quote_tag else "" + movie_data.append({ + "rank": rank, + "title": title, + "actors": actors, + "quote": quote + }) +with open("movies.json", "w", encoding="utf-8") as f: + json.dump(movie_data, f, ensure_ascii=False, indent=2) + +print(f"爬取完成,共{len(movie_data)}部电影,已生成 movies.json") \ No newline at end of file diff --git a/data_clean.py b/data_clean.py new file mode 100644 index 0000000..ed84e8a --- /dev/null +++ b/data_clean.py @@ -0,0 +1,26 @@ +import json + +# 读取原始爬虫数据 +with open("movies.json", "r", encoding="utf-8") as f: + raw_movies = json.load(f) + +# 过滤quote为空的样本 +valid_movies = [movie for movie in raw_movies if movie["quote"].strip()] + +# 1. 生成LabelStudio导入文件 quotes_processed.txt +with open("quotes_processed.txt", "w", encoding="utf-8") as out_f: + for item in valid_movies: + line = json.dumps({"text": item["quote"]}, ensure_ascii=False) + out_f.write(line + "\n") + +# 2. 生成数据统计文件 process_stats.json +stats = { + "原始总样本": len(raw_movies), + "过滤后有效样本": len(valid_movies), + "过滤掉空短评样本": len(raw_movies) - len(valid_movies) +} +with open("process_stats.json", "w", encoding="utf-8") as f: + json.dump(stats, f, ensure_ascii=False, indent=2) + +print("数据清洗完成,已生成 quotes_processed.txt、process_stats.json") +print(f"统计信息:{stats}") \ No newline at end of file diff --git a/movies.json b/movies.json new file mode 100644 index 0000000..c1c015b --- /dev/null +++ b/movies.json @@ -0,0 +1,302 @@ +[ + { + "rank": 1, + "title": "肖申克的救赎", + "actors": "蒂姆·罗宾斯 Tim Robbins /...1994 / 美国 / 犯罪 剧情", + "quote": "" + }, + { + "rank": 2, + "title": "霸王别姬", + "actors": "张国荣 Leslie Cheung / 张丰毅 Fengyi Zha...1993 / 中国大陆 中国香港 / 剧情 爱情 同性", + "quote": "" + }, + { + "rank": 3, + "title": "泰坦尼克号", + "actors": "莱昂纳多·迪卡普里奥 Leonardo...1997 / 美国 / 剧情 爱情 灾难", + "quote": "" + }, + { + "rank": 4, + "title": "阿甘正传", + "actors": "汤姆·汉克斯 Tom Hanks / ...1994 / 美国 / 剧情 爱情", + "quote": "" + }, + { + "rank": 5, + "title": "千与千寻", + "actors": "柊瑠美 Rumi Hîragi / 入野自由 Miy...2001 / 日本 / 剧情 动画 奇幻", + "quote": "" + }, + { + "rank": 6, + "title": "美丽人生", + "actors": "罗伯托·贝尼尼 Roberto Beni...1997 / 意大利 / 剧情 喜剧 爱情 战争", + "quote": "" + }, + { + "rank": 7, + "title": "星际穿越", + "actors": "马修·麦康纳 Matthew Mc...2014 / 美国 英国 加拿大 / 剧情 科幻 冒险", + "quote": "" + }, + { + "rank": 8, + "title": "这个杀手不太冷", + "actors": "让·雷诺 Jean Reno / 娜塔莉·波特曼 ...1994 / 法国 美国 / 剧情 动作 犯罪", + "quote": "" + }, + { + "rank": 9, + "title": "盗梦空间", + "actors": "莱昂纳多·迪卡普里奥 Le...2010 / 美国 英国 / 剧情 科幻 悬疑 冒险", + "quote": "" + }, + { + "rank": 10, + "title": "楚门的世界", + "actors": "金·凯瑞 Jim Carrey / 劳拉·琳妮 Lau...1998 / 美国 / 剧情 科幻", + "quote": "" + }, + { + "rank": 11, + "title": "辛德勒的名单", + "actors": "连姆·尼森 Liam Neeson...1993 / 美国 / 剧情 历史 战争", + "quote": "" + }, + { + "rank": 12, + "title": "忠犬八公的故事", + "actors": "理查·基尔 Richard Ger...2009 / 美国 英国 / 剧情", + "quote": "" + }, + { + "rank": 13, + "title": "海上钢琴师", + "actors": "蒂姆·罗斯 Tim Roth / ...1998 / 意大利 / 剧情 音乐", + "quote": "" + }, + { + "rank": 14, + "title": "疯狂动物城", + "actors": "金妮弗·...2016 / 美国 / 喜剧 动画 冒险", + "quote": "" + }, + { + "rank": 15, + "title": "三傻大闹宝莱坞", + "actors": "阿米尔·汗 Aamir Khan / 卡...2009 / 印度 / 剧情 喜剧 爱情 歌舞", + "quote": "" + }, + { + "rank": 16, + "title": "机器人总动员", + "actors": "本·贝尔特 Ben Burtt / 艾丽...2008 / 美国 / 科幻 动画 冒险", + "quote": "" + }, + { + "rank": 17, + "title": "放牛班的春天", + "actors": "让-巴蒂斯特·莫尼...2004 / 法国 瑞士 德国 / 剧情 音乐", + "quote": "" + }, + { + "rank": 18, + "title": "无间道", + "actors": "刘德华 Andy Lau / 梁朝伟 Tony Leung Chiu W...2002 / 中国香港 / 剧情 犯罪 惊悚", + "quote": "" + }, + { + "rank": 19, + "title": "控方证人", + "actors": "泰隆·鲍华 Tyrone Power / 玛琳·...1957 / 美国 / 剧情 犯罪 悬疑 惊悚", + "quote": "" + }, + { + "rank": 20, + "title": "寻梦环游记", + "actors": "...2017 / 美国 / 喜剧 动画 奇幻 音乐", + "quote": "" + }, + { + "rank": 21, + "title": "大话西游之大圣娶亲", + "actors": "周星驰 Stephen Chow / 吴孟达 Man Tat Ng...1995 / 中国香港 中国大陆 / 喜剧 爱情 奇幻 古装", + "quote": "" + }, + { + "rank": 22, + "title": "熔炉", + "actors": "孔侑 Yoo Gong / 郑有美 Yu-mi Jung /...2011 / 韩国 / 剧情", + "quote": "" + }, + { + "rank": 23, + "title": "触不可及", + "actors": "无", + "quote": "" + }, + { + "rank": 24, + "title": "教父", + "actors": "马龙·白兰度 M...1972 / 美国 / 剧情 犯罪", + "quote": "" + }, + { + "rank": 25, + "title": "末代皇帝", + "actors": "尊龙 John Lone / 陈...1987 / 英国 意大利 中国大陆 法国 / 剧情 传记 历史", + "quote": "" + }, + { + "rank": 26, + "title": "哈利·波特与魔法石", + "actors": "Daniel Radcliffe / Emma Watson / Rupert Grint2001 / 美国 英国 / 奇幻 冒险", + "quote": "" + }, + { + "rank": 27, + "title": "当幸福来敲门", + "actors": "威尔·史密斯 Will Smith ...2006 / 美国 / 剧情 传记 家庭", + "quote": "" + }, + { + "rank": 28, + "title": "龙猫", + "actors": "日高法子 Noriko Hidaka / 坂本千夏 Ch...1988 / 日本 / 动画 奇幻 冒险", + "quote": "" + }, + { + "rank": 29, + "title": "活着", + "actors": "葛优 You Ge / 巩俐 Li Gong / 姜武 Wu Jiang1994 / 中国大陆 中国香港 / 剧情 历史 家庭", + "quote": "" + }, + { + "rank": 30, + "title": "怦然心动", + "actors": "玛德琳·卡罗尔 Madeline Carroll / 卡...2010 / 美国 / 剧情 喜剧 爱情", + "quote": "" + }, + { + "rank": 31, + "title": "蝙蝠侠:黑暗骑士", + "actors": "克里斯蒂安·贝尔 Christ...2008 / 美国 英国 / 剧情 动作 科幻 犯罪 惊悚", + "quote": "" + }, + { + "rank": 32, + "title": "指环王3:王者无敌", + "actors": "伊利亚·伍德 Elijah Wood / 西恩...2003 / 美国 新西兰 / 剧情 动作 奇幻 冒险", + "quote": "" + }, + { + "rank": 33, + "title": "我不是药神", + "actors": "徐峥 Zheng Xu / 王传君 Chuanjun Wang / 周...2018 / 中国大陆 / 剧情 喜剧", + "quote": "" + }, + { + "rank": 34, + "title": "乱世佳人", + "actors": "费...1939 / 美国 / 剧情 历史 爱情 战争", + "quote": "" + }, + { + "rank": 35, + "title": "让子弹飞", + "actors": "姜文 Wen Jiang / 葛优 You Ge / 周润发 Yun-F...2010 / 中国大陆 中国香港 / 剧情 喜剧 动作 西部", + "quote": "" + }, + { + "rank": 36, + "title": "飞屋环游记", + "actors": "爱德...2009 / 美国 / 剧情 喜剧 动画 冒险", + "quote": "" + }, + { + "rank": 37, + "title": "哈尔的移动城堡", + "actors": "倍赏千惠子 Chieko Baishô / 木村拓...2004 / 日本 / 爱情 动画 奇幻 冒险", + "quote": "" + }, + { + "rank": 38, + "title": "十二怒汉", + "actors": "亨利·方达 Henry Fonda / 马丁...1957 / 美国 / 剧情", + "quote": "" + }, + { + "rank": 39, + "title": "海蒂和爷爷", + "actors": "阿努克·斯特芬 Anuk Steffen /...2015 / 德国 瑞士 / 剧情 冒险 家庭", + "quote": "" + }, + { + "rank": 40, + "title": "素媛", + "actors": "薛景求 Kyung-gu Sol / 严志媛 Ji-won Uhm ...2013 / 韩国 / 剧情", + "quote": "" + }, + { + "rank": 41, + "title": "猫鼠游戏", + "actors": "莱昂纳多·迪卡普里奥 L...2002 / 美国 加拿大 / 传记 犯罪 剧情", + "quote": "" + }, + { + "rank": 42, + "title": "天空之城", + "actors": "田中真弓 Mayumi Tanaka / 横泽启子 Ke...1986 / 日本 / 动画 奇幻 冒险", + "quote": "" + }, + { + "rank": 43, + "title": "鬼子来了", + "actors": "姜文 Wen Jiang / 香川照之 Teruyuki Kagawa /...2000 / 中国大陆 / 剧情 喜剧", + "quote": "" + }, + { + "rank": 44, + "title": "摔跤吧!爸爸", + "actors": "阿米尔·汗 Aamir Khan / 法缇玛...2016 / 印度 / 剧情 传记 运动 家庭", + "quote": "" + }, + { + "rank": 45, + "title": "少年派的奇幻漂流", + "actors": "苏拉·沙玛 Suraj Sharma / 伊尔凡·可汗 Irrfan...2012 / 美国 中国台湾 英国 加拿大 / 剧情 奇幻 冒险", + "quote": "" + }, + { + "rank": 46, + "title": "钢琴家", + "actors": "艾德里安·布洛迪 Adrien Brod...2002 / 英国 法国 波兰 德国 美国 / 剧情 传记 战争 音乐", + "quote": "" + }, + { + "rank": 47, + "title": "死亡诗社", + "actors": "罗宾·威廉姆斯 Robin Williams / 罗伯...1989 / 美国 / 剧情", + "quote": "" + }, + { + "rank": 48, + "title": "指环王2:双塔奇兵", + "actors": "伊利亚·伍德 Elijah Wood / 西恩...2002 / 美国 新西兰 / 剧情 动作 奇幻 冒险", + "quote": "" + }, + { + "rank": 49, + "title": "大话西游之月光宝盒", + "actors": "周星驰 Stephen Chow / 吴孟达 Man Tat Ng...1995 / 中国香港 中国大陆 / 喜剧 爱情 奇幻 古装", + "quote": "" + }, + { + "rank": 50, + "title": "绿皮书", + "actors": "维果·莫腾森 Viggo Mortensen /...2018 / 美国 中国大陆 / 剧情 喜剧 传记 音乐", + "quote": "" + } +] \ No newline at end of file diff --git a/train_mlp.py b/train_mlp.py new file mode 100644 index 0000000..507815c --- /dev/null +++ b/train_mlp.py @@ -0,0 +1,48 @@ +import pandas as pd +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.neural_network import MLPClassifier +from sklearn.metrics import precision_score +import csv + +# 1. 读取标注数据 +df = pd.read_csv("my_labels.csv") +texts = df["text"].tolist() +labels = df["label"].tolist() + +# 2. TF-IDF文本特征提取 +tfidf = TfidfVectorizer() +X = tfidf.fit_transform(texts) +y = np.array(labels) + +# 3. 划分训练集、验证集 +X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) + +# 4. 训练MLP,记录每轮loss +mlp = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=100, random_state=42) +train_loss_list = [] +val_precision_list = [] + +for epoch in range(1, mlp.max_iter+1): + mlp.partial_fit(X_train, y_train, classes=np.unique(y)) + # 记录训练loss + train_loss_list.append({"epoch": epoch, "loss": mlp.loss_}) + # 验证集预测、计算precision + y_pred_val = mlp.predict(X_val) + val_prec = precision_score(y_val, y_pred_val, average="macro", zero_division=0) + val_precision_list.append({"epoch": epoch, "precision": val_prec}) + +# 保存loss.csv +with open("loss.csv", "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=["epoch", "loss"]) + writer.writeheader() + writer.writerows(train_loss_list) + +# 保存predictions.csv +with open("predictions.csv", "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=["epoch", "precision"]) + writer.writeheader() + writer.writerows(val_precision_list) + +print("模型训练完成,已输出 loss.csv、predictions.csv") \ No newline at end of file diff --git a/visual_plot.py b/visual_plot.py new file mode 100644 index 0000000..7161f3f --- /dev/null +++ b/visual_plot.py @@ -0,0 +1,42 @@ +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +plt.rcParams["font.sans-serif"] = ["SimHei"] +plt.rcParams["axes.unicode_minus"] = False + +# ---------------------- 图1:Loss曲线(训练loss) ---------------------- +loss_df = pd.read_csv("loss.csv") +plt.figure(figsize=(10,4)) +plt.plot(loss_df["epoch"], loss_df["loss"], color="#e74c3c", linewidth=2, label="训练Loss") +plt.xlabel("Epoch 训练轮次") +plt.ylabel("Loss 损失值") +plt.title("MLP训练Loss变化曲线") +plt.legend() +plt.grid(alpha=0.3) +plt.tight_layout() + +# 创建images文件夹存放图片 +import os +if not os.path.exists("images"): + os.mkdir("images") +plt.savefig("images/loss_curve.png", dpi=300) +plt.close() + +# ---------------------- 图2:10类别预测分布柱状图 ---------------------- +label_df = pd.read_csv("my_labels.csv") +cate_count = label_df["label"].value_counts().sort_index() +cate_names = ["剧情","喜剧","科幻","悬疑","动作","爱情","动画","犯罪","奇幻","记录"] + +plt.figure(figsize=(10,4)) +bars = plt.bar([str(i) for i in range(10)], cate_count.values, color="#3498db") +plt.xlabel("类别编号") +plt.ylabel("样本数量") +plt.title("10个电影类别样本分布柱状图") +plt.xticks(range(10), cate_names, rotation=30) +plt.grid(axis="y", alpha=0.3) +plt.tight_layout() +plt.savefig("images/category_bar.png", dpi=300) +plt.close() + +print("可视化绘图完成,图片保存在 images/ 文件夹") \ No newline at end of file