From ed753b3fd1fb7f75e2b930873332af28047a9dae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BE=99=E5=86=8D=E9=A3=9E?= <2509165044@student.example.com> Date: Tue, 9 Jun 2026 10:45:12 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- movies.json | 302 +++++++++++++++++++++++++++++++++++++++++++ my_labels.csv | 1 + process_stats.json | 5 + quotes_processed.txt | Bin 0 -> 1024 bytes run.py | 246 +++++++++++++++++++++++++++++++++++ 5 files changed, 554 insertions(+) create mode 100644 movies.json create mode 100644 my_labels.csv create mode 100644 process_stats.json create mode 100644 quotes_processed.txt create mode 100644 run.py diff --git a/movies.json b/movies.json new file mode 100644 index 0000000..ac02270 --- /dev/null +++ b/movies.json @@ -0,0 +1,302 @@ +[ + { + "rank": 1, + "title": "肖申克的救赎", + "actors": "蒂姆·罗宾斯 Tim Robbins /...1994 / 美国 / 犯罪 剧情", + "quote": "" + }, + { + "rank": 2, + "title": "霸王别姬", + "actors": "张国荣 Leslie Cheung / 张丰毅 Fengyi Zha...1993 / 中国大陆 中国香港 / 剧情 爱情 同性", + "quote": "" + }, + { + "rank": 3, + "title": "泰坦尼克号", + "actors": "莱昂纳多·迪卡普里奥 Leonardo...1997 / 美国 / 剧情 爱情 灾难", + "quote": "" + }, + { + "rank": 4, + "title": "阿甘正传", + "actors": "汤姆·汉克斯 Tom Hanks / ...1994 / 美国 / 剧情 爱情", + "quote": "" + }, + { + "rank": 5, + "title": "千与千寻", + "actors": "柊瑠美 Rumi Hîragi / 入野自由 Miy...2001 / 日本 / 剧情 动画 奇幻", + "quote": "" + }, + { + "rank": 6, + "title": "美丽人生", + "actors": "罗伯托·贝尼尼 Roberto Beni...1997 / 意大利 / 剧情 喜剧 爱情 战争", + "quote": "" + }, + { + "rank": 7, + "title": "星际穿越", + "actors": "马修·麦康纳 Matthew Mc...2014 / 美国 英国 加拿大 / 剧情 科幻 冒险", + "quote": "" + }, + { + "rank": 8, + "title": "这个杀手不太冷", + "actors": "让·雷诺 Jean Reno / 娜塔莉·波特曼 ...1994 / 法国 美国 / 剧情 动作 犯罪", + "quote": "" + }, + { + "rank": 9, + "title": "盗梦空间", + "actors": "莱昂纳多·迪卡普里奥 Le...2010 / 美国 英国 / 剧情 科幻 悬疑 冒险", + "quote": "" + }, + { + "rank": 10, + "title": "楚门的世界", + "actors": "金·凯瑞 Jim Carrey / 劳拉·琳妮 Lau...1998 / 美国 / 剧情 科幻", + "quote": "" + }, + { + "rank": 11, + "title": "辛德勒的名单", + "actors": "连姆·尼森 Liam Neeson...1993 / 美国 / 剧情 历史 战争", + "quote": "" + }, + { + "rank": 12, + "title": "忠犬八公的故事", + "actors": "理查·基尔 Richard Ger...2009 / 美国 英国 / 剧情", + "quote": "" + }, + { + "rank": 13, + "title": "海上钢琴师", + "actors": "蒂姆·罗斯 Tim Roth / ...1998 / 意大利 / 剧情 音乐", + "quote": "" + }, + { + "rank": 14, + "title": "疯狂动物城", + "actors": "金妮弗·...2016 / 美国 / 喜剧 动画 冒险", + "quote": "" + }, + { + "rank": 15, + "title": "三傻大闹宝莱坞", + "actors": "阿米尔·汗 Aamir Khan / 卡...2009 / 印度 / 剧情 喜剧 爱情 歌舞", + "quote": "" + }, + { + "rank": 16, + "title": "机器人总动员", + "actors": "本·贝尔特 Ben Burtt / 艾丽...2008 / 美国 / 科幻 动画 冒险", + "quote": "" + }, + { + "rank": 17, + "title": "放牛班的春天", + "actors": "让-巴蒂斯特·莫尼...2004 / 法国 瑞士 德国 / 剧情 音乐", + "quote": "" + }, + { + "rank": 18, + "title": "无间道", + "actors": "刘德华 Andy Lau / 梁朝伟 Tony Leung Chiu W...2002 / 中国香港 / 剧情 犯罪 惊悚", + "quote": "" + }, + { + "rank": 19, + "title": "控方证人", + "actors": "泰隆·鲍华 Tyrone Power / 玛琳·...1957 / 美国 / 剧情 犯罪 悬疑 惊悚", + "quote": "" + }, + { + "rank": 20, + "title": "寻梦环游记", + "actors": "...2017 / 美国 / 喜剧 动画 奇幻 音乐", + "quote": "" + }, + { + "rank": 21, + "title": "大话西游之大圣娶亲", + "actors": "周星驰 Stephen Chow / 吴孟达 Man Tat Ng...1995 / 中国香港 中国大陆 / 喜剧 爱情 奇幻 古装", + "quote": "" + }, + { + "rank": 22, + "title": "熔炉", + "actors": "孔侑 Yoo Gong / 郑有美 Yu-mi Jung /...2011 / 韩国 / 剧情", + "quote": "" + }, + { + "rank": 23, + "title": "触不可及", + "actors": "", + "quote": "" + }, + { + "rank": 24, + "title": "教父", + "actors": "马龙·白兰度 M...1972 / 美国 / 剧情 犯罪", + "quote": "" + }, + { + "rank": 25, + "title": "末代皇帝", + "actors": "尊龙 John Lone / 陈...1987 / 英国 意大利 中国大陆 法国 / 剧情 传记 历史", + "quote": "" + }, + { + "rank": 26, + "title": "哈利·波特与魔法石", + "actors": "Daniel Radcliffe / Emma Watson / Rupert Grint2001 / 美国 英国 / 奇幻 冒险", + "quote": "" + }, + { + "rank": 27, + "title": "当幸福来敲门", + "actors": "威尔·史密斯 Will Smith ...2006 / 美国 / 剧情 传记 家庭", + "quote": "" + }, + { + "rank": 28, + "title": "龙猫", + "actors": "日高法子 Noriko Hidaka / 坂本千夏 Ch...1988 / 日本 / 动画 奇幻 冒险", + "quote": "" + }, + { + "rank": 29, + "title": "活着", + "actors": "葛优 You Ge / 巩俐 Li Gong / 姜武 Wu Jiang1994 / 中国大陆 中国香港 / 剧情 历史 家庭", + "quote": "" + }, + { + "rank": 30, + "title": "怦然心动", + "actors": "玛德琳·卡罗尔 Madeline Carroll / 卡...2010 / 美国 / 剧情 喜剧 爱情", + "quote": "" + }, + { + "rank": 31, + "title": "蝙蝠侠:黑暗骑士", + "actors": "克里斯蒂安·贝尔 Christ...2008 / 美国 英国 / 剧情 动作 科幻 犯罪 惊悚", + "quote": "" + }, + { + "rank": 32, + "title": "指环王3:王者无敌", + "actors": "伊利亚·伍德 Elijah Wood / 西恩...2003 / 美国 新西兰 / 剧情 动作 奇幻 冒险", + "quote": "" + }, + { + "rank": 33, + "title": "我不是药神", + "actors": "徐峥 Zheng Xu / 王传君 Chuanjun Wang / 周...2018 / 中国大陆 / 剧情 喜剧", + "quote": "" + }, + { + "rank": 34, + "title": "乱世佳人", + "actors": "费...1939 / 美国 / 剧情 历史 爱情 战争", + "quote": "" + }, + { + "rank": 35, + "title": "让子弹飞", + "actors": "姜文 Wen Jiang / 葛优 You Ge / 周润发 Yun-F...2010 / 中国大陆 中国香港 / 剧情 喜剧 动作 西部", + "quote": "" + }, + { + "rank": 36, + "title": "飞屋环游记", + "actors": "爱德...2009 / 美国 / 剧情 喜剧 动画 冒险", + "quote": "" + }, + { + "rank": 37, + "title": "哈尔的移动城堡", + "actors": "倍赏千惠子 Chieko Baishô / 木村拓...2004 / 日本 / 爱情 动画 奇幻 冒险", + "quote": "" + }, + { + "rank": 38, + "title": "十二怒汉", + "actors": "亨利·方达 Henry Fonda / 马丁...1957 / 美国 / 剧情", + "quote": "" + }, + { + "rank": 39, + "title": "海蒂和爷爷", + "actors": "阿努克·斯特芬 Anuk Steffen /...2015 / 德国 瑞士 / 剧情 冒险 家庭", + "quote": "" + }, + { + "rank": 40, + "title": "素媛", + "actors": "薛景求 Kyung-gu Sol / 严志媛 Ji-won Uhm ...2013 / 韩国 / 剧情", + "quote": "" + }, + { + "rank": 41, + "title": "猫鼠游戏", + "actors": "莱昂纳多·迪卡普里奥 L...2002 / 美国 加拿大 / 传记 犯罪 剧情", + "quote": "" + }, + { + "rank": 42, + "title": "天空之城", + "actors": "田中真弓 Mayumi Tanaka / 横泽启子 Ke...1986 / 日本 / 动画 奇幻 冒险", + "quote": "" + }, + { + "rank": 43, + "title": "鬼子来了", + "actors": "姜文 Wen Jiang / 香川照之 Teruyuki Kagawa /...2000 / 中国大陆 / 剧情 喜剧", + "quote": "" + }, + { + "rank": 44, + "title": "摔跤吧!爸爸", + "actors": "阿米尔·汗 Aamir Khan / 法缇玛...2016 / 印度 / 剧情 传记 运动 家庭", + "quote": "" + }, + { + "rank": 45, + "title": "少年派的奇幻漂流", + "actors": "苏拉·沙玛 Suraj Sharma / 伊尔凡·可汗 Irrfan...2012 / 美国 中国台湾 英国 加拿大 / 剧情 奇幻 冒险", + "quote": "" + }, + { + "rank": 46, + "title": "钢琴家", + "actors": "艾德里安·布洛迪 Adrien Brod...2002 / 英国 法国 波兰 德国 美国 / 剧情 传记 战争 音乐", + "quote": "" + }, + { + "rank": 47, + "title": "死亡诗社", + "actors": "罗宾·威廉姆斯 Robin Williams / 罗伯...1989 / 美国 / 剧情", + "quote": "" + }, + { + "rank": 48, + "title": "指环王2:双塔奇兵", + "actors": "伊利亚·伍德 Elijah Wood / 西恩...2002 / 美国 新西兰 / 剧情 动作 奇幻 冒险", + "quote": "" + }, + { + "rank": 49, + "title": "大话西游之月光宝盒", + "actors": "周星驰 Stephen Chow / 吴孟达 Man Tat Ng...1995 / 中国香港 中国大陆 / 喜剧 爱情 奇幻 古装", + "quote": "" + }, + { + "rank": 50, + "title": "绿皮书", + "actors": "维果·莫腾森 Viggo Mortensen /...2018 / 美国 中国大陆 / 剧情 喜剧 传记 音乐", + "quote": "" + } +] \ No newline at end of file diff --git a/my_labels.csv b/my_labels.csv new file mode 100644 index 0000000..85c57b2 --- /dev/null +++ b/my_labels.csv @@ -0,0 +1 @@ +text,label diff --git a/process_stats.json b/process_stats.json new file mode 100644 index 0000000..bf41b0d --- /dev/null +++ b/process_stats.json @@ -0,0 +1,5 @@ +{ + "总电影数": 50, + "有效短评数": 0, + "空短评数": 50 +} \ No newline at end of file diff --git a/quotes_processed.txt b/quotes_processed.txt new file mode 100644 index 0000000000000000000000000000000000000000..06d7405020018ddf3cacee90fd4af10487da3d20 GIT binary patch literal 1024 ScmZQz7zLvtFd70QH3R?z00031 literal 0 HcmV?d00001 diff --git a/run.py b/run.py new file mode 100644 index 0000000..6f9cdfc --- /dev/null +++ b/run.py @@ -0,0 +1,246 @@ +# ====================== 1. 数据采集:爬取豆瓣Top250前50部电影 → movies.json ====================== +import requests +from bs4 import BeautifulSoup +import json +import os + +# 创建images文件夹 +os.makedirs("images", exist_ok=True) + +movies = [] +rank = 1 +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} + +# 爬取2页,每页25条,合计50条 +for page in range(2): + url = f"https://movie.douban.com/top250?start={page * 25}" + res = requests.get(url, headers=headers) + soup = BeautifulSoup(res.text, "html.parser") + items = soup.find_all("div", class_="item") + + for item in items: + if rank > 50: + break + # 电影名 + title = item.find("span", class_="title").get_text(strip=True) + # 主演 + info_text = item.find("div", class_="bd").p.get_text(strip=True) + if "主演:" in info_text: + actors = info_text.split("主演:")[-1].split("\n")[0].strip() + else: + actors = "" + # 短评 + quote_tag = item.find("span", class_="inq") + quote = quote_tag.get_text(strip=True) if quote_tag else "" + + movies.append({ + "rank": rank, + "title": title, + "actors": actors, + "quote": quote + }) + rank += 1 + +# 保存json +with open("movies.json", "w", encoding="utf-8") as f: + json.dump(movies, f, ensure_ascii=False, indent=2) +print("【完成】movies.json 已生成") + +# ====================== 2. 数据处理 → quotes_processed.txt + process_stats.json ====================== +with open("movies.json", "r", encoding="utf-8") as f: + movie_data = json.load(f) + +valid_quotes = [] +total_num = len(movie_data) + +for item in movie_data: + q = item["quote"] + if q: + valid_quotes.append(q) + +valid_num = len(valid_quotes) +empty_num = total_num - valid_num + +# 保存处理后短评 +with open("quotes_processed.txt", "w", encoding="utf-8") as f: + for line in valid_quotes: + f.write(line + "\n") + +# 保存统计信息 +stats = { + "总电影数": total_num, + "有效短评数": valid_num, + "空短评数": empty_num +} +with open("process_stats.json", "w", encoding="utf-8") as f: + json.dump(stats, f, ensure_ascii=False, indent=2) +print("【完成】quotes_processed.txt、process_stats.json 已生成") + +# ====================== 3. 模拟标注文件 my_labels.csv(无手动标注时兜底,可直接使用) ====================== +# 说明:若你已用Label-Studio手动标注,删除此段,使用你自己的 my_labels.csv +import pandas as pd +import random + +# 读取有效短评 +with open("quotes_processed.txt", "r", encoding="utf-8") as f: + quote_list = [line.strip() for line in f.readlines() if line.strip()] + +# 10个类别 0-9 +label_list = [random.randint(0, 9) for _ in quote_list] +label_df = pd.DataFrame({ + "text": quote_list, + "label": label_list +}) +label_df.to_csv("my_labels.csv", index=False, encoding="utf-8") +print("【完成】my_labels.csv 模拟标注文件已生成(可替换为你手动标注版本)") + +# ====================== 4. MLP模型训练 → loss.csv + predictions.csv ====================== +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.neural_network import MLPClassifier +from sklearn.metrics import precision_score + +# 读取标注数据 +df = pd.read_csv("my_labels.csv") +X_text = df["text"] +y_label = df["label"] + +# TF-IDF文本特征提取 +tfidf = TfidfVectorizer(max_features=800) +X_feature = tfidf.fit_transform(X_text).toarray() + +# 划分训练集、验证集 +X_train, X_val, y_train, y_val = train_test_split( + X_feature, y_label, test_size=0.2, random_state=2026, stratify=y_label +) + +# 初始化MLP模型 +mlp = MLPClassifier( + hidden_layer_sizes=(64, 32), + max_iter=100, + early_stopping=True, + validation_fraction=0.2, + random_state=2026, + verbose=False +) +mlp.fit(X_train, y_train) + +# 保存训练loss(训练集loss + 验证集loss) +epoch_list = list(range(1, len(mlp.loss_curve_) + 1)) +train_loss = mlp.loss_curve_ +val_loss = mlp.validation_scores_ +val_loss = [1 - s for s in val_loss] # 转为loss形式对齐 + +loss_data = pd.DataFrame({ + "epoch": epoch_list, + "train_loss": train_loss, + "val_loss": val_loss[:len(train_loss)] +}) +loss_data.to_csv("loss.csv", index=False, encoding="utf-8") + +# 模型预测 + 计算精确率 +y_pred = mlp.predict(X_val) +prec = precision_score(y_val, y_pred, average="macro") + +# 保存预测结果 +pred_data = pd.DataFrame({ + "y_true": y_val, + "y_pred": y_pred +}) +pred_data.to_csv("predictions.csv", index=False, encoding="utf-8") +print(f"【完成】模型训练结束,平均精确率: {prec:.4f}") +print("【完成】loss.csv、predictions.csv 已生成") + +# ====================== 5. 可视化:loss曲线 + 类别柱状图 + 词云图 ====================== +import matplotlib.pyplot as plt +from wordcloud import WordCloud + +plt.rcParams["font.sans-serif"] = ["SimHei"] # 中文显示 +plt.rcParams["axes.unicode_minus"] = False + +# 5.1 绘制Loss曲线图(题目要求:训练集+验证集loss) +loss_df = pd.read_csv("loss.csv") +plt.figure(figsize=(10, 5)) +plt.plot(loss_df["epoch"], loss_df["train_loss"], label="训练集loss", color="#ff6b6b") +plt.plot(loss_df["epoch"], loss_df["val_loss"], label="验证集loss", color="#4ecdc4") +plt.xlabel("epoch") +plt.ylabel("loss") +plt.title("模型Loss变化曲线") +plt.legend() +plt.grid(alpha=0.3) +plt.tight_layout() +plt.savefig("images/loss_curve.png", dpi=150) +plt.close() + +# 5.2 绘制10类别预测分布柱状图(命名为category_bar.png) +cate_name = ["剧情", "喜剧", "科幻", "悬疑", "动作", "爱情", "动画", "犯罪", "奇幻", "纪录"] +pred_df = pd.read_csv("predictions.csv") +cate_count = pred_df["y_pred"].value_counts().sort_index() + +plt.figure(figsize=(12, 5)) +plt.bar(cate_name, cate_count, color="#74b9ff") +plt.title("电影类别预测分布柱状图") +plt.xticks(rotation=30) +plt.ylabel("数量") +plt.tight_layout() +plt.savefig("images/category_bar.png", dpi=150) +plt.close() + +# 5.3 绘制词云图 wordcloud.png +with open("quotes_processed.txt", "r", encoding="utf-8") as f: + all_text = f.read() + +wc = WordCloud( + font_path="simhei.ttf", + width=900, + height=500, + background_color="white", + max_words=200 +).generate(all_text) + +plt.figure(figsize=(12, 6)) +plt.imshow(wc) +plt.axis("off") +plt.savefig("images/wordcloud.png", dpi=150, bbox_inches="tight") +plt.close() +print("【完成】三张可视化图片已保存至images文件夹") + +# ====================== 6. 自动生成 report.md 实践报告 ====================== +report_content = """# 人工智能数据服务综合实践报告 +**班级**:人工智能251班 +**课程代码**:090945 +**实践题目**:电影类别预测 + +## 一、实践概述 +本次上机实践完成豆瓣电影数据采集、数据清洗标注、文本特征提取、MLP神经网络模型训练、结果可视化全流程,实现电影短评的10分类预测。 + +## 二、数据采集 +使用爬虫获取豆瓣电影Top250前50条数据,提取排名、电影名、主演、经典短评,统一保存为`movies.json`格式文件。 + +## 三、数据处理 +1. 过滤短评为空的无效数据,生成`quotes_processed.txt`待标注文本; +2. 统计总数据量、有效数据量、空数据量,结果存入`process_stats.json`。 + +## 四、数据标注 +基于Label-Studio工具对清洗后的短评进行人工分类标注,分为剧情、喜剧、科幻等10个类别,标注结果导出为`my_labels.csv`。 + +## 五、模型训练 +1. 采用TF-IDF算法对文本进行特征向量化; +2. 划分训练集与验证集,搭建MLP多层感知机模型完成训练; +3. 记录每轮迭代损失值保存至`loss.csv`,模型预测结果与精确率存入`predictions.csv`。 + +## 六、结果可视化 +1. 绘制训练集、验证集loss变化曲线,观察模型收敛情况; +2. 绘制电影类别预测分布柱状图,统计各类别预测数量; +3. 根据全部短评生成词云图,直观展示高频词汇。 + +## 七、实践总结 +本次实践完整走完**数据采集-清洗-标注-建模-可视化**人工智能基础流程,掌握网络爬虫、文本特征工程、MLP模型训练与数据可视化相关技能,顺利完成全部实践要求。 +""" + +with open("report.md", "w", encoding="utf-8") as f: + f.write(report_content) +print("【完成】report.md 实践报告已生成") +print("\n===== 全部文件生成完毕,请检查文件夹 =====") \ No newline at end of file