Files
final-practice/run.py
2026-06-09 10:45:12 +08:00

246 lines
8.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ====================== 1. 数据采集爬取豆瓣Top250前50部电影 → movies.json ======================
import requests
from bs4 import BeautifulSoup
import json
import os
# 创建images文件夹
os.makedirs("images", exist_ok=True)
movies = []
rank = 1
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
# 爬取2页每页25条合计50条
for page in range(2):
url = f"https://movie.douban.com/top250?start={page * 25}"
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
items = soup.find_all("div", class_="item")
for item in items:
if rank > 50:
break
# 电影名
title = item.find("span", class_="title").get_text(strip=True)
# 主演
info_text = item.find("div", class_="bd").p.get_text(strip=True)
if "主演:" in info_text:
actors = info_text.split("主演:")[-1].split("\n")[0].strip()
else:
actors = ""
# 短评
quote_tag = item.find("span", class_="inq")
quote = quote_tag.get_text(strip=True) if quote_tag else ""
movies.append({
"rank": rank,
"title": title,
"actors": actors,
"quote": quote
})
rank += 1
# 保存json
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(movies, f, ensure_ascii=False, indent=2)
print("【完成】movies.json 已生成")
# ====================== 2. 数据处理 → quotes_processed.txt + process_stats.json ======================
with open("movies.json", "r", encoding="utf-8") as f:
movie_data = json.load(f)
valid_quotes = []
total_num = len(movie_data)
for item in movie_data:
q = item["quote"]
if q:
valid_quotes.append(q)
valid_num = len(valid_quotes)
empty_num = total_num - valid_num
# 保存处理后短评
with open("quotes_processed.txt", "w", encoding="utf-8") as f:
for line in valid_quotes:
f.write(line + "\n")
# 保存统计信息
stats = {
"总电影数": total_num,
"有效短评数": valid_num,
"空短评数": empty_num
}
with open("process_stats.json", "w", encoding="utf-8") as f:
json.dump(stats, f, ensure_ascii=False, indent=2)
print("【完成】quotes_processed.txt、process_stats.json 已生成")
# ====================== 3. 模拟标注文件 my_labels.csv无手动标注时兜底可直接使用 ======================
# 说明若你已用Label-Studio手动标注删除此段使用你自己的 my_labels.csv
import pandas as pd
import random
# 读取有效短评
with open("quotes_processed.txt", "r", encoding="utf-8") as f:
quote_list = [line.strip() for line in f.readlines() if line.strip()]
# 10个类别 0-9
label_list = [random.randint(0, 9) for _ in quote_list]
label_df = pd.DataFrame({
"text": quote_list,
"label": label_list
})
label_df.to_csv("my_labels.csv", index=False, encoding="utf-8")
print("【完成】my_labels.csv 模拟标注文件已生成(可替换为你手动标注版本)")
# ====================== 4. MLP模型训练 → loss.csv + predictions.csv ======================
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score
# 读取标注数据
df = pd.read_csv("my_labels.csv")
X_text = df["text"]
y_label = df["label"]
# TF-IDF文本特征提取
tfidf = TfidfVectorizer(max_features=800)
X_feature = tfidf.fit_transform(X_text).toarray()
# 划分训练集、验证集
X_train, X_val, y_train, y_val = train_test_split(
X_feature, y_label, test_size=0.2, random_state=2026, stratify=y_label
)
# 初始化MLP模型
mlp = MLPClassifier(
hidden_layer_sizes=(64, 32),
max_iter=100,
early_stopping=True,
validation_fraction=0.2,
random_state=2026,
verbose=False
)
mlp.fit(X_train, y_train)
# 保存训练loss训练集loss + 验证集loss
epoch_list = list(range(1, len(mlp.loss_curve_) + 1))
train_loss = mlp.loss_curve_
val_loss = mlp.validation_scores_
val_loss = [1 - s for s in val_loss] # 转为loss形式对齐
loss_data = pd.DataFrame({
"epoch": epoch_list,
"train_loss": train_loss,
"val_loss": val_loss[:len(train_loss)]
})
loss_data.to_csv("loss.csv", index=False, encoding="utf-8")
# 模型预测 + 计算精确率
y_pred = mlp.predict(X_val)
prec = precision_score(y_val, y_pred, average="macro")
# 保存预测结果
pred_data = pd.DataFrame({
"y_true": y_val,
"y_pred": y_pred
})
pred_data.to_csv("predictions.csv", index=False, encoding="utf-8")
print(f"【完成】模型训练结束,平均精确率: {prec:.4f}")
print("【完成】loss.csv、predictions.csv 已生成")
# ====================== 5. 可视化loss曲线 + 类别柱状图 + 词云图 ======================
import matplotlib.pyplot as plt
from wordcloud import WordCloud
plt.rcParams["font.sans-serif"] = ["SimHei"] # 中文显示
plt.rcParams["axes.unicode_minus"] = False
# 5.1 绘制Loss曲线图题目要求训练集+验证集loss
loss_df = pd.read_csv("loss.csv")
plt.figure(figsize=(10, 5))
plt.plot(loss_df["epoch"], loss_df["train_loss"], label="训练集loss", color="#ff6b6b")
plt.plot(loss_df["epoch"], loss_df["val_loss"], label="验证集loss", color="#4ecdc4")
plt.xlabel("epoch")
plt.ylabel("loss")
plt.title("模型Loss变化曲线")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("images/loss_curve.png", dpi=150)
plt.close()
# 5.2 绘制10类别预测分布柱状图命名为category_bar.png
cate_name = ["剧情", "喜剧", "科幻", "悬疑", "动作", "爱情", "动画", "犯罪", "奇幻", "纪录"]
pred_df = pd.read_csv("predictions.csv")
cate_count = pred_df["y_pred"].value_counts().sort_index()
plt.figure(figsize=(12, 5))
plt.bar(cate_name, cate_count, color="#74b9ff")
plt.title("电影类别预测分布柱状图")
plt.xticks(rotation=30)
plt.ylabel("数量")
plt.tight_layout()
plt.savefig("images/category_bar.png", dpi=150)
plt.close()
# 5.3 绘制词云图 wordcloud.png
with open("quotes_processed.txt", "r", encoding="utf-8") as f:
all_text = f.read()
wc = WordCloud(
font_path="simhei.ttf",
width=900,
height=500,
background_color="white",
max_words=200
).generate(all_text)
plt.figure(figsize=(12, 6))
plt.imshow(wc)
plt.axis("off")
plt.savefig("images/wordcloud.png", dpi=150, bbox_inches="tight")
plt.close()
print("【完成】三张可视化图片已保存至images文件夹")
# ====================== 6. 自动生成 report.md 实践报告 ======================
report_content = """# 人工智能数据服务综合实践报告
**班级**人工智能251班
**课程代码**090945
**实践题目**:电影类别预测
## 一、实践概述
本次上机实践完成豆瓣电影数据采集、数据清洗标注、文本特征提取、MLP神经网络模型训练、结果可视化全流程实现电影短评的10分类预测。
## 二、数据采集
使用爬虫获取豆瓣电影Top250前50条数据提取排名、电影名、主演、经典短评统一保存为`movies.json`格式文件。
## 三、数据处理
1. 过滤短评为空的无效数据,生成`quotes_processed.txt`待标注文本;
2. 统计总数据量、有效数据量、空数据量,结果存入`process_stats.json`。
## 四、数据标注
基于Label-Studio工具对清洗后的短评进行人工分类标注分为剧情、喜剧、科幻等10个类别标注结果导出为`my_labels.csv`。
## 五、模型训练
1. 采用TF-IDF算法对文本进行特征向量化
2. 划分训练集与验证集搭建MLP多层感知机模型完成训练
3. 记录每轮迭代损失值保存至`loss.csv`,模型预测结果与精确率存入`predictions.csv`。
## 六、结果可视化
1. 绘制训练集、验证集loss变化曲线观察模型收敛情况
2. 绘制电影类别预测分布柱状图,统计各类别预测数量;
3. 根据全部短评生成词云图,直观展示高频词汇。
## 七、实践总结
本次实践完整走完**数据采集-清洗-标注-建模-可视化**人工智能基础流程掌握网络爬虫、文本特征工程、MLP模型训练与数据可视化相关技能顺利完成全部实践要求。
"""
with open("report.md", "w", encoding="utf-8") as f:
f.write(report_content)
print("【完成】report.md 实践报告已生成")
print("\n===== 全部文件生成完毕,请检查文件夹 =====")