Files
final-practice/hhh.py
2026-06-09 10:45:08 +08:00

196 lines
6.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import requests
import json
import csv
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
import os
# ----------------------
# 配置
# ----------------------
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
# 类别映射题目要求的10类
LABEL_MAP = {
"剧情": 0, "喜剧": 1, "科幻": 2, "悬疑": 3, "动作": 4,
"爱情": 5, "动画": 6, "犯罪": 7, "奇幻": 8, "纪录": 9
}
REVERSE_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}
# ----------------------
# 1. 数据采集爬取豆瓣Top250前50部电影
# ----------------------
print("=" * 50)
print("步骤1爬取豆瓣Top250前50部电影数据")
print("=" * 50)
base_url = "https://movie.douban.com/top250?start={}&filter="
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
movies = []
for page in range(2): # 前2页共50条
url = base_url.format(page * 25)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
for idx, item in enumerate(soup.select(".grid_view li")):
rank = page * 25 + idx + 1
title = item.select_one(".title").text.strip() if item.select_one(".title") else ""
actors_tag = item.select_one(".bd p")
actors = actors_tag.text.strip().split("\n")[0] if actors_tag else ""
quote_tag = item.select_one(".quote")
quote = quote_tag.text.strip() if quote_tag else ""
movies.append({
"rank": rank,
"title": title,
"actors": actors,
"quote": quote
})
# 保存movies.json
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(movies, f, ensure_ascii=False, indent=2)
print("✅ movies.json 已生成")
# ----------------------
# 2. 数据处理:过滤空短评,生成待标注文本
# ----------------------
print("\n步骤2过滤有效短评生成待标注文件")
valid_quotes = [m["quote"] for m in movies if m.get("quote") and m["quote"].strip()]
print(f"✅ 过滤出 {len(valid_quotes)} 条有效短评")
with open("quotes_processed.txt", "w", encoding="utf-8") as f:
for q in valid_quotes:
f.write(q + "\n")
print("✅ quotes_processed.txt 已生成可导入Label-Studio标注")
# ----------------------
# 3. 标注兼容处理如果没有my_labels.csv自动生成模拟数据测试流程
# ----------------------
print("\n步骤3读取标注数据若不存在则生成模拟数据")
if not os.path.exists("my_labels.csv"):
print("⚠️ 未找到my_labels.csv生成模拟标注数据用于测试提交作业请替换为真实标注数据")
df = pd.DataFrame({
"text": valid_quotes,
"label": np.random.choice(list(LABEL_MAP.keys()), size=len(valid_quotes))
})
df.to_csv("my_labels.csv", index=False, encoding="utf-8")
else:
print("✅ 读取已有的my_labels.csv标注数据")
df = pd.read_csv("my_labels.csv")
df["label_id"] = df["label"].map(LABEL_MAP)
# ----------------------
# 4. 模型训练TF-IDF + MLP
# ----------------------
print("\n步骤4模型训练TF-IDF + MLP")
# 文本特征提取
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["text"]).toarray()
y = df["label_id"].values
# 划分训练/验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# 训练MLP并记录loss
max_epochs = 50
train_losses = []
val_losses = []
model = MLPClassifier(
hidden_layer_sizes=(64, 32),
max_iter=1, warm_start=True,
random_state=42
)
for epoch in range(max_epochs):
model.fit(X_train, y_train)
train_loss = model.loss_
val_acc = model.score(X_val, y_val)
val_loss = 1 - val_acc
train_losses.append(train_loss)
val_losses.append(val_loss)
# 保存loss.csv
with open("loss.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["epoch", "train_loss", "val_loss"])
for i in range(max_epochs):
writer.writerow([i + 1, round(train_losses[i], 4), round(val_losses[i], 4)])
print("✅ loss.csv 已生成")
# 预测并保存predictions.csv
y_pred = model.predict(X_val)
precision = precision_score(y_val, y_pred, average="macro")
print(f"✅ 模型训练完成Macro Precision: {precision:.4f}")
with open("predictions.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["true_label", "pred_label", "label_name"])
for true, pred in zip(y_val, y_pred):
writer.writerow([true, pred, REVERSE_LABEL_MAP[pred]])
print("✅ predictions.csv 已生成")
# ----------------------
# 5. 可视化loss曲线 + 类别分布柱状图
# ----------------------
print("\n步骤5生成可视化图片")
if not os.path.exists("images"):
os.makedirs("images")
# 图1loss曲线
loss_df = pd.read_csv("loss.csv")
plt.figure(figsize=(10, 5))
plt.plot(loss_df["epoch"], loss_df["train_loss"], label="训练集Loss", color="#1f77b4")
plt.plot(loss_df["epoch"], loss_df["val_loss"], label="验证集Loss", color="#ff7f0e")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("训练集与验证集Loss曲线")
plt.legend()
plt.grid(alpha=0.3)
plt.savefig("images/loss_curve.png", dpi=300, bbox_inches="tight")
plt.close()
# 图2类别预测分布柱状图
pred_df = pd.read_csv("predictions.csv")
label_counts = pred_df["label_name"].value_counts()
all_labels = list(LABEL_MAP.keys())
counts = [label_counts.get(label, 0) for label in all_labels]
plt.figure(figsize=(12, 6))
plt.bar(all_labels, counts, color="#2ca02c")
plt.xlabel("电影类别")
plt.ylabel("预测数量")
plt.title("验证集电影类别预测分布")
plt.xticks(rotation=45)
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.savefig("images/category_bar.png", dpi=300, bbox_inches="tight")
plt.close()
print("✅ 可视化图片已保存到images/文件夹")
print("\n" + "=" * 50)
print("所有步骤完成!已生成以下文件:")
print("- movies.json")
print("- quotes_processed.txt")
print("- my_labels.csv若之前没有则为模拟数据")
print("- loss.csv")
print("- predictions.csv")
print("- images/loss_curve.png")
print("- images/category_bar.png")
print("=" * 50)