上传文件至 /

This commit is contained in:
2026-04-28 11:05:27 +08:00
parent 42ff061f01
commit 016df0c747
5 changed files with 966 additions and 966 deletions

View File

@@ -1,40 +1,40 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
配置文件 - 所有超参数集中管理 配置文件 - 所有超参数集中管理
设计思路: 设计思路:
将超参数分门别类,学生可以单独修改某一类而不会影响其他 将超参数分门别类,学生可以单独修改某一类而不会影响其他
""" """
# ==================== 数据相关 ==================== # ==================== 数据相关 ====================
DATA_DIR = 'data/ChnSentiCorp' # 数据集路径 DATA_DIR = 'data/ChnSentiCorp' # 数据集路径
MAX_FEATURES = 3000 # 词表最大容量 MAX_FEATURES = 3000 # 词表最大容量
MAX_SEQ_LEN = 100 # 句子最大长度(词数) MAX_SEQ_LEN = 100 # 句子最大长度(词数)
VECTORIZER_TYPE = 'tfidf' # 'tfidf' 或 'bow'(向量化方式) VECTORIZER_TYPE = 'tfidf' # 'tfidf' 或 'bow'(向量化方式)
# ==================== 模型相关 ==================== # ==================== 模型相关 ====================
MODEL_TYPE = 'mlp' # 'mlp' 或 'lr'(模型类型) MODEL_TYPE = 'mlp' # 'mlp' 或 'lr'(模型类型)
HIDDEN_SIZE = 64 # MLP隐藏层大小LR忽略 HIDDEN_SIZE = 64 # MLP隐藏层大小LR忽略
NUM_CLASSES = 2 # 类别数(正面/负面二分类) NUM_CLASSES = 2 # 类别数(正面/负面二分类)
KEEP_PROB = 1.0 # Dropout保留概率LR忽略设为1即可 KEEP_PROB = 1.0 # Dropout保留概率LR忽略设为1即可
# ==================== 训练相关 ==================== # ==================== 训练相关 ====================
LEARNING_RATE = 0.05 # 学习率 LEARNING_RATE = 0.05 # 学习率
NUM_EPOCHS = 100 # 训练轮数 NUM_EPOCHS = 100 # 训练轮数
BATCH_SIZE = 64 # 批次大小 BATCH_SIZE = 64 # 批次大小
# ==================== 类别权重(解决数据不平衡问题)==================== # ==================== 类别权重(解决数据不平衡问题)====================
USE_CLASS_WEIGHT = True # True=启用类别权重, False=不启用(对比用) USE_CLASS_WEIGHT = True # True=启用类别权重, False=不启用(对比用)
# 权重计算公式: n_samples / (n_classes * n_class_i) # 权重计算公式: n_samples / (n_classes * n_class_i)
# 正面评论多所以权重小,负面评论少所以权重大 # 正面评论多所以权重小,负面评论少所以权重大
CLASS_WEIGHT_POS = 0.73 # 正面类权重(自动计算) CLASS_WEIGHT_POS = 0.73 # 正面类权重(自动计算)
CLASS_WEIGHT_NEG = 1.58 # 负面类权重(自动计算) CLASS_WEIGHT_NEG = 1.58 # 负面类权重(自动计算)
# ==================== 实验相关 ==================== # ==================== 实验相关 ====================
RUN_COMPARISON = False # True=运行对比实验, False=运行单个模型 RUN_COMPARISON = False # True=运行对比实验, False=运行单个模型
COMPARE_MODELS = ['lr', 'mlp'] # 要对比的模型列表 COMPARE_MODELS = ['lr', 'mlp'] # 要对比的模型列表
COMPARE_VECTORS = ['bow', 'tfidf'] # 要对比的向量化方式 COMPARE_VECTORS = ['bow', 'tfidf'] # 要对比的向量化方式
# ==================== 其他 ==================== # ==================== 其他 ====================
RANDOM_SEED = 42 # 随机种子(保证可复现) RANDOM_SEED = 42 # 随机种子(保证可复现)
VERBOSE = True # 打印详细日志 VERBOSE = True # 打印详细日志

View File

@@ -1,286 +1,286 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
数据加载与向量化模块 数据加载与向量化模块
支持两种向量化方法: 支持两种向量化方法:
1. BoW (Bag of Words) - 词频向量 1. BoW (Bag of Words) - 词频向量
2. TF-IDF - 词频-逆文档频率向量 2. TF-IDF - 词频-逆文档频率向量
TF-IDF 的优势: TF-IDF 的优势:
- 降低常见词(如"""")的权重 - 降低常见词(如"""")的权重
- 提升罕见词的信息量 - 提升罕见词的信息量
- 通常效果优于简单BoW - 通常效果优于简单BoW
""" """
import os import os
import re import re
import csv import csv
import math import math
import jieba import jieba
import numpy as np import numpy as np
from collections import Counter from collections import Counter
try: try:
import urllib.request import urllib.request
import ssl import ssl
DOWNLOAD_AVAILABLE = True DOWNLOAD_AVAILABLE = True
except ImportError: except ImportError:
DOWNLOAD_AVAILABLE = False DOWNLOAD_AVAILABLE = False
DATASET_URL = "https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv" DATASET_URL = "https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv"
def download_dataset(data_dir): def download_dataset(data_dir):
"""下载数据集(如果不存在)""" """下载数据集(如果不存在)"""
csv_path = os.path.join(data_dir, 'ChnSentiCorp_htl_all.csv') csv_path = os.path.join(data_dir, 'ChnSentiCorp_htl_all.csv')
if os.path.exists(csv_path): if os.path.exists(csv_path):
print(f"数据已存在: {csv_path}") print(f"数据已存在: {csv_path}")
return True return True
if not DOWNLOAD_AVAILABLE: if not DOWNLOAD_AVAILABLE:
return False return False
print("正在下载数据集...") print("正在下载数据集...")
ssl_context = ssl.create_default_context() ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE ssl_context.verify_mode = ssl.CERT_NONE
try: try:
request = urllib.request.Request(DATASET_URL, headers={'User-Agent': 'Mozilla/5.0'}) request = urllib.request.Request(DATASET_URL, headers={'User-Agent': 'Mozilla/5.0'})
response = urllib.request.urlopen(request, timeout=120, context=ssl_context) response = urllib.request.urlopen(request, timeout=120, context=ssl_context)
os.makedirs(data_dir, exist_ok=True) os.makedirs(data_dir, exist_ok=True)
with open(csv_path, 'wb') as f: with open(csv_path, 'wb') as f:
f.write(response.read()) f.write(response.read())
print(f"下载完成: {csv_path}") print(f"下载完成: {csv_path}")
return True return True
except Exception as e: except Exception as e:
print(f"下载失败: {e}") print(f"下载失败: {e}")
return False return False
def load_raw_data(data_dir): def load_raw_data(data_dir):
"""加载原始数据""" """加载原始数据"""
csv_path = os.path.join(data_dir, 'ChnSentiCorp_htl_all.csv') csv_path = os.path.join(data_dir, 'ChnSentiCorp_htl_all.csv')
texts, labels = [], [] texts, labels = [], []
with open(csv_path, 'r', encoding='utf-8') as f: with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.reader(f) reader = csv.reader(f)
for row in reader: for row in reader:
if len(row) < 2: if len(row) < 2:
continue continue
try: try:
label = int(row[0]) label = int(row[0])
review = row[1].strip() review = row[1].strip()
if review: if review:
texts.append(review) texts.append(review)
labels.append(label) labels.append(label)
except (ValueError, IndexError): except (ValueError, IndexError):
continue continue
return texts, np.array(labels) return texts, np.array(labels)
def tokenize(text): def tokenize(text):
"""中文分词""" """中文分词"""
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', ' ', text) text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', ' ', text)
words = jieba.lcut(text) words = jieba.lcut(text)
return [w for w in words if len(w) > 1] return [w for w in words if len(w) > 1]
# ==================== 向量化器 ==================== # ==================== 向量化器 ====================
class BaseVectorizer: class BaseVectorizer:
"""向量化器基类""" """向量化器基类"""
def fit(self, texts): pass def fit(self, texts): pass
def transform(self, texts): pass def transform(self, texts): pass
def fit_transform(self, texts): pass def fit_transform(self, texts): pass
class BoWVectorizer(BaseVectorizer): class BoWVectorizer(BaseVectorizer):
""" """
词袋模型 (Bag of Words) 词袋模型 (Bag of Words)
原理:统计每个词在文本中出现的次数 原理:统计每个词在文本中出现的次数
向量维度 = 词表大小 向量维度 = 词表大小
每个维度 = 该词在本文本中出现的次数 每个维度 = 该词在本文本中出现的次数
""" """
def __init__(self, max_features, max_seq_len): def __init__(self, max_features, max_seq_len):
self.max_features = max_features self.max_features = max_features
self.max_seq_len = max_seq_len self.max_seq_len = max_seq_len
self.vocab = {} self.vocab = {}
self.doc_freq = {} # 文档频率 self.doc_freq = {} # 文档频率
self.num_docs = 0 self.num_docs = 0
def fit(self, texts): def fit(self, texts):
"""构建词表(基于词频)""" """构建词表(基于词频)"""
counter = Counter() counter = Counter()
doc_counter = Counter() # 统计包含该词的文档数 doc_counter = Counter() # 统计包含该词的文档数
for text in texts: for text in texts:
words = tokenize(text) words = tokenize(text)
unique_words = set(words) unique_words = set(words)
counter.update(words) counter.update(words)
for w in unique_words: for w in unique_words:
doc_counter[w] += 1 doc_counter[w] += 1
self.num_docs = len(texts) self.num_docs = len(texts)
# 取最高频的词 # 取最高频的词
most_common = counter.most_common(self.max_features) most_common = counter.most_common(self.max_features)
self.vocab = {word: idx for idx, (word, _) in enumerate(most_common)} self.vocab = {word: idx for idx, (word, _) in enumerate(most_common)}
# 记录文档频率用于TF-IDF # 记录文档频率用于TF-IDF
self.doc_freq = {w: doc_counter[w] for w in self.vocab} self.doc_freq = {w: doc_counter[w] for w in self.vocab}
print(f" BoW词表大小: {len(self.vocab)}") print(f" BoW词表大小: {len(self.vocab)}")
return self return self
def transform(self, texts): def transform(self, texts):
"""将文本转换为词频向量""" """将文本转换为词频向量"""
vectors = [] vectors = []
for text in texts: for text in texts:
words = tokenize(text) words = tokenize(text)
freq = [0] * self.max_seq_len freq = [0] * self.max_seq_len
for i, word in enumerate(words[:self.max_seq_len]): for i, word in enumerate(words[:self.max_seq_len]):
if word in self.vocab: if word in self.vocab:
freq[i] = 1 # 二值(出现=1不出现=0 freq[i] = 1 # 二值(出现=1不出现=0
vectors.append(freq) vectors.append(freq)
return np.array(vectors, dtype=np.float32) return np.array(vectors, dtype=np.float32)
def fit_transform(self, texts): def fit_transform(self, texts):
self.fit(texts) self.fit(texts)
return self.transform(texts) return self.transform(texts)
class TFIDFVectorizer(BaseVectorizer): class TFIDFVectorizer(BaseVectorizer):
""" """
TF-IDF 向量器 TF-IDF 向量器
原理: 原理:
- TF(词频) = 词在本文本中出现的次数 - TF(词频) = 词在本文本中出现的次数
- IDF(逆文档频率) = log(总文档数 / 包含该词的文档数) - IDF(逆文档频率) = log(总文档数 / 包含该词的文档数)
- TF-IDF = TF × IDF - TF-IDF = TF × IDF
优势: 优势:
- 降低常见无意义词的权重(如"""" - 降低常见无意义词的权重(如""""
- 提升罕见但有信息量的词 - 提升罕见但有信息量的词
""" """
def __init__(self, max_features, max_seq_len): def __init__(self, max_features, max_seq_len):
self.max_features = max_features self.max_features = max_features
self.max_seq_len = max_seq_len self.max_seq_len = max_seq_len
self.vocab = {} self.vocab = {}
self.idf = {} # 存储每个词的IDF值 self.idf = {} # 存储每个词的IDF值
self.num_docs = 0 self.num_docs = 0
def fit(self, texts): def fit(self, texts):
"""构建词表并计算IDF""" """构建词表并计算IDF"""
counter = Counter() counter = Counter()
doc_counter = Counter() doc_counter = Counter()
for text in texts: for text in texts:
words = tokenize(text) words = tokenize(text)
unique_words = set(words) unique_words = set(words)
counter.update(words) counter.update(words)
for w in unique_words: for w in unique_words:
doc_counter[w] += 1 doc_counter[w] += 1
self.num_docs = len(texts) self.num_docs = len(texts)
# 计算每个词的IDF # 计算每个词的IDF
# IDF = log(总文档数 / 包含该词的文档数) # IDF = log(总文档数 / 包含该词的文档数)
idf_values = {} idf_values = {}
for word, df in doc_counter.items(): for word, df in doc_counter.items():
idf_values[word] = math.log(self.num_docs / (df + 1)) + 1 # 加1防零 idf_values[word] = math.log(self.num_docs / (df + 1)) + 1 # 加1防零
# 取IDF值最高的词信息量最大的词 # 取IDF值最高的词信息量最大的词
sorted_words = sorted(idf_values.items(), key=lambda x: x[1], reverse=True) sorted_words = sorted(idf_values.items(), key=lambda x: x[1], reverse=True)
self.vocab = {word: idx for idx, (word, _) in enumerate(sorted_words[:self.max_features])} self.vocab = {word: idx for idx, (word, _) in enumerate(sorted_words[:self.max_features])}
# 保存IDF值 # 保存IDF值
self.idf = {word: idf_values[word] for word in self.vocab} self.idf = {word: idf_values[word] for word in self.vocab}
print(f" TF-IDF词表大小: {len(self.vocab)}") print(f" TF-IDF词表大小: {len(self.vocab)}")
print(f" 平均IDF: {np.mean(list(self.idf.values())):.3f}") print(f" 平均IDF: {np.mean(list(self.idf.values())):.3f}")
return self return self
def transform(self, texts): def transform(self, texts):
"""将文本转换为TF-IDF向量""" """将文本转换为TF-IDF向量"""
vectors = [] vectors = []
for text in texts: for text in texts:
words = tokenize(text) words = tokenize(text)
# 计算TF # 计算TF
tf = Counter(words) tf = Counter(words)
tf_sum = len(words) if words else 1 tf_sum = len(words) if words else 1
# 生成向量 # 生成向量
vec = [0.0] * self.max_seq_len vec = [0.0] * self.max_seq_len
for i, word in enumerate(words[:self.max_seq_len]): for i, word in enumerate(words[:self.max_seq_len]):
if word in self.vocab: if word in self.vocab:
# TF × IDF # TF × IDF
vec[i] = (tf[word] / tf_sum) * self.idf.get(word, 0) vec[i] = (tf[word] / tf_sum) * self.idf.get(word, 0)
vectors.append(vec) vectors.append(vec)
return np.array(vectors, dtype=np.float32) return np.array(vectors, dtype=np.float32)
def fit_transform(self, texts): def fit_transform(self, texts):
self.fit(texts) self.fit(texts)
return self.transform(texts) return self.transform(texts)
def load_data(data_dir, max_features, max_seq_len, vectorizer_type='tfidf'): def load_data(data_dir, max_features, max_seq_len, vectorizer_type='tfidf'):
""" """
加载并向量化数据 加载并向量化数据
参数: 参数:
- vectorizer_type: 'tfidf''bow' - vectorizer_type: 'tfidf''bow'
""" """
if not download_dataset(data_dir): if not download_dataset(data_dir):
raise RuntimeError("数据加载失败,请检查网络或手动下载数据集") raise RuntimeError("数据加载失败,请检查网络或手动下载数据集")
print("正在加载数据...") print("正在加载数据...")
texts, labels = load_raw_data(data_dir) texts, labels = load_raw_data(data_dir)
print(f"总评论数: {len(texts)}, 正面: {sum(labels)}, 负面: {len(labels) - sum(labels)}") print(f"总评论数: {len(texts)}, 正面: {sum(labels)}, 负面: {len(labels) - sum(labels)}")
# 选择向量化器 # 选择向量化器
if vectorizer_type == 'tfidf': if vectorizer_type == 'tfidf':
vectorizer = TFIDFVectorizer(max_features, max_seq_len) vectorizer = TFIDFVectorizer(max_features, max_seq_len)
vec_name = "TF-IDF" vec_name = "TF-IDF"
else: else:
vectorizer = BoWVectorizer(max_features, max_seq_len) vectorizer = BoWVectorizer(max_features, max_seq_len)
vec_name = "BoW" vec_name = "BoW"
print(f"正在使用{vec_name}向量化...") print(f"正在使用{vec_name}向量化...")
X = vectorizer.fit_transform(texts) X = vectorizer.fit_transform(texts)
y = labels y = labels
# 打乱并划分 # 打乱并划分
np.random.seed(42) np.random.seed(42)
indices = np.random.permutation(len(X)) indices = np.random.permutation(len(X))
X = X[indices] X = X[indices]
y = y[indices] y = y[indices]
split_idx = int(len(X) * 0.8) split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:] X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:] y_train, y_test = y[:split_idx], y[split_idx:]
print(f"训练集: {len(X_train)}条, 测试集: {len(X_test)}") print(f"训练集: {len(X_train)}条, 测试集: {len(X_test)}")
return X_train, y_train, X_test, y_test, vectorizer return X_train, y_train, X_test, y_test, vectorizer
if __name__ == '__main__': if __name__ == '__main__':
# 测试 # 测试
print("=" * 60) print("=" * 60)
print("测试 TF-IDF 向量化") print("测试 TF-IDF 向量化")
print("=" * 60) print("=" * 60)
X_train, y_train, X_test, y_test, vec = load_data( X_train, y_train, X_test, y_test, vec = load_data(
'data/ChnSentiCorp', max_features=3000, max_seq_len=100, 'data/ChnSentiCorp', max_features=3000, max_seq_len=100,
vectorizer_type='tfidf' vectorizer_type='tfidf'
) )
print(f"\nX_train shape: {X_train.shape}") print(f"\nX_train shape: {X_train.shape}")
print(f"X_train sample (前5个特征): {X_train[0][:5]}") print(f"X_train sample (前5个特征): {X_train[0][:5]}")

68
main.py
View File

@@ -1,34 +1,34 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
主程序入口 主程序入口
使用方式: 使用方式:
1. 运行单个模型(默认): 1. 运行单个模型(默认):
python main.py python main.py
修改 config.py 中的 MODEL_TYPE 和 VECTORIZER_TYPE 来切换配置 修改 config.py 中的 MODEL_TYPE 和 VECTORIZER_TYPE 来切换配置
2. 运行对比实验: 2. 运行对比实验:
修改 config.py 中 RUN_COMPARISON = True 修改 config.py 中 RUN_COMPARISON = True
这会依次运行: 这会依次运行:
- 实验1: BoW vs TF-IDF (固定LR模型) - 实验1: BoW vs TF-IDF (固定LR模型)
- 实验2: LR vs MLP (固定TF-IDF) - 实验2: LR vs MLP (固定TF-IDF)
- 实验3: 不同学习率对比 - 实验3: 不同学习率对比
- 实验4: 不同隐藏层大小对比 - 实验4: 不同隐藏层大小对比
最后输出汇总报告 最后输出汇总报告
""" """
from train import main from train import main
if __name__ == '__main__': if __name__ == '__main__':
print("\n" + "=" * 70) print("\n" + "=" * 70)
print("文本分类实验 - 纯NumPy实现") print("文本分类实验 - 纯NumPy实现")
print("数据集: ChnSentiCorp (中文酒店评论)") print("数据集: ChnSentiCorp (中文酒店评论)")
print("模型: Logistic Regression / MLP") print("模型: Logistic Regression / MLP")
print("向量化: BoW / TF-IDF") print("向量化: BoW / TF-IDF")
print("=" * 70 + "\n") print("=" * 70 + "\n")
main() main()

View File

@@ -1,342 +1,342 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
模型模块 - 纯NumPy实现 模型模块 - 纯NumPy实现
支持两种模型: 支持两种模型:
1. Logistic Regression(逻辑回归)- 线性模型 1. Logistic Regression(逻辑回归)- 线性模型
2. MLP(多层感知机)- 两层全连接网络 2. MLP(多层感知机)- 两层全连接网络
设计思路: 设计思路:
- 两种模型都共享相同的接口,方便对比 - 两种模型都共享相同的接口,方便对比
- 代码简洁,每行都有详细注释 - 代码简洁,每行都有详细注释
- 手动实现反向传播,原理透明 - 手动实现反向传播,原理透明
""" """
import numpy as np import numpy as np
class BaseModel: class BaseModel:
"""模型基类""" """模型基类"""
def fit(self, X, y, X_val=None, y_val=None, epochs=100, batch_size=32, verbose=True): pass def fit(self, X, y, X_val=None, y_val=None, epochs=100, batch_size=32, verbose=True): pass
def predict(self, X): pass def predict(self, X): pass
def predict_proba(self, X): pass def predict_proba(self, X): pass
def accuracy(self, X, y): pass def accuracy(self, X, y): pass
class LogisticRegression(BaseModel): class LogisticRegression(BaseModel):
""" """
逻辑回归(线性分类器) 逻辑回归(线性分类器)
结构:输入 → 线性变换 → Softmax → 输出 结构:输入 → 线性变换 → Softmax → 输出
原理: 原理:
- 线性变换: z = X @ W + b - 线性变换: z = X @ W + b
- Softmax: 将线性输出转为概率分布 - Softmax: 将线性输出转为概率分布
参数量:input_size × num_classes + num_classes 参数量:input_size × num_classes + num_classes
""" """
def __init__(self, input_size, num_classes=2, learning_rate=0.1, def __init__(self, input_size, num_classes=2, learning_rate=0.1,
class_weight=None, seed=42): class_weight=None, seed=42):
np.random.seed(seed) np.random.seed(seed)
# 权重初始化(Xavier) # 权重初始化(Xavier)
self.W = np.random.randn(input_size, num_classes) * np.sqrt(2.0 / input_size) self.W = np.random.randn(input_size, num_classes) * np.sqrt(2.0 / input_size)
self.b = np.zeros(num_classes) self.b = np.zeros(num_classes)
self.lr = learning_rate self.lr = learning_rate
self.input_size = input_size self.input_size = input_size
self.num_classes = num_classes self.num_classes = num_classes
self.class_weight = class_weight # 类别权重 self.class_weight = class_weight # 类别权重
total_params = input_size * num_classes + num_classes total_params = input_size * num_classes + num_classes
print(f"LogisticRegression: {input_size} -> {num_classes}, 参数量: {total_params}") print(f"LogisticRegression: {input_size} -> {num_classes}, 参数量: {total_params}")
def softmax(self, x): def softmax(self, x):
"""Softmax函数""" """Softmax函数"""
x_shifted = x - np.max(x, axis=1, keepdims=True) x_shifted = x - np.max(x, axis=1, keepdims=True)
exp_x = np.exp(x_shifted) exp_x = np.exp(x_shifted)
return exp_x / np.sum(exp_x, axis=1, keepdims=True) return exp_x / np.sum(exp_x, axis=1, keepdims=True)
def forward(self, X): def forward(self, X):
"""前向传播""" """前向传播"""
# 线性变换 # 线性变换
z = X @ self.W + self.b z = X @ self.W + self.b
# Softmax输出概率 # Softmax输出概率
return self.softmax(z) return self.softmax(z)
def backward(self, X, y): def backward(self, X, y):
"""反向传播(梯度下降)""" """反向传播(梯度下降)"""
batch_size = X.shape[0] batch_size = X.shape[0]
probs = self.forward(X) probs = self.forward(X)
# Softmax + 交叉熵梯度 # Softmax + 交叉熵梯度
d_z = probs.copy() d_z = probs.copy()
# 应用类别权重:减去权重值而不是1 # 应用类别权重:减去权重值而不是1
# 公式: dL/dz_y = w_y * (p_y - 1) = w_y*p_y - w_y # 公式: dL/dz_y = w_y * (p_y - 1) = w_y*p_y - w_y
if self.class_weight is not None: if self.class_weight is not None:
for i in range(batch_size): for i in range(batch_size):
d_z[i, y[i]] -= self.class_weight[y[i]] d_z[i, y[i]] -= self.class_weight[y[i]]
else: else:
d_z[np.arange(batch_size), y] -= 1 d_z[np.arange(batch_size), y] -= 1
# 梯度 # 梯度
d_W = X.T @ d_z d_W = X.T @ d_z
d_b = np.sum(d_z, axis=0) d_b = np.sum(d_z, axis=0)
# 更新 # 更新
self.W -= self.lr * d_W / batch_size self.W -= self.lr * d_W / batch_size
self.b -= self.lr * d_b / batch_size self.b -= self.lr * d_b / batch_size
def fit(self, X, y, X_val=None, y_val=None, epochs=100, batch_size=32, verbose=True): def fit(self, X, y, X_val=None, y_val=None, epochs=100, batch_size=32, verbose=True):
"""训练""" """训练"""
num_samples = len(X) num_samples = len(X)
num_batches = (num_samples + batch_size - 1) // batch_size num_batches = (num_samples + batch_size - 1) // batch_size
for epoch in range(epochs): for epoch in range(epochs):
# 打乱 # 打乱
indices = np.random.permutation(num_samples) indices = np.random.permutation(num_samples)
X_shuffled = X[indices] X_shuffled = X[indices]
y_shuffled = y[indices] y_shuffled = y[indices]
epoch_loss = 0 epoch_loss = 0
for batch_idx in range(num_batches): for batch_idx in range(num_batches):
start = batch_idx * batch_size start = batch_idx * batch_size
end = min(start + batch_size, num_samples) end = min(start + batch_size, num_samples)
X_batch = X_shuffled[start:end] X_batch = X_shuffled[start:end]
y_batch = y_shuffled[start:end] y_batch = y_shuffled[start:end]
# 前向 + 反向 # 前向 + 反向
probs = self.forward(X_batch) probs = self.forward(X_batch)
self.backward(X_batch, y_batch) self.backward(X_batch, y_batch)
# 损失 # 损失
loss = -np.mean(np.log(np.clip(probs[np.arange(len(y_batch)), y_batch], 1e-10, 1))) loss = -np.mean(np.log(np.clip(probs[np.arange(len(y_batch)), y_batch], 1e-10, 1)))
epoch_loss += loss epoch_loss += loss
# 评估 # 评估
if verbose and (epoch + 1) % 20 == 0: if verbose and (epoch + 1) % 20 == 0:
train_acc = self.accuracy(X, y) train_acc = self.accuracy(X, y)
msg = f"Epoch {epoch+1:3d}/{epochs} | Loss: {epoch_loss/num_batches:.4f} | 训练准确率: {train_acc:.4f}" msg = f"Epoch {epoch+1:3d}/{epochs} | Loss: {epoch_loss/num_batches:.4f} | 训练准确率: {train_acc:.4f}"
if X_val is not None: if X_val is not None:
val_acc = self.accuracy(X_val, y_val) val_acc = self.accuracy(X_val, y_val)
msg += f" | 测试准确率: {val_acc:.4f}" msg += f" | 测试准确率: {val_acc:.4f}"
print(msg) print(msg)
return self return self
def predict(self, X): def predict(self, X):
return np.argmax(self.forward(X), axis=1) return np.argmax(self.forward(X), axis=1)
def predict_proba(self, X): def predict_proba(self, X):
return self.forward(X) return self.forward(X)
def accuracy(self, X, y): def accuracy(self, X, y):
return np.mean(self.predict(X) == y) return np.mean(self.predict(X) == y)
def save(self, filepath): def save(self, filepath):
"""保存模型权重""" """保存模型权重"""
np.save(filepath + '_W.npy', self.W) np.save(filepath + '_W.npy', self.W)
np.save(filepath + '_b.npy', self.b) np.save(filepath + '_b.npy', self.b)
print(f"模型已保存: {filepath}") print(f"模型已保存: {filepath}")
@staticmethod @staticmethod
def load(filepath, input_size, num_classes=2, learning_rate=0.1): def load(filepath, input_size, num_classes=2, learning_rate=0.1):
"""加载模型权重""" """加载模型权重"""
model = LogisticRegression(input_size, num_classes, learning_rate) model = LogisticRegression(input_size, num_classes, learning_rate)
model.W = np.load(filepath + '_W.npy') model.W = np.load(filepath + '_W.npy')
model.b = np.load(filepath + '_b.npy') model.b = np.load(filepath + '_b.npy')
print(f"模型已加载: {filepath}") print(f"模型已加载: {filepath}")
return model return model
class MLP(BaseModel): class MLP(BaseModel):
""" """
多层感知机(神经网络) 多层感知机(神经网络)
结构:输入 → 线性变换 → ReLU → 线性变换 → Softmax → 输出 结构:输入 → 线性变换 → ReLU → 线性变换 → Softmax → 输出
和LogisticRegression的区别: 和LogisticRegression的区别:
- 多了一层隐藏层 + 非线性激活 - 多了一层隐藏层 + 非线性激活
- 可以学习非线性关系 - 可以学习非线性关系
- 参数量更大 - 参数量更大
参数量: 参数量:
- W1: input_size × hidden_size - W1: input_size × hidden_size
- b1: hidden_size - b1: hidden_size
- W2: hidden_size × num_classes - W2: hidden_size × num_classes
- b2: num_classes - b2: num_classes
""" """
def __init__(self, input_size, hidden_size=64, num_classes=2, def __init__(self, input_size, hidden_size=64, num_classes=2,
learning_rate=0.1, keep_prob=1.0, class_weight=None, seed=42): learning_rate=0.1, keep_prob=1.0, class_weight=None, seed=42):
np.random.seed(seed) np.random.seed(seed)
# 第一层权重 # 第一层权重
self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size) self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
self.b1 = np.zeros(hidden_size) self.b1 = np.zeros(hidden_size)
# 第二层权重 # 第二层权重
self.W2 = np.random.randn(hidden_size, num_classes) * np.sqrt(2.0 / hidden_size) self.W2 = np.random.randn(hidden_size, num_classes) * np.sqrt(2.0 / hidden_size)
self.b2 = np.zeros(num_classes) self.b2 = np.zeros(num_classes)
self.lr = learning_rate self.lr = learning_rate
self.keep_prob = keep_prob self.keep_prob = keep_prob
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.input_size = input_size self.input_size = input_size
self.num_classes = num_classes self.num_classes = num_classes
self.class_weight = class_weight # 类别权重 self.class_weight = class_weight # 类别权重
total_params = (input_size * hidden_size + hidden_size + total_params = (input_size * hidden_size + hidden_size +
hidden_size * num_classes + num_classes) hidden_size * num_classes + num_classes)
print(f"MLP: {input_size} -> {hidden_size} -> {num_classes}, 参数量: {total_params}") print(f"MLP: {input_size} -> {hidden_size} -> {num_classes}, 参数量: {total_params}")
def relu(self, x): def relu(self, x):
"""ReLU激活""" """ReLU激活"""
return np.maximum(0, x) return np.maximum(0, x)
def relu_derivative(self, x): def relu_derivative(self, x):
"""ReLU导数""" """ReLU导数"""
return (x > 0).astype(float) return (x > 0).astype(float)
def softmax(self, x): def softmax(self, x):
"""Softmax函数""" """Softmax函数"""
x_shifted = x - np.max(x, axis=1, keepdims=True) x_shifted = x - np.max(x, axis=1, keepdims=True)
exp_x = np.exp(x_shifted) exp_x = np.exp(x_shifted)
return exp_x / np.sum(exp_x, axis=1, keepdims=True) return exp_x / np.sum(exp_x, axis=1, keepdims=True)
def forward(self, X): def forward(self, X):
"""前向传播""" """前向传播"""
# 第一层 # 第一层
self.z1 = X @ self.W1 + self.b1 self.z1 = X @ self.W1 + self.b1
self.a1 = self.relu(self.z1) self.a1 = self.relu(self.z1)
# Dropout(训练时) # Dropout(训练时)
if self.keep_prob < 1.0 and hasattr(self, 'training'): if self.keep_prob < 1.0 and hasattr(self, 'training'):
self.d1 = (np.random.rand(*self.a1.shape) < self.keep_prob).astype(float) self.d1 = (np.random.rand(*self.a1.shape) < self.keep_prob).astype(float)
self.a1 *= self.d1 self.a1 *= self.d1
self.a1 /= self.keep_prob self.a1 /= self.keep_prob
# 第二层 # 第二层
self.z2 = self.a1 @ self.W2 + self.b2 self.z2 = self.a1 @ self.W2 + self.b2
self.probs = self.softmax(self.z2) self.probs = self.softmax(self.z2)
return self.probs return self.probs
def backward(self, X, y): def backward(self, X, y):
"""反向传播""" """反向传播"""
batch_size = X.shape[0] batch_size = X.shape[0]
# 输出层梯度 # 输出层梯度
d_z2 = self.probs.copy() d_z2 = self.probs.copy()
# 应用类别权重 # 应用类别权重
if self.class_weight is not None: if self.class_weight is not None:
for i in range(batch_size): for i in range(batch_size):
d_z2[i, y[i]] -= self.class_weight[y[i]] d_z2[i, y[i]] -= self.class_weight[y[i]]
else: else:
d_z2[np.arange(batch_size), y] -= 1 d_z2[np.arange(batch_size), y] -= 1
# 第二层梯度 # 第二层梯度
d_W2 = self.a1.T @ d_z2 d_W2 = self.a1.T @ d_z2
d_b2 = np.sum(d_z2, axis=0) d_b2 = np.sum(d_z2, axis=0)
# 隐藏层梯度 # 隐藏层梯度
d_a1 = d_z2 @ self.W2.T d_a1 = d_z2 @ self.W2.T
d_z1 = d_a1 * self.relu_derivative(self.z1) d_z1 = d_a1 * self.relu_derivative(self.z1)
# Dropout梯度 # Dropout梯度
if self.keep_prob < 1.0 and hasattr(self, 'd1'): if self.keep_prob < 1.0 and hasattr(self, 'd1'):
d_z1 *= self.d1 d_z1 *= self.d1
d_z1 /= self.keep_prob d_z1 /= self.keep_prob
# 第一层梯度 # 第一层梯度
d_W1 = X.T @ d_z1 d_W1 = X.T @ d_z1
d_b1 = np.sum(d_z1, axis=0) d_b1 = np.sum(d_z1, axis=0)
# 更新 # 更新
self.W1 -= self.lr * d_W1 / batch_size self.W1 -= self.lr * d_W1 / batch_size
self.b1 -= self.lr * d_b1 / batch_size self.b1 -= self.lr * d_b1 / batch_size
self.W2 -= self.lr * d_W2 / batch_size self.W2 -= self.lr * d_W2 / batch_size
self.b2 -= self.lr * d_b2 / batch_size self.b2 -= self.lr * d_b2 / batch_size
def fit(self, X, y, X_val=None, y_val=None, epochs=100, batch_size=32, verbose=True): def fit(self, X, y, X_val=None, y_val=None, epochs=100, batch_size=32, verbose=True):
"""训练""" """训练"""
num_samples = len(X) num_samples = len(X)
num_batches = (num_samples + batch_size - 1) // batch_size num_batches = (num_samples + batch_size - 1) // batch_size
for epoch in range(epochs): for epoch in range(epochs):
# 打乱 # 打乱
indices = np.random.permutation(num_samples) indices = np.random.permutation(num_samples)
X_shuffled = X[indices] X_shuffled = X[indices]
y_shuffled = y[indices] y_shuffled = y[indices]
epoch_loss = 0 epoch_loss = 0
self.training = True # 开启Dropout self.training = True # 开启Dropout
for batch_idx in range(num_batches): for batch_idx in range(num_batches):
start = batch_idx * batch_size start = batch_idx * batch_size
end = min(start + batch_size, num_samples) end = min(start + batch_size, num_samples)
X_batch = X_shuffled[start:end] X_batch = X_shuffled[start:end]
y_batch = y_shuffled[start:end] y_batch = y_shuffled[start:end]
# 前向 + 反向 # 前向 + 反向
probs = self.forward(X_batch) probs = self.forward(X_batch)
self.backward(X_batch, y_batch) self.backward(X_batch, y_batch)
# 损失 # 损失
loss = -np.mean(np.log(np.clip(probs[np.arange(len(y_batch)), y_batch], 1e-10, 1))) loss = -np.mean(np.log(np.clip(probs[np.arange(len(y_batch)), y_batch], 1e-10, 1)))
epoch_loss += loss epoch_loss += loss
self.training = False # 关闭Dropout self.training = False # 关闭Dropout
# 评估 # 评估
if verbose and (epoch + 1) % 20 == 0: if verbose and (epoch + 1) % 20 == 0:
train_acc = self.accuracy(X, y) train_acc = self.accuracy(X, y)
msg = f"Epoch {epoch+1:3d}/{epochs} | Loss: {epoch_loss/num_batches:.4f} | 训练准确率: {train_acc:.4f}" msg = f"Epoch {epoch+1:3d}/{epochs} | Loss: {epoch_loss/num_batches:.4f} | 训练准确率: {train_acc:.4f}"
if X_val is not None: if X_val is not None:
val_acc = self.accuracy(X_val, y_val) val_acc = self.accuracy(X_val, y_val)
msg += f" | 测试准确率: {val_acc:.4f}" msg += f" | 测试准确率: {val_acc:.4f}"
print(msg) print(msg)
return self return self
def predict(self, X): def predict(self, X):
return np.argmax(self.forward(X), axis=1) return np.argmax(self.forward(X), axis=1)
def predict_proba(self, X): def predict_proba(self, X):
return self.forward(X) return self.forward(X)
def accuracy(self, X, y): def accuracy(self, X, y):
return np.mean(self.predict(X) == y) return np.mean(self.predict(X) == y)
def save(self, filepath): def save(self, filepath):
"""保存模型权重""" """保存模型权重"""
np.save(filepath + '_W1.npy', self.W1) np.save(filepath + '_W1.npy', self.W1)
np.save(filepath + '_b1.npy', self.b1) np.save(filepath + '_b1.npy', self.b1)
np.save(filepath + '_W2.npy', self.W2) np.save(filepath + '_W2.npy', self.W2)
np.save(filepath + '_b2.npy', self.b2) np.save(filepath + '_b2.npy', self.b2)
print(f"模型已保存: {filepath}") print(f"模型已保存: {filepath}")
@staticmethod @staticmethod
def load(filepath, input_size, hidden_size=64, num_classes=2, learning_rate=0.1, keep_prob=1.0): def load(filepath, input_size, hidden_size=64, num_classes=2, learning_rate=0.1, keep_prob=1.0):
"""加载模型权重""" """加载模型权重"""
model = MLP(input_size, hidden_size, num_classes, learning_rate, keep_prob) model = MLP(input_size, hidden_size, num_classes, learning_rate, keep_prob)
model.W1 = np.load(filepath + '_W1.npy') model.W1 = np.load(filepath + '_W1.npy')
model.b1 = np.load(filepath + '_b1.npy') model.b1 = np.load(filepath + '_b1.npy')
model.W2 = np.load(filepath + '_W2.npy') model.W2 = np.load(filepath + '_W2.npy')
model.b2 = np.load(filepath + '_b2.npy') model.b2 = np.load(filepath + '_b2.npy')
print(f"模型已加载: {filepath}") print(f"模型已加载: {filepath}")
return model return model
def create_model(model_type, input_size, hidden_size=64, num_classes=2, def create_model(model_type, input_size, hidden_size=64, num_classes=2,
learning_rate=0.1, keep_prob=1.0, class_weight=None): learning_rate=0.1, keep_prob=1.0, class_weight=None):
"""工厂函数:创建模型""" """工厂函数:创建模型"""
if model_type == 'lr': if model_type == 'lr':
return LogisticRegression(input_size, num_classes, learning_rate, class_weight) return LogisticRegression(input_size, num_classes, learning_rate, class_weight)
elif model_type == 'mlp': elif model_type == 'mlp':
return MLP(input_size, hidden_size, num_classes, learning_rate, keep_prob, class_weight) return MLP(input_size, hidden_size, num_classes, learning_rate, keep_prob, class_weight)
else: else:
raise ValueError(f"未知模型类型: {model_type}") raise ValueError(f"未知模型类型: {model_type}")

View File

@@ -1,264 +1,264 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
预测脚本 - 加载训练好的模型,测试自定义文本 预测脚本 - 加载训练好的模型,测试自定义文本
使用方法: 使用方法:
python predict.py python predict.py
程序会: 程序会:
1. 列出已保存的模型 1. 列出已保存的模型
2. 让学生选择模型 2. 让学生选择模型
3. 加载模型和向量化器 3. 加载模型和向量化器
4. 学生输入文本,实时预测情感 4. 学生输入文本,实时预测情感
""" """
import os import os
import re import re
import sys import sys
import jieba import jieba
import numpy as np import numpy as np
import math import math
import csv import csv
from collections import Counter from collections import Counter
# 添加父目录到路径以便导入 # 添加父目录到路径以便导入
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from config import DATA_DIR, MAX_FEATURES, MAX_SEQ_LEN, HIDDEN_SIZE from config import DATA_DIR, MAX_FEATURES, MAX_SEQ_LEN, HIDDEN_SIZE
def find_saved_models(): def find_saved_models():
"""查找已保存的模型""" """查找已保存的模型"""
models = {} models = {}
for f in os.listdir('.'): for f in os.listdir('.'):
if not f.startswith('model_') or not f.endswith('.npy'): if not f.startswith('model_') or not f.endswith('.npy'):
continue continue
# MLP: model_mlp_tfidf_W1.npy -> model_mlp_tfidf # MLP: model_mlp_tfidf_W1.npy -> model_mlp_tfidf
# LR: model_lr_bow_W.npy -> model_lr_bow # LR: model_lr_bow_W.npy -> model_lr_bow
# 需要精确匹配,避免 tfidf 被截断 # 需要精确匹配,避免 tfidf 被截断
for suffix in ['_W1.npy', '_b1.npy', '_W2.npy', '_b2.npy', '_W.npy', '_b.npy']: for suffix in ['_W1.npy', '_b1.npy', '_W2.npy', '_b2.npy', '_W.npy', '_b.npy']:
if f.endswith(suffix): if f.endswith(suffix):
name = f[:-len(suffix)] name = f[:-len(suffix)]
models[name] = True models[name] = True
break break
return list(models.keys()) return list(models.keys())
def tokenize(text): def tokenize(text):
"""中文分词""" """中文分词"""
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', ' ', text) text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', ' ', text)
words = jieba.lcut(text) words = jieba.lcut(text)
return [w for w in words if len(w) > 1] return [w for w in words if len(w) > 1]
class BoWVectorizer: class BoWVectorizer:
def __init__(self, max_seq_len, max_features=3000): def __init__(self, max_seq_len, max_features=3000):
self.max_seq_len = max_seq_len self.max_seq_len = max_seq_len
self.max_features = max_features self.max_features = max_features
self.vocab = {} self.vocab = {}
def fit(self, texts): def fit(self, texts):
counter = Counter() counter = Counter()
for text in texts: for text in texts:
words = tokenize(text) words = tokenize(text)
counter.update(words) counter.update(words)
most_common = counter.most_common(self.max_features) most_common = counter.most_common(self.max_features)
self.vocab = {word: idx for idx, (word, _) in enumerate(most_common)} self.vocab = {word: idx for idx, (word, _) in enumerate(most_common)}
return self return self
def transform(self, text): def transform(self, text):
words = tokenize(text) words = tokenize(text)
vec = [0] * self.max_seq_len vec = [0] * self.max_seq_len
for i, word in enumerate(words[:self.max_seq_len]): for i, word in enumerate(words[:self.max_seq_len]):
if word in self.vocab: if word in self.vocab:
vec[i] = 1 vec[i] = 1
return np.array([vec], dtype=np.float32) return np.array([vec], dtype=np.float32)
class TFIDFVectorizer: class TFIDFVectorizer:
def __init__(self, max_seq_len, max_features=3000): def __init__(self, max_seq_len, max_features=3000):
self.max_seq_len = max_seq_len self.max_seq_len = max_seq_len
self.max_features = max_features self.max_features = max_features
self.vocab = {} self.vocab = {}
self.idf = {} self.idf = {}
self.num_docs = 0 self.num_docs = 0
def fit(self, texts): def fit(self, texts):
counter = Counter() counter = Counter()
doc_counter = Counter() doc_counter = Counter()
for text in texts: for text in texts:
words = tokenize(text) words = tokenize(text)
unique_words = set(words) unique_words = set(words)
counter.update(words) counter.update(words)
for w in unique_words: for w in unique_words:
doc_counter[w] += 1 doc_counter[w] += 1
self.num_docs = len(texts) self.num_docs = len(texts)
# 按词频取最高频的词和训练时dataset.py一致 # 按词频取最高频的词和训练时dataset.py一致
most_common = counter.most_common(self.max_features) most_common = counter.most_common(self.max_features)
self.vocab = {word: idx for idx, (word, _) in enumerate(most_common)} self.vocab = {word: idx for idx, (word, _) in enumerate(most_common)}
# 计算IDF只对词表中的词 # 计算IDF只对词表中的词
self.idf = {} self.idf = {}
for word in self.vocab: for word in self.vocab:
df = doc_counter.get(word, 1) df = doc_counter.get(word, 1)
self.idf[word] = math.log(self.num_docs / (df + 1)) + 1 self.idf[word] = math.log(self.num_docs / (df + 1)) + 1
return self return self
def transform(self, text): def transform(self, text):
words = tokenize(text) words = tokenize(text)
tf = Counter(words) tf = Counter(words)
tf_sum = len(words) if words else 1 tf_sum = len(words) if words else 1
vec = [0.0] * self.max_seq_len vec = [0.0] * self.max_seq_len
for i, word in enumerate(words[:self.max_seq_len]): for i, word in enumerate(words[:self.max_seq_len]):
if word in self.vocab: if word in self.vocab:
vec[i] = (tf[word] / tf_sum) * self.idf.get(word, 0) vec[i] = (tf[word] / tf_sum) * self.idf.get(word, 0)
return np.array([vec], dtype=np.float32) return np.array([vec], dtype=np.float32)
def load_vectorizer(vectorizer_type): def load_vectorizer(vectorizer_type):
"""加载向量化器""" """加载向量化器"""
csv_path = os.path.join(DATA_DIR, 'ChnSentiCorp_htl_all.csv') csv_path = os.path.join(DATA_DIR, 'ChnSentiCorp_htl_all.csv')
if not os.path.exists(csv_path): if not os.path.exists(csv_path):
print("数据文件不存在,请先运行 main.py 训练模型") print("数据文件不存在,请先运行 main.py 训练模型")
return None return None
print("正在加载数据构建词表...") print("正在加载数据构建词表...")
texts, labels = [], [] texts, labels = [], []
with open(csv_path, 'r', encoding='utf-8') as f: with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.reader(f) reader = csv.reader(f)
for row in reader: for row in reader:
if len(row) < 2: if len(row) < 2:
continue continue
try: try:
labels.append(int(row[0])) labels.append(int(row[0]))
texts.append(row[1].strip()) texts.append(row[1].strip())
except: except:
continue continue
# 判断向量化器类型 # 判断向量化器类型
is_tfidf = 'tfidf' in vectorizer_type.lower() is_tfidf = 'tfidf' in vectorizer_type.lower()
if is_tfidf: if is_tfidf:
vectorizer = TFIDFVectorizer(MAX_SEQ_LEN) vectorizer = TFIDFVectorizer(MAX_SEQ_LEN)
vectorizer.fit(texts) vectorizer.fit(texts)
print(f" TF-IDF词表大小: {len(vectorizer.vocab)}") print(f" TF-IDF词表大小: {len(vectorizer.vocab)}")
else: else:
vectorizer = BoWVectorizer(MAX_SEQ_LEN) vectorizer = BoWVectorizer(MAX_SEQ_LEN)
vectorizer.fit(texts) vectorizer.fit(texts)
print(f" BoW词表大小: {len(vectorizer.vocab)}") print(f" BoW词表大小: {len(vectorizer.vocab)}")
return vectorizer return vectorizer
def load_model(model_name, model_type, input_size, hidden_size): def load_model(model_name, model_type, input_size, hidden_size):
"""加载模型""" """加载模型"""
from model_numpy import MLP, LogisticRegression from model_numpy import MLP, LogisticRegression
if model_type == 'lr': if model_type == 'lr':
model = LogisticRegression(input_size, 2, learning_rate=0.05) model = LogisticRegression(input_size, 2, learning_rate=0.05)
model.W = np.load(model_name + '_W.npy') model.W = np.load(model_name + '_W.npy')
model.b = np.load(model_name + '_b.npy') model.b = np.load(model_name + '_b.npy')
else: # mlp else: # mlp
model = MLP(input_size, hidden_size, 2, learning_rate=0.05, keep_prob=1.0) model = MLP(input_size, hidden_size, 2, learning_rate=0.05, keep_prob=1.0)
model.W1 = np.load(model_name + '_W1.npy') model.W1 = np.load(model_name + '_W1.npy')
model.b1 = np.load(model_name + '_b1.npy') model.b1 = np.load(model_name + '_b1.npy')
model.W2 = np.load(model_name + '_W2.npy') model.W2 = np.load(model_name + '_W2.npy')
model.b2 = np.load(model_name + '_b2.npy') model.b2 = np.load(model_name + '_b2.npy')
return model return model
def predict_text(model, vectorizer, text): def predict_text(model, vectorizer, text):
"""预测单条文本""" """预测单条文本"""
vec = vectorizer.transform(text) vec = vectorizer.transform(text)
prob = model.forward(vec)[0] prob = model.forward(vec)[0]
pred = np.argmax(prob) pred = np.argmax(prob)
label = "正面" if pred == 1 else "负面" label = "正面" if pred == 1 else "负面"
confidence = prob[pred] * 100 confidence = prob[pred] * 100
return label, confidence, prob return label, confidence, prob
def main(): def main():
print("\n" + "=" * 60) print("\n" + "=" * 60)
print("文本情感预测 - 加载已训练模型") print("文本情感预测 - 加载已训练模型")
print("=" * 60) print("=" * 60)
# 查找已保存的模型 # 查找已保存的模型
models = find_saved_models() models = find_saved_models()
if not models: if not models:
print("\n未找到已保存的模型!") print("\n未找到已保存的模型!")
print("请先运行 python main.py 训练模型") print("请先运行 python main.py 训练模型")
return return
# 让用户选择模型 # 让用户选择模型
print(f"\n已找到 {len(models)} 个模型:") print(f"\n已找到 {len(models)} 个模型:")
for i, name in enumerate(models, 1): for i, name in enumerate(models, 1):
print(f" {i}. {name}") print(f" {i}. {name}")
print(f"\n请选择模型编号 (1-{len(models)}): ", end="", flush=True) print(f"\n请选择模型编号 (1-{len(models)}): ", end="", flush=True)
try: try:
choice = int(sys.stdin.readline().strip()) choice = int(sys.stdin.readline().strip())
if choice < 1 or choice > len(models): if choice < 1 or choice > len(models):
print("无效选择") print("无效选择")
return return
model_name = models[choice - 1] model_name = models[choice - 1]
except: except:
print("无效输入") print("无效输入")
return return
# 解析模型名称获取类型 # 解析模型名称获取类型
parts = model_name.split('_') parts = model_name.split('_')
model_type = parts[1] # 'lr' 或 'mlp' model_type = parts[1] # 'lr' 或 'mlp'
vectorizer_type = parts[2] # 'bow' 或 'tfidf' vectorizer_type = parts[2] # 'bow' 或 'tfidf'
print(f"\n选中的模型: {model_name}") print(f"\n选中的模型: {model_name}")
print(f"模型类型: {model_type.upper()}") print(f"模型类型: {model_type.upper()}")
print(f"向量化方式: {vectorizer_type.upper()}") print(f"向量化方式: {vectorizer_type.upper()}")
# 加载向量化器 # 加载向量化器
print("\n正在加载向量化器...") print("\n正在加载向量化器...")
vectorizer = load_vectorizer(vectorizer_type) vectorizer = load_vectorizer(vectorizer_type)
if vectorizer is None: if vectorizer is None:
return return
# 加载模型 # 加载模型
print("正在加载模型...") print("正在加载模型...")
model = load_model(model_name, model_type, MAX_SEQ_LEN, HIDDEN_SIZE) model = load_model(model_name, model_type, MAX_SEQ_LEN, HIDDEN_SIZE)
print("模型加载成功!") print("模型加载成功!")
# 预测循环 # 预测循环
print("\n" + "=" * 60) print("\n" + "=" * 60)
print("开始预测输入文本后按回车q退出") print("开始预测输入文本后按回车q退出")
print("=" * 60) print("=" * 60)
while True: while True:
try: try:
print("\n请输入评论文本: ", end="", flush=True) print("\n请输入评论文本: ", end="", flush=True)
text = sys.stdin.readline().strip() text = sys.stdin.readline().strip()
if text.lower() == 'q': if text.lower() == 'q':
print("再见!") print("再见!")
break break
if not text: if not text:
continue continue
label, confidence, prob = predict_text(model, vectorizer, text) label, confidence, prob = predict_text(model, vectorizer, text)
print(f"\n预测结果: {label}") print(f"\n预测结果: {label}")
print(f"置信度: {confidence:.1f}%") print(f"置信度: {confidence:.1f}%")
print(f"详细: 正面概率={prob[1]*100:.1f}%, 负面概率={prob[0]*100:.1f}%") print(f"详细: 正面概率={prob[1]*100:.1f}%, 负面概率={prob[0]*100:.1f}%")
except KeyboardInterrupt: except KeyboardInterrupt:
print("\n\n再见!") print("\n\n再见!")
break break
except Exception as e: except Exception as e:
print(f"预测出错: {e}") print(f"预测出错: {e}")
if __name__ == '__main__': if __name__ == '__main__':
main() main()