上传文件至 /

This commit is contained in:
2026-04-30 15:57:10 +08:00
parent 5f68c5649e
commit 43845ba362
5 changed files with 360 additions and 360 deletions

View File

@@ -1,40 +1,40 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
配置文件 - 所有超参数集中管理 配置文件 - 所有超参数集中管理
设计思路: 设计思路:
将超参数分门别类,学生可以单独修改某一类而不会影响其他 将超参数分门别类,学生可以单独修改某一类而不会影响其他
""" """
# ==================== 数据相关 ==================== # ==================== 数据相关 ====================
DATA_DIR = 'data/ChnSentiCorp' # 数据集路径 DATA_DIR = 'data/ChnSentiCorp' # 数据集路径
MAX_FEATURES = 3000 # 词表最大容量 MAX_FEATURES = 3000 # 词表最大容量
MAX_SEQ_LEN = 100 # 句子最大长度(词数) MAX_SEQ_LEN = 100 # 句子最大长度(词数)
VECTORIZER_TYPE = 'tfidf' # 'tfidf' 或 'bow'(向量化方式) VECTORIZER_TYPE = 'tfidf' # 'tfidf' 或 'bow'(向量化方式)
# ==================== 模型相关 ==================== # ==================== 模型相关 ====================
MODEL_TYPE = 'mlp' # 'mlp' 或 'lr'(模型类型) MODEL_TYPE = 'mlp' # 'mlp' 或 'lr'(模型类型)
HIDDEN_SIZE = 64 # MLP隐藏层大小LR忽略 HIDDEN_SIZE = 66 # MLP隐藏层大小LR忽略
NUM_CLASSES = 2 # 类别数(正面/负面二分类) NUM_CLASSES = 3 # 类别数(正面/负面二分类)
KEEP_PROB = 1.0 # Dropout保留概率LR忽略设为1即可 KEEP_PROB = 1.0 # Dropout保留概率LR忽略设为1即可
# ==================== 训练相关 ==================== # ==================== 训练相关 ====================
LEARNING_RATE = 0.05 # 学习率 LEARNING_RATE = 0.1 # 学习率
NUM_EPOCHS = 100 # 训练轮数 NUM_EPOCHS = 50 # 训练轮数
BATCH_SIZE = 64 # 批次大小 BATCH_SIZE = 55 # 批次大小
# ==================== 类别权重(解决数据不平衡问题)==================== # ==================== 类别权重(解决数据不平衡问题)====================
USE_CLASS_WEIGHT = True # True=启用类别权重, False=不启用(对比用) USE_CLASS_WEIGHT = True # True=启用类别权重, False=不启用(对比用)
# 权重计算公式: n_samples / (n_classes * n_class_i) # 权重计算公式: n_samples / (n_classes * n_class_i)
# 正面评论多所以权重小,负面评论少所以权重大 # 正面评论多所以权重小,负面评论少所以权重大
CLASS_WEIGHT_POS = 0.73 # 正面类权重(自动计算) CLASS_WEIGHT_POS = 0.1 # 正面类权重(自动计算)
CLASS_WEIGHT_NEG = 1.58 # 负面类权重(自动计算) CLASS_WEIGHT_NEG = 1.66 # 负面类权重(自动计算)
# ==================== 实验相关 ==================== # ==================== 实验相关 ====================
RUN_COMPARISON = False # True=运行对比实验, False=运行单个模型 RUN_COMPARISON = False # True=运行对比实验, False=运行单个模型
COMPARE_MODELS = ['lr', 'mlp'] # 要对比的模型列表 COMPARE_MODELS = ['lr', 'mlp'] # 要对比的模型列表
COMPARE_VECTORS = ['bow', 'tfidf'] # 要对比的向量化方式 COMPARE_VECTORS = ['bow', 'tfidf'] # 要对比的向量化方式
# ==================== 其他 ==================== # ==================== 其他 ====================
RANDOM_SEED = 42 # 随机种子(保证可复现) RANDOM_SEED = 42 # 随机种子(保证可复现)
VERBOSE = True # 打印详细日志 VERBOSE = True # 打印详细日志

View File

@@ -1,286 +1,286 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
数据加载与向量化模块 数据加载与向量化模块
支持两种向量化方法: 支持两种向量化方法:
1. BoW (Bag of Words) - 词频向量 1. BoW (Bag of Words) - 词频向量
2. TF-IDF - 词频-逆文档频率向量 2. TF-IDF - 词频-逆文档频率向量
TF-IDF 的优势: TF-IDF 的优势:
- 降低常见词(如"""")的权重 - 降低常见词(如"""")的权重
- 提升罕见词的信息量 - 提升罕见词的信息量
- 通常效果优于简单BoW - 通常效果优于简单BoW
""" """
import os import os
import re import re
import csv import csv
import math import math
import jieba import jieba
import numpy as np import numpy as np
from collections import Counter from collections import Counter
try: try:
import urllib.request import urllib.request
import ssl import ssl
DOWNLOAD_AVAILABLE = True DOWNLOAD_AVAILABLE = True
except ImportError: except ImportError:
DOWNLOAD_AVAILABLE = False DOWNLOAD_AVAILABLE = False
DATASET_URL = "https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv" DATASET_URL = "https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv"
def download_dataset(data_dir): def download_dataset(data_dir):
"""下载数据集(如果不存在)""" """下载数据集(如果不存在)"""
csv_path = os.path.join(data_dir, 'ChnSentiCorp_htl_all.csv') csv_path = os.path.join(data_dir, 'ChnSentiCorp_htl_all.csv')
if os.path.exists(csv_path): if os.path.exists(csv_path):
print(f"数据已存在: {csv_path}") print(f"数据已存在: {csv_path}")
return True return True
if not DOWNLOAD_AVAILABLE: if not DOWNLOAD_AVAILABLE:
return False return False
print("正在下载数据集...") print("正在下载数据集...")
ssl_context = ssl.create_default_context() ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE ssl_context.verify_mode = ssl.CERT_NONE
try: try:
request = urllib.request.Request(DATASET_URL, headers={'User-Agent': 'Mozilla/5.0'}) request = urllib.request.Request(DATASET_URL, headers={'User-Agent': 'Mozilla/5.0'})
response = urllib.request.urlopen(request, timeout=120, context=ssl_context) response = urllib.request.urlopen(request, timeout=120, context=ssl_context)
os.makedirs(data_dir, exist_ok=True) os.makedirs(data_dir, exist_ok=True)
with open(csv_path, 'wb') as f: with open(csv_path, 'wb') as f:
f.write(response.read()) f.write(response.read())
print(f"下载完成: {csv_path}") print(f"下载完成: {csv_path}")
return True return True
except Exception as e: except Exception as e:
print(f"下载失败: {e}") print(f"下载失败: {e}")
return False return False
def load_raw_data(data_dir): def load_raw_data(data_dir):
"""加载原始数据""" """加载原始数据"""
csv_path = os.path.join(data_dir, 'ChnSentiCorp_htl_all.csv') csv_path = os.path.join(data_dir, 'ChnSentiCorp_htl_all.csv')
texts, labels = [], [] texts, labels = [], []
with open(csv_path, 'r', encoding='utf-8') as f: with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.reader(f) reader = csv.reader(f)
for row in reader: for row in reader:
if len(row) < 2: if len(row) < 2:
continue continue
try: try:
label = int(row[0]) label = int(row[0])
review = row[1].strip() review = row[1].strip()
if review: if review:
texts.append(review) texts.append(review)
labels.append(label) labels.append(label)
except (ValueError, IndexError): except (ValueError, IndexError):
continue continue
return texts, np.array(labels) return texts, np.array(labels)
def tokenize(text): def tokenize(text):
"""中文分词""" """中文分词"""
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', ' ', text) text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', ' ', text)
words = jieba.lcut(text) words = jieba.lcut(text)
return [w for w in words if len(w) > 1] return [w for w in words if len(w) > 1]
# ==================== 向量化器 ==================== # ==================== 向量化器 ====================
class BaseVectorizer: class BaseVectorizer:
"""向量化器基类""" """向量化器基类"""
def fit(self, texts): pass def fit(self, texts): pass
def transform(self, texts): pass def transform(self, texts): pass
def fit_transform(self, texts): pass def fit_transform(self, texts): pass
class BoWVectorizer(BaseVectorizer): class BoWVectorizer(BaseVectorizer):
""" """
词袋模型 (Bag of Words) 词袋模型 (Bag of Words)
原理:统计每个词在文本中出现的次数 原理:统计每个词在文本中出现的次数
向量维度 = 词表大小 向量维度 = 词表大小
每个维度 = 该词在本文本中出现的次数 每个维度 = 该词在本文本中出现的次数
""" """
def __init__(self, max_features, max_seq_len): def __init__(self, max_features, max_seq_len):
self.max_features = max_features self.max_features = max_features
self.max_seq_len = max_seq_len self.max_seq_len = max_seq_len
self.vocab = {} self.vocab = {}
self.doc_freq = {} # 文档频率 self.doc_freq = {} # 文档频率
self.num_docs = 0 self.num_docs = 0
def fit(self, texts): def fit(self, texts):
"""构建词表(基于词频)""" """构建词表(基于词频)"""
counter = Counter() counter = Counter()
doc_counter = Counter() # 统计包含该词的文档数 doc_counter = Counter() # 统计包含该词的文档数
for text in texts: for text in texts:
words = tokenize(text) words = tokenize(text)
unique_words = set(words) unique_words = set(words)
counter.update(words) counter.update(words)
for w in unique_words: for w in unique_words:
doc_counter[w] += 1 doc_counter[w] += 1
self.num_docs = len(texts) self.num_docs = len(texts)
# 取最高频的词 # 取最高频的词
most_common = counter.most_common(self.max_features) most_common = counter.most_common(self.max_features)
self.vocab = {word: idx for idx, (word, _) in enumerate(most_common)} self.vocab = {word: idx for idx, (word, _) in enumerate(most_common)}
# 记录文档频率用于TF-IDF # 记录文档频率用于TF-IDF
self.doc_freq = {w: doc_counter[w] for w in self.vocab} self.doc_freq = {w: doc_counter[w] for w in self.vocab}
print(f" BoW词表大小: {len(self.vocab)}") print(f" BoW词表大小: {len(self.vocab)}")
return self return self
def transform(self, texts): def transform(self, texts):
"""将文本转换为词频向量""" """将文本转换为词频向量"""
vectors = [] vectors = []
for text in texts: for text in texts:
words = tokenize(text) words = tokenize(text)
freq = [0] * self.max_seq_len freq = [0] * self.max_seq_len
for i, word in enumerate(words[:self.max_seq_len]): for i, word in enumerate(words[:self.max_seq_len]):
if word in self.vocab: if word in self.vocab:
freq[i] = 1 # 二值(出现=1不出现=0 freq[i] = 1 # 二值(出现=1不出现=0
vectors.append(freq) vectors.append(freq)
return np.array(vectors, dtype=np.float32) return np.array(vectors, dtype=np.float32)
def fit_transform(self, texts): def fit_transform(self, texts):
self.fit(texts) self.fit(texts)
return self.transform(texts) return self.transform(texts)
class TFIDFVectorizer(BaseVectorizer): class TFIDFVectorizer(BaseVectorizer):
""" """
TF-IDF 向量器 TF-IDF 向量器
原理: 原理:
- TF(词频) = 词在本文本中出现的次数 - TF(词频) = 词在本文本中出现的次数
- IDF(逆文档频率) = log(总文档数 / 包含该词的文档数) - IDF(逆文档频率) = log(总文档数 / 包含该词的文档数)
- TF-IDF = TF × IDF - TF-IDF = TF × IDF
优势: 优势:
- 降低常见无意义词的权重(如"""" - 降低常见无意义词的权重(如""""
- 提升罕见但有信息量的词 - 提升罕见但有信息量的词
""" """
def __init__(self, max_features, max_seq_len): def __init__(self, max_features, max_seq_len):
self.max_features = max_features self.max_features = max_features
self.max_seq_len = max_seq_len self.max_seq_len = max_seq_len
self.vocab = {} self.vocab = {}
self.idf = {} # 存储每个词的IDF值 self.idf = {} # 存储每个词的IDF值
self.num_docs = 0 self.num_docs = 0
def fit(self, texts): def fit(self, texts):
"""构建词表并计算IDF""" """构建词表并计算IDF"""
counter = Counter() counter = Counter()
doc_counter = Counter() doc_counter = Counter()
for text in texts: for text in texts:
words = tokenize(text) words = tokenize(text)
unique_words = set(words) unique_words = set(words)
counter.update(words) counter.update(words)
for w in unique_words: for w in unique_words:
doc_counter[w] += 1 doc_counter[w] += 1
self.num_docs = len(texts) self.num_docs = len(texts)
# 计算每个词的IDF # 计算每个词的IDF
# IDF = log(总文档数 / 包含该词的文档数) # IDF = log(总文档数 / 包含该词的文档数)
idf_values = {} idf_values = {}
for word, df in doc_counter.items(): for word, df in doc_counter.items():
idf_values[word] = math.log(self.num_docs / (df + 1)) + 1 # 加1防零 idf_values[word] = math.log(self.num_docs / (df + 1)) + 1 # 加1防零
# 取IDF值最高的词信息量最大的词 # 取IDF值最高的词信息量最大的词
sorted_words = sorted(idf_values.items(), key=lambda x: x[1], reverse=True) sorted_words = sorted(idf_values.items(), key=lambda x: x[1], reverse=True)
self.vocab = {word: idx for idx, (word, _) in enumerate(sorted_words[:self.max_features])} self.vocab = {word: idx for idx, (word, _) in enumerate(sorted_words[:self.max_features])}
# 保存IDF值 # 保存IDF值
self.idf = {word: idf_values[word] for word in self.vocab} self.idf = {word: idf_values[word] for word in self.vocab}
print(f" TF-IDF词表大小: {len(self.vocab)}") print(f" TF-IDF词表大小: {len(self.vocab)}")
print(f" 平均IDF: {np.mean(list(self.idf.values())):.3f}") print(f" 平均IDF: {np.mean(list(self.idf.values())):.3f}")
return self return self
def transform(self, texts): def transform(self, texts):
"""将文本转换为TF-IDF向量""" """将文本转换为TF-IDF向量"""
vectors = [] vectors = []
for text in texts: for text in texts:
words = tokenize(text) words = tokenize(text)
# 计算TF # 计算TF
tf = Counter(words) tf = Counter(words)
tf_sum = len(words) if words else 1 tf_sum = len(words) if words else 1
# 生成向量 # 生成向量
vec = [0.0] * self.max_seq_len vec = [0.0] * self.max_seq_len
for i, word in enumerate(words[:self.max_seq_len]): for i, word in enumerate(words[:self.max_seq_len]):
if word in self.vocab: if word in self.vocab:
# TF × IDF # TF × IDF
vec[i] = (tf[word] / tf_sum) * self.idf.get(word, 0) vec[i] = (tf[word] / tf_sum) * self.idf.get(word, 0)
vectors.append(vec) vectors.append(vec)
return np.array(vectors, dtype=np.float32) return np.array(vectors, dtype=np.float32)
def fit_transform(self, texts): def fit_transform(self, texts):
self.fit(texts) self.fit(texts)
return self.transform(texts) return self.transform(texts)
def load_data(data_dir, max_features, max_seq_len, vectorizer_type='tfidf'): def load_data(data_dir, max_features, max_seq_len, vectorizer_type='tfidf'):
""" """
加载并向量化数据 加载并向量化数据
参数: 参数:
- vectorizer_type: 'tfidf''bow' - vectorizer_type: 'tfidf''bow'
""" """
if not download_dataset(data_dir): if not download_dataset(data_dir):
raise RuntimeError("数据加载失败,请检查网络或手动下载数据集") raise RuntimeError("数据加载失败,请检查网络或手动下载数据集")
print("正在加载数据...") print("正在加载数据...")
texts, labels = load_raw_data(data_dir) texts, labels = load_raw_data(data_dir)
print(f"总评论数: {len(texts)}, 正面: {sum(labels)}, 负面: {len(labels) - sum(labels)}") print(f"总评论数: {len(texts)}, 正面: {sum(labels)}, 负面: {len(labels) - sum(labels)}")
# 选择向量化器 # 选择向量化器
if vectorizer_type == 'tfidf': if vectorizer_type == 'tfidf':
vectorizer = TFIDFVectorizer(max_features, max_seq_len) vectorizer = TFIDFVectorizer(max_features, max_seq_len)
vec_name = "TF-IDF" vec_name = "TF-IDF"
else: else:
vectorizer = BoWVectorizer(max_features, max_seq_len) vectorizer = BoWVectorizer(max_features, max_seq_len)
vec_name = "BoW" vec_name = "BoW"
print(f"正在使用{vec_name}向量化...") print(f"正在使用{vec_name}向量化...")
X = vectorizer.fit_transform(texts) X = vectorizer.fit_transform(texts)
y = labels y = labels
# 打乱并划分 # 打乱并划分
np.random.seed(42) np.random.seed(42)
indices = np.random.permutation(len(X)) indices = np.random.permutation(len(X))
X = X[indices] X = X[indices]
y = y[indices] y = y[indices]
split_idx = int(len(X) * 0.8) split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:] X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:] y_train, y_test = y[:split_idx], y[split_idx:]
print(f"训练集: {len(X_train)}条, 测试集: {len(X_test)}") print(f"训练集: {len(X_train)}条, 测试集: {len(X_test)}")
return X_train, y_train, X_test, y_test, vectorizer return X_train, y_train, X_test, y_test, vectorizer
if __name__ == '__main__': if __name__ == '__main__':
# 测试 # 测试
print("=" * 60) print("=" * 60)
print("测试 TF-IDF 向量化") print("测试 TF-IDF 向量化")
print("=" * 60) print("=" * 60)
X_train, y_train, X_test, y_test, vec = load_data( X_train, y_train, X_test, y_test, vec = load_data(
'data/ChnSentiCorp', max_features=3000, max_seq_len=100, 'data/ChnSentiCorp', max_features=3000, max_seq_len=100,
vectorizer_type='tfidf' vectorizer_type='tfidf'
) )
print(f"\nX_train shape: {X_train.shape}") print(f"\nX_train shape: {X_train.shape}")
print(f"X_train sample (前5个特征): {X_train[0][:5]}") print(f"X_train sample (前5个特征): {X_train[0][:5]}")

68
main.py
View File

@@ -1,34 +1,34 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
主程序入口 主程序入口
使用方式: 使用方式:
1. 运行单个模型(默认): 1. 运行单个模型(默认):
python main.py python main.py
修改 config.py 中的 MODEL_TYPE 和 VECTORIZER_TYPE 来切换配置 修改 config.py 中的 MODEL_TYPE 和 VECTORIZER_TYPE 来切换配置
2. 运行对比实验: 2. 运行对比实验:
修改 config.py 中 RUN_COMPARISON = True 修改 config.py 中 RUN_COMPARISON = True
这会依次运行: 这会依次运行:
- 实验1: BoW vs TF-IDF (固定LR模型) - 实验1: BoW vs TF-IDF (固定LR模型)
- 实验2: LR vs MLP (固定TF-IDF) - 实验2: LR vs MLP (固定TF-IDF)
- 实验3: 不同学习率对比 - 实验3: 不同学习率对比
- 实验4: 不同隐藏层大小对比 - 实验4: 不同隐藏层大小对比
最后输出汇总报告 最后输出汇总报告
""" """
from train import main from train import main
if __name__ == '__main__': if __name__ == '__main__':
print("\n" + "=" * 70) print("\n" + "=" * 70)
print("文本分类实验 - 纯NumPy实现") print("文本分类实验 - 纯NumPy实现")
print("数据集: ChnSentiCorp (中文酒店评论)") print("数据集: ChnSentiCorp (中文酒店评论)")
print("模型: Logistic Regression / MLP") print("模型: Logistic Regression / MLP")
print("向量化: BoW / TF-IDF") print("向量化: BoW / TF-IDF")
print("=" * 70 + "\n") print("=" * 70 + "\n")
main() main()

Binary file not shown.

Binary file not shown.