上传文件至 /

This commit is contained in:
2026-04-30 15:53:27 +08:00
parent 2fc549d284
commit 58da0d6314
5 changed files with 7934 additions and 360 deletions

7767
ChnSentiCorp_htl_all.csv Normal file

File diff suppressed because one or more lines are too long

View File

@@ -1,40 +1,19 @@
# -*- coding: utf-8 -*- # ģ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
""" MODEL_TYPE = 'mlp' # 'lr' <20>߼<EFBFBD><DFBC>ع<EFBFBD> / 'mlp' <20><><EFBFBD><EFBFBD>֪<EFBFBD><D6AA>
配置文件 - 所有超参数集中管理 VECTORIZER_TYPE = 'tfidf' # 'bow' / 'tfidf'
设计思路: # ѵ<><D1B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
将超参数分门别类,学生可以单独修改某一类而不会影响其他 NUM_EPOCHS = 100
""" LEARNING_RATE = 0.05
BATCH_SIZE = 32
HIDDEN_SIZE = 64
# ==================== 数据相关 ==================== # <EFBFBD>ı<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
DATA_DIR = 'data/ChnSentiCorp' # 数据集路径 MAX_FEATURES = 3000 # <20>ʱ<EFBFBD><CAB1><EFBFBD>С
MAX_FEATURES = 3000 # 词表最大容量 MAX_SEQ_LEN = 100 # <20><><EFBFBD><EFBFBD>ı<EFBFBD><C4B1><EFBFBD><EFBFBD><EFBFBD>
MAX_SEQ_LEN = 100 # 句子最大长度(词数)
VECTORIZER_TYPE = 'tfidf' # 'tfidf' 或 'bow'(向量化方式)
# ==================== 模型相关 ==================== # <EFBFBD><EFBFBD><EFBFBD>ݲ<EFBFBD>ƽ<EFBFBD><EFBFBD><EFBFBD>
MODEL_TYPE = 'mlp' # 'mlp' 或 'lr'(模型类型) USE_CLASS_WEIGHT = True # <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ȩ<EFBFBD><C8A8>
HIDDEN_SIZE = 64 # MLP隐藏层大小LR忽略
NUM_CLASSES = 2 # 类别数(正面/负面二分类)
KEEP_PROB = 1.0 # Dropout保留概率LR忽略设为1即可
# ==================== 训练相关 ==================== # ʵ<EFBFBD><EFBFBD>Ա<EFBFBD>
LEARNING_RATE = 0.05 # 学习率 RUN_COMPARISON = True
NUM_EPOCHS = 100 # 训练轮数
BATCH_SIZE = 64 # 批次大小
# ==================== 类别权重(解决数据不平衡问题)====================
USE_CLASS_WEIGHT = True # True=启用类别权重, False=不启用(对比用)
# 权重计算公式: n_samples / (n_classes * n_class_i)
# 正面评论多所以权重小,负面评论少所以权重大
CLASS_WEIGHT_POS = 0.73 # 正面类权重(自动计算)
CLASS_WEIGHT_NEG = 1.58 # 负面类权重(自动计算)
# ==================== 实验相关 ====================
RUN_COMPARISON = False # True=运行对比实验, False=运行单个模型
COMPARE_MODELS = ['lr', 'mlp'] # 要对比的模型列表
COMPARE_VECTORS = ['bow', 'tfidf'] # 要对比的向量化方式
# ==================== 其他 ====================
RANDOM_SEED = 42 # 随机种子(保证可复现)
VERBOSE = True # 打印详细日志

View File

@@ -1,286 +1,88 @@
# -*- coding: utf-8 -*-
"""
数据加载与向量化模块
支持两种向量化方法:
1. BoW (Bag of Words) - 词频向量
2. TF-IDF - 词频-逆文档频率向量
TF-IDF 的优势:
- 降低常见词(如"""")的权重
- 提升罕见词的信息量
- 通常效果优于简单BoW
"""
import os
import re
import csv
import math
import jieba
import numpy as np import numpy as np
import jieba
from collections import Counter from collections import Counter
import os
import requests
try: # <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݼ<EFBFBD>
import urllib.request def download_data():
import ssl url = "https://github.com/SophonPlus/ChineseNlpCorpus/raw/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv"
DOWNLOAD_AVAILABLE = True path = "ChnSentiCorp_htl_all.csv"
except ImportError: if not os.path.exists(path):
DOWNLOAD_AVAILABLE = False print("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݼ<EFBFBD>...")
r = requests.get(url)
with open(path, "wb") as f:
DATASET_URL = "https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv" f.write(r.content)
return path
def download_dataset(data_dir):
"""下载数据集(如果不存在)"""
csv_path = os.path.join(data_dir, 'ChnSentiCorp_htl_all.csv')
if os.path.exists(csv_path):
print(f"数据已存在: {csv_path}")
return True
if not DOWNLOAD_AVAILABLE:
return False
print("正在下载数据集...")
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
try:
request = urllib.request.Request(DATASET_URL, headers={'User-Agent': 'Mozilla/5.0'})
response = urllib.request.urlopen(request, timeout=120, context=ssl_context)
os.makedirs(data_dir, exist_ok=True)
with open(csv_path, 'wb') as f:
f.write(response.read())
print(f"下载完成: {csv_path}")
return True
except Exception as e:
print(f"下载失败: {e}")
return False
def load_raw_data(data_dir):
"""加载原始数据"""
csv_path = os.path.join(data_dir, 'ChnSentiCorp_htl_all.csv')
texts, labels = [], []
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
if len(row) < 2:
continue
try:
label = int(row[0])
review = row[1].strip()
if review:
texts.append(review)
labels.append(label)
except (ValueError, IndexError):
continue
return texts, np.array(labels)
# <20>ִʣ<D6B4><CAA3><EFBFBD><EFBFBD>˵<EFBFBD><CBB5>֣<EFBFBD>
def tokenize(text): def tokenize(text):
"""中文分词""" words = jieba.lcut(str(text).strip())
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', ' ', text)
words = jieba.lcut(text)
return [w for w in words if len(w) > 1] return [w for w in words if len(w) > 1]
# <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
def load_data():
path = download_data()
texts, labels = [], []
with open(path, "r", encoding="utf-8") as f:
next(f)
for line in f:
parts = line.strip().split(",", 1)
if len(parts) != 2:
continue
label, text = parts
texts.append(text)
labels.append(int(label))
return texts, labels
# ==================== 向量化器 ==================== # BoW <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
class BoWVectorizer:
class BaseVectorizer: def __init__(self, max_features=3000):
"""向量化器基类"""
def fit(self, texts): pass
def transform(self, texts): pass
def fit_transform(self, texts): pass
class BoWVectorizer(BaseVectorizer):
"""
词袋模型 (Bag of Words)
原理:统计每个词在文本中出现的次数
向量维度 = 词表大小
每个维度 = 该词在本文本中出现的次数
"""
def __init__(self, max_features, max_seq_len):
self.max_features = max_features self.max_features = max_features
self.max_seq_len = max_seq_len
self.vocab = {} self.vocab = {}
self.doc_freq = {} # 文档频率
self.num_docs = 0
def fit(self, texts): def fit(self, texts):
"""构建词表(基于词频)"""
counter = Counter() counter = Counter()
doc_counter = Counter() # 统计包含该词的文档数 for t in texts:
counter.update(tokenize(t))
words = [w for w, _ in counter.most_common(self.max_features)]
self.vocab = {w:i for i, w in enumerate(words)}
for text in texts: def transform(self, text):
words = tokenize(text) words = tokenize(text)
unique_words = set(words) vec = np.zeros(self.max_features)
counter.update(words) for w in words:
for w in unique_words: if w in self.vocab:
doc_counter[w] += 1 vec[self.vocab[w]] += 1
return vec
self.num_docs = len(texts) # TF-IDF <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
class TFIDFVectorizer:
# 取最高频的词 def __init__(self, max_features=3000):
most_common = counter.most_common(self.max_features)
self.vocab = {word: idx for idx, (word, _) in enumerate(most_common)}
# 记录文档频率用于TF-IDF
self.doc_freq = {w: doc_counter[w] for w in self.vocab}
print(f" BoW词表大小: {len(self.vocab)}")
return self
def transform(self, texts):
"""将文本转换为词频向量"""
vectors = []
for text in texts:
words = tokenize(text)
freq = [0] * self.max_seq_len
for i, word in enumerate(words[:self.max_seq_len]):
if word in self.vocab:
freq[i] = 1 # 二值(出现=1不出现=0
vectors.append(freq)
return np.array(vectors, dtype=np.float32)
def fit_transform(self, texts):
self.fit(texts)
return self.transform(texts)
class TFIDFVectorizer(BaseVectorizer):
"""
TF-IDF 向量器
原理:
- TF(词频) = 词在本文本中出现的次数
- IDF(逆文档频率) = log(总文档数 / 包含该词的文档数)
- TF-IDF = TF × IDF
优势:
- 降低常见无意义词的权重(如""""
- 提升罕见但有信息量的词
"""
def __init__(self, max_features, max_seq_len):
self.max_features = max_features self.max_features = max_features
self.max_seq_len = max_seq_len
self.vocab = {} self.vocab = {}
self.idf = {} # 存储每个词的IDF值 self.idf = {}
self.num_docs = 0
def fit(self, texts): def fit(self, texts):
"""构建词表并计算IDF"""
counter = Counter() counter = Counter()
doc_counter = Counter() doc_freq = Counter()
for t in texts:
ws = set(tokenize(t))
counter.update(tokenize(t))
for w in ws:
doc_freq[w] += 1
for text in texts: words = [w for w, _ in counter.most_common(self.max_features)]
self.vocab = {w:i for i, w in enumerate(words)}
N = len(texts)
for w in self.vocab:
self.idf[w] = np.log(N / (doc_freq.get(w, 0) + 1))
def transform(self, text):
words = tokenize(text) words = tokenize(text)
unique_words = set(words) vec = np.zeros(self.max_features)
counter.update(words)
for w in unique_words:
doc_counter[w] += 1
self.num_docs = len(texts)
# 计算每个词的IDF
# IDF = log(总文档数 / 包含该词的文档数)
idf_values = {}
for word, df in doc_counter.items():
idf_values[word] = math.log(self.num_docs / (df + 1)) + 1 # 加1防零
# 取IDF值最高的词信息量最大的词
sorted_words = sorted(idf_values.items(), key=lambda x: x[1], reverse=True)
self.vocab = {word: idx for idx, (word, _) in enumerate(sorted_words[:self.max_features])}
# 保存IDF值
self.idf = {word: idf_values[word] for word in self.vocab}
print(f" TF-IDF词表大小: {len(self.vocab)}")
print(f" 平均IDF: {np.mean(list(self.idf.values())):.3f}")
return self
def transform(self, texts):
"""将文本转换为TF-IDF向量"""
vectors = []
for text in texts:
words = tokenize(text)
# 计算TF
tf = Counter(words) tf = Counter(words)
tf_sum = len(words) if words else 1 for w, cnt in tf.items():
if w in self.vocab:
# 生成向量 vec[self.vocab[w]] = cnt * self.idf[w]
vec = [0.0] * self.max_seq_len return vec
for i, word in enumerate(words[:self.max_seq_len]):
if word in self.vocab:
# TF × IDF
vec[i] = (tf[word] / tf_sum) * self.idf.get(word, 0)
vectors.append(vec)
return np.array(vectors, dtype=np.float32)
def fit_transform(self, texts):
self.fit(texts)
return self.transform(texts)
def load_data(data_dir, max_features, max_seq_len, vectorizer_type='tfidf'):
"""
加载并向量化数据
参数:
- vectorizer_type: 'tfidf''bow'
"""
if not download_dataset(data_dir):
raise RuntimeError("数据加载失败,请检查网络或手动下载数据集")
print("正在加载数据...")
texts, labels = load_raw_data(data_dir)
print(f"总评论数: {len(texts)}, 正面: {sum(labels)}, 负面: {len(labels) - sum(labels)}")
# 选择向量化器
if vectorizer_type == 'tfidf':
vectorizer = TFIDFVectorizer(max_features, max_seq_len)
vec_name = "TF-IDF"
else:
vectorizer = BoWVectorizer(max_features, max_seq_len)
vec_name = "BoW"
print(f"正在使用{vec_name}向量化...")
X = vectorizer.fit_transform(texts)
y = labels
# 打乱并划分
np.random.seed(42)
indices = np.random.permutation(len(X))
X = X[indices]
y = y[indices]
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
print(f"训练集: {len(X_train)}条, 测试集: {len(X_test)}")
return X_train, y_train, X_test, y_test, vectorizer
if __name__ == '__main__':
# 测试
print("=" * 60)
print("测试 TF-IDF 向量化")
print("=" * 60)
X_train, y_train, X_test, y_test, vec = load_data(
'data/ChnSentiCorp', max_features=3000, max_seq_len=100,
vectorizer_type='tfidf'
)
print(f"\nX_train shape: {X_train.shape}")
print(f"X_train sample (前5个特征): {X_train[0][:5]}")

76
main.py
View File

@@ -1,34 +1,60 @@
# -*- coding: utf-8 -*- import numpy as np
""" from dataset import load_data, BoWVectorizer, TFIDFVectorizer
主程序入口 from train import train
import config as cfg
import pickle
import time
使用方式: # <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
texts, labels = load_data()
labels = np.array(labels)
1. 运行单个模型(默认): # <20><><EFBFBD><EFBFBD>ѵ<EFBFBD><D1B5><EFBFBD><EFBFBD>/<2F><><EFBFBD>Լ<EFBFBD>
python main.py np.random.seed(42)
indices = np.random.permutation(len(texts))
split = int(0.8 * len(texts))
train_idx, test_idx = indices[:split], indices[split:]
train_texts = [texts[i] for i in train_idx]
test_texts = [texts[i] for i in test_idx]
y_train, y_test = labels[train_idx], labels[test_idx]
修改 config.py 中的 MODEL_TYPE 和 VECTORIZER_TYPE 来切换配置 # <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
if cfg.VECTORIZER_TYPE == "bow":
vec = BoWVectorizer(cfg.MAX_FEATURES)
else:
vec = TFIDFVectorizer(cfg.MAX_FEATURES)
2. 运行对比实验: vec.fit(train_texts)
修改 config.py 中 RUN_COMPARISON = True X_train = np.array([vec.transform(t) for t in train_texts])
X_test = np.array([vec.transform(t) for t in test_texts])
这会依次运行: # ѵ<><D1B5>
- 实验1: BoW vs TF-IDF (固定LR模型) print("="*50)
- 实验2: LR vs MLP (固定TF-IDF) print(f"ѵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>:\n ģ<><C4A3>: {cfg.MODEL_TYPE}\n <20><><EFBFBD><EFBFBD>: {cfg.VECTORIZER_TYPE}\n ѧϰ<D1A7><CFB0>: {cfg.LEARNING_RATE}")
- 实验3: 不同学习率对比 print("="*50)
- 实验4: 不同隐藏层大小对比
最后输出汇总报告 model, t = train(
""" X_train, y_train, X_test, y_test,
model_type=cfg.MODEL_TYPE,
lr=cfg.LEARNING_RATE,
epochs=cfg.NUM_EPOCHS,
use_weight=cfg.USE_CLASS_WEIGHT
)
from train import main # <20><><EFBFBD><EFBFBD>
ts = time.strftime("%m%d_%H%M%S")
name = f"model_{cfg.MODEL_TYPE}_{cfg.VECTORIZER_TYPE}_{'weighted' if cfg.USE_CLASS_WEIGHT else 'raw'}_{ts}"
if __name__ == '__main__': if cfg.MODEL_TYPE == "lr":
print("\n" + "=" * 70) np.save(f"{name}_W.npy", model.W)
print("文本分类实验 - 纯NumPy实现") np.save(f"{name}_b.npy", model.b)
print("数据集: ChnSentiCorp (中文酒店评论)") else:
print("模型: Logistic Regression / MLP") np.save(f"{name}_W1.npy", model.W1)
print("向量化: BoW / TF-IDF") np.save(f"{name}_b1.npy", model.b1)
print("=" * 70 + "\n") np.save(f"{name}_W2.npy", model.W2)
np.save(f"{name}_b2.npy", model.b2)
main() with open(f"{name}_vec.pkl", "wb") as f:
pickle.dump(vec, f)
print(f"\nģ<EFBFBD><EFBFBD><EFBFBD>ѱ<EFBFBD><EFBFBD><EFBFBD>: {name}_*.npy/*.pkl")

Binary file not shown.