Upload train.py

2026-04-27 21:46:11 +08:00
parent eb0209afee
commit 2366641352
1 changed files with 265 additions and 0 deletions
--- a/train.py
+++ b/train.py
@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+"""
+训练与对比实验模块
+
+功能：
+1. 单模型训练与评估
+2. 模型对比实验（LR vs MLP）
+3. 向量化对比实验（BoW vs TF-IDF）
+4. 超参数对比实验
+"""
+
+import numpy as np
+import time
+from datetime import datetime
+from model_numpy import create_model
+from dataset import load_data
+from config import *
+
+
+def train_single_model(X_train, y_train, X_test, y_test, 
+                      model_type, vectorizer_type, epochs, batch_size, lr, hidden_size):
+    """
+    训练单个模型并返回结果
+    """
+    print(f"\n{'='*60}")
+    print(f"训练配置:")
+    print(f"  模型: {model_type.upper()}")
+    print(f"  向量: {vectorizer_type.upper()}")
+    print(f"  学习率: {lr}")
+    if model_type == 'mlp':
+        print(f"  隐藏层大小: {hidden_size}")
+    print(f"  训练轮数: {epochs}")
+    print(f"{'='*60}")
+    
+    # 重新加载数据（确保向量化方式正确）
+    X_tr, y_tr, X_te, y_te, _ = load_data(
+        DATA_DIR, MAX_FEATURES, MAX_SEQ_LEN, vectorizer_type
+    )
+    
+    # 计算类别权重（解决数据不平衡问题）
+    pos_count = sum(y_tr)
+    neg_count = len(y_tr) - pos_count
+    n_samples = len(y_tr)
+    n_classes = 2
+    
+    if USE_CLASS_WEIGHT:
+        # 权重 = n_samples / (n_classes * n_class_i)
+        # 负面样本少，权重更大
+        weight_pos = n_samples / (n_classes * pos_count)
+        weight_neg = n_samples / (n_classes * neg_count)
+        class_weight = {0: weight_neg, 1: weight_pos}
+        print(f"  类别权重: 正面={weight_pos:.2f}, 负面={weight_neg:.2f}")
+    else:
+        class_weight = None
+        print(f"  类别权重: 不使用")
+    
+    # 创建模型
+    model = create_model(
+        model_type=model_type,
+        input_size=X_tr.shape[1],
+        hidden_size=hidden_size,
+        num_classes=NUM_CLASSES,
+        learning_rate=lr,
+        keep_prob=KEEP_PROB,
+        class_weight=class_weight
+    )
+    
+    # 训练
+    start_time = time.time()
+    model.fit(X_tr, y_tr, X_te, y_te, epochs=epochs, batch_size=batch_size, verbose=True)
+    train_time = time.time() - start_time
+    
+    # 最终评估
+    train_acc = model.accuracy(X_tr, y_tr)
+    test_acc = model.accuracy(X_te, y_te)
+    
+    print(f"\n最终结果:")
+    print(f"  训练准确率: {train_acc:.4f}")
+    print(f"  测试准确率: {test_acc:.4f}")
+    print(f"  训练时间: {train_time:.2f}秒")
+    
+    # 保存模型权重
+    # 生成模型文件名（包含配置标识和时间戳）
+    weight_suffix = "weighted" if USE_CLASS_WEIGHT else "noweight"
+    timestamp = datetime.now().strftime("%m%d_%H%M%S")
+    model_path = f"model_{model_type}_{vectorizer_type}_{weight_suffix}_{timestamp}"
+    model.save(model_path)
+    print(f"  权重文件: {model_path}_*.npy")
+    
+    return {
+        'model_type': model_type,
+        'vectorizer_type': vectorizer_type,
+        'train_acc': train_acc,
+        'test_acc': test_acc,
+        'train_time': train_time,
+        'lr': lr,
+        'hidden_size': hidden_size
+    }
+
+
+def run_comparison_experiments():
+    """
+    运行对比实验
+    
+    实验设计：
+    1. 不同向量化方法的效果对比 (BoW vs TF-IDF)
+    2. 不同模型的效果对比 (LR vs MLP)
+    3. 超参数对比 (学习率, 隐藏层大小)
+    """
+    print("\n" + "=" * 70)
+    print("文本分类对比实验")
+    print("=" * 70)
+    
+    results = []
+    
+    # ===== 实验1: 向量化方法对比 (固定模型=LR) =====
+    print("\n\n" + "▶ " * 20)
+    print("实验1: 向量化方法对比 (模型=LR, 学习率=0.05)")
+    print("▶ " * 20)
+    
+    for vec_type in COMPARE_VECTORS:
+        result = train_single_model(
+            None, None, None, None,
+            model_type='lr',
+            vectorizer_type=vec_type,
+            epochs=50,
+            batch_size=BATCH_SIZE,
+            lr=0.05,
+            hidden_size=HIDDEN_SIZE
+        )
+        results.append(result)
+    
+    # ===== 实验2: 模型复杂度对比 (固定向量=TF-IDF) =====
+    print("\n\n" + "▶ " * 20)
+    print("实验2: 模型复杂度对比 (向量=TF-IDF)")
+    print("▶ " * 20)
+    
+    for model_type in COMPARE_MODELS:
+        result = train_single_model(
+            None, None, None, None,
+            model_type=model_type,
+            vectorizer_type='tfidf',
+            epochs=50,
+            batch_size=BATCH_SIZE,
+            lr=LEARNING_RATE,
+            hidden_size=HIDDEN_SIZE
+        )
+        results.append(result)
+    
+    # ===== 实验3: 学习率对比 (MLP) =====
+    print("\n\n" + "▶ " * 20)
+    print("实验3: 学习率对比 (模型=MLP, 向量=TF-IDF)")
+    print("▶ " * 20)
+    
+    for lr in [0.01, 0.1]:
+        result = train_single_model(
+            None, None, None, None,
+            model_type='mlp',
+            vectorizer_type='tfidf',
+            epochs=50,
+            batch_size=BATCH_SIZE,
+            lr=lr,
+            hidden_size=64
+        )
+        results.append(result)
+    
+    # ===== 实验4: 隐藏层大小对比 (MLP) =====
+    print("\n\n" + "▶ " * 20)
+    print("实验4: 隐藏层大小对比 (模型=MLP, 向量=TF-IDF)")
+    print("▶ " * 20)
+    
+    for hidden_size in [32, 128]:
+        result = train_single_model(
+            None, None, None, None,
+            model_type='mlp',
+            vectorizer_type='tfidf',
+            epochs=50,
+            batch_size=BATCH_SIZE,
+            lr=LEARNING_RATE,
+            hidden_size=hidden_size
+        )
+        results.append(result)
+    
+    # ===== 汇总报告 =====
+    print("\n\n" + "=" * 70)
+    print("实验结果汇总")
+    print("=" * 70)
+    
+    print(f"\n{'配置':<40} {'训练准确率':<12} {'测试准确率':<12} {'训练时间(秒)':<10}")
+    print("-" * 76)
+    
+    for r in results:
+        config = f"{r['model_type'].upper()} + {r['vectorizer_type'].upper()}"
+        if r['lr'] != LEARNING_RATE:
+            config += f", lr={r['lr']}"
+        if r['hidden_size'] != HIDDEN_SIZE:
+            config += f", h={r['hidden_size']}"
+        print(f"{config:<40} {r['train_acc']:<12.4f} {r['test_acc']:<12.4f} {r['train_time']:<10.2f}")
+    
+    # 分析
+    print("\n" + "=" * 70)
+    print("结果分析")
+    print("=" * 70)
+    
+    # 找出最佳配置
+    best = max(results, key=lambda x: x['test_acc'])
+    print(f"\n最佳测试准确率: {best['test_acc']:.4f}")
+    print(f"最佳配置: {best['model_type'].upper()} + {best['vectorizer_type'].upper()}")
+    
+    # BoW vs TF-IDF 分析
+    bow_results = [r for r in results if r['vectorizer_type'] == 'bow' and r['model_type'] == 'lr']
+    tfidf_results = [r for r in results if r['vectorizer_type'] == 'tfidf' and r['model_type'] == 'lr']
+    
+    if bow_results and tfidf_results:
+        bow_acc = bow_results[0]['test_acc']
+        tfidf_acc = tfidf_results[0]['test_acc']
+        print(f"\n向量化方法影响:")
+        print(f"  BoW测试准确率:   {bow_acc:.4f}")
+        print(f"  TF-IDF测试准确率: {tfidf_acc:.4f}")
+        print(f"  差异: {abs(tfidf_acc - bow_acc):.4f} ({'TF-IDF更优' if tfidf_acc > bow_acc else 'BoW更优'})")
+    
+    # LR vs MLP 分析
+    lr_results = [r for r in results if r['model_type'] == 'lr' and r['vectorizer_type'] == 'tfidf']
+    mlp_results = [r for r in results if r['model_type'] == 'mlp' and r['vectorizer_type'] == 'tfidf']
+    
+    if lr_results and mlp_results:
+        lr_acc = lr_results[0]['test_acc']
+        mlp_acc = mlp_results[0]['test_acc']
+        print(f"\n模型复杂度影响:")
+        print(f"  LR测试准确率:  {lr_acc:.4f}")
+        print(f"  MLP测试准确率:  {mlp_acc:.4f}")
+        print(f"  差异: {abs(mlp_acc - lr_acc):.4f} ({'MLP更优' if mlp_acc > lr_acc else 'LR更优'})")
+    
+    return results
+
+
+def main():
+    """主函数"""
+    print("=" * 70)
+    print("文本分类实验 - 纯NumPy实现")
+    print("数据集: ChnSentiCorp (中文酒店评论)")
+    print("=" * 70)
+    
+    if RUN_COMPARISON:
+        # 运行对比实验
+        run_comparison_experiments()
+    else:
+        # 运行单个模型
+        train_single_model(
+            None, None, None, None,
+            model_type=MODEL_TYPE,
+            vectorizer_type=VECTORIZER_TYPE,
+            epochs=NUM_EPOCHS,
+            batch_size=BATCH_SIZE,
+            lr=LEARNING_RATE,
+            hidden_size=HIDDEN_SIZE
+        )
+    
+    print("\n" + "=" * 70)
+    print("实验完成!")
+    print("=" * 70)
+
+
+if __name__ == '__main__':
+    main()