From 37236ee059cf398fde3d68732a7be4da54484f8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E6=A5=A0=E6=A5=A0?= <2509165032@student.example.com> Date: Thu, 30 Apr 2026 15:59:08 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- hh.py | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 hh.py diff --git a/hh.py b/hh.py new file mode 100644 index 0000000..767b3c3 --- /dev/null +++ b/hh.py @@ -0,0 +1,132 @@ +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer + +# -------------------------- 多层感知机(MLP)实现 -------------------------- +class MLP: + def __init__(self, input_size, hidden_size, num_classes, learning_rate=0.1, keep_prob=1.0): + self.input_size = input_size + self.hidden_size = hidden_size + self.num_classes = num_classes + self.learning_rate = learning_rate + self.keep_prob = keep_prob + + self.W1 = np.random.randn(input_size, hidden_size) / np.sqrt(input_size) + self.b1 = np.zeros((1, hidden_size)) + self.W2 = np.random.randn(hidden_size, num_classes) / np.sqrt(hidden_size) + self.b2 = np.zeros((1, num_classes)) + + def relu(self, x): + return np.maximum(0, x) + + def softmax(self, x): + exp_x = np.exp(x - np.max(x, axis=1, keepdims=True)) + return exp_x / np.sum(exp_x, axis=1, keepdims=True) + + def forward(self, X, training=True): + self.z1 = np.dot(X, self.W1) + self.b1 + self.a1 = self.relu(self.z1) + if training and self.keep_prob < 1.0: + self.dropout_mask = np.random.rand(*self.a1.shape) < self.keep_prob + self.a1 = self.a1 * self.dropout_mask / self.keep_prob + self.z2 = np.dot(self.a1, self.W2) + self.b2 + self.a2 = self.softmax(self.z2) + return self.a2 + + def backward(self, X, y, output): + m = X.shape[0] + delta2 = output.copy() + delta2[range(m), y] -= 1 + delta2 /= m + dW2 = np.dot(self.a1.T, delta2) + db2 = np.sum(delta2, axis=0, keepdims=True) + delta1 = np.dot(delta2, self.W2.T) + delta1[self.z1 <= 0] = 0 + if self.keep_prob < 1.0: + delta1 = delta1 * self.dropout_mask / self.keep_prob + dW1 = np.dot(X.T, delta1) + db1 = np.sum(delta1, axis=0, keepdims=True) + self.W1 -= self.learning_rate * dW1 + self.b1 -= self.learning_rate * db1 + self.W2 -= self.learning_rate * dW2 + self.b2 -= self.learning_rate * db2 + + def train(self, X, y, epochs=30, batch_size=2, verbose=True): + m = X.shape[0] + for epoch in range(epochs): + permutation = np.random.permutation(m) + X_shuffled = X[permutation] + y_shuffled = y[permutation] + num_batches = max(1, m // batch_size) + epoch_loss = 0.0 + for i in range(num_batches): + start = i * batch_size + end = start + batch_size + batch_X = X_shuffled[start:end] + batch_y = y_shuffled[start:end] + output = self.forward(batch_X, training=True) + batch_loss = -np.mean(np.log(output[range(len(batch_y)), batch_y] + 1e-8)) + epoch_loss += batch_loss + self.backward(batch_X, batch_y, output) + if verbose and (epoch + 1) % 10 == 0: + train_acc = self.accuracy(X, y) + print(f"Epoch {epoch+1:3d}/{epochs} | Loss: {epoch_loss/num_batches:.4f} | 训练准确率: {train_acc:.4f}") + return self + + def predict(self, X): + return np.argmax(self.forward(X, training=False), axis=1) + + def predict_proba(self, X): + return self.forward(X, training=False) + + def accuracy(self, X, y): + return np.mean(self.predict(X) == y) + + def save(self, filepath): + np.save(filepath + '_W1.npy', self.W1) + np.save(filepath + '_b1.npy', self.b1) + np.save(filepath + '_W2.npy', self.W2) + np.save(filepath + '_b2.npy', self.b2) + + @staticmethod + def load(filepath, input_size, hidden_size=8, num_classes=2, learning_rate=0.1, keep_prob=1.0): + model = MLP(input_size, hidden_size, num_classes, learning_rate, keep_prob) + model.W1 = np.load(filepath + '_W1.npy') + model.b1 = np.load(filepath + '_b1.npy') + model.W2 = np.load(filepath + '_W2.npy') + model.b2 = np.load(filepath + '_b2.npy') + return model + +# -------------------------- 主程序(训练+预测一体化) -------------------------- +if __name__ == "__main__": + # 1. 内置训练数据 + texts = [ + "房间干净整洁,前台服务态度特别好,住着很舒服", + "隔音特别差,卫生不干净,设施老旧,住得非常不满意", + "环境很好,服务周到,下次还会再来", + "空调噪音大,洗澡水忽冷忽热,体验很差", + "床很软,枕头舒服,睡得很香", + "灯光昏暗,床品不干净,体验极差" + ] + labels = [1, 0, 1, 0, 1, 0] + + # 2. 训练并保存向量器和模型 + vectorizer = TfidfVectorizer(max_features=100) + X = vectorizer.fit_transform(texts).toarray() + y = np.array(labels) + np.save("tfidf_vocab.npy", vectorizer.vocabulary_) + + mlp = MLP(input_size=X.shape[1], hidden_size=8, num_classes=2) + mlp.train(X, y, epochs=30, batch_size=2, verbose=True) + mlp.save("model_mlp") + print("\n✅ 模型训练完成!") + + # 3. 直接用训练好的向量器预测,不重新加载 + print("\n=== 酒店评论情感分类预测 ===") + text = input("请输入酒店评论文本:") + X_new = vectorizer.transform([text]).toarray() + pred = mlp.predict(X_new)[0] + prob = mlp.predict_proba(X_new)[0] + + label_map = {0: "负面", 1: "正面"} + print(f"\n预测结果:{label_map[pred]}") + print(f"置信度:正面概率={prob[1]*100:.1f}%,负面概率={prob[0]*100:.1f}%") \ No newline at end of file