import numpy as np import jieba from collections import Counter import os import requests # �������ݼ� def download_data(): url = "https://github.com/SophonPlus/ChineseNlpCorpus/raw/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv" path = "ChnSentiCorp_htl_all.csv" if not os.path.exists(path): print("�����������ݼ�...") r = requests.get(url) with open(path, "wb") as f: f.write(r.content) return path # �ִʣ����˵��֣� def tokenize(text): words = jieba.lcut(str(text).strip()) return [w for w in words if len(w) > 1] # �������� def load_data(): path = download_data() texts, labels = [], [] with open(path, "r", encoding="utf-8") as f: next(f) for line in f: parts = line.strip().split(",", 1) if len(parts) != 2: continue label, text = parts texts.append(text) labels.append(int(label)) return texts, labels # BoW ������ class BoWVectorizer: def __init__(self, max_features=3000): self.max_features = max_features self.vocab = {} def fit(self, texts): counter = Counter() for t in texts: counter.update(tokenize(t)) words = [w for w, _ in counter.most_common(self.max_features)] self.vocab = {w:i for i, w in enumerate(words)} def transform(self, text): words = tokenize(text) vec = np.zeros(self.max_features) for w in words: if w in self.vocab: vec[self.vocab[w]] += 1 return vec # TF-IDF ������ class TFIDFVectorizer: def __init__(self, max_features=3000): self.max_features = max_features self.vocab = {} self.idf = {} def fit(self, texts): counter = Counter() doc_freq = Counter() for t in texts: ws = set(tokenize(t)) counter.update(tokenize(t)) for w in ws: doc_freq[w] += 1 words = [w for w, _ in counter.most_common(self.max_features)] self.vocab = {w:i for i, w in enumerate(words)} N = len(texts) for w in self.vocab: self.idf[w] = np.log(N / (doc_freq.get(w, 0) + 1)) def transform(self, text): words = tokenize(text) vec = np.zeros(self.max_features) tf = Counter(words) for w, cnt in tf.items(): if w in self.vocab: vec[self.vocab[w]] = cnt * self.idf[w] return vec