Files
task-3-2-2-text-classification/dataset.py
2026-04-30 15:53:27 +08:00

88 lines
2.6 KiB
Python
Raw Blame History

import numpy as np
import jieba
from collections import Counter
import os
import requests
# <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݼ<EFBFBD>
def download_data():
url = "https://github.com/SophonPlus/ChineseNlpCorpus/raw/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv"
path = "ChnSentiCorp_htl_all.csv"
if not os.path.exists(path):
print("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݼ<EFBFBD>...")
r = requests.get(url)
with open(path, "wb") as f:
f.write(r.content)
return path
# <20>ִʣ<D6B4><CAA3><EFBFBD><EFBFBD>˵<EFBFBD><CBB5>֣<EFBFBD>
def tokenize(text):
words = jieba.lcut(str(text).strip())
return [w for w in words if len(w) > 1]
# <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
def load_data():
path = download_data()
texts, labels = [], []
with open(path, "r", encoding="utf-8") as f:
next(f)
for line in f:
parts = line.strip().split(",", 1)
if len(parts) != 2:
continue
label, text = parts
texts.append(text)
labels.append(int(label))
return texts, labels
# BoW <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
class BoWVectorizer:
def __init__(self, max_features=3000):
self.max_features = max_features
self.vocab = {}
def fit(self, texts):
counter = Counter()
for t in texts:
counter.update(tokenize(t))
words = [w for w, _ in counter.most_common(self.max_features)]
self.vocab = {w:i for i, w in enumerate(words)}
def transform(self, text):
words = tokenize(text)
vec = np.zeros(self.max_features)
for w in words:
if w in self.vocab:
vec[self.vocab[w]] += 1
return vec
# TF-IDF <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
class TFIDFVectorizer:
def __init__(self, max_features=3000):
self.max_features = max_features
self.vocab = {}
self.idf = {}
def fit(self, texts):
counter = Counter()
doc_freq = Counter()
for t in texts:
ws = set(tokenize(t))
counter.update(tokenize(t))
for w in ws:
doc_freq[w] += 1
words = [w for w, _ in counter.most_common(self.max_features)]
self.vocab = {w:i for i, w in enumerate(words)}
N = len(texts)
for w in self.vocab:
self.idf[w] = np.log(N / (doc_freq.get(w, 0) + 1))
def transform(self, text):
words = tokenize(text)
vec = np.zeros(self.max_features)
tf = Counter(words)
for w, cnt in tf.items():
if w in self.vocab:
vec[self.vocab[w]] = cnt * self.idf[w]
return vec