88 lines
2.6 KiB
Python
88 lines
2.6 KiB
Python
import numpy as np
|
||
import jieba
|
||
from collections import Counter
|
||
import os
|
||
import requests
|
||
|
||
# <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݼ<EFBFBD>
|
||
def download_data():
|
||
url = "https://github.com/SophonPlus/ChineseNlpCorpus/raw/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv"
|
||
path = "ChnSentiCorp_htl_all.csv"
|
||
if not os.path.exists(path):
|
||
print("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݼ<EFBFBD>...")
|
||
r = requests.get(url)
|
||
with open(path, "wb") as f:
|
||
f.write(r.content)
|
||
return path
|
||
|
||
# <20>ִʣ<D6B4><CAA3><EFBFBD><EFBFBD>˵<EFBFBD><CBB5>֣<EFBFBD>
|
||
def tokenize(text):
|
||
words = jieba.lcut(str(text).strip())
|
||
return [w for w in words if len(w) > 1]
|
||
|
||
# <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||
def load_data():
|
||
path = download_data()
|
||
texts, labels = [], []
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
next(f)
|
||
for line in f:
|
||
parts = line.strip().split(",", 1)
|
||
if len(parts) != 2:
|
||
continue
|
||
label, text = parts
|
||
texts.append(text)
|
||
labels.append(int(label))
|
||
return texts, labels
|
||
|
||
# BoW <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||
class BoWVectorizer:
|
||
def __init__(self, max_features=3000):
|
||
self.max_features = max_features
|
||
self.vocab = {}
|
||
|
||
def fit(self, texts):
|
||
counter = Counter()
|
||
for t in texts:
|
||
counter.update(tokenize(t))
|
||
words = [w for w, _ in counter.most_common(self.max_features)]
|
||
self.vocab = {w:i for i, w in enumerate(words)}
|
||
|
||
def transform(self, text):
|
||
words = tokenize(text)
|
||
vec = np.zeros(self.max_features)
|
||
for w in words:
|
||
if w in self.vocab:
|
||
vec[self.vocab[w]] += 1
|
||
return vec
|
||
|
||
# TF-IDF <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||
class TFIDFVectorizer:
|
||
def __init__(self, max_features=3000):
|
||
self.max_features = max_features
|
||
self.vocab = {}
|
||
self.idf = {}
|
||
|
||
def fit(self, texts):
|
||
counter = Counter()
|
||
doc_freq = Counter()
|
||
for t in texts:
|
||
ws = set(tokenize(t))
|
||
counter.update(tokenize(t))
|
||
for w in ws:
|
||
doc_freq[w] += 1
|
||
|
||
words = [w for w, _ in counter.most_common(self.max_features)]
|
||
self.vocab = {w:i for i, w in enumerate(words)}
|
||
N = len(texts)
|
||
for w in self.vocab:
|
||
self.idf[w] = np.log(N / (doc_freq.get(w, 0) + 1))
|
||
|
||
def transform(self, text):
|
||
words = tokenize(text)
|
||
vec = np.zeros(self.max_features)
|
||
tf = Counter(words)
|
||
for w, cnt in tf.items():
|
||
if w in self.vocab:
|
||
vec[self.vocab[w]] = cnt * self.idf[w]
|
||
return vec |