task-3-2-2-text-classification/dataset.py

import numpy as np
import jieba
from collections import Counter
import os
import requests

# <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݼ<EFBFBD>
def download_data():
    url = "https://github.com/SophonPlus/ChineseNlpCorpus/raw/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv"
    path = "ChnSentiCorp_htl_all.csv"
    if not os.path.exists(path):
        print("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݼ<EFBFBD>...")
        r = requests.get(url)
        with open(path, "wb") as f:
            f.write(r.content)
    return path

# <20>ִʣ<D6B4><CAA3><EFBFBD><EFBFBD>˵<EFBFBD><CBB5>֣<EFBFBD>
def tokenize(text):
    words = jieba.lcut(str(text).strip())
    return [w for w in words if len(w) > 1]

# <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
def load_data():
    path = download_data()
    texts, labels = [], []
    with open(path, "r", encoding="utf-8") as f:
        next(f)
        for line in f:
            parts = line.strip().split(",", 1)
            if len(parts) != 2:
                continue
            label, text = parts
            texts.append(text)
            labels.append(int(label))
    return texts, labels

# BoW <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
class BoWVectorizer:
    def __init__(self, max_features=3000):
        self.max_features = max_features
        self.vocab = {}

    def fit(self, texts):
        counter = Counter()
        for t in texts:
            counter.update(tokenize(t))
        words = [w for w, _ in counter.most_common(self.max_features)]
        self.vocab = {w:i for i, w in enumerate(words)}

    def transform(self, text):
        words = tokenize(text)
        vec = np.zeros(self.max_features)
        for w in words:
            if w in self.vocab:
                vec[self.vocab[w]] += 1
        return vec

# TF-IDF <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
class TFIDFVectorizer:
    def __init__(self, max_features=3000):
        self.max_features = max_features
        self.vocab = {}
        self.idf = {}

    def fit(self, texts):
        counter = Counter()
        doc_freq = Counter()
        for t in texts:
            ws = set(tokenize(t))
            counter.update(tokenize(t))
            for w in ws:
                doc_freq[w] += 1

        words = [w for w, _ in counter.most_common(self.max_features)]
        self.vocab = {w:i for i, w in enumerate(words)}
        N = len(texts)
        for w in self.vocab:
            self.idf[w] = np.log(N / (doc_freq.get(w, 0) + 1))

    def transform(self, text):
        words = tokenize(text)
        vec = np.zeros(self.max_features)
        tf = Counter(words)
        for w, cnt in tf.items():
            if w in self.vocab:
                vec[self.vocab[w]] = cnt * self.idf[w]
        return vec