task-3-2-1-Text-Processing-…/2509165016-7.py

# TF-IDF演示（纯Python实现）
import math

print("=" * 50)
print("TF-IDF词频-逆文档频率演示")
print("=" * 50)

def simple_tfidf(docs):
    """
    简单的TF-IDF实现

    参数:
        docs: 文档列表，每篇文档已经是分词后的词列表
    返回:
        vocab: 词表
        tfidf_matrix: TF-IDF矩阵
        idf: 每个词的IDF值
    """
    # 1. 构建词表和BoW
    vocab_set = set()
    for doc in docs:
        vocab_set.update(doc)
    vocab = sorted(list(vocab_set))

    # 2. 构建BoW矩阵
    bow = []
    for doc in docs:
        vec = [0] * len(vocab)
        for word in doc:
            if word in vocab:
                vec[vocab.index(word)] += 1
        bow.append(vec)

    n_docs = len(docs)

    # 3. 计算IDF
    idf = []
    for j, word in enumerate(vocab):
        df = sum(1 for vec in bow if vec[j] > 0)
        idf_j = math.log(n_docs / (df + 1)) + 1
        idf.append(idf_j)

    # 4. 计算TF-IDF
    tfidf = []
    for vec in bow:
        tfidf_vec = []
        for i, tf in enumerate(vec):
            tfidf_vec.append(tf * idf[i])
        tfidf.append(tfidf_vec)

    return vocab, tfidf, idf

docs = [
    ["Python", "编程", "语言"],
    ["Python", "Python", "Python"],  # Python出现3次
    ["Java", "编程", "语言"],
]

vocab, tfidf_matrix, idf = simple_tfidf(docs)

print("文档集合：")
for i, doc in enumerate(docs):
    print(f"  Doc{i+1}: {' '.join(doc)}")
print()

print(f"词表: {vocab}")
print()
print(f"IDF值: {[round(x, 4) for x in idf]}")
print()

print("TF-IDF矩阵：")
for i, vec in enumerate(tfidf_matrix):
    print(f"  Doc{i+1}: {[round(x, 4) for x in vec]}")
print()

print("详细分析：")
for i, doc in enumerate(docs):
    print(f"\nDoc{i+1}: {' '.join(doc)}")
    for j, score in enumerate(tfidf_matrix[i]):
        if score > 0:
            print(f"  '{vocab[j]}': TF-IDF = {score:.4f}")
            # TF-IDF vs BoW 对比
import math

print("=" * 50)
print("TF-IDF vs BoW 对比")
print("=" * 50)

def simple_bow(docs):
    vocab_set = set()
    for doc in docs:
        vocab_set.update(doc)
    vocab = sorted(list(vocab_set))
    bow_matrix = []
    for doc in docs:
        vec = [0] * len(vocab)
        for word in doc:
            if word in vocab:
                vec[vocab.index(word)] += 1
        bow_matrix.append(vec)
    return vocab, bow_matrix

def simple_tfidf(docs):
    vocab_set = set()
    for doc in docs:
        vocab_set.update(doc)
    vocab = sorted(list(vocab_set))
    bow = []
    for doc in docs:
        vec = [0] * len(vocab)
        for word in doc:
            if word in vocab:
                vec[vocab.index(word)] += 1
        bow.append(vec)

    n_docs = len(docs)
    idf = []
    for j, word in enumerate(vocab):
        df = sum(1 for vec in bow if vec[j] > 0)
        idf_j = math.log(n_docs / (df + 1)) + 1
        idf.append(idf_j)

    tfidf = []
    for vec in bow:
        tfidf_vec = []
        for i, tf in enumerate(vec):
            tfidf_vec.append(tf * idf[i])
        tfidf.append(tfidf_vec)

    return vocab, tfidf, idf

docs = [
    ["Python", "编程"],
    ["Java", "编程"],
    ["Python", "Python", "Python"]  # Python出现3次
]

vocab_bow, bow_matrix = simple_bow(docs)
vocab_tfidf, tfidf_matrix, idf = simple_tfidf(docs)

print("文档：")
for i, doc in enumerate(docs):
    print(f"  Doc{i+1}: {' '.join(doc)}")
print()

print("BoW矩阵：")
for i, vec in enumerate(bow_matrix):
    print(f"  Doc{i+1}: {vec}")
print()

print("TF-IDF矩阵：")
for i, vec in enumerate(tfidf_matrix):
    print(f"  Doc{i+1}: {[round(x, 4) for x in vec]}")
print()

# 重点分析Doc3
print("重点分析：")
print(f"Doc3 'Python Python Python':")
print(f"  BoW:   Python出现3次")
print(f"  TF-IDF: Python的TF-IDF = {tfidf_matrix[2][0]:.4f}")
print()
print("为什么Doc3的TF-IDF不是最高的？")
print("因为Python在Doc1和Doc2也出现了，IDF值被稀释")