Files
task-3-2-1-Text-Processing-…/2509165016-7.py
2026-04-23 16:00:52 +08:00

163 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# TF-IDF演示纯Python实现
import math
print("=" * 50)
print("TF-IDF词频-逆文档频率演示")
print("=" * 50)
def simple_tfidf(docs):
"""
简单的TF-IDF实现
参数:
docs: 文档列表,每篇文档已经是分词后的词列表
返回:
vocab: 词表
tfidf_matrix: TF-IDF矩阵
idf: 每个词的IDF值
"""
# 1. 构建词表和BoW
vocab_set = set()
for doc in docs:
vocab_set.update(doc)
vocab = sorted(list(vocab_set))
# 2. 构建BoW矩阵
bow = []
for doc in docs:
vec = [0] * len(vocab)
for word in doc:
if word in vocab:
vec[vocab.index(word)] += 1
bow.append(vec)
n_docs = len(docs)
# 3. 计算IDF
idf = []
for j, word in enumerate(vocab):
df = sum(1 for vec in bow if vec[j] > 0)
idf_j = math.log(n_docs / (df + 1)) + 1
idf.append(idf_j)
# 4. 计算TF-IDF
tfidf = []
for vec in bow:
tfidf_vec = []
for i, tf in enumerate(vec):
tfidf_vec.append(tf * idf[i])
tfidf.append(tfidf_vec)
return vocab, tfidf, idf
docs = [
["Python", "编程", "语言"],
["Python", "Python", "Python"], # Python出现3次
["Java", "编程", "语言"],
]
vocab, tfidf_matrix, idf = simple_tfidf(docs)
print("文档集合:")
for i, doc in enumerate(docs):
print(f" Doc{i+1}: {' '.join(doc)}")
print()
print(f"词表: {vocab}")
print()
print(f"IDF值: {[round(x, 4) for x in idf]}")
print()
print("TF-IDF矩阵")
for i, vec in enumerate(tfidf_matrix):
print(f" Doc{i+1}: {[round(x, 4) for x in vec]}")
print()
print("详细分析:")
for i, doc in enumerate(docs):
print(f"\nDoc{i+1}: {' '.join(doc)}")
for j, score in enumerate(tfidf_matrix[i]):
if score > 0:
print(f" '{vocab[j]}': TF-IDF = {score:.4f}")
# TF-IDF vs BoW 对比
import math
print("=" * 50)
print("TF-IDF vs BoW 对比")
print("=" * 50)
def simple_bow(docs):
vocab_set = set()
for doc in docs:
vocab_set.update(doc)
vocab = sorted(list(vocab_set))
bow_matrix = []
for doc in docs:
vec = [0] * len(vocab)
for word in doc:
if word in vocab:
vec[vocab.index(word)] += 1
bow_matrix.append(vec)
return vocab, bow_matrix
def simple_tfidf(docs):
vocab_set = set()
for doc in docs:
vocab_set.update(doc)
vocab = sorted(list(vocab_set))
bow = []
for doc in docs:
vec = [0] * len(vocab)
for word in doc:
if word in vocab:
vec[vocab.index(word)] += 1
bow.append(vec)
n_docs = len(docs)
idf = []
for j, word in enumerate(vocab):
df = sum(1 for vec in bow if vec[j] > 0)
idf_j = math.log(n_docs / (df + 1)) + 1
idf.append(idf_j)
tfidf = []
for vec in bow:
tfidf_vec = []
for i, tf in enumerate(vec):
tfidf_vec.append(tf * idf[i])
tfidf.append(tfidf_vec)
return vocab, tfidf, idf
docs = [
["Python", "编程"],
["Java", "编程"],
["Python", "Python", "Python"] # Python出现3次
]
vocab_bow, bow_matrix = simple_bow(docs)
vocab_tfidf, tfidf_matrix, idf = simple_tfidf(docs)
print("文档:")
for i, doc in enumerate(docs):
print(f" Doc{i+1}: {' '.join(doc)}")
print()
print("BoW矩阵")
for i, vec in enumerate(bow_matrix):
print(f" Doc{i+1}: {vec}")
print()
print("TF-IDF矩阵")
for i, vec in enumerate(tfidf_matrix):
print(f" Doc{i+1}: {[round(x, 4) for x in vec]}")
print()
# 重点分析Doc3
print("重点分析:")
print(f"Doc3 'Python Python Python':")
print(f" BoW: Python出现3次")
print(f" TF-IDF: Python的TF-IDF = {tfidf_matrix[2][0]:.4f}")
print()
print("为什么Doc3的TF-IDF不是最高的")
print("因为Python在Doc1和Doc2也出现了IDF值被稀释")