163 lines
3.9 KiB
Python
163 lines
3.9 KiB
Python
# TF-IDF演示(纯Python实现)
|
||
import math
|
||
|
||
print("=" * 50)
|
||
print("TF-IDF词频-逆文档频率演示")
|
||
print("=" * 50)
|
||
|
||
def simple_tfidf(docs):
|
||
"""
|
||
简单的TF-IDF实现
|
||
|
||
参数:
|
||
docs: 文档列表,每篇文档已经是分词后的词列表
|
||
返回:
|
||
vocab: 词表
|
||
tfidf_matrix: TF-IDF矩阵
|
||
idf: 每个词的IDF值
|
||
"""
|
||
# 1. 构建词表和BoW
|
||
vocab_set = set()
|
||
for doc in docs:
|
||
vocab_set.update(doc)
|
||
vocab = sorted(list(vocab_set))
|
||
|
||
# 2. 构建BoW矩阵
|
||
bow = []
|
||
for doc in docs:
|
||
vec = [0] * len(vocab)
|
||
for word in doc:
|
||
if word in vocab:
|
||
vec[vocab.index(word)] += 1
|
||
bow.append(vec)
|
||
|
||
n_docs = len(docs)
|
||
|
||
# 3. 计算IDF
|
||
idf = []
|
||
for j, word in enumerate(vocab):
|
||
df = sum(1 for vec in bow if vec[j] > 0)
|
||
idf_j = math.log(n_docs / (df + 1)) + 1
|
||
idf.append(idf_j)
|
||
|
||
# 4. 计算TF-IDF
|
||
tfidf = []
|
||
for vec in bow:
|
||
tfidf_vec = []
|
||
for i, tf in enumerate(vec):
|
||
tfidf_vec.append(tf * idf[i])
|
||
tfidf.append(tfidf_vec)
|
||
|
||
return vocab, tfidf, idf
|
||
|
||
docs = [
|
||
["Python", "编程", "语言"],
|
||
["Python", "Python", "Python"], # Python出现3次
|
||
["Java", "编程", "语言"],
|
||
]
|
||
|
||
vocab, tfidf_matrix, idf = simple_tfidf(docs)
|
||
|
||
print("文档集合:")
|
||
for i, doc in enumerate(docs):
|
||
print(f" Doc{i+1}: {' '.join(doc)}")
|
||
print()
|
||
|
||
print(f"词表: {vocab}")
|
||
print()
|
||
print(f"IDF值: {[round(x, 4) for x in idf]}")
|
||
print()
|
||
|
||
print("TF-IDF矩阵:")
|
||
for i, vec in enumerate(tfidf_matrix):
|
||
print(f" Doc{i+1}: {[round(x, 4) for x in vec]}")
|
||
print()
|
||
|
||
print("详细分析:")
|
||
for i, doc in enumerate(docs):
|
||
print(f"\nDoc{i+1}: {' '.join(doc)}")
|
||
for j, score in enumerate(tfidf_matrix[i]):
|
||
if score > 0:
|
||
print(f" '{vocab[j]}': TF-IDF = {score:.4f}")
|
||
# TF-IDF vs BoW 对比
|
||
import math
|
||
|
||
print("=" * 50)
|
||
print("TF-IDF vs BoW 对比")
|
||
print("=" * 50)
|
||
|
||
def simple_bow(docs):
|
||
vocab_set = set()
|
||
for doc in docs:
|
||
vocab_set.update(doc)
|
||
vocab = sorted(list(vocab_set))
|
||
bow_matrix = []
|
||
for doc in docs:
|
||
vec = [0] * len(vocab)
|
||
for word in doc:
|
||
if word in vocab:
|
||
vec[vocab.index(word)] += 1
|
||
bow_matrix.append(vec)
|
||
return vocab, bow_matrix
|
||
|
||
def simple_tfidf(docs):
|
||
vocab_set = set()
|
||
for doc in docs:
|
||
vocab_set.update(doc)
|
||
vocab = sorted(list(vocab_set))
|
||
bow = []
|
||
for doc in docs:
|
||
vec = [0] * len(vocab)
|
||
for word in doc:
|
||
if word in vocab:
|
||
vec[vocab.index(word)] += 1
|
||
bow.append(vec)
|
||
|
||
n_docs = len(docs)
|
||
idf = []
|
||
for j, word in enumerate(vocab):
|
||
df = sum(1 for vec in bow if vec[j] > 0)
|
||
idf_j = math.log(n_docs / (df + 1)) + 1
|
||
idf.append(idf_j)
|
||
|
||
tfidf = []
|
||
for vec in bow:
|
||
tfidf_vec = []
|
||
for i, tf in enumerate(vec):
|
||
tfidf_vec.append(tf * idf[i])
|
||
tfidf.append(tfidf_vec)
|
||
|
||
return vocab, tfidf, idf
|
||
|
||
docs = [
|
||
["Python", "编程"],
|
||
["Java", "编程"],
|
||
["Python", "Python", "Python"] # Python出现3次
|
||
]
|
||
|
||
vocab_bow, bow_matrix = simple_bow(docs)
|
||
vocab_tfidf, tfidf_matrix, idf = simple_tfidf(docs)
|
||
|
||
print("文档:")
|
||
for i, doc in enumerate(docs):
|
||
print(f" Doc{i+1}: {' '.join(doc)}")
|
||
print()
|
||
|
||
print("BoW矩阵:")
|
||
for i, vec in enumerate(bow_matrix):
|
||
print(f" Doc{i+1}: {vec}")
|
||
print()
|
||
|
||
print("TF-IDF矩阵:")
|
||
for i, vec in enumerate(tfidf_matrix):
|
||
print(f" Doc{i+1}: {[round(x, 4) for x in vec]}")
|
||
print()
|
||
|
||
# 重点分析Doc3
|
||
print("重点分析:")
|
||
print(f"Doc3 'Python Python Python':")
|
||
print(f" BoW: Python出现3次")
|
||
print(f" TF-IDF: Python的TF-IDF = {tfidf_matrix[2][0]:.4f}")
|
||
print()
|
||
print("为什么Doc3的TF-IDF不是最高的?")
|
||
print("因为Python在Doc1和Doc2也出现了,IDF值被稀释") |