完成作业3-2-1
This commit is contained in:
163
2509165016-7.py
Normal file
163
2509165016-7.py
Normal file
@@ -0,0 +1,163 @@
|
||||
# TF-IDF演示(纯Python实现)
|
||||
import math
|
||||
|
||||
print("=" * 50)
|
||||
print("TF-IDF词频-逆文档频率演示")
|
||||
print("=" * 50)
|
||||
|
||||
def simple_tfidf(docs):
|
||||
"""
|
||||
简单的TF-IDF实现
|
||||
|
||||
参数:
|
||||
docs: 文档列表,每篇文档已经是分词后的词列表
|
||||
返回:
|
||||
vocab: 词表
|
||||
tfidf_matrix: TF-IDF矩阵
|
||||
idf: 每个词的IDF值
|
||||
"""
|
||||
# 1. 构建词表和BoW
|
||||
vocab_set = set()
|
||||
for doc in docs:
|
||||
vocab_set.update(doc)
|
||||
vocab = sorted(list(vocab_set))
|
||||
|
||||
# 2. 构建BoW矩阵
|
||||
bow = []
|
||||
for doc in docs:
|
||||
vec = [0] * len(vocab)
|
||||
for word in doc:
|
||||
if word in vocab:
|
||||
vec[vocab.index(word)] += 1
|
||||
bow.append(vec)
|
||||
|
||||
n_docs = len(docs)
|
||||
|
||||
# 3. 计算IDF
|
||||
idf = []
|
||||
for j, word in enumerate(vocab):
|
||||
df = sum(1 for vec in bow if vec[j] > 0)
|
||||
idf_j = math.log(n_docs / (df + 1)) + 1
|
||||
idf.append(idf_j)
|
||||
|
||||
# 4. 计算TF-IDF
|
||||
tfidf = []
|
||||
for vec in bow:
|
||||
tfidf_vec = []
|
||||
for i, tf in enumerate(vec):
|
||||
tfidf_vec.append(tf * idf[i])
|
||||
tfidf.append(tfidf_vec)
|
||||
|
||||
return vocab, tfidf, idf
|
||||
|
||||
docs = [
|
||||
["Python", "编程", "语言"],
|
||||
["Python", "Python", "Python"], # Python出现3次
|
||||
["Java", "编程", "语言"],
|
||||
]
|
||||
|
||||
vocab, tfidf_matrix, idf = simple_tfidf(docs)
|
||||
|
||||
print("文档集合:")
|
||||
for i, doc in enumerate(docs):
|
||||
print(f" Doc{i+1}: {' '.join(doc)}")
|
||||
print()
|
||||
|
||||
print(f"词表: {vocab}")
|
||||
print()
|
||||
print(f"IDF值: {[round(x, 4) for x in idf]}")
|
||||
print()
|
||||
|
||||
print("TF-IDF矩阵:")
|
||||
for i, vec in enumerate(tfidf_matrix):
|
||||
print(f" Doc{i+1}: {[round(x, 4) for x in vec]}")
|
||||
print()
|
||||
|
||||
print("详细分析:")
|
||||
for i, doc in enumerate(docs):
|
||||
print(f"\nDoc{i+1}: {' '.join(doc)}")
|
||||
for j, score in enumerate(tfidf_matrix[i]):
|
||||
if score > 0:
|
||||
print(f" '{vocab[j]}': TF-IDF = {score:.4f}")
|
||||
# TF-IDF vs BoW 对比
|
||||
import math
|
||||
|
||||
print("=" * 50)
|
||||
print("TF-IDF vs BoW 对比")
|
||||
print("=" * 50)
|
||||
|
||||
def simple_bow(docs):
|
||||
vocab_set = set()
|
||||
for doc in docs:
|
||||
vocab_set.update(doc)
|
||||
vocab = sorted(list(vocab_set))
|
||||
bow_matrix = []
|
||||
for doc in docs:
|
||||
vec = [0] * len(vocab)
|
||||
for word in doc:
|
||||
if word in vocab:
|
||||
vec[vocab.index(word)] += 1
|
||||
bow_matrix.append(vec)
|
||||
return vocab, bow_matrix
|
||||
|
||||
def simple_tfidf(docs):
|
||||
vocab_set = set()
|
||||
for doc in docs:
|
||||
vocab_set.update(doc)
|
||||
vocab = sorted(list(vocab_set))
|
||||
bow = []
|
||||
for doc in docs:
|
||||
vec = [0] * len(vocab)
|
||||
for word in doc:
|
||||
if word in vocab:
|
||||
vec[vocab.index(word)] += 1
|
||||
bow.append(vec)
|
||||
|
||||
n_docs = len(docs)
|
||||
idf = []
|
||||
for j, word in enumerate(vocab):
|
||||
df = sum(1 for vec in bow if vec[j] > 0)
|
||||
idf_j = math.log(n_docs / (df + 1)) + 1
|
||||
idf.append(idf_j)
|
||||
|
||||
tfidf = []
|
||||
for vec in bow:
|
||||
tfidf_vec = []
|
||||
for i, tf in enumerate(vec):
|
||||
tfidf_vec.append(tf * idf[i])
|
||||
tfidf.append(tfidf_vec)
|
||||
|
||||
return vocab, tfidf, idf
|
||||
|
||||
docs = [
|
||||
["Python", "编程"],
|
||||
["Java", "编程"],
|
||||
["Python", "Python", "Python"] # Python出现3次
|
||||
]
|
||||
|
||||
vocab_bow, bow_matrix = simple_bow(docs)
|
||||
vocab_tfidf, tfidf_matrix, idf = simple_tfidf(docs)
|
||||
|
||||
print("文档:")
|
||||
for i, doc in enumerate(docs):
|
||||
print(f" Doc{i+1}: {' '.join(doc)}")
|
||||
print()
|
||||
|
||||
print("BoW矩阵:")
|
||||
for i, vec in enumerate(bow_matrix):
|
||||
print(f" Doc{i+1}: {vec}")
|
||||
print()
|
||||
|
||||
print("TF-IDF矩阵:")
|
||||
for i, vec in enumerate(tfidf_matrix):
|
||||
print(f" Doc{i+1}: {[round(x, 4) for x in vec]}")
|
||||
print()
|
||||
|
||||
# 重点分析Doc3
|
||||
print("重点分析:")
|
||||
print(f"Doc3 'Python Python Python':")
|
||||
print(f" BoW: Python出现3次")
|
||||
print(f" TF-IDF: Python的TF-IDF = {tfidf_matrix[2][0]:.4f}")
|
||||
print()
|
||||
print("为什么Doc3的TF-IDF不是最高的?")
|
||||
print("因为Python在Doc1和Doc2也出现了,IDF值被稀释")
|
||||
Reference in New Issue
Block a user