完成作业3-2-1

This commit is contained in:
2509165016
2026-04-23 16:00:52 +08:00
parent b936c0ca34
commit 8460c913c8
4 changed files with 486 additions and 0 deletions

163
2509165016-7.py Normal file
View File

@@ -0,0 +1,163 @@
# TF-IDF演示纯Python实现
import math
print("=" * 50)
print("TF-IDF词频-逆文档频率演示")
print("=" * 50)
def simple_tfidf(docs):
"""
简单的TF-IDF实现
参数:
docs: 文档列表,每篇文档已经是分词后的词列表
返回:
vocab: 词表
tfidf_matrix: TF-IDF矩阵
idf: 每个词的IDF值
"""
# 1. 构建词表和BoW
vocab_set = set()
for doc in docs:
vocab_set.update(doc)
vocab = sorted(list(vocab_set))
# 2. 构建BoW矩阵
bow = []
for doc in docs:
vec = [0] * len(vocab)
for word in doc:
if word in vocab:
vec[vocab.index(word)] += 1
bow.append(vec)
n_docs = len(docs)
# 3. 计算IDF
idf = []
for j, word in enumerate(vocab):
df = sum(1 for vec in bow if vec[j] > 0)
idf_j = math.log(n_docs / (df + 1)) + 1
idf.append(idf_j)
# 4. 计算TF-IDF
tfidf = []
for vec in bow:
tfidf_vec = []
for i, tf in enumerate(vec):
tfidf_vec.append(tf * idf[i])
tfidf.append(tfidf_vec)
return vocab, tfidf, idf
docs = [
["Python", "编程", "语言"],
["Python", "Python", "Python"], # Python出现3次
["Java", "编程", "语言"],
]
vocab, tfidf_matrix, idf = simple_tfidf(docs)
print("文档集合:")
for i, doc in enumerate(docs):
print(f" Doc{i+1}: {' '.join(doc)}")
print()
print(f"词表: {vocab}")
print()
print(f"IDF值: {[round(x, 4) for x in idf]}")
print()
print("TF-IDF矩阵")
for i, vec in enumerate(tfidf_matrix):
print(f" Doc{i+1}: {[round(x, 4) for x in vec]}")
print()
print("详细分析:")
for i, doc in enumerate(docs):
print(f"\nDoc{i+1}: {' '.join(doc)}")
for j, score in enumerate(tfidf_matrix[i]):
if score > 0:
print(f" '{vocab[j]}': TF-IDF = {score:.4f}")
# TF-IDF vs BoW 对比
import math
print("=" * 50)
print("TF-IDF vs BoW 对比")
print("=" * 50)
def simple_bow(docs):
vocab_set = set()
for doc in docs:
vocab_set.update(doc)
vocab = sorted(list(vocab_set))
bow_matrix = []
for doc in docs:
vec = [0] * len(vocab)
for word in doc:
if word in vocab:
vec[vocab.index(word)] += 1
bow_matrix.append(vec)
return vocab, bow_matrix
def simple_tfidf(docs):
vocab_set = set()
for doc in docs:
vocab_set.update(doc)
vocab = sorted(list(vocab_set))
bow = []
for doc in docs:
vec = [0] * len(vocab)
for word in doc:
if word in vocab:
vec[vocab.index(word)] += 1
bow.append(vec)
n_docs = len(docs)
idf = []
for j, word in enumerate(vocab):
df = sum(1 for vec in bow if vec[j] > 0)
idf_j = math.log(n_docs / (df + 1)) + 1
idf.append(idf_j)
tfidf = []
for vec in bow:
tfidf_vec = []
for i, tf in enumerate(vec):
tfidf_vec.append(tf * idf[i])
tfidf.append(tfidf_vec)
return vocab, tfidf, idf
docs = [
["Python", "编程"],
["Java", "编程"],
["Python", "Python", "Python"] # Python出现3次
]
vocab_bow, bow_matrix = simple_bow(docs)
vocab_tfidf, tfidf_matrix, idf = simple_tfidf(docs)
print("文档:")
for i, doc in enumerate(docs):
print(f" Doc{i+1}: {' '.join(doc)}")
print()
print("BoW矩阵")
for i, vec in enumerate(bow_matrix):
print(f" Doc{i+1}: {vec}")
print()
print("TF-IDF矩阵")
for i, vec in enumerate(tfidf_matrix):
print(f" Doc{i+1}: {[round(x, 4) for x in vec]}")
print()
# 重点分析Doc3
print("重点分析:")
print(f"Doc3 'Python Python Python':")
print(f" BoW: Python出现3次")
print(f" TF-IDF: Python的TF-IDF = {tfidf_matrix[2][0]:.4f}")
print()
print("为什么Doc3的TF-IDF不是最高的")
print("因为Python在Doc1和Doc2也出现了IDF值被稀释")