Files
2026-04-23 16:03:38 +08:00

110 lines
2.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 实战jieba分词 + TF-IDF完整流程
import jieba
import math
print("=" * 50)
print("实战jieba分词 + TF-IDF完整流程")
print("=" * 50)
def simple_tfidf_tokenized(docs, stopwords=None):
"""
结合分词的TF-IDF实现
参数:
docs: 原始文档列表
stopwords: 停用词集合
返回:
vocab, tfidf_matrix
"""
# 1. 分词
tokenized = []
for doc in docs:
words = jieba.cut(doc)
if stopwords:
words = [w for w in words if w not in stopwords and len(w) > 1]
else:
words = [w for w in words if len(w) > 1]
tokenized.append(words)
# 2. 构建词表
vocab_set = set()
for doc in tokenized:
vocab_set.update(doc)
vocab = sorted(list(vocab_set))
# 3. 构建TF矩阵并计算IDF
n_docs = len(tokenized)
tf_matrix = []
df_dict = {word: 0 for word in vocab}
for doc in tokenized:
vec = [0] * len(vocab)
for word in doc:
if word in vocab:
idx = vocab.index(word)
vec[idx] += 1
tf_matrix.append(vec)
# 计算DF
for vec in tf_matrix:
for j, count in enumerate(vec):
if count > 0:
word = vocab[j]
df_dict[word] += 1
# 计算IDF
idf = []
for word in vocab:
df = df_dict[word]
idf_j = math.log(n_docs / (df + 1)) + 1
idf.append(idf_j)
# 计算TF-IDF
tfidf = []
for vec in tf_matrix:
tfidf_vec = [vec[i] * idf[i] for i in range(len(vec))]
tfidf.append(tfidf_vec)
return vocab, tfidf, tokenized
# 示例文档集合
docs = [
"Python是一门很棒的编程语言",
"人工智能是未来的发展方向",
"深度学习是机器学习的一个分支",
"Python和Java都是很流行的编程语言"
]
# 停用词
stopwords = set(["", "", "一个", "", "", "", ""])
vocab, tfidf_matrix, tokenized = simple_tfidf_tokenized(docs, stopwords)
print("文档集合:")
for i, doc in enumerate(docs):
print(f" Doc{i+1}: {doc}")
print()
print(f"分词结果:")
for i, words in enumerate(tokenized):
print(f" Doc{i+1}: {' / '.join(words)}")
print()
print(f"词表(共{len(vocab)}个词):")
print(f" {vocab}")
print()
print("TF-IDF矩阵")
for i, vec in enumerate(tfidf_matrix):
# 只显示非零值
nonzero = [(vocab[j], round(vec[j], 4)) for j in range(len(vec)) if vec[j] > 0]
print(f" Doc{i+1}: {nonzero}")
print()
# 找每个文档最重要的词
print("每个文档最重要的词TF-IDF值最高")
for i, vec in enumerate(tfidf_matrix):
max_idx = max(range(len(vec)), key=lambda j: vec[j])
max_score = vec[max_idx]
if max_score > 0:
print(f" Doc{i+1}: '{vocab[max_idx]}' (TF-IDF={max_score:.4f})")