Files
task-3-2-1-Text-Processing-…/4.23 25/25 2.py
2026-04-23 15:54:30 +08:00

99 lines
2.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import jieba
import math
print("=" * 50)
print("实战jieba分词 + TF-IDF完整流程")
print("=" * 50)
def simple_tfidf_tokenized(docs, stopwords=None):
"""
结合分词的TF-IDF实现
参数:
docs: 原始文档列表
stopwords: 停用词集合
返回:
vocab, tfidf_matrix
"""
tokenized = []
for doc in docs:
words = jieba.cut(doc)
if stopwords:
words = [w for w in words if w not in stopwords and len(w) > 1]
else:
words = [w for w in words if len(w) > 1]
tokenized.append(words)
vocab_set = set()
for doc in tokenized:
vocab_set.update(doc)
vocab = sorted(list(vocab_set))
n_docs = len(tokenized)
tf_matrix = []
df_dict = {word: 0 for word in vocab}
for doc in tokenized:
vec = [0] * len(vocab)
for word in doc:
if word in vocab:
idx = vocab.index(word)
vec[idx] += 1
tf_matrix.append(vec)
for vec in tf_matrix:
for j, count in enumerate(vec):
if count > 0:
word = vocab[j]
df_dict[word] += 1
idf = []
for word in vocab:
df = df_dict[word]
idf_j = math.log(n_docs / (df + 1)) + 1
idf.append(idf_j)
tfidf = []
for vec in tf_matrix:
tfidf_vec = [vec[i] * idf[i] for i in range(len(vec))]
tfidf.append(tfidf_vec)
return vocab, tfidf, tokenized
docs = [
"Python是一门很棒的编程语言",
"人工智能是未来的发展方向",
"深度学习是机器学习的一个分支",
"Python和Java都是很流行的编程语言"
]
stopwords = set(["", "", "一个", "", "", "", ""])
vocab, tfidf_matrix, tokenized = simple_tfidf_tokenized(docs, stopwords)
print("文档集合:")
for i, doc in enumerate(docs):
print(f" Doc{i+1}: {doc}")
print()
print(f"分词结果:")
for i, words in enumerate(tokenized):
print(f" Doc{i+1}: {' / '.join(words)}")
print()
print(f"词表(共{len(vocab)}个词):")
print(f" {vocab}")
print()
print("TF-IDF矩阵")
for i, vec in enumerate(tfidf_matrix):
nonzero = [(vocab[j], round(vec[j], 4)) for j in range(len(vec)) if vec[j] > 0]
print(f" Doc{i+1}: {nonzero}")
print()
print("每个文档最重要的词TF-IDF值最高")
for i, vec in enumerate(tfidf_matrix):
max_idx = max(range(len(vec)), key=lambda j: vec[j])
max_score = vec[max_idx]
if max_score > 0:
print(f" Doc{i+1}: '{vocab[max_idx]}' (TF-IDF={max_score:.4f})")