# 实战:jieba分词 + TF-IDF完整流程 import jieba import math print("=" * 50) print("实战:jieba分词 + TF-IDF完整流程") print("=" * 50) def simple_tfidf_tokenized(docs, stopwords=None): """ 结合分词的TF-IDF实现 参数: docs: 原始文档列表 stopwords: 停用词集合 返回: vocab, tfidf_matrix """ # 1. 分词 tokenized = [] for doc in docs: words = jieba.cut(doc) if stopwords: words = [w for w in words if w not in stopwords and len(w) > 1] else: words = [w for w in words if len(w) > 1] tokenized.append(words) # 2. 构建词表 vocab_set = set() for doc in tokenized: vocab_set.update(doc) vocab = sorted(list(vocab_set)) # 3. 构建TF矩阵并计算IDF n_docs = len(tokenized) tf_matrix = [] df_dict = {word: 0 for word in vocab} for doc in tokenized: vec = [0] * len(vocab) for word in doc: if word in vocab: idx = vocab.index(word) vec[idx] += 1 tf_matrix.append(vec) # 计算DF for vec in tf_matrix: for j, count in enumerate(vec): if count > 0: word = vocab[j] df_dict[word] += 1 # 计算IDF idf = [] for word in vocab: df = df_dict[word] idf_j = math.log(n_docs / (df + 1)) + 1 idf.append(idf_j) # 计算TF-IDF tfidf = [] for vec in tf_matrix: tfidf_vec = [vec[i] * idf[i] for i in range(len(vec))] tfidf.append(tfidf_vec) return vocab, tfidf, tokenized # 示例文档集合 docs = [ "Python是一门很棒的编程语言", "人工智能是未来的发展方向", "深度学习是机器学习的一个分支", "Python和Java都是很流行的编程语言" ] # 停用词 stopwords = set(["的", "是", "一个", "很", "和", "在", "了"]) vocab, tfidf_matrix, tokenized = simple_tfidf_tokenized(docs, stopwords) print("文档集合:") for i, doc in enumerate(docs): print(f" Doc{i+1}: {doc}") print() print(f"分词结果:") for i, words in enumerate(tokenized): print(f" Doc{i+1}: {' / '.join(words)}") print() print(f"词表(共{len(vocab)}个词):") print(f" {vocab}") print() print("TF-IDF矩阵:") for i, vec in enumerate(tfidf_matrix): # 只显示非零值 nonzero = [(vocab[j], round(vec[j], 4)) for j in range(len(vec)) if vec[j] > 0] print(f" Doc{i+1}: {nonzero}") print() # 找每个文档最重要的词 print("每个文档最重要的词(TF-IDF值最高):") for i, vec in enumerate(tfidf_matrix): max_idx = max(range(len(vec)), key=lambda j: vec[j]) max_score = vec[max_idx] if max_score > 0: print(f" Doc{i+1}: '{vocab[max_idx]}' (TF-IDF={max_score:.4f})")