# TF-IDF演示(纯Python实现) import math print("=" * 50) print("TF-IDF词频-逆文档频率演示") print("=" * 50) def simple_tfidf(docs): """ 简单的TF-IDF实现 参数: docs: 文档列表,每篇文档已经是分词后的词列表 返回: vocab: 词表 tfidf_matrix: TF-IDF矩阵 idf: 每个词的IDF值 """ # 1. 构建词表和BoW vocab_set = set() for doc in docs: vocab_set.update(doc) vocab = sorted(list(vocab_set)) # 2. 构建BoW矩阵 bow = [] for doc in docs: vec = [0] * len(vocab) for word in doc: if word in vocab: vec[vocab.index(word)] += 1 bow.append(vec) n_docs = len(docs) # 3. 计算IDF idf = [] for j, word in enumerate(vocab): df = sum(1 for vec in bow if vec[j] > 0) idf_j = math.log(n_docs / (df + 1)) + 1 idf.append(idf_j) # 4. 计算TF-IDF tfidf = [] for vec in bow: tfidf_vec = [] for i, tf in enumerate(vec): tfidf_vec.append(tf * idf[i]) tfidf.append(tfidf_vec) return vocab, tfidf, idf docs = [ ["Python", "编程", "语言"], ["Python", "Python", "Python"], # Python出现3次 ["Java", "编程", "语言"], ] vocab, tfidf_matrix, idf = simple_tfidf(docs) print("文档集合:") for i, doc in enumerate(docs): print(f" Doc{i+1}: {' '.join(doc)}") print() print(f"词表: {vocab}") print() print(f"IDF值: {[round(x, 4) for x in idf]}") print() print("TF-IDF矩阵:") for i, vec in enumerate(tfidf_matrix): print(f" Doc{i+1}: {[round(x, 4) for x in vec]}") print() print("详细分析:") for i, doc in enumerate(docs): print(f"\nDoc{i+1}: {' '.join(doc)}") for j, score in enumerate(tfidf_matrix[i]): if score > 0: print(f" '{vocab[j]}': TF-IDF = {score:.4f}") # TF-IDF vs BoW 对比 import math print("=" * 50) print("TF-IDF vs BoW 对比") print("=" * 50) def simple_bow(docs): vocab_set = set() for doc in docs: vocab_set.update(doc) vocab = sorted(list(vocab_set)) bow_matrix = [] for doc in docs: vec = [0] * len(vocab) for word in doc: if word in vocab: vec[vocab.index(word)] += 1 bow_matrix.append(vec) return vocab, bow_matrix def simple_tfidf(docs): vocab_set = set() for doc in docs: vocab_set.update(doc) vocab = sorted(list(vocab_set)) bow = [] for doc in docs: vec = [0] * len(vocab) for word in doc: if word in vocab: vec[vocab.index(word)] += 1 bow.append(vec) n_docs = len(docs) idf = [] for j, word in enumerate(vocab): df = sum(1 for vec in bow if vec[j] > 0) idf_j = math.log(n_docs / (df + 1)) + 1 idf.append(idf_j) tfidf = [] for vec in bow: tfidf_vec = [] for i, tf in enumerate(vec): tfidf_vec.append(tf * idf[i]) tfidf.append(tfidf_vec) return vocab, tfidf, idf docs = [ ["Python", "编程"], ["Java", "编程"], ["Python", "Python", "Python"] # Python出现3次 ] vocab_bow, bow_matrix = simple_bow(docs) vocab_tfidf, tfidf_matrix, idf = simple_tfidf(docs) print("文档:") for i, doc in enumerate(docs): print(f" Doc{i+1}: {' '.join(doc)}") print() print("BoW矩阵:") for i, vec in enumerate(bow_matrix): print(f" Doc{i+1}: {vec}") print() print("TF-IDF矩阵:") for i, vec in enumerate(tfidf_matrix): print(f" Doc{i+1}: {[round(x, 4) for x in vec]}") print() # 重点分析Doc3 print("重点分析:") print(f"Doc3 'Python Python Python':") print(f" BoW: Python出现3次") print(f" TF-IDF: Python的TF-IDF = {tfidf_matrix[2][0]:.4f}") print() print("为什么Doc3的TF-IDF不是最高的?") print("因为Python在Doc1和Doc2也出现了,IDF值被稀释")