#第五题:词表:[Python, 是,编程,语言,Java] #Doc1 向量:[1,1,1,1,0] #Doc2 向量:[0,1,1,1,1] #Doc3 向量:[3,0,0,0,0] #第六题: #1.无法捕捉语序与上下文语义 #BoW 只统计词频,忽略词语顺序、语法结构。 #2.丢失词语语义关联,无法理解一词多义 #同一个词不同含义、近义词无法区分。 import subprocess subprocess.run(['pip', 'install', 'jieba', '-q']) print("jieba安装完成!") import jieba print("=" * 50) print("jieba分词演示") print("=" * 50) text = "我喜欢深度学习和人工智能" print(f"原文: {text}") print() # 精确模式(默认) words精确 = list(jieba.cut(text, cut_all=False)) print(f"精确模式: {' / '.join(words精确)}") # 全模式 words全 = list(jieba.cut(text, cut_all=True)) print(f"全模式: {' / '.join(words全)}") # 搜索引擎模式 words搜索 = list(jieba.cut_for_search(text)) print(f"搜索模式: {' / '.join(words搜索)}") # 更多分词示例 import jieba print("=" * 50) print("更多分词示例") print("=" * 50) examples = [ "今天天气真不错", "人工智能是未来的发展方向", "Python是一门非常流行的编程语言", "小明毕业于清华大学计算机系", "我今天在京东买了一部iPhone手机" ] for i, text in enumerate(examples): words = list(jieba.cut(text)) print(f"{i+1}. {text}") print(f" → {' / '.join(words)}") print() import jieba.posseg as pseg print("=" * 50) print("jieba词性标注演示") print("=" * 50) text = "我喜欢深度学习和人工智能" print(f"原文: {text}") print() words = pseg.cut(text) print("分词 + 词性标注:") for word, flag in words: print(f" {word}: {flag}") import jieba print("=" * 50) print("停用词处理演示") print("=" * 50) # 常见停用词列表 stopwords = set(['的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这']) text = "人工智能是未来的发展方向,也是当前科技领域的热门话题" print(f"原文: {text}") print() # 不使用停用词 words_all = list(jieba.cut(text)) print(f"不使用停用词: {' / '.join(words_all)}") # 使用停用词 words_filtered = [w for w in words_all if w not in stopwords] print(f"使用停用词: {' / '.join(words_filtered)}") print() # 更完整的停用词表可以从网上下载 print("提示:实际项目中可以从以下地方获取停用词表:") print(" - 哈工大停用词表") print(" - 百度停用词表") print(" - 四川大学机器学习实验室停用词表") # 实战:完整的文本预处理流程 import jieba print("=" * 50) print("完整的文本预处理流程") print("=" * 50) # 示例文档集合 docs = [ "今天天气真不错!适合出去玩。", "Python是一门很棒的编程语言。", "人工智能和机器学习是未来的发展方向。", "今天在咖啡馆喝了一杯很好喝的拿铁。" ] # 停用词表 stopwords = set(['的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '!', '。', ',']) def preprocess_text(text): """完整的文本预处理流程""" # 1. 分词 words = jieba.cut(text) # 2. 去除停用词 words = [w for w in words if w not in stopwords and len(w) > 0] # 3. 去除空格 words = [w for w in words if w.strip()] return words print("预处理结果:") for i, doc in enumerate(docs): words = preprocess_text(doc) print(f"\nDoc{i+1}: {doc}") print(f" → {' / '.join(words)}") # 实战:jieba分词 + TF-IDF完整流程 import jieba import math print("=" * 50) print("实战:jieba分词 + TF-IDF完整流程") print("=" * 50) def simple_tfidf_tokenized(docs, stopwords=None): """ 结合分词的TF-IDF实现 参数: docs: 原始文档列表 stopwords: 停用词集合 返回: vocab, tfidf_matrix """ # 1. 分词 tokenized = [] for doc in docs: words = jieba.cut(doc) if stopwords: words = [w for w in words if w not in stopwords and len(w) > 1] else: words = [w for w in words if len(w) > 1] tokenized.append(words) # 2. 构建词表 vocab_set = set() for doc in tokenized: vocab_set.update(doc) vocab = sorted(list(vocab_set)) # 3. 构建TF矩阵并计算IDF n_docs = len(tokenized) tf_matrix = [] df_dict = {word: 0 for word in vocab} for doc in tokenized: vec = [0] * len(vocab) for word in doc: if word in vocab: idx = vocab.index(word) vec[idx] += 1 tf_matrix.append(vec) # 计算DF for vec in tf_matrix: for j, count in enumerate(vec): if count > 0: word = vocab[j] df_dict[word] += 1 # 计算IDF idf = [] for word in vocab: df = df_dict[word] idf_j = math.log(n_docs / (df + 1)) + 1 idf.append(idf_j) # 计算TF-IDF tfidf = [] for vec in tf_matrix: tfidf_vec = [vec[i] * idf[i] for i in range(len(vec))] tfidf.append(tfidf_vec) return vocab, tfidf, tokenized # 示例文档集合 docs = [ "Python是一门很棒的编程语言", "人工智能是未来的发展方向", "深度学习是机器学习的一个分支", "Python和Java都是很流行的编程语言" ] # 停用词 stopwords = set(["的", "是", "一个", "很", "和", "在", "了"]) vocab, tfidf_matrix, tokenized = simple_tfidf_tokenized(docs, stopwords) print("文档集合:") for i, doc in enumerate(docs): print(f" Doc{i+1}: {doc}") print() print(f"分词结果:") for i, words in enumerate(tokenized): print(f" Doc{i+1}: {' / '.join(words)}") print() print(f"词表(共{len(vocab)}个词):") print(f" {vocab}") print() print("TF-IDF矩阵:") for i, vec in enumerate(tfidf_matrix): # 只显示非零值 nonzero = [(vocab[j], round(vec[j], 4)) for j in range(len(vec)) if vec[j] > 0] print(f" Doc{i+1}: {nonzero}") print() # 找每个文档最重要的词 print("每个文档最重要的词(TF-IDF值最高):") for i, vec in enumerate(tfidf_matrix): max_idx = max(range(len(vec)), key=lambda j: vec[j]) max_score = vec[max_idx] if max_score > 0: print(f" Doc{i+1}: '{vocab[max_idx]}' (TF-IDF={max_score:.4f})")