# 1. 定义3个文档 docs = [ "Python 是 编程 语言", "Java 是 编程 语言", "Python Python Python" ] # 2. 分词(按空格分割) tokenized_docs = [doc.split() for doc in docs] # 3. 构建词表:去重 + 排序 vocab = sorted(list(set(word for doc in tokenized_docs for word in doc))) print("✅ 词表:", vocab) # 4. 生成BoW向量 def bow_vector(doc_tokens, vocab_list): return [doc_tokens.count(word) for word in vocab_list] # 计算每个文档的向量 vec1 = bow_vector(tokenized_docs[0], vocab) vec2 = bow_vector(tokenized_docs[1], vocab) vec3 = bow_vector(tokenized_docs[2], vocab) print("\n📌 各文档BoW向量:") print(f"Doc1: {vec1}") print(f"Doc2: {vec2}") print(f"Doc3: {vec3}")