27 lines
735 B
Python
27 lines
735 B
Python
# 1. 定义3个文档
|
||
docs = [
|
||
"Python 是 编程 语言",
|
||
"Java 是 编程 语言",
|
||
"Python Python Python"
|
||
]
|
||
|
||
# 2. 分词(按空格分割)
|
||
tokenized_docs = [doc.split() for doc in docs]
|
||
|
||
# 3. 构建词表:去重 + 排序
|
||
vocab = sorted(list(set(word for doc in tokenized_docs for word in doc)))
|
||
print("✅ 词表:", vocab)
|
||
|
||
# 4. 生成BoW向量
|
||
def bow_vector(doc_tokens, vocab_list):
|
||
return [doc_tokens.count(word) for word in vocab_list]
|
||
|
||
# 计算每个文档的向量
|
||
vec1 = bow_vector(tokenized_docs[0], vocab)
|
||
vec2 = bow_vector(tokenized_docs[1], vocab)
|
||
vec3 = bow_vector(tokenized_docs[2], vocab)
|
||
|
||
print("\n📌 各文档BoW向量:")
|
||
print(f"Doc1: {vec1}")
|
||
print(f"Doc2: {vec2}")
|
||
print(f"Doc3: {vec3}") |