完成作业
This commit is contained in:
27
5.py
Normal file
27
5.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# 1. 定义3个文档
|
||||
docs = [
|
||||
"Python 是 编程 语言",
|
||||
"Java 是 编程 语言",
|
||||
"Python Python Python"
|
||||
]
|
||||
|
||||
# 2. 分词(按空格分割)
|
||||
tokenized_docs = [doc.split() for doc in docs]
|
||||
|
||||
# 3. 构建词表:去重 + 排序
|
||||
vocab = sorted(list(set(word for doc in tokenized_docs for word in doc)))
|
||||
print("✅ 词表:", vocab)
|
||||
|
||||
# 4. 生成BoW向量
|
||||
def bow_vector(doc_tokens, vocab_list):
|
||||
return [doc_tokens.count(word) for word in vocab_list]
|
||||
|
||||
# 计算每个文档的向量
|
||||
vec1 = bow_vector(tokenized_docs[0], vocab)
|
||||
vec2 = bow_vector(tokenized_docs[1], vocab)
|
||||
vec3 = bow_vector(tokenized_docs[2], vocab)
|
||||
|
||||
print("\n📌 各文档BoW向量:")
|
||||
print(f"Doc1: {vec1}")
|
||||
print(f"Doc2: {vec2}")
|
||||
print(f"Doc3: {vec3}")
|
||||
Reference in New Issue
Block a user