23 lines
622 B
Python
23 lines
622 B
Python
documents=[
|
|
"Python是编程语言",
|
|
"Java是编程语言",
|
|
"Python Python Python"
|
|
]
|
|
word_list=[]
|
|
for doc in documents:
|
|
words=doc.split()
|
|
for word in words:
|
|
if word not in word_list:
|
|
word_list.append(word)
|
|
vocab=sorted(word_list)
|
|
print("词变:",vocab)
|
|
bow_vectors=[]
|
|
for doc in documents:
|
|
words=doc.split()
|
|
vector=[words.count(word) for word in vocab]
|
|
bow_vectors.append(vector)
|
|
for i,vec in enumerate(bow_vectors):
|
|
print(f"Doc{i+1}的向量表示:{vec}")
|
|
|
|
#两个缺点:忽略词序与语义关系
|
|
#词汇维度爆炸与稀疏性 |