47 lines
1.1 KiB
Python
47 lines
1.1 KiB
Python
import numpy as np
|
||
|
||
print("=" * 50)
|
||
print("BoW忽略词序的演示")
|
||
print("=" * 50)
|
||
|
||
def simple_bow(docs):
|
||
vocab_set = set()
|
||
for doc in docs:
|
||
vocab_set.update(doc)
|
||
vocab = sorted(list(vocab_set))
|
||
bow_matrix = []
|
||
for doc in docs:
|
||
vec = [0] * len(vocab)
|
||
for word in doc:
|
||
if word in vocab:
|
||
vec[vocab.index(word)] += 1
|
||
bow_matrix.append(vec)
|
||
return vocab, bow_matrix
|
||
|
||
docs = [
|
||
["我", "爱", "你"],
|
||
["你", "爱", "我"],
|
||
["爱你我"],
|
||
]
|
||
|
||
vocab, bow_matrix = simple_bow(docs)
|
||
|
||
print("文档:")
|
||
for i, doc in enumerate(docs):
|
||
print(f" Doc{i+1}: {''.join(doc)}")
|
||
print()
|
||
|
||
print("BoW矩阵:")
|
||
for i, vec in enumerate(bow_matrix):
|
||
print(f" Doc{i+1}: {vec}")
|
||
print()
|
||
|
||
print(f"词表: {vocab}")
|
||
print()
|
||
|
||
print("问题:这三个完全不同的句子,BoW向量完全相同!")
|
||
print("Doc1: 我爱你(表达爱意)")
|
||
print("Doc2: 你爱我(对方爱我)")
|
||
print("Doc3: 爱你我(意义不明)")
|
||
print()
|
||
print("结论:BoW模型丢失了词序信息!") |