4.23
This commit is contained in:
47
4.23.py
Normal file
47
4.23.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import numpy as np
|
||||
|
||||
print("=" * 50)
|
||||
print("BoW忽略词序的演示")
|
||||
print("=" * 50)
|
||||
|
||||
def simple_bow(docs):
|
||||
vocab_set = set()
|
||||
for doc in docs:
|
||||
vocab_set.update(doc)
|
||||
vocab = sorted(list(vocab_set))
|
||||
bow_matrix = []
|
||||
for doc in docs:
|
||||
vec = [0] * len(vocab)
|
||||
for word in doc:
|
||||
if word in vocab:
|
||||
vec[vocab.index(word)] += 1
|
||||
bow_matrix.append(vec)
|
||||
return vocab, bow_matrix
|
||||
|
||||
docs = [
|
||||
["我", "爱", "你"],
|
||||
["你", "爱", "我"],
|
||||
["爱你我"],
|
||||
]
|
||||
|
||||
vocab, bow_matrix = simple_bow(docs)
|
||||
|
||||
print("文档:")
|
||||
for i, doc in enumerate(docs):
|
||||
print(f" Doc{i+1}: {''.join(doc)}")
|
||||
print()
|
||||
|
||||
print("BoW矩阵:")
|
||||
for i, vec in enumerate(bow_matrix):
|
||||
print(f" Doc{i+1}: {vec}")
|
||||
print()
|
||||
|
||||
print(f"词表: {vocab}")
|
||||
print()
|
||||
|
||||
print("问题:这三个完全不同的句子,BoW向量完全相同!")
|
||||
print("Doc1: 我爱你(表达爱意)")
|
||||
print("Doc2: 你爱我(对方爱我)")
|
||||
print("Doc3: 爱你我(意义不明)")
|
||||
print()
|
||||
print("结论:BoW模型丢失了词序信息!")
|
||||
Reference in New Issue
Block a user