import numpy as np print("=" * 50) print("BoW忽略词序的演示") print("=" * 50) def simple_bow(docs): vocab_set = set() for doc in docs: vocab_set.update(doc) vocab = sorted(list(vocab_set)) bow_matrix = [] for doc in docs: vec = [0] * len(vocab) for word in doc: if word in vocab: vec[vocab.index(word)] += 1 bow_matrix.append(vec) return vocab, bow_matrix docs = [ ["我", "爱", "你"], ["你", "爱", "我"], ["爱你我"], ] vocab, bow_matrix = simple_bow(docs) print("文档:") for i, doc in enumerate(docs): print(f" Doc{i+1}: {''.join(doc)}") print() print("BoW矩阵:") for i, vec in enumerate(bow_matrix): print(f" Doc{i+1}: {vec}") print() print(f"词表: {vocab}") print() print("问题:这三个完全不同的句子,BoW向量完全相同!") print("Doc1: 我爱你(表达爱意)") print("Doc2: 你爱我(对方爱我)") print("Doc3: 爱你我(意义不明)") print() print("结论:BoW模型丢失了词序信息!")