Files
task-3-2-1-Text-Processing-…/4.23.py
2509165003 ad1cffb0e1 4.23
2026-04-23 15:59:07 +08:00

47 lines
1.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
print("=" * 50)
print("BoW忽略词序的演示")
print("=" * 50)
def simple_bow(docs):
vocab_set = set()
for doc in docs:
vocab_set.update(doc)
vocab = sorted(list(vocab_set))
bow_matrix = []
for doc in docs:
vec = [0] * len(vocab)
for word in doc:
if word in vocab:
vec[vocab.index(word)] += 1
bow_matrix.append(vec)
return vocab, bow_matrix
docs = [
["", "", ""],
["", "", ""],
["爱你我"],
]
vocab, bow_matrix = simple_bow(docs)
print("文档:")
for i, doc in enumerate(docs):
print(f" Doc{i+1}: {''.join(doc)}")
print()
print("BoW矩阵")
for i, vec in enumerate(bow_matrix):
print(f" Doc{i+1}: {vec}")
print()
print(f"词表: {vocab}")
print()
print("问题这三个完全不同的句子BoW向量完全相同")
print("Doc1: 我爱你(表达爱意)")
print("Doc2: 你爱我(对方爱我)")
print("Doc3: 爱你我(意义不明)")
print()
print("结论BoW模型丢失了词序信息")