This commit is contained in:
2509165003
2026-04-23 15:59:07 +08:00
parent df690257aa
commit ad1cffb0e1
2 changed files with 48 additions and 1 deletions

View File

@@ -1,4 +1,4 @@
text = "hello"
text = ".,+*"
for char in text:
print(f"'{char}'的ASCII码是:{ord(char)}")

47
4.23.py Normal file
View File

@@ -0,0 +1,47 @@
import numpy as np
print("=" * 50)
print("BoW忽略词序的演示")
print("=" * 50)
def simple_bow(docs):
vocab_set = set()
for doc in docs:
vocab_set.update(doc)
vocab = sorted(list(vocab_set))
bow_matrix = []
for doc in docs:
vec = [0] * len(vocab)
for word in doc:
if word in vocab:
vec[vocab.index(word)] += 1
bow_matrix.append(vec)
return vocab, bow_matrix
docs = [
["", "", ""],
["", "", ""],
["爱你我"],
]
vocab, bow_matrix = simple_bow(docs)
print("文档:")
for i, doc in enumerate(docs):
print(f" Doc{i+1}: {''.join(doc)}")
print()
print("BoW矩阵")
for i, vec in enumerate(bow_matrix):
print(f" Doc{i+1}: {vec}")
print()
print(f"词表: {vocab}")
print()
print("问题这三个完全不同的句子BoW向量完全相同")
print("Doc1: 我爱你(表达爱意)")
print("Doc2: 你爱我(对方爱我)")
print("Doc3: 爱你我(意义不明)")
print()
print("结论BoW模型丢失了词序信息")