4.23

2026-04-23 15:59:07 +08:00
parent df690257aa
commit ad1cffb0e1
2 changed files with 48 additions and 1 deletions
--- a/4.21.py
+++ b/4.21.py
@@ -1,4 +1,4 @@
-text = "hello"
+text = ".,+*"
 for char in text:
    print(f"'{char}'的ASCII码是:{ord(char)}")

--- a/4.23.py
+++ b/4.23.py
@@ -0,0 +1,47 @@
+import numpy as np
+
+print("=" * 50)
+print("BoW忽略词序的演示")
+print("=" * 50)
+
+def simple_bow(docs):
+    vocab_set = set()
+    for doc in docs:
+        vocab_set.update(doc)
+    vocab = sorted(list(vocab_set))
+    bow_matrix = []
+    for doc in docs:
+        vec = [0] * len(vocab)
+        for word in doc:
+            if word in vocab:
+                vec[vocab.index(word)] += 1
+        bow_matrix.append(vec)
+    return vocab, bow_matrix
+
+docs = [
+    ["我", "爱", "你"],      
+    ["你", "爱", "我"],      
+    ["爱你我"],             
+]
+
+vocab, bow_matrix = simple_bow(docs)
+
+print("文档：")
+for i, doc in enumerate(docs):
+    print(f"  Doc{i+1}: {''.join(doc)}")
+print()
+
+print("BoW矩阵：")
+for i, vec in enumerate(bow_matrix):
+    print(f"  Doc{i+1}: {vec}")
+print()
+
+print(f"词表: {vocab}")
+print()
+
+print("问题：这三个完全不同的句子，BoW向量完全相同！")
+print("Doc1: 我爱你（表达爱意）")
+print("Doc2: 你爱我（对方爱我）")
+print("Doc3: 爱你我（意义不明）")
+print()
+print("结论：BoW模型丢失了词序信息！")