diff --git a/4.21.py b/4.21.py index 6b1e69a..2d75671 100644 --- a/4.21.py +++ b/4.21.py @@ -1,4 +1,4 @@ -text = "hello" +text = ".,+*" for char in text: print(f"'{char}'的ASCII码是:{ord(char)}") diff --git a/4.23.py b/4.23.py new file mode 100644 index 0000000..987ddae --- /dev/null +++ b/4.23.py @@ -0,0 +1,47 @@ +import numpy as np + +print("=" * 50) +print("BoW忽略词序的演示") +print("=" * 50) + +def simple_bow(docs): + vocab_set = set() + for doc in docs: + vocab_set.update(doc) + vocab = sorted(list(vocab_set)) + bow_matrix = [] + for doc in docs: + vec = [0] * len(vocab) + for word in doc: + if word in vocab: + vec[vocab.index(word)] += 1 + bow_matrix.append(vec) + return vocab, bow_matrix + +docs = [ + ["我", "爱", "你"], + ["你", "爱", "我"], + ["爱你我"], +] + +vocab, bow_matrix = simple_bow(docs) + +print("文档:") +for i, doc in enumerate(docs): + print(f" Doc{i+1}: {''.join(doc)}") +print() + +print("BoW矩阵:") +for i, vec in enumerate(bow_matrix): + print(f" Doc{i+1}: {vec}") +print() + +print(f"词表: {vocab}") +print() + +print("问题:这三个完全不同的句子,BoW向量完全相同!") +print("Doc1: 我爱你(表达爱意)") +print("Doc2: 你爱我(对方爱我)") +print("Doc3: 爱你我(意义不明)") +print() +print("结论:BoW模型丢失了词序信息!") \ No newline at end of file