# 题目1 print("题目1:") for c in "Hello": print(ord(c)) print(chr(65)) # 题目2 思考题答案 print("\n题目2:") print("""1. 数据的表示形式: 图像是结构化的数值矩阵,每个像素都是数字,计算模型可以直接处理; 文本是非结构化的符号序列,每个字符/词的编码本身不包含直接可计算的数值关系。 2. 语义理解: 图像的数值矩阵直接对应像素信息,模型只需处理数值运算; 文本承载抽象语义、歧义、上下文依赖,无法直接用简单数值运算捕捉深层含义。""") # 题目3 print("\n题目3:") A = [3, 4] B = [1, 2] print("A + B =", [a + b for a, b in zip(A, B)]) print("2 × A =", [2 * x for x in A]) length_A = (A[0]**2 + A[1]**2) ** 0.5 print("A的长度(模) =", length_A) # 题目4 print("\n题目4:") A = [1, 2, 3] B = [4, 5, 6] dot_product = sum(a * b for a, b in zip(A, B)) mod_A = (sum(x**2 for x in A)) ** 0.5 mod_B = (sum(x**2 for x in B)) ** 0.5 cos_sim = dot_product / (mod_A * mod_B) print("A · B =", dot_product) print("余弦相似度 =", cos_sim) A2 = [1, 0] B2 = [0, 1] dot2 = sum(a * b for a, b in zip(A2, B2)) mod_A2 = (sum(x**2 for x in A2)) ** 0.5 mod_B2 = (sum(x**2 for x in B2)) ** 0.5 cos_sim2 = dot2 / (mod_A2 * mod_B2) print("A = [1,0], B = [0,1] 的余弦相似度 =", cos_sim2) print("解释:两个向量正交,夹角为90度,余弦值为0,表示两者完全不相关。") # 题目5 print("\n题目5:") docs = ["Python 是 编程 语言", "Java 是 编程 语言", "Python Python Python"] vocab = sorted(list(set(" ".join(docs).split()))) vec1 = [docs[0].split().count(word) for word in vocab] vec2 = [docs[1].split().count(word) for word in vocab] vec3 = [docs[2].split().count(word) for word in vocab] print("词表:", vocab) print("Doc1向量:", vec1) print("Doc2向量:", vec2) print("Doc3向量:", vec3) # 题目6 思考题答案 print("\n题目6:") print("""BoW模型的缺点: 1. 忽略词序:只统计词频,不考虑词的顺序,无法区分语义,如“我打你”和“你打我”向量完全相同,会造成语义混淆。 2. 不区分词的重要性:高频停用词(如“是”)和低频关键词权重一样,无法突出关键信息,降低文本区分度。""") # 附加题(题目7,去掉sklearn依赖,改用纯Python实现TF-IDF核心逻辑) print("\n题目7(纯Python实现TF-IDF):") docs = ["Python 编程", "Java 编程", "Python Python"] vocab = sorted(list(set(" ".join(docs).split()))) N = len(docs) def tf(word, doc): return doc.split().count(word) / len(doc.split()) def idf(word, docs): import math df = sum(1 for doc in docs if word in doc.split()) return math.log(N / (1 + df)) tfidf_matrix = [] for doc in docs: row = [] for word in vocab: row.append(tf(word, doc) * idf(word, docs)) tfidf_matrix.append(row) print("词表:", vocab) print("TF-IDF矩阵:") for row in tfidf_matrix: print(row)