diff --git a/40吴承恩.py b/40吴承恩.py new file mode 100644 index 0000000..b1aa578 --- /dev/null +++ b/40吴承恩.py @@ -0,0 +1,87 @@ +# 题目1 +print("题目1:") +for c in "Hello": + print(ord(c)) +print(chr(65)) + +# 题目2 思考题答案 +print("\n题目2:") +print("""1. 数据的表示形式: +图像是结构化的数值矩阵,每个像素都是数字,计算模型可以直接处理; +文本是非结构化的符号序列,每个字符/词的编码本身不包含直接可计算的数值关系。 +2. 语义理解: +图像的数值矩阵直接对应像素信息,模型只需处理数值运算; +文本承载抽象语义、歧义、上下文依赖,无法直接用简单数值运算捕捉深层含义。""") + +# 题目3 +print("\n题目3:") +A = [3, 4] +B = [1, 2] +print("A + B =", [a + b for a, b in zip(A, B)]) +print("2 × A =", [2 * x for x in A]) +length_A = (A[0]**2 + A[1]**2) ** 0.5 +print("A的长度(模) =", length_A) + +# 题目4 +print("\n题目4:") +A = [1, 2, 3] +B = [4, 5, 6] +dot_product = sum(a * b for a, b in zip(A, B)) +mod_A = (sum(x**2 for x in A)) ** 0.5 +mod_B = (sum(x**2 for x in B)) ** 0.5 +cos_sim = dot_product / (mod_A * mod_B) +print("A · B =", dot_product) +print("余弦相似度 =", cos_sim) + +A2 = [1, 0] +B2 = [0, 1] +dot2 = sum(a * b for a, b in zip(A2, B2)) +mod_A2 = (sum(x**2 for x in A2)) ** 0.5 +mod_B2 = (sum(x**2 for x in B2)) ** 0.5 +cos_sim2 = dot2 / (mod_A2 * mod_B2) +print("A = [1,0], B = [0,1] 的余弦相似度 =", cos_sim2) +print("解释:两个向量正交,夹角为90度,余弦值为0,表示两者完全不相关。") + +# 题目5 +print("\n题目5:") +docs = ["Python 是 编程 语言", "Java 是 编程 语言", "Python Python Python"] +vocab = sorted(list(set(" ".join(docs).split()))) +vec1 = [docs[0].split().count(word) for word in vocab] +vec2 = [docs[1].split().count(word) for word in vocab] +vec3 = [docs[2].split().count(word) for word in vocab] +print("词表:", vocab) +print("Doc1向量:", vec1) +print("Doc2向量:", vec2) +print("Doc3向量:", vec3) + +# 题目6 思考题答案 +print("\n题目6:") +print("""BoW模型的缺点: +1. 忽略词序:只统计词频,不考虑词的顺序,无法区分语义,如“我打你”和“你打我”向量完全相同,会造成语义混淆。 +2. 不区分词的重要性:高频停用词(如“是”)和低频关键词权重一样,无法突出关键信息,降低文本区分度。""") + +# 附加题(题目7,去掉sklearn依赖,改用纯Python实现TF-IDF核心逻辑) +print("\n题目7(纯Python实现TF-IDF):") +docs = ["Python 编程", "Java 编程", "Python Python"] +vocab = sorted(list(set(" ".join(docs).split()))) +N = len(docs) + +def tf(word, doc): + return doc.split().count(word) / len(doc.split()) + +def idf(word, docs): + import math + df = sum(1 for doc in docs if word in doc.split()) + return math.log(N / (1 + df)) + +tfidf_matrix = [] +for doc in docs: + row = [] + for word in vocab: + row.append(tf(word, doc) * idf(word, docs)) + tfidf_matrix.append(row) + +print("词表:", vocab) +print("TF-IDF矩阵:") +for row in tfidf_matrix: + print(row) \ No newline at end of file