上传文件至 /

This commit is contained in:
2026-04-23 16:01:30 +08:00
parent 590fc71e59
commit 0b17b74b3e

87
40吴承恩.py Normal file
View File

@@ -0,0 +1,87 @@
# 题目1
print("题目1")
for c in "Hello":
print(ord(c))
print(chr(65))
# 题目2 思考题答案
print("\n题目2")
print("""1. 数据的表示形式:
图像是结构化的数值矩阵,每个像素都是数字,计算模型可以直接处理;
文本是非结构化的符号序列,每个字符/词的编码本身不包含直接可计算的数值关系。
2. 语义理解:
图像的数值矩阵直接对应像素信息,模型只需处理数值运算;
文本承载抽象语义、歧义、上下文依赖,无法直接用简单数值运算捕捉深层含义。""")
# 题目3
print("\n题目3")
A = [3, 4]
B = [1, 2]
print("A + B =", [a + b for a, b in zip(A, B)])
print("2 × A =", [2 * x for x in A])
length_A = (A[0]**2 + A[1]**2) ** 0.5
print("A的长度 =", length_A)
# 题目4
print("\n题目4")
A = [1, 2, 3]
B = [4, 5, 6]
dot_product = sum(a * b for a, b in zip(A, B))
mod_A = (sum(x**2 for x in A)) ** 0.5
mod_B = (sum(x**2 for x in B)) ** 0.5
cos_sim = dot_product / (mod_A * mod_B)
print("A · B =", dot_product)
print("余弦相似度 =", cos_sim)
A2 = [1, 0]
B2 = [0, 1]
dot2 = sum(a * b for a, b in zip(A2, B2))
mod_A2 = (sum(x**2 for x in A2)) ** 0.5
mod_B2 = (sum(x**2 for x in B2)) ** 0.5
cos_sim2 = dot2 / (mod_A2 * mod_B2)
print("A = [1,0], B = [0,1] 的余弦相似度 =", cos_sim2)
print("解释两个向量正交夹角为90度余弦值为0表示两者完全不相关。")
# 题目5
print("\n题目5")
docs = ["Python 是 编程 语言", "Java 是 编程 语言", "Python Python Python"]
vocab = sorted(list(set(" ".join(docs).split())))
vec1 = [docs[0].split().count(word) for word in vocab]
vec2 = [docs[1].split().count(word) for word in vocab]
vec3 = [docs[2].split().count(word) for word in vocab]
print("词表:", vocab)
print("Doc1向量", vec1)
print("Doc2向量", vec2)
print("Doc3向量", vec3)
# 题目6 思考题答案
print("\n题目6")
print("""BoW模型的缺点
1. 忽略词序:只统计词频,不考虑词的顺序,无法区分语义,如“我打你”和“你打我”向量完全相同,会造成语义混淆。
2. 不区分词的重要性:高频停用词(如“是”)和低频关键词权重一样,无法突出关键信息,降低文本区分度。""")
# 附加题题目7去掉sklearn依赖改用纯Python实现TF-IDF核心逻辑
print("\n题目7纯Python实现TF-IDF")
docs = ["Python 编程", "Java 编程", "Python Python"]
vocab = sorted(list(set(" ".join(docs).split())))
N = len(docs)
def tf(word, doc):
return doc.split().count(word) / len(doc.split())
def idf(word, docs):
import math
df = sum(1 for doc in docs if word in doc.split())
return math.log(N / (1 + df))
tfidf_matrix = []
for doc in docs:
row = []
for word in vocab:
row.append(tf(word, doc) * idf(word, docs))
tfidf_matrix.append(row)
print("词表:", vocab)
print("TF-IDF矩阵:")
for row in tfidf_matrix:
print(row)