87 lines
3.0 KiB
Python
87 lines
3.0 KiB
Python
# 题目1
|
||
print("题目1:")
|
||
for c in "Hello":
|
||
print(ord(c))
|
||
print(chr(65))
|
||
|
||
# 题目2 思考题答案
|
||
print("\n题目2:")
|
||
print("""1. 数据的表示形式:
|
||
图像是结构化的数值矩阵,每个像素都是数字,计算模型可以直接处理;
|
||
文本是非结构化的符号序列,每个字符/词的编码本身不包含直接可计算的数值关系。
|
||
2. 语义理解:
|
||
图像的数值矩阵直接对应像素信息,模型只需处理数值运算;
|
||
文本承载抽象语义、歧义、上下文依赖,无法直接用简单数值运算捕捉深层含义。""")
|
||
|
||
# 题目3
|
||
print("\n题目3:")
|
||
A = [3, 4]
|
||
B = [1, 2]
|
||
print("A + B =", [a + b for a, b in zip(A, B)])
|
||
print("2 × A =", [2 * x for x in A])
|
||
length_A = (A[0]**2 + A[1]**2) ** 0.5
|
||
print("A的长度(模) =", length_A)
|
||
|
||
# 题目4
|
||
print("\n题目4:")
|
||
A = [1, 2, 3]
|
||
B = [4, 5, 6]
|
||
dot_product = sum(a * b for a, b in zip(A, B))
|
||
mod_A = (sum(x**2 for x in A)) ** 0.5
|
||
mod_B = (sum(x**2 for x in B)) ** 0.5
|
||
cos_sim = dot_product / (mod_A * mod_B)
|
||
print("A · B =", dot_product)
|
||
print("余弦相似度 =", cos_sim)
|
||
|
||
A2 = [1, 0]
|
||
B2 = [0, 1]
|
||
dot2 = sum(a * b for a, b in zip(A2, B2))
|
||
mod_A2 = (sum(x**2 for x in A2)) ** 0.5
|
||
mod_B2 = (sum(x**2 for x in B2)) ** 0.5
|
||
cos_sim2 = dot2 / (mod_A2 * mod_B2)
|
||
print("A = [1,0], B = [0,1] 的余弦相似度 =", cos_sim2)
|
||
print("解释:两个向量正交,夹角为90度,余弦值为0,表示两者完全不相关。")
|
||
|
||
# 题目5
|
||
print("\n题目5:")
|
||
docs = ["Python 是 编程 语言", "Java 是 编程 语言", "Python Python Python"]
|
||
vocab = sorted(list(set(" ".join(docs).split())))
|
||
vec1 = [docs[0].split().count(word) for word in vocab]
|
||
vec2 = [docs[1].split().count(word) for word in vocab]
|
||
vec3 = [docs[2].split().count(word) for word in vocab]
|
||
print("词表:", vocab)
|
||
print("Doc1向量:", vec1)
|
||
print("Doc2向量:", vec2)
|
||
print("Doc3向量:", vec3)
|
||
|
||
# 题目6 思考题答案
|
||
print("\n题目6:")
|
||
print("""BoW模型的缺点:
|
||
1. 忽略词序:只统计词频,不考虑词的顺序,无法区分语义,如“我打你”和“你打我”向量完全相同,会造成语义混淆。
|
||
2. 不区分词的重要性:高频停用词(如“是”)和低频关键词权重一样,无法突出关键信息,降低文本区分度。""")
|
||
|
||
# 附加题(题目7,去掉sklearn依赖,改用纯Python实现TF-IDF核心逻辑)
|
||
print("\n题目7(纯Python实现TF-IDF):")
|
||
docs = ["Python 编程", "Java 编程", "Python Python"]
|
||
vocab = sorted(list(set(" ".join(docs).split())))
|
||
N = len(docs)
|
||
|
||
def tf(word, doc):
|
||
return doc.split().count(word) / len(doc.split())
|
||
|
||
def idf(word, docs):
|
||
import math
|
||
df = sum(1 for doc in docs if word in doc.split())
|
||
return math.log(N / (1 + df))
|
||
|
||
tfidf_matrix = []
|
||
for doc in docs:
|
||
row = []
|
||
for word in vocab:
|
||
row.append(tf(word, doc) * idf(word, docs))
|
||
tfidf_matrix.append(row)
|
||
|
||
print("词表:", vocab)
|
||
print("TF-IDF矩阵:")
|
||
for row in tfidf_matrix:
|
||
print(row) |