Files
task-3-2-1-Text-Processing-…/40吴承恩.py
2026-04-23 16:01:30 +08:00

87 lines
3.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 题目1
print("题目1")
for c in "Hello":
print(ord(c))
print(chr(65))
# 题目2 思考题答案
print("\n题目2")
print("""1. 数据的表示形式:
图像是结构化的数值矩阵,每个像素都是数字,计算模型可以直接处理;
文本是非结构化的符号序列,每个字符/词的编码本身不包含直接可计算的数值关系。
2. 语义理解:
图像的数值矩阵直接对应像素信息,模型只需处理数值运算;
文本承载抽象语义、歧义、上下文依赖,无法直接用简单数值运算捕捉深层含义。""")
# 题目3
print("\n题目3")
A = [3, 4]
B = [1, 2]
print("A + B =", [a + b for a, b in zip(A, B)])
print("2 × A =", [2 * x for x in A])
length_A = (A[0]**2 + A[1]**2) ** 0.5
print("A的长度 =", length_A)
# 题目4
print("\n题目4")
A = [1, 2, 3]
B = [4, 5, 6]
dot_product = sum(a * b for a, b in zip(A, B))
mod_A = (sum(x**2 for x in A)) ** 0.5
mod_B = (sum(x**2 for x in B)) ** 0.5
cos_sim = dot_product / (mod_A * mod_B)
print("A · B =", dot_product)
print("余弦相似度 =", cos_sim)
A2 = [1, 0]
B2 = [0, 1]
dot2 = sum(a * b for a, b in zip(A2, B2))
mod_A2 = (sum(x**2 for x in A2)) ** 0.5
mod_B2 = (sum(x**2 for x in B2)) ** 0.5
cos_sim2 = dot2 / (mod_A2 * mod_B2)
print("A = [1,0], B = [0,1] 的余弦相似度 =", cos_sim2)
print("解释两个向量正交夹角为90度余弦值为0表示两者完全不相关。")
# 题目5
print("\n题目5")
docs = ["Python 是 编程 语言", "Java 是 编程 语言", "Python Python Python"]
vocab = sorted(list(set(" ".join(docs).split())))
vec1 = [docs[0].split().count(word) for word in vocab]
vec2 = [docs[1].split().count(word) for word in vocab]
vec3 = [docs[2].split().count(word) for word in vocab]
print("词表:", vocab)
print("Doc1向量", vec1)
print("Doc2向量", vec2)
print("Doc3向量", vec3)
# 题目6 思考题答案
print("\n题目6")
print("""BoW模型的缺点
1. 忽略词序:只统计词频,不考虑词的顺序,无法区分语义,如“我打你”和“你打我”向量完全相同,会造成语义混淆。
2. 不区分词的重要性:高频停用词(如“是”)和低频关键词权重一样,无法突出关键信息,降低文本区分度。""")
# 附加题题目7去掉sklearn依赖改用纯Python实现TF-IDF核心逻辑
print("\n题目7纯Python实现TF-IDF")
docs = ["Python 编程", "Java 编程", "Python Python"]
vocab = sorted(list(set(" ".join(docs).split())))
N = len(docs)
def tf(word, doc):
return doc.split().count(word) / len(doc.split())
def idf(word, docs):
import math
df = sum(1 for doc in docs if word in doc.split())
return math.log(N / (1 + df))
tfidf_matrix = []
for doc in docs:
row = []
for word in vocab:
row.append(tf(word, doc) * idf(word, docs))
tfidf_matrix.append(row)
print("词表:", vocab)
print("TF-IDF矩阵:")
for row in tfidf_matrix:
print(row)