完成作业

This commit is contained in:
2509165004
2026-04-23 15:48:56 +08:00
parent be0a8b60c9
commit e93e873ef8
3 changed files with 266 additions and 0 deletions

27
5.py Normal file
View File

@@ -0,0 +1,27 @@
# 1. 定义3个文档
docs = [
"Python 是 编程 语言",
"Java 是 编程 语言",
"Python Python Python"
]
# 2. 分词(按空格分割)
tokenized_docs = [doc.split() for doc in docs]
# 3. 构建词表:去重 + 排序
vocab = sorted(list(set(word for doc in tokenized_docs for word in doc)))
print("✅ 词表:", vocab)
# 4. 生成BoW向量
def bow_vector(doc_tokens, vocab_list):
return [doc_tokens.count(word) for word in vocab_list]
# 计算每个文档的向量
vec1 = bow_vector(tokenized_docs[0], vocab)
vec2 = bow_vector(tokenized_docs[1], vocab)
vec3 = bow_vector(tokenized_docs[2], vocab)
print("\n📌 各文档BoW向量")
print(f"Doc1: {vec1}")
print(f"Doc2: {vec2}")
print(f"Doc3: {vec3}")