Files
task-3-2-1-Text-Processing-…/5.py
2026-04-23 15:48:56 +08:00

27 lines
735 B
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 1. 定义3个文档
docs = [
"Python 是 编程 语言",
"Java 是 编程 语言",
"Python Python Python"
]
# 2. 分词(按空格分割)
tokenized_docs = [doc.split() for doc in docs]
# 3. 构建词表:去重 + 排序
vocab = sorted(list(set(word for doc in tokenized_docs for word in doc)))
print("✅ 词表:", vocab)
# 4. 生成BoW向量
def bow_vector(doc_tokens, vocab_list):
return [doc_tokens.count(word) for word in vocab_list]
# 计算每个文档的向量
vec1 = bow_vector(tokenized_docs[0], vocab)
vec2 = bow_vector(tokenized_docs[1], vocab)
vec3 = bow_vector(tokenized_docs[2], vocab)
print("\n📌 各文档BoW向量")
print(f"Doc1: {vec1}")
print(f"Doc2: {vec2}")
print(f"Doc3: {vec3}")