28 lines
1.0 KiB
Python
28 lines
1.0 KiB
Python
docs = [
|
|
"Python 是 编程 语言",
|
|
"Java 是 编程 语言",
|
|
"Python Python Python"
|
|
]
|
|
all_words = []
|
|
for doc in docs:
|
|
words = doc.split()
|
|
all_words.extend(words)
|
|
vocab = sorted(list(set(all_words)))
|
|
print("词表(手动实现):", vocab)
|
|
bow_vectors = []
|
|
for doc in docs:
|
|
words = doc.split()
|
|
vector = [words.count(word) for word in vocab]
|
|
bow_vectors.append(vector)
|
|
print("\n每个文档的BoW向量(手动实现):")
|
|
for i, vec in enumerate(bow_vectors):
|
|
print(f"Doc{i+1}: {vec}")
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
vectorizer = CountVectorizer()
|
|
X = vectorizer.fit_transform(docs)
|
|
print("\n词表(sklearn实现):", vectorizer.get_feature_names_out())
|
|
print("\n每个文档的BoW向量(sklearn实现):")
|
|
for i, vec in enumerate(X.toarray()):
|
|
print(f"Doc{i+1}: {vec}")
|
|
#6#
|
|
print("1忽略词序信息:无法区分语序不同但词频相同的文本,会丢失语义逻辑。2不理解词语语义关联:将词视为独立符号,无法捕捉同义词、近义词的关系。") |