上传文件至 /
This commit is contained in:
28
XWL2.py
Normal file
28
XWL2.py
Normal file
@@ -0,0 +1,28 @@
|
||||
docs = [
|
||||
"Python 是 编程 语言",
|
||||
"Java 是 编程 语言",
|
||||
"Python Python Python"
|
||||
]
|
||||
all_words = []
|
||||
for doc in docs:
|
||||
words = doc.split()
|
||||
all_words.extend(words)
|
||||
vocab = sorted(list(set(all_words)))
|
||||
print("词表(手动实现):", vocab)
|
||||
bow_vectors = []
|
||||
for doc in docs:
|
||||
words = doc.split()
|
||||
vector = [words.count(word) for word in vocab]
|
||||
bow_vectors.append(vector)
|
||||
print("\n每个文档的BoW向量(手动实现):")
|
||||
for i, vec in enumerate(bow_vectors):
|
||||
print(f"Doc{i+1}: {vec}")
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
vectorizer = CountVectorizer()
|
||||
X = vectorizer.fit_transform(docs)
|
||||
print("\n词表(sklearn实现):", vectorizer.get_feature_names_out())
|
||||
print("\n每个文档的BoW向量(sklearn实现):")
|
||||
for i, vec in enumerate(X.toarray()):
|
||||
print(f"Doc{i+1}: {vec}")
|
||||
#6#
|
||||
print("1忽略词序信息:无法区分语序不同但词频相同的文本,会丢失语义逻辑。2不理解词语语义关联:将词视为独立符号,无法捕捉同义词、近义词的关系。")
|
||||
Reference in New Issue
Block a user