上传文件至 /

This commit is contained in:
2026-04-23 16:04:06 +08:00
parent 5354d6387b
commit 16cfde4384
2 changed files with 263 additions and 33 deletions

28
XWL2.py Normal file
View File

@@ -0,0 +1,28 @@
docs = [
"Python 是 编程 语言",
"Java 是 编程 语言",
"Python Python Python"
]
all_words = []
for doc in docs:
words = doc.split()
all_words.extend(words)
vocab = sorted(list(set(all_words)))
print("词表(手动实现):", vocab)
bow_vectors = []
for doc in docs:
words = doc.split()
vector = [words.count(word) for word in vocab]
bow_vectors.append(vector)
print("\n每个文档的BoW向量(手动实现):")
for i, vec in enumerate(bow_vectors):
print(f"Doc{i+1}: {vec}")
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)
print("\n词表(sklearn实现):", vectorizer.get_feature_names_out())
print("\n每个文档的BoW向量(sklearn实现):")
for i, vec in enumerate(X.toarray()):
print(f"Doc{i+1}: {vec}")
#6#
print("1忽略词序信息:无法区分语序不同但词频相同的文本,会丢失语义逻辑。2不理解词语语义关联:将词视为独立符号,无法捕捉同义词、近义词的关系。")