Files
2026-04-23 15:58:58 +08:00

10 lines
343 B
Python

from sklearn.feature_extraction.text import CountVectorizer
docs = [
"Pyhton是编程语言"
"Java是编程语言"
"Pyhton Python Pyhton"
]
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
X = vectorizer.fit_transform(docs)
print("词表:",vectorizer.get_feature_names_out())
print("文档向量:\n",X.toarray())