10 lines
343 B
Python
10 lines
343 B
Python
from sklearn.feature_extraction.text import CountVectorizer
|
|
docs = [
|
|
"Pyhton是编程语言"
|
|
"Java是编程语言"
|
|
"Pyhton Python Pyhton"
|
|
]
|
|
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
|
|
X = vectorizer.fit_transform(docs)
|
|
print("词表:",vectorizer.get_feature_names_out())
|
|
print("文档向量:\n",X.toarray()) |