17 lines
466 B
INI
17 lines
466 B
INI
from sklearn.feature_extraction.text import CountVectorizer
|
||
|
||
docs = [
|
||
"Python 是 编程 语言",
|
||
"Java 是 编程 语言",
|
||
"Python Python Python"
|
||
]
|
||
|
||
vectorizer = CountVectorizer(tokenizer=lambda x: x.split())
|
||
|
||
X = vectorizer.fit_transform(docs)
|
||
|
||
print("词表(Vocabulary):", vectorizer.get_feature_names_out())
|
||
|
||
print("Doc1 向量:", X.toarray()[0])
|
||
print("Doc2 向量:", X.toarray()[1])
|
||
print("Doc3 向量:", X.toarray()[2]) |