from sklearn.feature_extraction.text import CountVectorizer docs = [ "Pyhton是编程语言" "Java是编程语言" "Pyhton Python Pyhton" ] vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b") X = vectorizer.fit_transform(docs) print("词表:",vectorizer.get_feature_names_out()) print("文档向量:\n",X.toarray())