from sklearn.feature_extraction.text import CountVectorizer # 文档集合 docs = [ "Python 是 编程 语言", "Java 是 编程 语言", ] # BoW 向量化 vectorizer = CountVectorizer() bow_matrix = vectorizer.fit_transform(docs) print("词表:", vectorizer.get_feature_names_out()) # 输出: ['Python', 'Java', '是', '编程', '语言'] print("BoW矩阵:") print(bow_matrix.toarray()) # 输出: # [[1 0 1 1 1] # Python文档: Python=1, Java=0, 是=1, 编程=1, 语言=1 # [0 1 1 1 1]] # Java文档: Python=0, Java=1, 是=1, 编程=1, 语言=1