docs = [ "Python 是 编程 语言", "Java 是 编程 语言", "Python Python Python" ] all_words = [] for doc in docs: words = doc.split() all_words.extend(words) vocab = sorted(list(set(all_words))) print("词表(手动实现):", vocab) bow_vectors = [] for doc in docs: words = doc.split() vector = [words.count(word) for word in vocab] bow_vectors.append(vector) print("\n每个文档的BoW向量(手动实现):") for i, vec in enumerate(bow_vectors): print(f"Doc{i+1}: {vec}") from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() X = vectorizer.fit_transform(docs) print("\n词表(sklearn实现):", vectorizer.get_feature_names_out()) print("\n每个文档的BoW向量(sklearn实现):") for i, vec in enumerate(X.toarray()): print(f"Doc{i+1}: {vec}") #6# print("1忽略词序信息:无法区分语序不同但词频相同的文本,会丢失语义逻辑。2不理解词语语义关联:将词视为独立符号,无法捕捉同义词、近义词的关系。")