import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import classification_report, accuracy_score genre_map = { 0: "剧情", 1: "喜剧", 2: "科幻", 3: "悬疑", 4: "动作", 5: "爱情", 6: "动画", 7: "犯罪", 8: "奇幻", 9: "纪录" } df = pd.read_csv("movie_data.csv") X = df["text"] y = df["label"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2)) X_train_tfidf = tfidf.fit_transform(X_train) X_test_tfidf = tfidf.transform(X_test) model = MultinomialNB() model.fit(X_train_tfidf, y_train) y_pred = model.predict(X_test_tfidf) print(f"准确率: {accuracy_score(y_test, y_pred):.4f}") print(classification_report(y_test, y_pred, target_names=genre_map.values())) def predict_genre(text): text_tfidf = tfidf.transform([text]) pred_label = model.predict(text_tfidf)[0] return genre_map[pred_label] new_movie = "一群年轻人在宇宙飞船上探索外星文明,遭遇未知危险" print(f"电影简介:{new_movie}") print(f"预测类别:{predict_genre(new_movie)}")