import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=UserWarning) import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC from sklearn.metrics import accuracy_score, classification_report genre_dict = { 0: "剧情", 1: "喜剧", 2: "科幻", 3: "悬疑", 4: "动作", 5: "爱情", 6: "动画", 7: "犯罪", 8: "奇幻", 9: "纪录" } num_classes = len(genre_dict) def load_data(file_path="movie_data.csv"): df = pd.read_csv(file_path) texts = df["text"].astype(str).tolist() labels = df["label"].astype(int).tolist() return texts, labels def text_feature_extraction(texts): vectorizer = TfidfVectorizer( max_features=10000, stop_words="english", ngram_range=(1, 2) ) features = vectorizer.fit_transform(texts) return features, vectorizer def train_and_evaluate(features, labels): X_train, X_test, y_train, y_test = train_test_split( features, labels, test_size=0.2, random_state=42, stratify=labels ) model = LinearSVC(random_state=42, max_iter=10000) model.fit(X_train, y_train) y_pred = model.predict(X_test) acc = accuracy_score(y_test, y_pred) print(f"测试集准确率: {acc:.4f}") print("\n分类报告:") print(classification_report(y_test, y_pred, target_names=genre_dict.values())) return model def predict_genre(model, vectorizer, new_text): new_feature = vectorizer.transform([new_text]) pred_label = model.predict(new_feature)[0] return genre_dict[pred_label] if __name__ == "__main__": texts, labels = load_data() features, vectorizer = text_feature_extraction(texts) model = train_and_evaluate(features, labels) sample_text = "一个孤独的科学家发明了时间机器,却在穿梭时空的过程中陷入了悖论..." print(f"\n示例文本: {sample_text}") print(f"预测类型: {predict_genre(model, vectorizer, sample_text)}")