删除 1.py
This commit is contained in:
65
1.py
65
1.py
@@ -1,65 +0,0 @@
|
|||||||
import warnings
|
|
||||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
||||||
warnings.filterwarnings("ignore", category=UserWarning)
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
from sklearn.svm import LinearSVC
|
|
||||||
from sklearn.metrics import accuracy_score, classification_report
|
|
||||||
|
|
||||||
genre_dict = {
|
|
||||||
0: "剧情",
|
|
||||||
1: "喜剧",
|
|
||||||
2: "科幻",
|
|
||||||
3: "悬疑",
|
|
||||||
4: "动作",
|
|
||||||
5: "爱情",
|
|
||||||
6: "动画",
|
|
||||||
7: "犯罪",
|
|
||||||
8: "奇幻",
|
|
||||||
9: "纪录"
|
|
||||||
}
|
|
||||||
num_classes = len(genre_dict)
|
|
||||||
|
|
||||||
def load_data(file_path="movie_data.csv"):
|
|
||||||
df = pd.read_csv(file_path)
|
|
||||||
texts = df["text"].astype(str).tolist()
|
|
||||||
labels = df["label"].astype(int).tolist()
|
|
||||||
return texts, labels
|
|
||||||
|
|
||||||
def text_feature_extraction(texts):
|
|
||||||
vectorizer = TfidfVectorizer(
|
|
||||||
max_features=10000,
|
|
||||||
stop_words="english",
|
|
||||||
ngram_range=(1, 2)
|
|
||||||
)
|
|
||||||
features = vectorizer.fit_transform(texts)
|
|
||||||
return features, vectorizer
|
|
||||||
|
|
||||||
def train_and_evaluate(features, labels):
|
|
||||||
X_train, X_test, y_train, y_test = train_test_split(
|
|
||||||
features, labels, test_size=0.2, random_state=42, stratify=labels
|
|
||||||
)
|
|
||||||
model = LinearSVC(random_state=42, max_iter=10000)
|
|
||||||
model.fit(X_train, y_train)
|
|
||||||
y_pred = model.predict(X_test)
|
|
||||||
acc = accuracy_score(y_test, y_pred)
|
|
||||||
print(f"测试集准确率: {acc:.4f}")
|
|
||||||
print("\n分类报告:")
|
|
||||||
print(classification_report(y_test, y_pred, target_names=genre_dict.values()))
|
|
||||||
return model
|
|
||||||
|
|
||||||
def predict_genre(model, vectorizer, new_text):
|
|
||||||
new_feature = vectorizer.transform([new_text])
|
|
||||||
pred_label = model.predict(new_feature)[0]
|
|
||||||
return genre_dict[pred_label]
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
texts, labels = load_data()
|
|
||||||
features, vectorizer = text_feature_extraction(texts)
|
|
||||||
model = train_and_evaluate(features, labels)
|
|
||||||
sample_text = "一个孤独的科学家发明了时间机器,却在穿梭时空的过程中陷入了悖论..."
|
|
||||||
print(f"\n示例文本: {sample_text}")
|
|
||||||
print(f"预测类型: {predict_genre(model, vectorizer, sample_text)}")
|
|
||||||
Reference in New Issue
Block a user