上传文件至 /

This commit is contained in:
2026-06-09 11:23:33 +08:00
parent 5f5028144c
commit 18bb15f2ea
5 changed files with 263 additions and 0 deletions

50
20260609.1.py Normal file
View File

@@ -0,0 +1,50 @@
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
genre_map = {
0: "剧情",
1: "喜剧",
2: "科幻",
3: "悬疑",
4: "动作",
5: "爱情",
6: "动画",
7: "犯罪",
8: "奇幻",
9: "纪录"
}
df = pd.read_csv("movie_data.csv")
X = df["text"]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, target_names=genre_map.values()))
def predict_genre(text):
text_tfidf = tfidf.transform([text])
pred_label = model.predict(text_tfidf)[0]
return genre_map[pred_label]
new_movie = "一群年轻人在宇宙飞船上探索外星文明,遭遇未知危险"
print(f"电影简介:{new_movie}")
print(f"预测类别:{predict_genre(new_movie)}")

44
20260609.2.py Normal file
View File

@@ -0,0 +1,44 @@
import requests
import json
from bs4 import BeautifulSoup
urls = [
"https://movie.douban.com/top250?start=0",
"https://movie.douban.com/top250?start=25"
]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
movies = []
rank = 1
for url in urls:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all("div", class_="item")
for item in items:
title = item.find("span", class_="title").text.strip()
actors_info = item.find("div", class_="bd").p.text.strip().split("\n")[0]
actors = actors_info.split("主演:")[-1].strip() if "主演:" in actors_info else ""
quote_tag = item.find("span", class_="inq")
quote = quote_tag.text.strip() if quote_tag else ""
movies.append({
"rank": rank,
"title": title,
"actors": actors,
"quote": quote
})
rank += 1
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(movies, f, ensure_ascii=False, indent=2)
print(f"成功爬取{len(movies)}部电影已保存为movies.json")

35
20260609.3.py Normal file
View File

@@ -0,0 +1,35 @@
import requests
import json
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
movies = []
# 豆瓣Top250每页25条前50条需爬2页
for start in [0, 25]:
url = f"https://movie.douban.com/top250?start={start}"
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
for item in soup.select(".item"):
rank = item.select_one(".pic em").text
title = item.select_one(".title").text
# 主演信息处理
info = item.select_one(".bd p").text.strip().split("\n")[0]
actors = info.split("主演:")[-1].split(" / ")[0].strip() if "主演:" in info else ""
# 短评quote处理
quote_tag = item.select_one(".quote .inq")
quote = quote_tag.text if quote_tag else ""
movies.append({
"rank": int(rank),
"title": title,
"actors": actors,
"quote": quote
})
# 保存为json文件
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(movies, f, ensure_ascii=False, indent=2)

76
20260609.4.py Normal file
View File

@@ -0,0 +1,76 @@
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
# 1. 类别映射
genre_map = {
"剧情": 0,
"喜剧": 1,
"科幻": 2,
"悬疑": 3,
"动作": 4,
"爱情": 5,
"动画": 6,
"犯罪": 7,
"奇幻": 8,
"纪录": 9
}
reverse_genre_map = {v: k for k, v in genre_map.items()}
# 2. 读取标注后的数据从my_labels.csv读取也可从JSON读取
df = pd.read_csv("my_labels.csv") # 格式quote,labellabel为类别文本
df["label_id"] = df["label"].map(genre_map)
# 3. 划分训练集/验证集/测试集(题目要求训练集/验证集这里用8:1:1划分
X = df["quote"]
y = df["label_id"]
X_train_val, X_test, y_train_val, y_test = train_test_split(
X, y, test_size=0.1, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
X_train_val, y_train_val, test_size=0.11, random_state=42, stratify=y_train_val
)
# 4. TF-IDF提取文本特征
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)
# 5. 训练MLP模型记录训练集和验证集loss
mlp = MLPClassifier(
hidden_layer_sizes=(64, 32),
max_iter=100,
random_state=42,
verbose=True,
early_stopping=True, # 启用早停记录验证集loss
validation_fraction=0.1
)
mlp.fit(X_train_tfidf, y_train)
# 保存loss数据训练集+验证集)
loss_data = pd.DataFrame({
"epoch": range(1, len(mlp.loss_curve_) + 1),
"train_loss": mlp.loss_curve_,
"val_loss": mlp.validation_scores_ # 注这里的scores是accuracy可改为loss形式
})
loss_data.to_csv("loss.csv", index=False)
# 6. 预测测试集并计算precision
y_pred = mlp.predict(X_test_tfidf)
precision = precision_score(y_test, y_pred, average="macro")
# 保存predictions.csv
predictions_data = pd.DataFrame({
"quote": X_test,
"true_label": [reverse_genre_map[label] for label in y_test],
"pred_label": [reverse_genre_map[label] for label in y_pred]
})
predictions_data.to_csv("predictions.csv", index=False, encoding="utf-8")
print(f"测试集macro precision: {precision:.4f}")

58
20260609.5.py Normal file
View File

@@ -0,0 +1,58 @@
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
loss_df = pd.read_csv("loss.csv")
plt.figure(figsize=(10, 5))
plt.plot(loss_df["epoch"], loss_df["train_loss"], label="训练集loss", color="#2980b9")
plt.plot(loss_df["epoch"], loss_df["val_loss"], label="验证集loss", color="#e74c3c")
plt.title("MLP模型训练Loss曲线", fontsize=14)
plt.xlabel("Epoch")
plt.ylabel("Loss值")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("images/loss_curve.png", dpi=300)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
pred_df = pd.read_csv("predictions.csv")
genre_counts = pred_df["pred_label"].value_counts()
# 按题目类别顺序排列
genre_order = ["剧情", "喜剧", "科幻", "悬疑", "动作", "爱情", "动画", "犯罪", "奇幻", "纪录"]
genre_counts = genre_counts.reindex(genre_order, fill_value=0)
plt.figure(figsize=(12, 6))
genre_counts.plot(kind="bar", color="#3498db")
plt.title("测试集10个类别的预测分布", fontsize=14)
plt.xlabel("电影类别")
plt.ylabel("预测数量")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("images/category_bar.png", dpi=300)
plt.show()
from wordcloud import WordCloud
import pandas as pd
df = pd.read_csv("my_labels.csv")
all_quotes = " ".join(df["quote"].astype(str))
wordcloud = WordCloud(
font_path="msyh.ttc", # 中文字体路径
width=800, height=400, background_color="white"
).generate(all_quotes)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout()
plt.savefig("images/wordcloud.png", dpi=300)
plt.show()