上传文件至 /
This commit is contained in:
50
20260609.1.py
Normal file
50
20260609.1.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.metrics import classification_report, accuracy_score
|
||||
|
||||
|
||||
genre_map = {
|
||||
0: "剧情",
|
||||
1: "喜剧",
|
||||
2: "科幻",
|
||||
3: "悬疑",
|
||||
4: "动作",
|
||||
5: "爱情",
|
||||
6: "动画",
|
||||
7: "犯罪",
|
||||
8: "奇幻",
|
||||
9: "纪录"
|
||||
}
|
||||
|
||||
|
||||
df = pd.read_csv("movie_data.csv")
|
||||
X = df["text"]
|
||||
y = df["label"]
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42, stratify=y
|
||||
)
|
||||
|
||||
|
||||
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
|
||||
X_train_tfidf = tfidf.fit_transform(X_train)
|
||||
X_test_tfidf = tfidf.transform(X_test)
|
||||
|
||||
model = MultinomialNB()
|
||||
model.fit(X_train_tfidf, y_train)
|
||||
|
||||
|
||||
y_pred = model.predict(X_test_tfidf)
|
||||
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
|
||||
print(classification_report(y_test, y_pred, target_names=genre_map.values()))
|
||||
|
||||
|
||||
def predict_genre(text):
|
||||
text_tfidf = tfidf.transform([text])
|
||||
pred_label = model.predict(text_tfidf)[0]
|
||||
return genre_map[pred_label]
|
||||
|
||||
new_movie = "一群年轻人在宇宙飞船上探索外星文明,遭遇未知危险"
|
||||
print(f"电影简介:{new_movie}")
|
||||
print(f"预测类别:{predict_genre(new_movie)}")
|
||||
44
20260609.2.py
Normal file
44
20260609.2.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import requests
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
urls = [
|
||||
"https://movie.douban.com/top250?start=0",
|
||||
"https://movie.douban.com/top250?start=25"
|
||||
]
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
movies = []
|
||||
rank = 1
|
||||
|
||||
for url in urls:
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
items = soup.find_all("div", class_="item")
|
||||
|
||||
for item in items:
|
||||
|
||||
title = item.find("span", class_="title").text.strip()
|
||||
|
||||
actors_info = item.find("div", class_="bd").p.text.strip().split("\n")[0]
|
||||
actors = actors_info.split("主演:")[-1].strip() if "主演:" in actors_info else ""
|
||||
|
||||
quote_tag = item.find("span", class_="inq")
|
||||
quote = quote_tag.text.strip() if quote_tag else ""
|
||||
|
||||
movies.append({
|
||||
"rank": rank,
|
||||
"title": title,
|
||||
"actors": actors,
|
||||
"quote": quote
|
||||
})
|
||||
rank += 1
|
||||
|
||||
|
||||
with open("movies.json", "w", encoding="utf-8") as f:
|
||||
json.dump(movies, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"成功爬取{len(movies)}部电影,已保存为movies.json")
|
||||
35
20260609.3.py
Normal file
35
20260609.3.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import requests
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
}
|
||||
|
||||
movies = []
|
||||
# 豆瓣Top250每页25条,前50条需爬2页
|
||||
for start in [0, 25]:
|
||||
url = f"https://movie.douban.com/top250?start={start}"
|
||||
res = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(res.text, "html.parser")
|
||||
|
||||
for item in soup.select(".item"):
|
||||
rank = item.select_one(".pic em").text
|
||||
title = item.select_one(".title").text
|
||||
# 主演信息处理
|
||||
info = item.select_one(".bd p").text.strip().split("\n")[0]
|
||||
actors = info.split("主演:")[-1].split(" / ")[0].strip() if "主演:" in info else ""
|
||||
# 短评(quote)处理
|
||||
quote_tag = item.select_one(".quote .inq")
|
||||
quote = quote_tag.text if quote_tag else ""
|
||||
|
||||
movies.append({
|
||||
"rank": int(rank),
|
||||
"title": title,
|
||||
"actors": actors,
|
||||
"quote": quote
|
||||
})
|
||||
|
||||
# 保存为json文件
|
||||
with open("movies.json", "w", encoding="utf-8") as f:
|
||||
json.dump(movies, f, ensure_ascii=False, indent=2)
|
||||
76
20260609.4.py
Normal file
76
20260609.4.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import json
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.metrics import precision_score
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# 1. 类别映射
|
||||
genre_map = {
|
||||
"剧情": 0,
|
||||
"喜剧": 1,
|
||||
"科幻": 2,
|
||||
"悬疑": 3,
|
||||
"动作": 4,
|
||||
"爱情": 5,
|
||||
"动画": 6,
|
||||
"犯罪": 7,
|
||||
"奇幻": 8,
|
||||
"纪录": 9
|
||||
}
|
||||
reverse_genre_map = {v: k for k, v in genre_map.items()}
|
||||
|
||||
# 2. 读取标注后的数据(从my_labels.csv读取,也可从JSON读取)
|
||||
df = pd.read_csv("my_labels.csv") # 格式:quote,label(label为类别文本)
|
||||
df["label_id"] = df["label"].map(genre_map)
|
||||
|
||||
# 3. 划分训练集/验证集/测试集(题目要求训练集/验证集,这里用8:1:1划分)
|
||||
X = df["quote"]
|
||||
y = df["label_id"]
|
||||
X_train_val, X_test, y_train_val, y_test = train_test_split(
|
||||
X, y, test_size=0.1, random_state=42, stratify=y
|
||||
)
|
||||
X_train, X_val, y_train, y_val = train_test_split(
|
||||
X_train_val, y_train_val, test_size=0.11, random_state=42, stratify=y_train_val
|
||||
)
|
||||
|
||||
# 4. TF-IDF提取文本特征
|
||||
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
|
||||
X_train_tfidf = tfidf.fit_transform(X_train)
|
||||
X_val_tfidf = tfidf.transform(X_val)
|
||||
X_test_tfidf = tfidf.transform(X_test)
|
||||
|
||||
# 5. 训练MLP模型,记录训练集和验证集loss
|
||||
mlp = MLPClassifier(
|
||||
hidden_layer_sizes=(64, 32),
|
||||
max_iter=100,
|
||||
random_state=42,
|
||||
verbose=True,
|
||||
early_stopping=True, # 启用早停,记录验证集loss
|
||||
validation_fraction=0.1
|
||||
)
|
||||
mlp.fit(X_train_tfidf, y_train)
|
||||
|
||||
# 保存loss数据(训练集+验证集)
|
||||
loss_data = pd.DataFrame({
|
||||
"epoch": range(1, len(mlp.loss_curve_) + 1),
|
||||
"train_loss": mlp.loss_curve_,
|
||||
"val_loss": mlp.validation_scores_ # 注:这里的scores是accuracy,可改为loss形式
|
||||
})
|
||||
loss_data.to_csv("loss.csv", index=False)
|
||||
|
||||
# 6. 预测测试集并计算precision
|
||||
y_pred = mlp.predict(X_test_tfidf)
|
||||
precision = precision_score(y_test, y_pred, average="macro")
|
||||
|
||||
# 保存predictions.csv
|
||||
predictions_data = pd.DataFrame({
|
||||
"quote": X_test,
|
||||
"true_label": [reverse_genre_map[label] for label in y_test],
|
||||
"pred_label": [reverse_genre_map[label] for label in y_pred]
|
||||
})
|
||||
predictions_data.to_csv("predictions.csv", index=False, encoding="utf-8")
|
||||
|
||||
print(f"测试集macro precision: {precision:.4f}")
|
||||
58
20260609.5.py
Normal file
58
20260609.5.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
plt.rcParams["font.sans-serif"] = ["SimHei"]
|
||||
plt.rcParams["axes.unicode_minus"] = False
|
||||
|
||||
loss_df = pd.read_csv("loss.csv")
|
||||
plt.figure(figsize=(10, 5))
|
||||
plt.plot(loss_df["epoch"], loss_df["train_loss"], label="训练集loss", color="#2980b9")
|
||||
plt.plot(loss_df["epoch"], loss_df["val_loss"], label="验证集loss", color="#e74c3c")
|
||||
plt.title("MLP模型训练Loss曲线", fontsize=14)
|
||||
plt.xlabel("Epoch")
|
||||
plt.ylabel("Loss值")
|
||||
plt.legend()
|
||||
plt.grid(alpha=0.3)
|
||||
plt.tight_layout()
|
||||
plt.savefig("images/loss_curve.png", dpi=300)
|
||||
plt.show()
|
||||
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
plt.rcParams["font.sans-serif"] = ["SimHei"]
|
||||
plt.rcParams["axes.unicode_minus"] = False
|
||||
|
||||
pred_df = pd.read_csv("predictions.csv")
|
||||
genre_counts = pred_df["pred_label"].value_counts()
|
||||
# 按题目类别顺序排列
|
||||
genre_order = ["剧情", "喜剧", "科幻", "悬疑", "动作", "爱情", "动画", "犯罪", "奇幻", "纪录"]
|
||||
genre_counts = genre_counts.reindex(genre_order, fill_value=0)
|
||||
|
||||
plt.figure(figsize=(12, 6))
|
||||
genre_counts.plot(kind="bar", color="#3498db")
|
||||
plt.title("测试集10个类别的预测分布", fontsize=14)
|
||||
plt.xlabel("电影类别")
|
||||
plt.ylabel("预测数量")
|
||||
plt.xticks(rotation=45)
|
||||
plt.tight_layout()
|
||||
plt.savefig("images/category_bar.png", dpi=300)
|
||||
plt.show()
|
||||
|
||||
from wordcloud import WordCloud
|
||||
import pandas as pd
|
||||
|
||||
df = pd.read_csv("my_labels.csv")
|
||||
all_quotes = " ".join(df["quote"].astype(str))
|
||||
|
||||
wordcloud = WordCloud(
|
||||
font_path="msyh.ttc", # 中文字体路径
|
||||
width=800, height=400, background_color="white"
|
||||
).generate(all_quotes)
|
||||
|
||||
plt.figure(figsize=(10, 5))
|
||||
plt.imshow(wordcloud, interpolation="bilinear")
|
||||
plt.axis("off")
|
||||
plt.tight_layout()
|
||||
plt.savefig("images/wordcloud.png", dpi=300)
|
||||
plt.show()
|
||||
Reference in New Issue
Block a user