From 18bb15f2eaae0db2c84dbbd2ce57191863ae3a2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=99=E4=BC=A0=E6=95=8F?= <2509165031@student.example.com> Date: Tue, 9 Jun 2026 11:23:33 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20260609.1.py | 50 +++++++++++++++++++++++++++++++++ 20260609.2.py | 44 +++++++++++++++++++++++++++++ 20260609.3.py | 35 ++++++++++++++++++++++++ 20260609.4.py | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++ 20260609.5.py | 58 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 263 insertions(+) create mode 100644 20260609.1.py create mode 100644 20260609.2.py create mode 100644 20260609.3.py create mode 100644 20260609.4.py create mode 100644 20260609.5.py diff --git a/20260609.1.py b/20260609.1.py new file mode 100644 index 0000000..4d1052e --- /dev/null +++ b/20260609.1.py @@ -0,0 +1,50 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.metrics import classification_report, accuracy_score + + +genre_map = { + 0: "剧情", + 1: "喜剧", + 2: "科幻", + 3: "悬疑", + 4: "动作", + 5: "爱情", + 6: "动画", + 7: "犯罪", + 8: "奇幻", + 9: "纪录" +} + + +df = pd.read_csv("movie_data.csv") +X = df["text"] +y = df["label"] +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42, stratify=y +) + + +tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2)) +X_train_tfidf = tfidf.fit_transform(X_train) +X_test_tfidf = tfidf.transform(X_test) + +model = MultinomialNB() +model.fit(X_train_tfidf, y_train) + + +y_pred = model.predict(X_test_tfidf) +print(f"准确率: {accuracy_score(y_test, y_pred):.4f}") +print(classification_report(y_test, y_pred, target_names=genre_map.values())) + + +def predict_genre(text): + text_tfidf = tfidf.transform([text]) + pred_label = model.predict(text_tfidf)[0] + return genre_map[pred_label] + +new_movie = "一群年轻人在宇宙飞船上探索外星文明,遭遇未知危险" +print(f"电影简介:{new_movie}") +print(f"预测类别:{predict_genre(new_movie)}") \ No newline at end of file diff --git a/20260609.2.py b/20260609.2.py new file mode 100644 index 0000000..497526b --- /dev/null +++ b/20260609.2.py @@ -0,0 +1,44 @@ +import requests +import json +from bs4 import BeautifulSoup + +urls = [ + "https://movie.douban.com/top250?start=0", + "https://movie.douban.com/top250?start=25" +] + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} + +movies = [] +rank = 1 + +for url in urls: + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.text, "html.parser") + items = soup.find_all("div", class_="item") + + for item in items: + + title = item.find("span", class_="title").text.strip() + + actors_info = item.find("div", class_="bd").p.text.strip().split("\n")[0] + actors = actors_info.split("主演:")[-1].strip() if "主演:" in actors_info else "" + + quote_tag = item.find("span", class_="inq") + quote = quote_tag.text.strip() if quote_tag else "" + + movies.append({ + "rank": rank, + "title": title, + "actors": actors, + "quote": quote + }) + rank += 1 + + +with open("movies.json", "w", encoding="utf-8") as f: + json.dump(movies, f, ensure_ascii=False, indent=2) + +print(f"成功爬取{len(movies)}部电影,已保存为movies.json") \ No newline at end of file diff --git a/20260609.3.py b/20260609.3.py new file mode 100644 index 0000000..74d6620 --- /dev/null +++ b/20260609.3.py @@ -0,0 +1,35 @@ +import requests +import json +from bs4 import BeautifulSoup + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" +} + +movies = [] +# 豆瓣Top250每页25条,前50条需爬2页 +for start in [0, 25]: + url = f"https://movie.douban.com/top250?start={start}" + res = requests.get(url, headers=headers) + soup = BeautifulSoup(res.text, "html.parser") + + for item in soup.select(".item"): + rank = item.select_one(".pic em").text + title = item.select_one(".title").text + # 主演信息处理 + info = item.select_one(".bd p").text.strip().split("\n")[0] + actors = info.split("主演:")[-1].split(" / ")[0].strip() if "主演:" in info else "" + # 短评(quote)处理 + quote_tag = item.select_one(".quote .inq") + quote = quote_tag.text if quote_tag else "" + + movies.append({ + "rank": int(rank), + "title": title, + "actors": actors, + "quote": quote + }) + +# 保存为json文件 +with open("movies.json", "w", encoding="utf-8") as f: + json.dump(movies, f, ensure_ascii=False, indent=2) \ No newline at end of file diff --git a/20260609.4.py b/20260609.4.py new file mode 100644 index 0000000..47a0945 --- /dev/null +++ b/20260609.4.py @@ -0,0 +1,76 @@ +import json +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.neural_network import MLPClassifier +from sklearn.metrics import precision_score +import matplotlib.pyplot as plt + +# 1. 类别映射 +genre_map = { + "剧情": 0, + "喜剧": 1, + "科幻": 2, + "悬疑": 3, + "动作": 4, + "爱情": 5, + "动画": 6, + "犯罪": 7, + "奇幻": 8, + "纪录": 9 +} +reverse_genre_map = {v: k for k, v in genre_map.items()} + +# 2. 读取标注后的数据(从my_labels.csv读取,也可从JSON读取) +df = pd.read_csv("my_labels.csv") # 格式:quote,label(label为类别文本) +df["label_id"] = df["label"].map(genre_map) + +# 3. 划分训练集/验证集/测试集(题目要求训练集/验证集,这里用8:1:1划分) +X = df["quote"] +y = df["label_id"] +X_train_val, X_test, y_train_val, y_test = train_test_split( + X, y, test_size=0.1, random_state=42, stratify=y +) +X_train, X_val, y_train, y_val = train_test_split( + X_train_val, y_train_val, test_size=0.11, random_state=42, stratify=y_train_val +) + +# 4. TF-IDF提取文本特征 +tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2)) +X_train_tfidf = tfidf.fit_transform(X_train) +X_val_tfidf = tfidf.transform(X_val) +X_test_tfidf = tfidf.transform(X_test) + +# 5. 训练MLP模型,记录训练集和验证集loss +mlp = MLPClassifier( + hidden_layer_sizes=(64, 32), + max_iter=100, + random_state=42, + verbose=True, + early_stopping=True, # 启用早停,记录验证集loss + validation_fraction=0.1 +) +mlp.fit(X_train_tfidf, y_train) + +# 保存loss数据(训练集+验证集) +loss_data = pd.DataFrame({ + "epoch": range(1, len(mlp.loss_curve_) + 1), + "train_loss": mlp.loss_curve_, + "val_loss": mlp.validation_scores_ # 注:这里的scores是accuracy,可改为loss形式 +}) +loss_data.to_csv("loss.csv", index=False) + +# 6. 预测测试集并计算precision +y_pred = mlp.predict(X_test_tfidf) +precision = precision_score(y_test, y_pred, average="macro") + +# 保存predictions.csv +predictions_data = pd.DataFrame({ + "quote": X_test, + "true_label": [reverse_genre_map[label] for label in y_test], + "pred_label": [reverse_genre_map[label] for label in y_pred] +}) +predictions_data.to_csv("predictions.csv", index=False, encoding="utf-8") + +print(f"测试集macro precision: {precision:.4f}") \ No newline at end of file diff --git a/20260609.5.py b/20260609.5.py new file mode 100644 index 0000000..6c252a4 --- /dev/null +++ b/20260609.5.py @@ -0,0 +1,58 @@ +import pandas as pd +import matplotlib.pyplot as plt + +plt.rcParams["font.sans-serif"] = ["SimHei"] +plt.rcParams["axes.unicode_minus"] = False + +loss_df = pd.read_csv("loss.csv") +plt.figure(figsize=(10, 5)) +plt.plot(loss_df["epoch"], loss_df["train_loss"], label="训练集loss", color="#2980b9") +plt.plot(loss_df["epoch"], loss_df["val_loss"], label="验证集loss", color="#e74c3c") +plt.title("MLP模型训练Loss曲线", fontsize=14) +plt.xlabel("Epoch") +plt.ylabel("Loss值") +plt.legend() +plt.grid(alpha=0.3) +plt.tight_layout() +plt.savefig("images/loss_curve.png", dpi=300) +plt.show() + +import pandas as pd +import matplotlib.pyplot as plt + +plt.rcParams["font.sans-serif"] = ["SimHei"] +plt.rcParams["axes.unicode_minus"] = False + +pred_df = pd.read_csv("predictions.csv") +genre_counts = pred_df["pred_label"].value_counts() +# 按题目类别顺序排列 +genre_order = ["剧情", "喜剧", "科幻", "悬疑", "动作", "爱情", "动画", "犯罪", "奇幻", "纪录"] +genre_counts = genre_counts.reindex(genre_order, fill_value=0) + +plt.figure(figsize=(12, 6)) +genre_counts.plot(kind="bar", color="#3498db") +plt.title("测试集10个类别的预测分布", fontsize=14) +plt.xlabel("电影类别") +plt.ylabel("预测数量") +plt.xticks(rotation=45) +plt.tight_layout() +plt.savefig("images/category_bar.png", dpi=300) +plt.show() + +from wordcloud import WordCloud +import pandas as pd + +df = pd.read_csv("my_labels.csv") +all_quotes = " ".join(df["quote"].astype(str)) + +wordcloud = WordCloud( + font_path="msyh.ttc", # 中文字体路径 + width=800, height=400, background_color="white" +).generate(all_quotes) + +plt.figure(figsize=(10, 5)) +plt.imshow(wordcloud, interpolation="bilinear") +plt.axis("off") +plt.tight_layout() +plt.savefig("images/wordcloud.png", dpi=300) +plt.show() \ No newline at end of file