From 5c02d05021198da00ad053f0e516ca281cad0731 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E8=89=BA=E6=AC=A3?= <2509165020@student.example.com> Date: Tue, 9 Jun 2026 11:24:34 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 1.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2.py | 43 ++++++++++++++++++++++++++++++++++++ 3.py | 32 +++++++++++++++++++++++++++ movies.csv | 26 ++++++++++++++++++++++ 4 files changed, 166 insertions(+) create mode 100644 1.py create mode 100644 2.py create mode 100644 3.py create mode 100644 movies.csv diff --git a/1.py b/1.py new file mode 100644 index 0000000..f6170f3 --- /dev/null +++ b/1.py @@ -0,0 +1,65 @@ +import warnings +warnings.filterwarnings("ignore", category=DeprecationWarning) +warnings.filterwarnings("ignore", category=UserWarning) + +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.svm import LinearSVC +from sklearn.metrics import accuracy_score, classification_report + +genre_dict = { + 0: "剧情", + 1: "喜剧", + 2: "科幻", + 3: "悬疑", + 4: "动作", + 5: "爱情", + 6: "动画", + 7: "犯罪", + 8: "奇幻", + 9: "纪录" +} +num_classes = len(genre_dict) + +def load_data(file_path="movie_data.csv"): + df = pd.read_csv(file_path) + texts = df["text"].astype(str).tolist() + labels = df["label"].astype(int).tolist() + return texts, labels + +def text_feature_extraction(texts): + vectorizer = TfidfVectorizer( + max_features=10000, + stop_words="english", + ngram_range=(1, 2) + ) + features = vectorizer.fit_transform(texts) + return features, vectorizer + +def train_and_evaluate(features, labels): + X_train, X_test, y_train, y_test = train_test_split( + features, labels, test_size=0.2, random_state=42, stratify=labels + ) + model = LinearSVC(random_state=42, max_iter=10000) + model.fit(X_train, y_train) + y_pred = model.predict(X_test) + acc = accuracy_score(y_test, y_pred) + print(f"测试集准确率: {acc:.4f}") + print("\n分类报告:") + print(classification_report(y_test, y_pred, target_names=genre_dict.values())) + return model + +def predict_genre(model, vectorizer, new_text): + new_feature = vectorizer.transform([new_text]) + pred_label = model.predict(new_feature)[0] + return genre_dict[pred_label] + +if __name__ == "__main__": + texts, labels = load_data() + features, vectorizer = text_feature_extraction(texts) + model = train_and_evaluate(features, labels) + sample_text = "一个孤独的科学家发明了时间机器,却在穿梭时空的过程中陷入了悖论..." + print(f"\n示例文本: {sample_text}") + print(f"预测类型: {predict_genre(model, vectorizer, sample_text)}") \ No newline at end of file diff --git a/2.py b/2.py new file mode 100644 index 0000000..449c104 --- /dev/null +++ b/2.py @@ -0,0 +1,43 @@ +import requests +from bs4 import BeautifulSoup +import json +import time + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} + +movies = [] +# 豆瓣Top250每页25条,前50条需要爬取2页(start=0和start=25) +for page in range(2): + url = f"https://movie.douban.com/top250?start={page*25}" + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.text, "html.parser") + + items = soup.find_all("div", class_="item") + for idx, item in enumerate(items): + rank = page * 25 + idx + 1 + # 电影名称 + title = item.find("span", class_="title").text.strip() + # 主演信息 + info = item.find("div", class_="bd").find("p", class_="").text.strip() + actors = info.split("\n")[0].split("主演:")[-1].strip() if "主演:" in info else "未知" + # 短评 + quote_tag = item.find("span", class_="inq") + quote = quote_tag.text.strip() if quote_tag else "无短评" + + movies.append({ + "rank": rank, + "title": title, + "actors": actors, + "quote": quote + }) + + # 礼貌间隔,避免被反爬 + time.sleep(1) + +# 保存为movies.json +with open("movies.json", "w", encoding="utf-8") as f: + json.dump(movies, f, ensure_ascii=False, indent=2) + +print("爬取完成,数据已保存到 movies.json") \ No newline at end of file diff --git a/3.py b/3.py new file mode 100644 index 0000000..bf96a23 --- /dev/null +++ b/3.py @@ -0,0 +1,32 @@ +import requests +from bs4 import BeautifulSoup +import csv +import time + +# 1. 发送请求 +url = 'https://movie.douban.com/top250' +headers = {'User-Agent': 'Mozilla/5.0...'} +response = requests.get(url, headers=headers) + +# 2. 解析数据 +soup = BeautifulSoup(response.text, 'lxml') +movies = [] + +for item in soup.select('.item'): + title = item.select_one('.title').get_text() + rating = item.select_one('.rating_num').get_text() + quote = item.select_one('.inq').get_text() if item.select_one('.inq') else '' + + movies.append({ + 'title': title.strip(), + 'rating': rating, + 'quote': quote + }) + +# 3. 保存为CSV +with open('movies.csv', 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=['title', 'rating', 'quote']) + writer.writeheader() + writer.writerows(movies) + +print(f'已保存 {len(movies)} 部电影到 movies.csv') \ No newline at end of file diff --git a/movies.csv b/movies.csv new file mode 100644 index 0000000..2b46e76 --- /dev/null +++ b/movies.csv @@ -0,0 +1,26 @@ +title,rating,quote +肖申克的救赎,9.7, +霸王别姬,9.6, +泰坦尼克号,9.5, +阿甘正传,9.5, +千与千寻,9.4, +美丽人生,9.5, +星际穿越,9.4, +这个杀手不太冷,9.4, +盗梦空间,9.4, +楚门的世界,9.4, +辛德勒的名单,9.5, +忠犬八公的故事,9.4, +海上钢琴师,9.3, +疯狂动物城,9.3, +三傻大闹宝莱坞,9.2, +机器人总动员,9.3, +放牛班的春天,9.3, +无间道,9.3, +控方证人,9.6, +寻梦环游记,9.1, +大话西游之大圣娶亲,9.2, +熔炉,9.3, +触不可及,9.3, +教父,9.3, +末代皇帝,9.3,