上传文件至 /

This commit is contained in:
2026-06-09 11:24:34 +08:00
parent b1698ecc3e
commit 5c02d05021
4 changed files with 166 additions and 0 deletions

65
1.py Normal file
View File

@@ -0,0 +1,65 @@
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
genre_dict = {
0: "剧情",
1: "喜剧",
2: "科幻",
3: "悬疑",
4: "动作",
5: "爱情",
6: "动画",
7: "犯罪",
8: "奇幻",
9: "纪录"
}
num_classes = len(genre_dict)
def load_data(file_path="movie_data.csv"):
df = pd.read_csv(file_path)
texts = df["text"].astype(str).tolist()
labels = df["label"].astype(int).tolist()
return texts, labels
def text_feature_extraction(texts):
vectorizer = TfidfVectorizer(
max_features=10000,
stop_words="english",
ngram_range=(1, 2)
)
features = vectorizer.fit_transform(texts)
return features, vectorizer
def train_and_evaluate(features, labels):
X_train, X_test, y_train, y_test = train_test_split(
features, labels, test_size=0.2, random_state=42, stratify=labels
)
model = LinearSVC(random_state=42, max_iter=10000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"测试集准确率: {acc:.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=genre_dict.values()))
return model
def predict_genre(model, vectorizer, new_text):
new_feature = vectorizer.transform([new_text])
pred_label = model.predict(new_feature)[0]
return genre_dict[pred_label]
if __name__ == "__main__":
texts, labels = load_data()
features, vectorizer = text_feature_extraction(texts)
model = train_and_evaluate(features, labels)
sample_text = "一个孤独的科学家发明了时间机器,却在穿梭时空的过程中陷入了悖论..."
print(f"\n示例文本: {sample_text}")
print(f"预测类型: {predict_genre(model, vectorizer, sample_text)}")

43
2.py Normal file
View File

@@ -0,0 +1,43 @@
import requests
from bs4 import BeautifulSoup
import json
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
movies = []
# 豆瓣Top250每页25条前50条需要爬取2页start=0和start=25
for page in range(2):
url = f"https://movie.douban.com/top250?start={page*25}"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all("div", class_="item")
for idx, item in enumerate(items):
rank = page * 25 + idx + 1
# 电影名称
title = item.find("span", class_="title").text.strip()
# 主演信息
info = item.find("div", class_="bd").find("p", class_="").text.strip()
actors = info.split("\n")[0].split("主演:")[-1].strip() if "主演:" in info else "未知"
# 短评
quote_tag = item.find("span", class_="inq")
quote = quote_tag.text.strip() if quote_tag else "无短评"
movies.append({
"rank": rank,
"title": title,
"actors": actors,
"quote": quote
})
# 礼貌间隔,避免被反爬
time.sleep(1)
# 保存为movies.json
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(movies, f, ensure_ascii=False, indent=2)
print("爬取完成,数据已保存到 movies.json")

32
3.py Normal file
View File

@@ -0,0 +1,32 @@
import requests
from bs4 import BeautifulSoup
import csv
import time
# 1. 发送请求
url = 'https://movie.douban.com/top250'
headers = {'User-Agent': 'Mozilla/5.0...'}
response = requests.get(url, headers=headers)
# 2. 解析数据
soup = BeautifulSoup(response.text, 'lxml')
movies = []
for item in soup.select('.item'):
title = item.select_one('.title').get_text()
rating = item.select_one('.rating_num').get_text()
quote = item.select_one('.inq').get_text() if item.select_one('.inq') else ''
movies.append({
'title': title.strip(),
'rating': rating,
'quote': quote
})
# 3. 保存为CSV
with open('movies.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['title', 'rating', 'quote'])
writer.writeheader()
writer.writerows(movies)
print(f'已保存 {len(movies)} 部电影到 movies.csv')

26
movies.csv Normal file
View File

@@ -0,0 +1,26 @@
title,rating,quote
肖申克的救赎,9.7,
霸王别姬,9.6,
泰坦尼克号,9.5,
阿甘正传,9.5,
千与千寻,9.4,
美丽人生,9.5,
星际穿越,9.4,
这个杀手不太冷,9.4,
盗梦空间,9.4,
楚门的世界,9.4,
辛德勒的名单,9.5,
忠犬八公的故事,9.4,
海上钢琴师,9.3,
疯狂动物城,9.3,
三傻大闹宝莱坞,9.2,
机器人总动员,9.3,
放牛班的春天,9.3,
无间道,9.3,
控方证人,9.6,
寻梦环游记,9.1,
大话西游之大圣娶亲,9.2,
熔炉,9.3,
触不可及,9.3,
教父,9.3,
末代皇帝,9.3,
1 title rating quote
2 肖申克的救赎 9.7
3 霸王别姬 9.6
4 泰坦尼克号 9.5
5 阿甘正传 9.5
6 千与千寻 9.4
7 美丽人生 9.5
8 星际穿越 9.4
9 这个杀手不太冷 9.4
10 盗梦空间 9.4
11 楚门的世界 9.4
12 辛德勒的名单 9.5
13 忠犬八公的故事 9.4
14 海上钢琴师 9.3
15 疯狂动物城 9.3
16 三傻大闹宝莱坞 9.2
17 机器人总动员 9.3
18 放牛班的春天 9.3
19 无间道 9.3
20 控方证人 9.6
21 寻梦环游记 9.1
22 大话西游之大圣娶亲 9.2
23 熔炉 9.3
24 触不可及 9.3
25 教父 9.3
26 末代皇帝 9.3