上传文件至 /
This commit is contained in:
65
1.py
Normal file
65
1.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
warnings.filterwarnings("ignore", category=UserWarning)
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.metrics import accuracy_score, classification_report
|
||||
|
||||
genre_dict = {
|
||||
0: "剧情",
|
||||
1: "喜剧",
|
||||
2: "科幻",
|
||||
3: "悬疑",
|
||||
4: "动作",
|
||||
5: "爱情",
|
||||
6: "动画",
|
||||
7: "犯罪",
|
||||
8: "奇幻",
|
||||
9: "纪录"
|
||||
}
|
||||
num_classes = len(genre_dict)
|
||||
|
||||
def load_data(file_path="movie_data.csv"):
|
||||
df = pd.read_csv(file_path)
|
||||
texts = df["text"].astype(str).tolist()
|
||||
labels = df["label"].astype(int).tolist()
|
||||
return texts, labels
|
||||
|
||||
def text_feature_extraction(texts):
|
||||
vectorizer = TfidfVectorizer(
|
||||
max_features=10000,
|
||||
stop_words="english",
|
||||
ngram_range=(1, 2)
|
||||
)
|
||||
features = vectorizer.fit_transform(texts)
|
||||
return features, vectorizer
|
||||
|
||||
def train_and_evaluate(features, labels):
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
features, labels, test_size=0.2, random_state=42, stratify=labels
|
||||
)
|
||||
model = LinearSVC(random_state=42, max_iter=10000)
|
||||
model.fit(X_train, y_train)
|
||||
y_pred = model.predict(X_test)
|
||||
acc = accuracy_score(y_test, y_pred)
|
||||
print(f"测试集准确率: {acc:.4f}")
|
||||
print("\n分类报告:")
|
||||
print(classification_report(y_test, y_pred, target_names=genre_dict.values()))
|
||||
return model
|
||||
|
||||
def predict_genre(model, vectorizer, new_text):
|
||||
new_feature = vectorizer.transform([new_text])
|
||||
pred_label = model.predict(new_feature)[0]
|
||||
return genre_dict[pred_label]
|
||||
|
||||
if __name__ == "__main__":
|
||||
texts, labels = load_data()
|
||||
features, vectorizer = text_feature_extraction(texts)
|
||||
model = train_and_evaluate(features, labels)
|
||||
sample_text = "一个孤独的科学家发明了时间机器,却在穿梭时空的过程中陷入了悖论..."
|
||||
print(f"\n示例文本: {sample_text}")
|
||||
print(f"预测类型: {predict_genre(model, vectorizer, sample_text)}")
|
||||
43
2.py
Normal file
43
2.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import time
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
movies = []
|
||||
# 豆瓣Top250每页25条,前50条需要爬取2页(start=0和start=25)
|
||||
for page in range(2):
|
||||
url = f"https://movie.douban.com/top250?start={page*25}"
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
items = soup.find_all("div", class_="item")
|
||||
for idx, item in enumerate(items):
|
||||
rank = page * 25 + idx + 1
|
||||
# 电影名称
|
||||
title = item.find("span", class_="title").text.strip()
|
||||
# 主演信息
|
||||
info = item.find("div", class_="bd").find("p", class_="").text.strip()
|
||||
actors = info.split("\n")[0].split("主演:")[-1].strip() if "主演:" in info else "未知"
|
||||
# 短评
|
||||
quote_tag = item.find("span", class_="inq")
|
||||
quote = quote_tag.text.strip() if quote_tag else "无短评"
|
||||
|
||||
movies.append({
|
||||
"rank": rank,
|
||||
"title": title,
|
||||
"actors": actors,
|
||||
"quote": quote
|
||||
})
|
||||
|
||||
# 礼貌间隔,避免被反爬
|
||||
time.sleep(1)
|
||||
|
||||
# 保存为movies.json
|
||||
with open("movies.json", "w", encoding="utf-8") as f:
|
||||
json.dump(movies, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print("爬取完成,数据已保存到 movies.json")
|
||||
32
3.py
Normal file
32
3.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import csv
|
||||
import time
|
||||
|
||||
# 1. 发送请求
|
||||
url = 'https://movie.douban.com/top250'
|
||||
headers = {'User-Agent': 'Mozilla/5.0...'}
|
||||
response = requests.get(url, headers=headers)
|
||||
|
||||
# 2. 解析数据
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
movies = []
|
||||
|
||||
for item in soup.select('.item'):
|
||||
title = item.select_one('.title').get_text()
|
||||
rating = item.select_one('.rating_num').get_text()
|
||||
quote = item.select_one('.inq').get_text() if item.select_one('.inq') else ''
|
||||
|
||||
movies.append({
|
||||
'title': title.strip(),
|
||||
'rating': rating,
|
||||
'quote': quote
|
||||
})
|
||||
|
||||
# 3. 保存为CSV
|
||||
with open('movies.csv', 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['title', 'rating', 'quote'])
|
||||
writer.writeheader()
|
||||
writer.writerows(movies)
|
||||
|
||||
print(f'已保存 {len(movies)} 部电影到 movies.csv')
|
||||
26
movies.csv
Normal file
26
movies.csv
Normal file
@@ -0,0 +1,26 @@
|
||||
title,rating,quote
|
||||
肖申克的救赎,9.7,
|
||||
霸王别姬,9.6,
|
||||
泰坦尼克号,9.5,
|
||||
阿甘正传,9.5,
|
||||
千与千寻,9.4,
|
||||
美丽人生,9.5,
|
||||
星际穿越,9.4,
|
||||
这个杀手不太冷,9.4,
|
||||
盗梦空间,9.4,
|
||||
楚门的世界,9.4,
|
||||
辛德勒的名单,9.5,
|
||||
忠犬八公的故事,9.4,
|
||||
海上钢琴师,9.3,
|
||||
疯狂动物城,9.3,
|
||||
三傻大闹宝莱坞,9.2,
|
||||
机器人总动员,9.3,
|
||||
放牛班的春天,9.3,
|
||||
无间道,9.3,
|
||||
控方证人,9.6,
|
||||
寻梦环游记,9.1,
|
||||
大话西游之大圣娶亲,9.2,
|
||||
熔炉,9.3,
|
||||
触不可及,9.3,
|
||||
教父,9.3,
|
||||
末代皇帝,9.3,
|
||||
|
Reference in New Issue
Block a user