上传文件至 /
This commit is contained in:
65
1.py
Normal file
65
1.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
import warnings
|
||||||
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
|
warnings.filterwarnings("ignore", category=UserWarning)
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from sklearn.metrics import accuracy_score, classification_report
|
||||||
|
|
||||||
|
genre_dict = {
|
||||||
|
0: "剧情",
|
||||||
|
1: "喜剧",
|
||||||
|
2: "科幻",
|
||||||
|
3: "悬疑",
|
||||||
|
4: "动作",
|
||||||
|
5: "爱情",
|
||||||
|
6: "动画",
|
||||||
|
7: "犯罪",
|
||||||
|
8: "奇幻",
|
||||||
|
9: "纪录"
|
||||||
|
}
|
||||||
|
num_classes = len(genre_dict)
|
||||||
|
|
||||||
|
def load_data(file_path="movie_data.csv"):
|
||||||
|
df = pd.read_csv(file_path)
|
||||||
|
texts = df["text"].astype(str).tolist()
|
||||||
|
labels = df["label"].astype(int).tolist()
|
||||||
|
return texts, labels
|
||||||
|
|
||||||
|
def text_feature_extraction(texts):
|
||||||
|
vectorizer = TfidfVectorizer(
|
||||||
|
max_features=10000,
|
||||||
|
stop_words="english",
|
||||||
|
ngram_range=(1, 2)
|
||||||
|
)
|
||||||
|
features = vectorizer.fit_transform(texts)
|
||||||
|
return features, vectorizer
|
||||||
|
|
||||||
|
def train_and_evaluate(features, labels):
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
|
features, labels, test_size=0.2, random_state=42, stratify=labels
|
||||||
|
)
|
||||||
|
model = LinearSVC(random_state=42, max_iter=10000)
|
||||||
|
model.fit(X_train, y_train)
|
||||||
|
y_pred = model.predict(X_test)
|
||||||
|
acc = accuracy_score(y_test, y_pred)
|
||||||
|
print(f"测试集准确率: {acc:.4f}")
|
||||||
|
print("\n分类报告:")
|
||||||
|
print(classification_report(y_test, y_pred, target_names=genre_dict.values()))
|
||||||
|
return model
|
||||||
|
|
||||||
|
def predict_genre(model, vectorizer, new_text):
|
||||||
|
new_feature = vectorizer.transform([new_text])
|
||||||
|
pred_label = model.predict(new_feature)[0]
|
||||||
|
return genre_dict[pred_label]
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
texts, labels = load_data()
|
||||||
|
features, vectorizer = text_feature_extraction(texts)
|
||||||
|
model = train_and_evaluate(features, labels)
|
||||||
|
sample_text = "一个孤独的科学家发明了时间机器,却在穿梭时空的过程中陷入了悖论..."
|
||||||
|
print(f"\n示例文本: {sample_text}")
|
||||||
|
print(f"预测类型: {predict_genre(model, vectorizer, sample_text)}")
|
||||||
43
2.py
Normal file
43
2.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
movies = []
|
||||||
|
# 豆瓣Top250每页25条,前50条需要爬取2页(start=0和start=25)
|
||||||
|
for page in range(2):
|
||||||
|
url = f"https://movie.douban.com/top250?start={page*25}"
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
items = soup.find_all("div", class_="item")
|
||||||
|
for idx, item in enumerate(items):
|
||||||
|
rank = page * 25 + idx + 1
|
||||||
|
# 电影名称
|
||||||
|
title = item.find("span", class_="title").text.strip()
|
||||||
|
# 主演信息
|
||||||
|
info = item.find("div", class_="bd").find("p", class_="").text.strip()
|
||||||
|
actors = info.split("\n")[0].split("主演:")[-1].strip() if "主演:" in info else "未知"
|
||||||
|
# 短评
|
||||||
|
quote_tag = item.find("span", class_="inq")
|
||||||
|
quote = quote_tag.text.strip() if quote_tag else "无短评"
|
||||||
|
|
||||||
|
movies.append({
|
||||||
|
"rank": rank,
|
||||||
|
"title": title,
|
||||||
|
"actors": actors,
|
||||||
|
"quote": quote
|
||||||
|
})
|
||||||
|
|
||||||
|
# 礼貌间隔,避免被反爬
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# 保存为movies.json
|
||||||
|
with open("movies.json", "w", encoding="utf-8") as f:
|
||||||
|
json.dump(movies, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print("爬取完成,数据已保存到 movies.json")
|
||||||
32
3.py
Normal file
32
3.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import csv
|
||||||
|
import time
|
||||||
|
|
||||||
|
# 1. 发送请求
|
||||||
|
url = 'https://movie.douban.com/top250'
|
||||||
|
headers = {'User-Agent': 'Mozilla/5.0...'}
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
|
||||||
|
# 2. 解析数据
|
||||||
|
soup = BeautifulSoup(response.text, 'lxml')
|
||||||
|
movies = []
|
||||||
|
|
||||||
|
for item in soup.select('.item'):
|
||||||
|
title = item.select_one('.title').get_text()
|
||||||
|
rating = item.select_one('.rating_num').get_text()
|
||||||
|
quote = item.select_one('.inq').get_text() if item.select_one('.inq') else ''
|
||||||
|
|
||||||
|
movies.append({
|
||||||
|
'title': title.strip(),
|
||||||
|
'rating': rating,
|
||||||
|
'quote': quote
|
||||||
|
})
|
||||||
|
|
||||||
|
# 3. 保存为CSV
|
||||||
|
with open('movies.csv', 'w', newline='', encoding='utf-8') as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=['title', 'rating', 'quote'])
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(movies)
|
||||||
|
|
||||||
|
print(f'已保存 {len(movies)} 部电影到 movies.csv')
|
||||||
26
movies.csv
Normal file
26
movies.csv
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
title,rating,quote
|
||||||
|
肖申克的救赎,9.7,
|
||||||
|
霸王别姬,9.6,
|
||||||
|
泰坦尼克号,9.5,
|
||||||
|
阿甘正传,9.5,
|
||||||
|
千与千寻,9.4,
|
||||||
|
美丽人生,9.5,
|
||||||
|
星际穿越,9.4,
|
||||||
|
这个杀手不太冷,9.4,
|
||||||
|
盗梦空间,9.4,
|
||||||
|
楚门的世界,9.4,
|
||||||
|
辛德勒的名单,9.5,
|
||||||
|
忠犬八公的故事,9.4,
|
||||||
|
海上钢琴师,9.3,
|
||||||
|
疯狂动物城,9.3,
|
||||||
|
三傻大闹宝莱坞,9.2,
|
||||||
|
机器人总动员,9.3,
|
||||||
|
放牛班的春天,9.3,
|
||||||
|
无间道,9.3,
|
||||||
|
控方证人,9.6,
|
||||||
|
寻梦环游记,9.1,
|
||||||
|
大话西游之大圣娶亲,9.2,
|
||||||
|
熔炉,9.3,
|
||||||
|
触不可及,9.3,
|
||||||
|
教父,9.3,
|
||||||
|
末代皇帝,9.3,
|
||||||
|
Reference in New Issue
Block a user