Files
task-2-4-regular-expression/2604022509165013.py
2026-04-02 16:02:17 +08:00

135 lines
4.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import json
import csv
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64; x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/91.8.4472.124 Safari/537.36'
}
movies = []
for start in range(0, 250, 25):
url = f'https://movie.douban.com/top250?start={start}'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
for item in soup.find_all('div', class_='item'):
rank_tag = item.find('em')
rank = rank_tag.text if rank_tag else '未知排名'
title_tag = item.find('span', class_='title')
if title_tag:
title = title_tag.text
else:
title = "未找到标题"
print(f"在这个item里没找到标题:{item}")
other_tag = item.find('span', class_='other')
other_title = other_tag.text.strip() if other_tag else ""
if other_title.startswith('/'):
other_title = other_title[1:].strip()
rating_tag = item.find('span', class_='rating_num')
rating = rating_tag.text if rating_tag else "未知评分"
inq_tag = item.find('span', class_='inq')
inq = inq_tag.text.strip() if inq_tag else ""
playable_tag = item.find('span', class_='playable')
if playable_tag:
year_tag = playable_tag.find_previous_sibling('span', class_='year')
year = year_tag.text.strip('()') if year_tag else "未知年份"
else:
year_tag = item.find('span', class_='year')
year = year_tag.text.strip('()') if year_tag else "未知年份"
img_tag = item.find('img')
poster_url = img_tag['src'] if img_tag else ""
movies.append({
"rank": int(rank),
"title": title,
"en_title": other_title,
"rating": rating,
"quote": inq,
"year": year,
"poster_url": poster_url
})
if len(movies) >= 10:
break
if len(movies) >= 10:
break
with open("movies.txt", "w", encoding="utf-8") as f:
for movie in movies:
f.write(movie["title"] + "\n")
print(" 已保存movies.txt")
with open("movies.csv", "w", encoding="utf-8-sig", newline="") as f:
writer = csv.writer(f)
writer.writerow(["排名", "中文名", "英文名", "评分", "简介", "年份"])
for movie in movies:
writer.writerow([
movie["rank"],
movie["title"],
movie["en_title"],
movie["rating"],
movie["quote"],
movie["year"]
])
print(" 已保存movies.csv")
with open("movies.json", "w", encoding="utf-8") as f:
json.dump(movies, f, ensure_ascii=False, indent=4)
print(" 已保存movies.json")
high_rating_movies = []
with open("movies.csv", "r", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
for row in reader:
if float(row["评分"]) > 9.5:
high_rating_movies.append(row)
print("\n=== 评分高于9.5的电影 ===")
for m in high_rating_movies:
print(f"{m['排名']} {m['中文名']} {m['评分']}")
with open("high_rating.csv", "w", encoding="utf-8-sig", newline="") as f:
writer = csv.DictWriter(f, fieldnames=high_rating_movies[0].keys())
writer.writeheader()
writer.writerows(high_rating_movies)
print(" 已保存high_rating.csv")
with open("movies.json", "r", encoding="utf-8") as f:
movie_data = json.load(f)
ratings = [float(m["rating"]) for m in movie_data]
avg_rating = sum(ratings) / len(ratings)
max_rating = max(ratings)
top_movies = [m for m in movie_data if float(m["rating"]) == max_rating]
print(f"\n=== 统计信息 ===")
print(f"前10部电影平均分{avg_rating:.2f}")
print(f"最高评分:{max_rating}")
for m in top_movies:
print(f"评分最高的电影:{m['title']}")
poster_folder = "movie_posters"
os.makedirs(poster_folder, exist_ok=True)
for movie in movies:
try:
if movie["poster_url"]:
img_resp = requests.get(movie["poster_url"], headers=headers)
filename = f"{poster_folder}/{movie['rank']}_{movie['title']}.jpg"
with open(filename, "wb") as f:
f.write(img_resp.content)
print(f" 已下载海报:{movie['title']}")
except Exception as e:
print(f" 海报下载失败:{movie['title']},错误:{e}")
print("\n 所有任务全部完成!")