From a8469608ea3aa1b081c85b1f478ed7d422fc1878 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9E=97=E8=B4=B5=E7=8F=8D?= <2509165009@student.example.com> Date: Tue, 31 Mar 2026 11:22:33 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 26331,09test5.py | 41 +++++++++++++++++++++++++++ 26331,09test6.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 26331,09test5.py create mode 100644 26331,09test6.py diff --git a/26331,09test5.py b/26331,09test5.py new file mode 100644 index 0000000..82bc9b2 --- /dev/null +++ b/26331,09test5.py @@ -0,0 +1,41 @@ +import requests +import os +import json + +poster_urls = [ + {'rank': 1, 'title': '肖申克的救赎', 'url': 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p480747492.jpg'}, + {'rank': 2, 'title': '霸王别姬', 'url': 'https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2911205318.jpg'}, + {'rank': 3, 'title': '泰坦尼克号', 'url': 'https://img9.doubanio.com/view/photo/s_ratio_poster/public/p457760035.jpg'}, +] + +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' +} + +os.makedirs('posters', exist_ok=True) + +saved_info = [] +for info in poster_urls: + try: + response = requests.get(info['url'], headers=headers, timeout=10) + image_data = response.content + + filename = f"posters/{info['rank']}_{info['title']}.jpg" + with open(filename, 'wb') as f: + f.write(image_data) + + saved_info.append({ + 'rank': info['rank'], + 'title': info['title'], + 'filename': filename, + 'size': len(image_data) + }) + print(f'已保存: {filename} ({len(image_data)} bytes)') + + except Exception as e: + print(f'下载失败 {info["title"]}: {e}') + +with open('posters/info.json', 'w', encoding='utf-8') as f: + json.dump(saved_info, f, ensure_ascii=False, indent=2) + +print('\n图片信息已保存到 posters/info.json') \ No newline at end of file diff --git a/26331,09test6.py b/26331,09test6.py new file mode 100644 index 0000000..54a4cb2 --- /dev/null +++ b/26331,09test6.py @@ -0,0 +1,74 @@ +import requests +import re +import csv +import json +import os +import time + +def crawl_douban_top10(): + """爬取豆瓣Top10电影信息""" + + print('开始爬取豆瓣电影Top10...') + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', + 'Accept': 'text/html,application/xhtml+xml', + } + url = 'https://movie.douban.com/top250' + + response = requests.get(url, headers=headers, timeout=10) + html = response.text + + title_cn = re.findall(r'([^<&]+)', html) + ratings = re.findall(r']*>(\d+\.?\d*)', html) + quotes = re.findall(r'([^<]+)', html) + + movies = [] + cn_index = 0 + for i in range(10): + while cn_index < len(title_cn) and title_cn[cn_index].startswith('/'): + cn_index += 1 + + movie = { + 'rank': i + 1, + 'title': title_cn[cn_index] if cn_index < len(title_cn) else '', + 'rating': ratings[i] if i < len(ratings) else '', + 'quote': quotes[i] if i < len(quotes) else '' + } + movies.append(movie) + cn_index += 1 + + return movies + +def save_to_csv(movies, filename): + """保存为CSV""" + with open(filename, 'w', encoding='utf-8', newline='') as f: + writer = csv.DictWriter(f, fieldnames=['rank', 'title', 'rating', 'quote']) + writer.writeheader() + writer.writerows(movies) + print(f'CSV已保存: {filename}') + +def save_to_json(movies, filename): + """保存为JSON""" + with open(filename, 'w', encoding='utf-8') as f: + json.dump(movies, f, ensure_ascii=False, indent=2) + print(f'JSON已保存: {filename}') + +def main(): + os.makedirs('douban_output', exist_ok=True) + + movies = crawl_douban_top10() + + save_to_csv(movies, 'douban_output/movies.csv') + save_to_json(movies, 'douban_output/movies.json') + + print('\n爬取结果:') + print('-' * 50) + for m in movies: + quote_text = f'「{m["quote"]}」' if m['quote'] else '' + print(f"{m['rank']}. {m['title']} - 评分: {m['rating']} {quote_text}") + + print('\n完成!') + +if __name__ == '__main__': + main() \ No newline at end of file