diff --git a/26.03.31 43test4.py b/26.03.31 43test4.py new file mode 100644 index 0000000..9f30a56 --- /dev/null +++ b/26.03.31 43test4.py @@ -0,0 +1,47 @@ +import requests +import os +import json + +# 模拟:从网页提取的海报URL(实际应从HTML中提取) +poster_urls = [ + {'rank': 1, 'title': '肖申克的救赎', 'url': 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p480747492.jpg'}, + {'rank': 2, 'title': '霸王别姬', 'url': 'https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2911205318.jpg'}, + {'rank': 3, 'title': '泰坦尼克号', 'url': 'https://img9.doubanio.com/view/photo/s_ratio_poster/public/p457760035.jpg'}, +] + +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' +} + +# 创建目录 +os.makedirs('posters', exist_ok=True) + +# 保存图片 +saved_info = [] +for info in poster_urls: + try: + # 发送请求获取图片 + response = requests.get(info['url'], headers=headers, timeout=10) + image_data = response.content + + # 保存图片 + filename = f"posters/{info['rank']}_{info['title']}.jpg" + with open(filename, 'wb') as f: + f.write(image_data) + + saved_info.append({ + 'rank': info['rank'], + 'title': info['title'], + 'filename': filename, + 'size': len(image_data) + }) + print(f'已保存: {filename} ({len(image_data)} bytes)') + + except Exception as e: + print(f'下载失败 {info["title"]}: {e}') + +# 保存图片信息到JSON +with open('posters/info.json', 'w', encoding='utf-8') as f: + json.dump(saved_info, f, ensure_ascii=False, indent=2) + +print('\n图片信息已保存到 posters/info.json') \ No newline at end of file diff --git a/26.03.31 43test5.py b/26.03.31 43test5.py new file mode 100644 index 0000000..31ad411 --- /dev/null +++ b/26.03.31 43test5.py @@ -0,0 +1,85 @@ +import requests +import re +import csv +import json +import os +import time + +def crawl_douban_top10(): + """爬取豆瓣Top10电影信息""" + + print('开始爬取豆瓣电影Top10...') + + # 1. 爬取页面 + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', + 'Accept': 'text/html,application/xhtml+xml', + } + url = 'https://movie.douban.com/top250' + + response = requests.get(url, headers=headers, timeout=10) + html = response.text + + # 2. 提取数据 + # 电影名称(中文) + title_cn = re.findall(r'([^<&]+)', html) + # 评分 + ratings = re.findall(r']*>(\d+\.?\d*)', html) + # 经典台词 + quotes = re.findall(r'([^<]+)', html) + + # 3. 整理数据 + movies = [] + cn_index = 0 + for i in range(10): + # 跳过英文名 + while cn_index < len(title_cn) and title_cn[cn_index].startswith('/'): + cn_index += 1 + + movie = { + 'rank': i + 1, + 'title': title_cn[cn_index] if cn_index < len(title_cn) else '', + 'rating': ratings[i] if i < len(ratings) else '', + 'quote': quotes[i] if i < len(quotes) else '' + } + movies.append(movie) + cn_index += 1 + + return movies + +def save_to_csv(movies, filename): + """保存为CSV""" + with open(filename, 'w', encoding='utf-8', newline='') as f: + writer = csv.DictWriter(f, fieldnames=['rank', 'title', 'rating', 'quote']) + writer.writeheader() + writer.writerows(movies) + print(f'CSV已保存: {filename}') + +def save_to_json(movies, filename): + """保存为JSON""" + with open(filename, 'w', encoding='utf-8') as f: + json.dump(movies, f, ensure_ascii=False, indent=2) + print(f'JSON已保存: {filename}') + +def main(): + # 创建输出目录 + os.makedirs('douban_output', exist_ok=True) + + # 爬取数据 + movies = crawl_douban_top10() + + # 保存文件 + save_to_csv(movies, 'douban_output/movies.csv') + save_to_json(movies, 'douban_output/movies.json') + + # 显示结果 + print('\n爬取结果:') + print('-' * 50) + for m in movies: + quote_text = f'「{m["quote"]}」' if m['quote'] else '' + print(f"{m['rank']}. {m['title']} - 评分: {m['rating']} {quote_text}") + + print('\n完成!') + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/movies.csv b/movies.csv new file mode 100644 index 0000000..fee06ef --- /dev/null +++ b/movies.csv @@ -0,0 +1,11 @@ +rank,title,en_title,rating +1,肖申克的救赎,,9.7 +2,霸王别姬,,9.6 +3,泰坦尼克号,,9.5 +4,阿甘正传,,9.5 +5,千与千寻,,9.4 +6,美丽人生,,9.5 +7,星际穿越,,9.4 +8,这个杀手不太冷,,9.4 +9,盗梦空间,,9.4 +10,楚门的世界,,9.4 diff --git a/movies.json b/movies.json new file mode 100644 index 0000000..9d82bea --- /dev/null +++ b/movies.json @@ -0,0 +1,72 @@ +[ + { + "rank": 1, + "title": "肖申克的救赎", + "en_title": "", + "rating": "9.7", + "quote": "" + }, + { + "rank": 2, + "title": "霸王别姬", + "en_title": "", + "rating": "9.6", + "quote": "" + }, + { + "rank": 3, + "title": "泰坦尼克号", + "en_title": "", + "rating": "9.5", + "quote": "" + }, + { + "rank": 4, + "title": "阿甘正传", + "en_title": "", + "rating": "9.5", + "quote": "" + }, + { + "rank": 5, + "title": "千与千寻", + "en_title": "", + "rating": "9.4", + "quote": "" + }, + { + "rank": 6, + "title": "美丽人生", + "en_title": "", + "rating": "9.5", + "quote": "" + }, + { + "rank": 7, + "title": "星际穿越", + "en_title": "", + "rating": "9.4", + "quote": "" + }, + { + "rank": 8, + "title": "这个杀手不太冷", + "en_title": "", + "rating": "9.4", + "quote": "" + }, + { + "rank": 9, + "title": "盗梦空间", + "en_title": "", + "rating": "9.4", + "quote": "" + }, + { + "rank": 10, + "title": "楚门的世界", + "en_title": "", + "rating": "9.4", + "quote": "" + } +] \ No newline at end of file