diff --git a/260402-2509165034.py b/260402-2509165034.py new file mode 100644 index 0000000..d31c44c --- /dev/null +++ b/260402-2509165034.py @@ -0,0 +1,103 @@ +import requests +import re +import json +import csv +import time +import random + +BASE_URL = "https://movie.douban.com/top250?start={}&filter=" + +HEADERS = { + "User-Agent":"Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/91.0.4472.124 Safari/537.36" +} + +all_movies=[] + +def get_movie_data(): + print("正在开始爬取豆瓣Top 250 数据...") + + for i in range(0,10): + start_num = i * 25 + url = BASE_URL.format(start_num) + + try: + response = requests.get(url,headers=HEADERS) + + if response.status_code == 200: + html = response.text + + parse_html(html) + print(f"第{i+1}页爬取完成...") + + time.sleep(random.uniform(1,2)) + else: + print(f"第{i+1}页爬取失败,状态码:{response.status_code}") + except Exception as e: + print(f"发生错误:{e}") + + +def parse_html(html): + li_list = re.findall(r'
(.*?)
',li,re.S) + if info_match: + info_raw = info_match.group(1) + info_clean = re.sub(r'\s+','',info_raw).strip() + else: + info_clean = "未知信息" + + movie = { + "title": title, + "rating": rating, + "people": people, + "info": info_clean, + "quote": quote + } + all_movies.append(movie) + + +def save_data(): + print("正在保存数据...") + + with open("douban_top250.txt","w",encoding="utf-8") as f: + for movie in all_movies: + line = f"电影名:{movie['title']} | 评分:{movie['rating']} | 评价人数:{movie['people']} | 引言:{movie['quote']}\n" + f.write(line) + print("已保存为douban_top250.txt") + + + with open("douban_top250.csv","w",newline="",encoding="utf-8-sig") as f: + writer = csv.writer(f) + + writer.writerow(["电影名","评分","评价人数","详细信息","引言"]) + + for movie in all_movies: + writer.writerow([movie['title'],movie['rating'],movie['people'],movie['info'],movie['quote']]) + print("已保存为douban_top250.csv") + + + with open("douban_top250.json","w",encoding="utf-8") as f: + json.dump(all_movies,f,ensure_ascii=False,indent=4) + print("已保存为douban_top250.json") + + +if __name__ == "__main__": + get_movie_data() + save_data() + print("全部任务完成!") \ No newline at end of file