From 6e96743a5947c630d7f351b0a5719e946ca41723 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E5=B0=A4=E4=BC=98?= <2509165039@student.example.com> Date: Wed, 1 Apr 2026 01:04:50 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 260331-2509165039.py | 62 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 260331-2509165039.py diff --git a/260331-2509165039.py b/260331-2509165039.py new file mode 100644 index 0000000..bd832a0 --- /dev/null +++ b/260331-2509165039.py @@ -0,0 +1,62 @@ +import requests +from bs4 import BeautifulSoup +import json +import csv + +headers = {'User-Agent':'Mozilla/5.0(Windows NT 10.0;Win64; x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/91.0.4472.124 Safari/537.36'} + +movies = [] + +for start in range(0,250,25): + url = f'https://movie.douban.com/top250?start={start}' + response = requests.get(url,headers=headers) + soup = BeautifulSoup(response.text,'html.parser') + + for item in soup.find_all('div',class_='item'): + title_tag = item.find('span',class_='title') + if title_tag: + title = title_tag.text + else: + title = "未找到标题" + print(f"在这个item里没找到标题:{item}") + other_tag = item.find('span',class_='other') + other_title = other_tag.text if other_tag else"" + info_tag = item.find('p',class_='') + info = info_tag.text.strip() if info_tag else "" + playable_tag = item.find('span',class_='playable') + if playable_tag: + year_tag = playable_tag.find_previous_sibling('span',class_='year') + year = year_tag.text.strip('()') if year_tag else "未知年份" + else: + year_tag = item.find('span',class_='year') + year = year_tag.text.strip('()') if year_tag else "未知年份" + + director = info.split('导演:')[1].split('主演:')[0].strip() if'导演:' in info else '' + actors = info.split('主演:')[1].strip() if '主演:' in info else '' + + movie = { + '中文名': title, + '英文名': other_title.replace('/','').strip(), + '年份': year, + '导演': director, + '主演': actors + } + movies.append(movie) + +with open('movies.tst','w',encoding='utf-8') as f: + f.write('中文名\t英文名\t年份\t导演\t主演\n') + for movie in movies: + f.write(f"{movie['中文名']}\t{movie['英文名']}\t{movie['年份']}\t{movie['导演']}\t{movie['主演']}\n") + + +with open('movies.csv','w',encoding='utf-8',newline='') as f: + writer = csv.DictWriter(f,fieldnames=['中文名','英文名','年份','导演','主演']) + writer.writeheader() + writer.writerows(movies) + + +with open('movies.json','w',encoding='utf-8') as f: + json.dump(movies,f,ensure_ascii=False,indent=4) + + +print("爬取完成!已生成 movies.tst, movies.csv, movies.json") \ No newline at end of file