import requests from bs4 import BeautifulSoup import json import csv headers = {'User-Agent':'Mozilla/5.0(Windows NT 10.0;Win64; x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/91.0.4472.124 Safari/537.36'} movies = [] for start in range(0,250,25): url = f'https://movie.douban.com/top250?start={start}' response = requests.get(url,headers=headers) soup = BeautifulSoup(response.text,'html.parser') for item in soup.find_all('div',class_='item'): title_tag = item.find('span',class_='title') if title_tag: title = title_tag.text else: title = "未找到标题" print(f"在这个item里没找到标题:{item}") other_tag = item.find('span',class_='other') other_title = other_tag.text if other_tag else"" info_tag = item.find('p',class_='') info = info_tag.text.strip() if info_tag else "" playable_tag = item.find('span',class_='playable') if playable_tag: year_tag = playable_tag.find_previous_sibling('span',class_='year') year = year_tag.text.strip('()') if year_tag else "未知年份" else: year_tag = item.find('span',class_='year') year = year_tag.text.strip('()') if year_tag else "未知年份" director = info.split('导演:')[1].split('主演:')[0].strip() if'导演:' in info else '' actors = info.split('主演:')[1].strip() if '主演:' in info else '' movie = { '中文名': title, '英文名': other_title.replace('/','').strip(), '年份': year, '导演': director, '主演': actors } movies.append(movie) with open('movies.tst','w',encoding='utf-8') as f: f.write('中文名\t英文名\t年份\t导演\t主演\n') for movie in movies: f.write(f"{movie['中文名']}\t{movie['英文名']}\t{movie['年份']}\t{movie['导演']}\t{movie['主演']}\n") with open('movies.csv','w',encoding='utf-8',newline='') as f: writer = csv.DictWriter(f,fieldnames=['中文名','英文名','年份','导演','主演']) writer.writeheader() writer.writerows(movies) with open('movies.json','w',encoding='utf-8') as f: json.dump(movies,f,ensure_ascii=False,indent=4) print("爬取完成!已生成 movies.tst, movies.csv, movies.json")