diff --git a/爬豆瓣电影top250.py b/爬豆瓣电影top250.py new file mode 100644 index 0000000..8eb0158 --- /dev/null +++ b/爬豆瓣电影top250.py @@ -0,0 +1,63 @@ +import requests +from bs4 import BeautifulSoup +import json +import csv + +url = 'https://www.douban.com/doulist/3936288/' +headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} +response = requests.get(url, headers=headers) +response.encoding = 'utf-8' +html = response.text +movies = [] + + +for start in range(0,250,25): + url = f'https://movie.douban.com/top250?start={start}' + response = requests.get(url,headers=headers) + soup = BeautifulSoup(response.text,'html.parser') + + for item in soup.find_all('div',class_='item'): + title_tag = item.find('span',class_='title') + if title_tag: + title = title_tag.text + else: + title = "未找到标题" + print(f"在这个item里没找到标题:{item}") + other_tag = item.find('span',class_='other') + other_title = other_tag.text if other_tag else"" + info_tag = item.find('p',class_='') + info = info_tag.text.strip() if info_tag else "" + playable_tag = item.find('span',class_='playable') + if playable_tag: + year_tag = playable_tag.find_previous_sibling('span',class_='year') + year = year_tag.text.strip('()') if year_tag else "未知年份" + else: + year_tag = item.find('span',class_='year') + year = year_tag.text.strip('()') if year_tag else "未知年份" + + director = info.split('导演:')[1].split('主演:')[0].strip() if'导演:' in info else '' + actors = info.split('主演:')[1].strip() if '主演:' in info else '' + + movie = { + '中文名': title, + '英文名': other_title.replace('/','').strip(), + '年份': year, + '导演': director, + '主演': actors + } + movies.append(movie) +with open('movies.txt','w',encoding='utf-8') as f: + f.write('中文名\t英文名\t年份\t导演\t主演\n') + for movie in movies: + f.write(f"{movie['中文名']}\t{movie['英文名']}\t{movie['年份']}\t{movie['导演']}\t{movie['主演']}\n") +with open('movies.csv','w',encoding='utf-8',newline='') as f: + writer = csv.DictWriter(f,fieldnames=['中文名','英文名','年份','导演','主演']) + writer.writeheader() + writer.writerows(movies) + + +with open('movies.json','w',encoding='utf-8') as f: + json.dump(movies,f,ensure_ascii=False,indent=4) + + +print("爬取完成!已生成 movies.txt, movies.csv, movies.json")