diff --git a/33ljh.py b/33ljh.py new file mode 100644 index 0000000..1ca21da --- /dev/null +++ b/33ljh.py @@ -0,0 +1,96 @@ +import requests +from bs4 import BeautifulSoup +import json +import csv +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' +} + +movies = [] +for start in range(0, 250, 25): + url = f'https://movie.douban.com/top250?start={start}' + response = requests.get(url, headers=headers) + + response.encoding = 'utf-8' + soup = BeautifulSoup(response.text, 'html.parser') + + for item in soup.find_all('div', class_='item'): + + title_tag = item.find('span', class_='title') + if title_tag: + title = title_tag.text + else: + title = "未找到标题" + print(f"在这个item里没找到标题: {item}") + + + other_tag = item.find('span', class_='other') + other_title = other_tag.text.replace('/', '').strip() if other_tag else "" + + + info_tag = item.find('p', class_='') + info = info_tag.text.strip() if info_tag else "" + + + playable_tag = item.find('span', class_='playable') + if playable_tag: + year_tag = playable_tag.find_previous_sibling('span', class_='year') + year = year_tag.text.strip('()') if year_tag else "未知年份" + else: + year_tag = item.find('span', class_='year') + year = year_tag.text.strip('()') if year_tag else "未知年份" + + + rating_tag = item.find('span', class_='rating_num') + rating = rating_tag.text if rating_tag else "0.0" + + + people_tag = item.find('div', class_='star').find_all('span')[-1] + people_count = people_tag.text.replace('人评价', '').strip() if people_tag else "0" + + + quote_tag = item.find('span', class_='inq') + quote = quote_tag.text if quote_tag else "暂无短评" + + + director = "" + actors = "" + if '导演:' in info: + director_part = info.split('导演:')[1].split('主演:') + director = director_part[0].strip() + if len(director_part) > 1: + actors = director_part[1].split('/')[0].strip() + + + movie = { + '中文名': title, + '英文名': other_title, + '年份': year, + '导演': director, + '主演': actors, + '评分': rating, + '评价人数': people_count, + '精选短评': quote + } + movies.append(movie) + + +with open('movies.txt', 'w', encoding='utf-8') as f: + f.write("中文名\t英文名\t年份\t导演\t主演\t评分\t评价人数\t精选短评\n") + for movie in movies: + f.write(f"{movie['中文名']}\t{movie['英文名']}\t{movie['年份']}\t{movie['导演']}\t{movie['主演']}\t{movie['评分']}\t{movie['评价人数']}\t{movie['精选短评']}\n") + + +with open('movies.csv', 'w', encoding='utf-8', newline='') as f: + writer = csv.DictWriter( + f, + fieldnames=['中文名', '英文名', '年份', '导演', '主演', '评分', '评价人数', '精选短评'] + ) + writer.writeheader() + writer.writerows(movies) + + +with open('movies.json', 'w', encoding='utf-8') as f: + json.dump(movies, f, ensure_ascii=False, indent=4) + +print("爬取完成! 已生成 movies.txt, movies.csv, movies.json") \ No newline at end of file