From f4f94c5077c76723b4df309aca53d376a7f707c8 Mon Sep 17 00:00:00 2001 From: 2509165025 <2509165025@student.edu.cn> Date: Thu, 2 Apr 2026 15:55:18 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BD=9C=E4=B8=9A=E4=B8=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 2509165025 | 0 2509165025.CSV | 57 +++++++++++++++++++++++++++++++ 2509165026(2).json | 85 ++++++++++++++++++++++++++++++++++++++++++++++ 爬虫.txt.txt | 67 ++++++++++++++++++++++++++++++++++++ 4 files changed, 209 insertions(+) create mode 100644 2509165025 create mode 100644 2509165025.CSV create mode 100644 2509165026(2).json create mode 100644 爬虫.txt.txt diff --git a/2509165025 b/2509165025 new file mode 100644 index 0000000..e69de29 diff --git a/2509165025.CSV b/2509165025.CSV new file mode 100644 index 0000000..bf96a52 --- /dev/null +++ b/2509165025.CSV @@ -0,0 +1,57 @@ +import requests +from bs4 import BeautifulSoup +import csv +import time + +def scrape_douban_top250(): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + } + + with open('douban_movies.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['排名', '电影名称', '评分', '评价人数', '经典台词'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + + for start in range(0, 250, 25): + url = f'https://movie.douban.com/top250?start={start}' + + try: + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + movies = soup.find_all('div', class_='item') + + for movie in movies: + rank = movie.find('em').text + + title = movie.find('span', class_='title').text + + rating = movie.find('span', class_='rating_num').text + + people_span = movie.find('div', class_='star').find_all('span')[-1] + people = people_span.text.replace('人评价', '') + + quote_tag = movie.find('span', class_='inq') + quote = quote_tag.text if quote_tag else "暂无" + + writer.writerow({ + '排名': rank, + '电影名称': title, + '评分': rating, + '评价人数': people, + '经典台词': quote + }) + + print(f"第 {start//25 + 1} 页爬取完成,当前 URL: {url}") + time.sleep(1) + + except requests.exceptions.RequestException as e: + print(f"请求异常:{e}") + continue + + print("✅ 所有数据已爬取完成,已保存为 douban_movies.csv") + +if __name__ == '__main__': + scrape_douban_top250() \ No newline at end of file diff --git a/2509165026(2).json b/2509165026(2).json new file mode 100644 index 0000000..22ff339 --- /dev/null +++ b/2509165026(2).json @@ -0,0 +1,85 @@ +import requests +from bs4 import BeautifulSoup +import time +import json + +def scrape_douban_top250_to_json(): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + } + + movies_list = [] + + for start in range(0, 250, 25): + url = f'https://movie.douban.com/top250?start={start}' + + try: + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + movies = soup.find_all('div', class_='item') + + for movie in movies: + rank = movie.find('em').text + title = movie.find('span', class_='title').text + rating = movie.find('span', class_='rating_num').text + people_span = movie.find('div', class_='star').find_all('span')[-1] + people = people_span.text.replace('人评价', '') + quote_tag = movie.find('span', class_='inq') + quote = quote_tag.text if quote_tag else "暂无" + + info = movie.find('p', class_='').text.strip().split('\n') + director_actor = info[0].strip() + + if '导演: ' in director_actor: + director = director_actor.split('导演: ')[1].split('主演: ')[0].strip() + actor = director_actor.split('主演: ')[1].strip() if '主演: ' in director_actor else "暂无" + else: + director = "暂无" + actor = "暂无" + + year_area_type = info[1].strip().split('/') + year = year_area_type[0].strip() + area = year_area_type[1].strip() if len(year_area_type) > 1 else "暂无" + genre = year_area_type[2].strip() if len(year_area_type) > 2 else "暂无" + + movie_dict = { + + python +荣成 + pythonhtml +python python +对应的の +dict在中国言甑 + + "排名": rank, + "电影名称": title, + "评分 +python + "评分": rating, + "python + "评价人数": people, + "经典台词": quote, + "导演": director, + "主演": actor, + "上映年份": year, + "国家/地区": area, + "类型": genre + } + movies_list.append(movie_dict) + + print(f"第 {start//25 + 1} 页爬取完成,当前URL: {url}") + time.sleep(1) + + except requests.exceptions.RequestException as e: + print(f"请求异常:{e}") + continue + + with open('douban_movies.json', 'w', encoding='utf-8') as jsonfile: + json.dump(movies_list, jsonfile, ensure_ascii=False, indent=4) + + print("✅ 所有数据已爬取完成,保存为 douban_movies.json") + +if __name__ == '__main__': + scrape_douban_top250_to_json() \ No newline at end of file diff --git a/爬虫.txt.txt b/爬虫.txt.txt new file mode 100644 index 0000000..333e723 --- /dev/null +++ b/爬虫.txt.txt @@ -0,0 +1,67 @@ +import requests +from bs4 import BeautifulSoup +import time + +def scrape_douban_top250_to_txt(): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + } + + with open('douban_movies.txt', 'w', encoding='utf-8') as txtfile: + for start in range(0, 250, 25): + url = f'https://movie.douban.com/top250?start={start}' + + try: + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + movies = soup.find_all('div', class_='item' + + for movie in movies: + rank = movie.find('em').text + title = movie.find('span', class_='title').text + rating = movie.find('span', class_='rating_num').text + people_span = movie.find('div', class_='star').find_all('span')[-1] + people = people_span.text.replace('', '') + quote_tag = movie.find('span', class_='inq') + quote = quote_tag.text if quote_tag else "" + + info = movie.find('p', class_='').text.strip().split('\n') + director_actor = info[0].strip() + + if ': ' in director_actor: + director = director_actor.split(': ')[1].split(': ')[0].strip() + actor = director_actor.split(': ')[1].strip() if ': ' in director_actor else "" + else: + director = "" + actor = "" + + year_area_type = info[1].strip().split('/') + year = year_area_type[0].strip() + area = year_area_type[1].strip() if len(year_area_type) > 1 else "" + genre = year_area_type[2].strip() if len(year_area_type) > 2 else " + + txtfile.write(f"{rank}\n") + txtfile.write(f"Ӱƣ{title}\n") + txtfile.write(f"֣{rating}\n") + txtfile.write(f"{people}\n") + txtfile.write(f"̨ʣ{quote}\n") + txtfile.write(f"ݣ{director}\n") + txtfile.write(f"ݣ{actor}\n") + txtfile.write(f"ӳݣ{year}\n") + txtfile.write(f"/{area}\n") + txtfile.write(f"ͣ{genre}\n") + txtfile.write("-" * 50 + "\n") # ָ + + print(f" {start//25 + 1} ҳȡɣǰURL: {url}") + time.sleep(1) # ӳ1룬ⱻIP + + except requests.exceptions.RequestException as e: + print(f"쳣{e}") + continue + + print("? ȡɣΪ douban_movies.txt") + +if __name__ == '__main__': + scrape_douban_top250_to_txt() \ No newline at end of file