Files
task-2-4-regular-expression/2509165026(2).json
2026-04-02 15:55:18 +08:00

85 lines
3.1 KiB
JSON
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import time
import json
def scrape_douban_top250_to_json():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
movies_list = []
for start in range(0, 250, 25):
url = f'https://movie.douban.com/top250?start={start}'
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
movies = soup.find_all('div', class_='item')
for movie in movies:
rank = movie.find('em').text
title = movie.find('span', class_='title').text
rating = movie.find('span', class_='rating_num').text
people_span = movie.find('div', class_='star').find_all('span')[-1]
people = people_span.text.replace('', '')
quote_tag = movie.find('span', class_='inq')
quote = quote_tag.text if quote_tag else "暂无"
info = movie.find('p', class_='').text.strip().split('\n')
director_actor = info[0].strip()
if ': ' in director_actor:
director = director_actor.split(': ')[1].split(': ')[0].strip()
actor = director_actor.split(': ')[1].strip() if ': ' in director_actor else "暂无"
else:
director = "暂无"
actor = "暂无"
year_area_type = info[1].strip().split('/')
year = year_area_type[0].strip()
area = year_area_type[1].strip() if len(year_area_type) > 1 else "暂无"
genre = year_area_type[2].strip() if len(year_area_type) > 2 else "暂无"
movie_dict = {
python
pythonhtml
python python
dict
"排名": rank,
"电影名称": title,
"评分
python
"": rating,
"python
"评价人数": people,
"经典台词": quote,
"导演": director,
"主演": actor,
"上映年份": year,
"国家/地区": area,
"类型": genre
}
movies_list.append(movie_dict)
print(f"第 {start//25 + 1} 页爬取完成当前URL: {url}")
time.sleep(1)
except requests.exceptions.RequestException as e:
print(f"请求异常:{e}")
continue
with open('douban_movies.json', 'w', encoding='utf-8') as jsonfile:
json.dump(movies_list, jsonfile, ensure_ascii=False, indent=4)
print("✅ 所有数据已爬取完成,保存为 douban_movies.json")
if __name__ == '__main__':
scrape_douban_top250_to_json()