import requests import re import time def crawl_douban_top250_regex_with_quote(): base_url = "https://movie.douban.com/top250" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" } movies = [] for start in range(0, 250, 25): url = f"{base_url}?start={start}&filter=" response = requests.get(url, headers=headers) html = response.text pattern = re.compile( r'(\d+).+?' r'([^&]+?).+?' r'.+?' r'
(.+?)
', re.S ) items = pattern.findall(html) for item in items: rank = item[0] title = item[1] rating = item[2] quote = item[3].strip() movies.append({ "rank": rank, "title": title, "rating": rating, "quote": quote }) return movies if __name__ == "__main__": top250 = crawl_douban_top250_regex_with_quote()