import requests import re import os from time import sleep base_url = 'https://movie.douban.com/top250?start={}&filter=' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } p = re.compile( r'
.*?' r'(.*?).*?' r'(.*?).*?' r'(.*?人评价).*?' r'(.*?)', re.S ) m = [] for page in range(10): u = base_url.format(page*25) try: r = requests.get(u, headers=headers, timeout=10) r.encoding = 'utf-8' items = p.findall(r.text) for i in items: t = i[0].split('/')[0].strip() m.append([t, i[1], i[2], i[3]]) except: pass sleep(1) dp = os.path.join(os.path.expanduser("~"), "Desktop") fp = os.path.join(dp, "top250.txt") with open(fp, 'w', encoding='utf-8') as f: for i, v in enumerate(m, 1): f.write(f'{i}. {v[0]}\n') f.write(f'{v[1]}\n') f.write(f'{v[2]}\n') f.write(f'{v[3]}\n\n')