2.1 KiB
2.1 KiB
| 1 | import requests |
|---|---|
| 2 | from bs4 import BeautifulSoup |
| 3 | import csv |
| 4 | import time |
| 5 | def scrape_douban_top250(): |
| 6 | headers = { |
| 7 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' |
| 8 | } |
| 9 | |
| 10 | with open('douban_movies.csv', 'w', newline='', encoding='utf-8') as csvfile: |
| 11 | fieldnames = ['排名', '电影名称', '评分', '评价人数', '经典台词'] |
| 12 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) |
| 13 | writer.writeheader() |
| 14 | for start in range(0, 250, 25): |
| 15 | url = f'https://movie.douban.com/top250?start={start}' |
| 16 | |
| 17 | try: |
| 18 | response = requests.get(url, headers=headers, timeout=10) |
| 19 | response.raise_for_status() |
| 20 | |
| 21 | soup = BeautifulSoup(response.text, 'html.parser') |
| 22 | movies = soup.find_all('div', class_='item') |
| 23 | for movie in movies: |
| 24 | rank = movie.find('em').text |
| 25 | |
| 26 | title = movie.find('span', class_='title').text |
| 27 | rating = movie.find('span', class_='rating_num').text |
| 28 | |
| 29 | people_span = movie.find('div', class_='star').find_all('span')[-1] |
| 30 | people = people_span.text.replace('人评价', '') |
| 31 | |
| 32 | quote_tag = movie.find('span', class_='inq') |
| 33 | writer.writerow({ |
| 34 | '排名': rank, |
| 35 | '电影名称': title, |
| 36 | '评分': rating, |
| 37 | '评价人数': people, |
| 38 | '经典台词': quote |
| 39 | }) |
| 40 | time.sleep(1) |
| 41 | except requests.exceptions.RequestException as e: |
| 42 | continue |
| 43 | if __name__ == '__main__': |
| 44 | scrape_douban_top250() |