58 lines
1.8 KiB
Plaintext
58 lines
1.8 KiB
Plaintext
import requests
|
|
from bs4 import BeautifulSoup
|
|
import csv
|
|
import time
|
|
import random
|
|
|
|
# 模拟浏览器访问
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
}
|
|
|
|
movie_list = []
|
|
|
|
# 爬取 10 页,一共 250 部电影
|
|
for page in range(10):
|
|
start = page * 25
|
|
url = f'https://movie.douban.com/top250?start={start}&filter='
|
|
print(f'正在爬取第 {page+1} 页...')
|
|
|
|
# 随机延迟,防止被封
|
|
time.sleep(random.uniform(0.5, 1.5))
|
|
|
|
try:
|
|
# 1. 发送请求获取网页
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
# 2. 解析网页
|
|
soup = BeautifulSoup(response.text, 'lxml')
|
|
items = soup.select('.item')
|
|
|
|
# 3. 提取每部电影信息
|
|
for item in items:
|
|
rank = item.select_one('.pic em').text # 排名
|
|
title = item.select_one('.hd .title').text # 电影名
|
|
score = item.select_one('.rating_num').text # 评分
|
|
quote = item.select_one('.inq').text if item.select_one('.inq') else "无"
|
|
info = item.select_one('.bd p').text.strip() # 导演主演
|
|
|
|
movie = {
|
|
"排名": rank,
|
|
"电影名": title,
|
|
"评分": score,
|
|
"引言": quote,
|
|
"主创信息": info
|
|
}
|
|
movie_list.append(movie)
|
|
|
|
except Exception as e:
|
|
print(f"第 {page+1} 页爬取失败:{e}")
|
|
|
|
# 4. 保存为 CSV 文件
|
|
with open("douban_top250_bs4.csv", "w", encoding="utf-8-sig", newline="") as f:
|
|
writer = csv.DictWriter(f, fieldnames=["排名", "电影名", "评分", "引言", "主创信息"])
|
|
writer.writeheader()
|
|
writer.writerows(movie_list)
|
|
|
|
print(f"? 爬取完成!共 {len(movie_list)} 条数据,已保存到 douban_top250_bs4.csv") |