Files
task-2-3-File-Operations/20260331/text2.py
2026-03-31 11:36:26 +08:00

67 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import csv
import time
# 请求头(模拟浏览器)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
# 存储所有电影数据
movies_data = []
# 爬取10页每页25条共250条
for page in range(10):
start = page * 25
url = f"https://movie.douban.com/top250?start={start}&filter="
try:
# 发送请求获取页面
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# 提取当前页所有电影项
items = soup.find_all("div", class_="item")
for item in items:
# 提取排名、片名、评分
rank = item.find("em").text
title = item.find("span", class_="title").text
score = item.find("span", class_="rating_num").text
# 提取导演、年份、地区、类型
info = item.find("div", class_="bd").p.text.strip().split("\n")
director_line = info[0].strip()
director = director_line.split("导演:")[1].split("主演:")[0].strip()
year_area_genre = info[1].strip().split("/")
year = year_area_genre[0].strip()
area = year_area_genre[1].strip()
genre = year_area_genre[2].strip()
# 添加到数据列表
movies_data.append({
"排名": rank,
"片名": title,
"导演": director,
"上映年份": year,
"地区": area,
"类型": genre,
"评分": score
})
print(f"{page+1} 页爬取完成")
time.sleep(1) # 延迟1秒避免反爬
except Exception as e:
print(f"{page+1} 页爬取失败:{str(e)}")
# 保存为CSV文件
with open("豆瓣电影Top250.csv", "w", encoding="utf-8-sig", newline="") as f:
# 表头
fieldnames = ["排名", "片名", "导演", "上映年份", "地区", "类型", "评分"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(movies_data)
print("✅ 数据已保存为「豆瓣电影Top250.csv」")