Files
task-2-4-regular-expression/260402-2509165004.py
2026-04-02 15:56:38 +08:00

56 lines
1.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import csv
import json
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
}
movie_list = []
for start in range(0, 250, 25):
url = f"https://movie.douban.com/top250?start={start}"
print(f"正在爬取:{url}")
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
html = response.text
pattern = re.compile(
r'<div class="item">.*?'
r'<span class="title">(.*?)</span>.*?'
r'<span class="rating_num" property="v:average">(.*?)</span>.*?' # 评分
r'<span>(.*?)人评价</span>.*?'
r'<span class="inq">(.*?)</span>',
re.S
)
movies = pattern.findall(html)
for movie in movies:
name = movie[0].strip()
score = movie[1].strip()
comment_num = movie[2].strip()
comment = movie[3].strip()
movie_dict = {
"电影名称": name,
"评分": score,
"评价人数": comment_num,
"短评": comment
}
movie_list.append(movie_dict)
except Exception as e:
print(f"爬取失败:{e}")
with open("douban_top250.csv", "w", encoding="utf-8-sig", newline="") as f:
fieldnames = ["电影名称", "评分", "评价人数", "短评"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(movie_list)
print("✅ CSV 文件已保存douban_top250.csv")
with open("douban_top250.json", "w", encoding="utf-8") as f:
json.dump(movie_list, f, ensure_ascii=False, indent=4)
print("✅ JSON 文件已保存douban_top250.json")
print(f"\n🎉 爬取完成!共获取 {len(movie_list)} 部电影数据")