上传文件至 /
This commit is contained in:
56
260402-2509165004.py
Normal file
56
260402-2509165004.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import re
|
||||
import csv
|
||||
import json
|
||||
import requests
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
|
||||
}
|
||||
movie_list = []
|
||||
for start in range(0, 250, 25):
|
||||
url = f"https://movie.douban.com/top250?start={start}"
|
||||
print(f"正在爬取:{url}")
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
html = response.text
|
||||
pattern = re.compile(
|
||||
r'<div class="item">.*?'
|
||||
r'<span class="title">(.*?)</span>.*?'
|
||||
r'<span class="rating_num" property="v:average">(.*?)</span>.*?' # 评分
|
||||
r'<span>(.*?)人评价</span>.*?'
|
||||
r'<span class="inq">(.*?)</span>',
|
||||
re.S
|
||||
)
|
||||
movies = pattern.findall(html)
|
||||
|
||||
for movie in movies:
|
||||
name = movie[0].strip()
|
||||
score = movie[1].strip()
|
||||
comment_num = movie[2].strip()
|
||||
comment = movie[3].strip()
|
||||
|
||||
movie_dict = {
|
||||
"电影名称": name,
|
||||
"评分": score,
|
||||
"评价人数": comment_num,
|
||||
"短评": comment
|
||||
}
|
||||
movie_list.append(movie_dict)
|
||||
|
||||
except Exception as e:
|
||||
print(f"爬取失败:{e}")
|
||||
|
||||
with open("douban_top250.csv", "w", encoding="utf-8-sig", newline="") as f:
|
||||
fieldnames = ["电影名称", "评分", "评价人数", "短评"]
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(movie_list)
|
||||
|
||||
print("✅ CSV 文件已保存:douban_top250.csv")
|
||||
|
||||
with open("douban_top250.json", "w", encoding="utf-8") as f:
|
||||
json.dump(movie_list, f, ensure_ascii=False, indent=4)
|
||||
|
||||
print("✅ JSON 文件已保存:douban_top250.json")
|
||||
print(f"\n🎉 爬取完成!共获取 {len(movie_list)} 部电影数据")
|
||||
Reference in New Issue
Block a user