提交正则表达式爬虫任务代码
This commit is contained in:
@@ -1,45 +1,103 @@
|
||||
import requests
|
||||
import re
|
||||
import csv
|
||||
import json
|
||||
import csv
|
||||
import time
|
||||
import random
|
||||
|
||||
html_content = """
|
||||
<div class="item">
|
||||
<div class="pic">
|
||||
<em>1</em>
|
||||
<a href="https://movie.douban.com/subject/1292052/">
|
||||
<img width="100" alt="肖申克的救赎" src="https://img3.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp">
|
||||
</a>
|
||||
</div>
|
||||
<div class="info">
|
||||
<div class="hd">
|
||||
<a href="https://movie.douban.com/subject/1292052/">
|
||||
<span class="title">肖申克的救赎</span>
|
||||
<span class="title"> / The Shawshank Redemption</span>
|
||||
<span class="other"> / 月黑高飞(港) / 刺激1995(台)</span>
|
||||
</a>
|
||||
BASE_URL = "https://movie.douban.com/top250?start={}&filter="
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent":"Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
}
|
||||
|
||||
all_movies=[]
|
||||
|
||||
def get_movie_data():
|
||||
print("正在开始爬取豆瓣Top 250 数据...")
|
||||
|
||||
for i in range(0,10):
|
||||
start_num = i * 25
|
||||
url = BASE_URL.format(start_num)
|
||||
|
||||
try:
|
||||
response = requests.get(url,headers=HEADERS)
|
||||
|
||||
if response.status_code == 200:
|
||||
html = response.text
|
||||
|
||||
parse_html(html)
|
||||
print(f"第{i+1}页爬取完成...")
|
||||
|
||||
time.sleep(random.uniform(1,2))
|
||||
else:
|
||||
print(f"第{i+1}页爬取失败,状态码:{response.status_code}")
|
||||
except Exception as e:
|
||||
print(f"发生错误:{e}")
|
||||
|
||||
|
||||
<span class="playable">[可播放]</span>
|
||||
</div>
|
||||
<div class="bd">
|
||||
<p>
|
||||
导演: 弗兰克·德拉邦特 Frank Darabont 主演: 蒂姆·罗宾斯 Tim Robbins /...<br>
|
||||
1994 / 美国 / 犯罪 剧情
|
||||
</p>
|
||||
def parse_html(html):
|
||||
li_list = re.findall(r'<li>.*?</li>',html,re.S)
|
||||
|
||||
|
||||
<div>
|
||||
<span class="rating5-t"></span>
|
||||
<span class="rating_num" property="v:average">9.7</span>
|
||||
<span property="v:best" content="10.0"></span>
|
||||
<span>3273519人评价</span>
|
||||
</div>
|
||||
for li in li_list:
|
||||
if 'class="item"' not in li:
|
||||
continue
|
||||
|
||||
<p class="quote">
|
||||
<span>希望让人自由。</span>
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
def parse_data(html):
|
||||
title_match = re.search(r'<span class="title">(.*?)</span>',li,re.S)
|
||||
title = title_match.group(1) if title_match else "未知标题"
|
||||
|
||||
rating_match = re.search(r'<span class="rating_num".*?>(.*?)</span>',li,re.S)
|
||||
rating = rating_match.group(1) if rating_match else "0"
|
||||
|
||||
people_match = re.search(r'(\d+)人评价',li,re.S)
|
||||
people = people_match.group(1) if people_match else "0"
|
||||
|
||||
quote_match = re.search(r'<span class="inq">(.*?)</span>',li,re.S)
|
||||
quote = quote_match.group(1) if quote_match else "无引言"
|
||||
|
||||
info_match = re.search(r'<p class="">(.*?)</p>',li,re.S)
|
||||
if info_match:
|
||||
info_raw = info_match.group(1)
|
||||
info_clean = re.sub(r'\s+','',info_raw).strip()
|
||||
else:
|
||||
info_clean = "未知信息"
|
||||
|
||||
movie = {
|
||||
"title": title,
|
||||
"rating": rating,
|
||||
"people": people,
|
||||
"info": info_clean,
|
||||
"quote": quote
|
||||
}
|
||||
all_movies.append(movie)
|
||||
|
||||
|
||||
def save_data():
|
||||
print("正在保存数据...")
|
||||
|
||||
with open("douban_top250.txt","w",encoding="utf-8") as f:
|
||||
for movie in all_movies:
|
||||
line = f"电影名:{movie['title']} | 评分:{movie['rating']} | 评价人数:{movie['people']} | 引言:{movie['quote']}\n"
|
||||
f.write(line)
|
||||
print("已保存为douban_top250.txt")
|
||||
|
||||
|
||||
with open("douban_top250.csv","w",newline="",encoding="utf-8-sig") as f:
|
||||
writer = csv.writer(f)
|
||||
|
||||
writer.writerow(["电影名","评分","评价人数","详细信息","引言"])
|
||||
|
||||
for movie in all_movies:
|
||||
writer.writerow([movie['title'],movie['rating'],movie['people'],movie['info'],movie['quote']])
|
||||
print("已保存为douban_top250.csv")
|
||||
|
||||
|
||||
with open("douban_top250.json","w",encoding="utf-8") as f:
|
||||
json.dump(all_movies,f,ensure_ascii=False,indent=4)
|
||||
print("已保存为douban_top250.json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
get_movie_data()
|
||||
save_data()
|
||||
print("全部任务完成!")
|
||||
Reference in New Issue
Block a user