提交正则表达式爬虫任务代码

2026-04-03 00:02:51 +08:00
parent db2c6ba437
commit 4b9fb044ef
4 changed files with 2349 additions and 38 deletions
--- a/260402-2509165039.py
+++ b/260402-2509165039.py
@@ -1,45 +1,103 @@
+import requests
 import re
-import csv
 import json
+import csv
+import time
+import random

-html_content = """
-<div class="item">
-                <div class="pic">
-                    <em>1</em>
-                    <a href="https://movie.douban.com/subject/1292052/">
-                        <img width="100" alt="肖申克的救赎" src="https://img3.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp">
-                    </a>
-                </div>
-                <div class="info">
-                    <div class="hd">
-                        <a href="https://movie.douban.com/subject/1292052/">
-                            <span class="title">肖申克的救赎</span>
-                                    <span class="title">&nbsp;/&nbsp;The Shawshank Redemption</span>
-                                <span class="other">&nbsp;/&nbsp;月黑高飞(港)  /  刺激1995(台)</span>
-                        </a>
+BASE_URL = "https://movie.douban.com/top250?start={}&filter="
+
+HEADERS = {
+    "User-Agent":"Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+}
+
+all_movies=[]
+
+def get_movie_data():
+    print("正在开始爬取豆瓣Top 250 数据...")
+
+    for i in range(0,10):
+        start_num = i * 25
+        url = BASE_URL.format(start_num)
+
+        try:
+            response = requests.get(url,headers=HEADERS)
+
+            if response.status_code == 200:
+                html = response.text
+
+                parse_html(html)
+                print(f"第{i+1}页爬取完成...")
+
+                time.sleep(random.uniform(1,2))
+            else:
+                print(f"第{i+1}页爬取失败，状态码:{response.status_code}")
+        except Exception as e:
+            print(f"发生错误:{e}")


-                            <span class="playable">[可播放]</span>
-                    </div>
-                    <div class="bd">
-                        <p>
-                            导演: 弗兰克·德拉邦特 Frank Darabont&nbsp;&nbsp;&nbsp;主演: 蒂姆·罗宾斯 Tim Robbins /...<br>
-                            1994&nbsp;/&nbsp;美国&nbsp;/&nbsp;犯罪 剧情
-                        </p>
+def parse_html(html):
+    li_list = re.findall(r'<li>.*?</li>',html,re.S)

-                        
-                        <div>
-                            <span class="rating5-t"></span>
-                            <span class="rating_num" property="v:average">9.7</span>
-                            <span property="v:best" content="10.0"></span>
-                            <span>3273519人评价</span>
-                        </div>
+    for li in li_list:
+        if 'class="item"' not in li:
+            continue

-                            <p class="quote">
-                                <span>希望让人自由。</span>
-                            </p>
-                    </div>
-                </div>
-            </div>
-"""
-def parse_data(html):            
+        title_match = re.search(r'<span class="title">(.*?)</span>',li,re.S)
+        title = title_match.group(1) if title_match else "未知标题"
+
+        rating_match = re.search(r'<span class="rating_num".*?>(.*?)</span>',li,re.S)
+        rating = rating_match.group(1) if rating_match else "0"
+
+        people_match = re.search(r'(\d+)人评价',li,re.S)
+        people = people_match.group(1) if people_match else "0"
+
+        quote_match = re.search(r'<span class="inq">(.*?)</span>',li,re.S)
+        quote = quote_match.group(1) if quote_match else "无引言"
+
+        info_match = re.search(r'<p class="">(.*?)</p>',li,re.S)
+        if info_match:
+            info_raw = info_match.group(1)
+            info_clean = re.sub(r'\s+','',info_raw).strip()
+        else:
+            info_clean = "未知信息"
+
+        movie = {
+            "title": title,
+            "rating": rating,
+            "people": people,
+            "info": info_clean,
+            "quote": quote
+        }
+        all_movies.append(movie)
+
+
+def save_data():
+    print("正在保存数据...")
+
+    with open("douban_top250.txt","w",encoding="utf-8") as f:
+        for movie in all_movies:
+            line = f"电影名:{movie['title']} | 评分:{movie['rating']} | 评价人数:{movie['people']} | 引言:{movie['quote']}\n"
+            f.write(line)
+        print("已保存为douban_top250.txt")
+
+
+        with open("douban_top250.csv","w",newline="",encoding="utf-8-sig") as f:
+            writer = csv.writer(f)
+
+            writer.writerow(["电影名","评分","评价人数","详细信息","引言"])
+
+            for movie in all_movies:
+                writer.writerow([movie['title'],movie['rating'],movie['people'],movie['info'],movie['quote']])
+        print("已保存为douban_top250.csv")
+
+
+        with open("douban_top250.json","w",encoding="utf-8") as f:
+            json.dump(all_movies,f,ensure_ascii=False,indent=4)
+        print("已保存为douban_top250.json")
+
+
+if __name__ == "__main__":
+    get_movie_data()
+    save_data()
+    print("全部任务完成!")