网络数据采集（爬虫基础）

2026-03-13 17:49:38 +08:00
parent 8b8266a311
commit 06e4a6216d
1 changed files with 42 additions and 0 deletions
--- a/爬虫1/pachong1.py.txt
+++ b/爬虫1/pachong1.py.txt
@@ -0,0 +1,42 @@
+import requests
+from bs4 import BeautifulSoup
+import time
+
+def crawl_movie_info():
+    """
+    爬取豆瓣电影 Top250 的基础信息
+    """
+    url = "https://movie.douban.com/top250"）
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+    }
+    
+    try:
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()  # 如果请求失败，抛出异常
+        soup = BeautifulSoup(response.text, "html.parser")
+        movie_items = soup.find_all("div", class_="item")
+        
+        print(f"✅ 成功获取到 {len(movie_items)} 部电影信息！")
+        print("-" * 50)
+        for index, item in enumerate(movie_items, 1):
+            title = item.find("span", class_="title").get_text()
+           
+            rating = item.find("span", class_="rating_num").get_text()
+            info_line = item.find("div", class_="bd").find("p").get_text().strip()
+            year = info_line.split("\n")[-1].strip()[:4]
+            print(f"🎬 第 {index} 部：")
+            print(f"   片名：{title}")
+            print(f"   评分：{rating}")
+            print(f"   年份：{year}")
+            print("-" * 30)
+            
+        return movie_items
+        
+    except Exception as e:
+        print(f"❌ 爬取失败：{e}")
+        return None
+
+if __name__ == "__main__":
+    print("🚀 开始爬取豆瓣电影 Top250 信息...")
+    crawl_movie_info()