网络数据采集(爬虫基础)
This commit is contained in:
42
爬虫1/pachong1.py.txt
Normal file
42
爬虫1/pachong1.py.txt
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import time
|
||||||
|
|
||||||
|
def crawl_movie_info():
|
||||||
|
"""
|
||||||
|
爬取豆瓣电影 Top250 的基础信息
|
||||||
|
"""
|
||||||
|
url = "https://movie.douban.com/top250")
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
response.raise_for_status() # 如果请求失败,抛出异常
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
movie_items = soup.find_all("div", class_="item")
|
||||||
|
|
||||||
|
print(f"✅ 成功获取到 {len(movie_items)} 部电影信息!")
|
||||||
|
print("-" * 50)
|
||||||
|
for index, item in enumerate(movie_items, 1):
|
||||||
|
title = item.find("span", class_="title").get_text()
|
||||||
|
|
||||||
|
rating = item.find("span", class_="rating_num").get_text()
|
||||||
|
info_line = item.find("div", class_="bd").find("p").get_text().strip()
|
||||||
|
year = info_line.split("\n")[-1].strip()[:4]
|
||||||
|
print(f"🎬 第 {index} 部:")
|
||||||
|
print(f" 片名:{title}")
|
||||||
|
print(f" 评分:{rating}")
|
||||||
|
print(f" 年份:{year}")
|
||||||
|
print("-" * 30)
|
||||||
|
|
||||||
|
return movie_items
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ 爬取失败:{e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("🚀 开始爬取豆瓣电影 Top250 信息...")
|
||||||
|
crawl_movie_info()
|
||||||
Reference in New Issue
Block a user