完成爬虫作业:豆瓣Top250 bs4+xpath两种方法

This commit is contained in:
2509165046
2026-03-26 15:58:14 +08:00
parent 82e2af26a5
commit cc2b8096f4
2 changed files with 108 additions and 0 deletions

58
douban_bs4.py.txt Normal file
View File

@@ -0,0 +1,58 @@
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
# ģ<><C4A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
movie_list = []
# <20><>ȡ 10 ҳ<><D2B3>һ<EFBFBD><D2BB> 250 <20><><EFBFBD><EFBFBD>Ӱ
for page in range(10):
start = page * 25
url = f'https://movie.douban.com/top250?start={start}&filter='
print(f'<27><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1> {page+1} ҳ...')
# <20><><EFBFBD><EFBFBD><EFBFBD>ӳ٣<D3B3><D9A3><EFBFBD>ֹ<EFBFBD><D6B9><EFBFBD><EFBFBD>
time.sleep(random.uniform(0.5, 1.5))
try:
# 1. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1>ҳ
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# 2. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҳ
soup = BeautifulSoup(response.text, 'lxml')
items = soup.select('.item')
# 3. <20><>ȡÿ<C8A1><C3BF><EFBFBD><EFBFBD>Ӱ<EFBFBD><D3B0>Ϣ
for item in items:
rank = item.select_one('.pic em').text # <20><><EFBFBD><EFBFBD>
title = item.select_one('.hd .title').text # <20><>Ӱ<EFBFBD><D3B0>
score = item.select_one('.rating_num').text # <20><><EFBFBD><EFBFBD>
quote = item.select_one('.inq').text if item.select_one('.inq') else "<22><>"
info = item.select_one('.bd p').text.strip() # <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
movie = {
"<22><><EFBFBD><EFBFBD>": rank,
"<22><>Ӱ<EFBFBD><D3B0>": title,
"<22><><EFBFBD><EFBFBD>": score,
"<22><><EFBFBD><EFBFBD>": quote,
"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ": info
}
movie_list.append(movie)
except Exception as e:
print(f"<22><> {page+1} ҳ<><D2B3>ȡʧ<C8A1>ܣ<EFBFBD>{e}")
# 4. <20><><EFBFBD><EFBFBD>Ϊ CSV <20>ļ<EFBFBD>
with open("douban_top250_bs4.csv", "w", encoding="utf-8-sig", newline="") as f:
writer = csv.DictWriter(f, fieldnames=["<22><><EFBFBD><EFBFBD>", "<22><>Ӱ<EFBFBD><D3B0>", "<22><><EFBFBD><EFBFBD>", "<22><><EFBFBD><EFBFBD>", "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ"])
writer.writeheader()
writer.writerows(movie_list)
print(f"? <20><>ȡ<EFBFBD><C8A1><EFBFBD>ɣ<EFBFBD><C9A3><EFBFBD> {len(movie_list)} <20><><EFBFBD><EFBFBD><EFBFBD>ݣ<EFBFBD><DDA3>ѱ<EFBFBD><D1B1>浽 douban_top250_bs4.csv")

50
douban_xpath.py.txt Normal file
View File

@@ -0,0 +1,50 @@
import requests
from lxml import etree
import csv
import time
import random
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
movie_list = []
for page in range(10):
start = page * 25
url = f'https://movie.douban.com/top250?start={start}&filter='
print(f'<27><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1> {page+1} ҳ...')
time.sleep(random.uniform(0.5, 1.5))
try:
response = requests.get(url, headers=headers, timeout=10)
tree = etree.HTML(response.text)
items = tree.xpath('//div[@class="item"]')
for item in items:
rank = item.xpath('.//em/text()')[0]
title = item.xpath('.//span[@class="title"][1]/text()')[0]
score = item.xpath('.//span[@class="rating_num"]/text()')[0]
quote = item.xpath('.//span[@class="inq"]/text()')
quote = quote[0] if quote else "<22><>"
info = item.xpath('.//div[@class="bd"]/p[1]/text()')[0].strip()
movie = {
"<22><><EFBFBD><EFBFBD>": rank,
"<22><>Ӱ<EFBFBD><D3B0>": title,
"<22><><EFBFBD><EFBFBD>": score,
"<22><><EFBFBD><EFBFBD>": quote,
"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ": info
}
movie_list.append(movie)
except Exception as e:
print(f"<22><> {page+1} ҳ<><D2B3>ȡʧ<C8A1>ܣ<EFBFBD>{e}")
with open("douban_top250_xpath.csv", "w", encoding="utf-8-sig", newline="") as f:
writer = csv.DictWriter(f, fieldnames=["<22><><EFBFBD><EFBFBD>", "<22><>Ӱ<EFBFBD><D3B0>", "<22><><EFBFBD><EFBFBD>", "<22><><EFBFBD><EFBFBD>", "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ"])
writer.writeheader()
writer.writerows(movie_list)
print(f"? <20><>ȡ<EFBFBD><C8A1><EFBFBD>ɣ<EFBFBD><C9A3><EFBFBD> {len(movie_list)} <20><><EFBFBD><EFBFBD><EFBFBD>ݣ<EFBFBD><DDA3>ѱ<EFBFBD><D1B1>浽 douban_top250_xpath.csv")