完成爬虫作业:豆瓣Top250 bs4+xpath两种方法
This commit is contained in:
50
douban_xpath.py.txt
Normal file
50
douban_xpath.py.txt
Normal file
@@ -0,0 +1,50 @@
|
||||
import requests
|
||||
from lxml import etree
|
||||
import csv
|
||||
import time
|
||||
import random
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
movie_list = []
|
||||
|
||||
for page in range(10):
|
||||
start = page * 25
|
||||
url = f'https://movie.douban.com/top250?start={start}&filter='
|
||||
print(f'<27><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1> {page+1} ҳ...')
|
||||
|
||||
time.sleep(random.uniform(0.5, 1.5))
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
tree = etree.HTML(response.text)
|
||||
items = tree.xpath('//div[@class="item"]')
|
||||
|
||||
for item in items:
|
||||
rank = item.xpath('.//em/text()')[0]
|
||||
title = item.xpath('.//span[@class="title"][1]/text()')[0]
|
||||
score = item.xpath('.//span[@class="rating_num"]/text()')[0]
|
||||
quote = item.xpath('.//span[@class="inq"]/text()')
|
||||
quote = quote[0] if quote else "<22><>"
|
||||
info = item.xpath('.//div[@class="bd"]/p[1]/text()')[0].strip()
|
||||
|
||||
movie = {
|
||||
"<22><><EFBFBD><EFBFBD>": rank,
|
||||
"<22><>Ӱ<EFBFBD><D3B0>": title,
|
||||
"<22><><EFBFBD><EFBFBD>": score,
|
||||
"<22><><EFBFBD><EFBFBD>": quote,
|
||||
"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ": info
|
||||
}
|
||||
movie_list.append(movie)
|
||||
|
||||
except Exception as e:
|
||||
print(f"<22><> {page+1} ҳ<><D2B3>ȡʧ<C8A1>ܣ<EFBFBD>{e}")
|
||||
|
||||
with open("douban_top250_xpath.csv", "w", encoding="utf-8-sig", newline="") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=["<22><><EFBFBD><EFBFBD>", "<22><>Ӱ<EFBFBD><D3B0>", "<22><><EFBFBD><EFBFBD>", "<22><><EFBFBD><EFBFBD>", "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ"])
|
||||
writer.writeheader()
|
||||
writer.writerows(movie_list)
|
||||
|
||||
print(f"? <20><>ȡ<EFBFBD><C8A1><EFBFBD>ɣ<EFBFBD><C9A3><EFBFBD> {len(movie_list)} <20><><EFBFBD><EFBFBD><EFBFBD>ݣ<EFBFBD><DDA3>ѱ<EFBFBD><D1B1>浽 douban_top250_xpath.csv")
|
||||
Reference in New Issue
Block a user