完成作业爬虫三种形式
This commit is contained in:
0
douban2.py
Normal file
0
douban2.py
Normal file
0
douban3.py
Normal file
0
douban3.py
Normal file
44
wenben.txt
Normal file
44
wenben.txt
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import requests
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
all_chinese_titles = []
|
||||||
|
|
||||||
|
# ѭ<><D1AD><EFBFBD><EFBFBD>ȡ10ҳ<30><D2B3>ÿҳ25<32><35><EFBFBD><EFBFBD><EFBFBD><EFBFBD>250<35><30><EFBFBD><EFBFBD>
|
||||||
|
for page in range(10):
|
||||||
|
start = page * 25
|
||||||
|
url = f'https://movie.douban.com/top250?start={start}&filter='
|
||||||
|
print(f'<27><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1> {page+1} ҳ...')
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status() # <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƿ<EFBFBD><C7B7>ɹ<EFBFBD>
|
||||||
|
html = response.text
|
||||||
|
|
||||||
|
# <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1><EFBFBD>е<EFBFBD>Ӱ<EFBFBD><D3B0><EFBFBD><EFBFBD>
|
||||||
|
pattern = r'<span class="title">([^<&]+)</span>'
|
||||||
|
titles = re.findall(pattern, html)
|
||||||
|
|
||||||
|
# <20><><EFBFBD>˵<EFBFBD>Ӣ<EFBFBD><D3A2><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֻ<EFBFBD><D6BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||||
|
chinese_titles = [t for t in titles if not t.startswith('/')]
|
||||||
|
all_chinese_titles.extend(chinese_titles)
|
||||||
|
|
||||||
|
time.sleep(1) # <20>ӳ<EFBFBD>1<EFBFBD>룬<EFBFBD><EBA3AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>챻<EFBFBD><ECB1BB>
|
||||||
|
except Exception as e:
|
||||||
|
print(f'<27><> {page+1} ҳ<><D2B3>ȡʧ<C8A1><CAA7>: {e}')
|
||||||
|
break
|
||||||
|
|
||||||
|
# <20><><EFBFBD>浽<EFBFBD>ı<EFBFBD><C4B1>ļ<EFBFBD>
|
||||||
|
with open('douban_top250.txt', 'w', encoding='utf-8') as f:
|
||||||
|
for i, title in enumerate(all_chinese_titles, 1):
|
||||||
|
f.write(f'{i}. {title}\n')
|
||||||
|
|
||||||
|
print(f'<27>ѳɹ<D1B3><C9B9><EFBFBD><EFBFBD><EFBFBD>ȫ<EFBFBD><C8AB> {len(all_chinese_titles)} <20><><EFBFBD><EFBFBD>Ӱ<EFBFBD><D3B0> douban_top250.txt')
|
||||||
|
|
||||||
|
# <20><>֤<EFBFBD><D6A4><EFBFBD><EFBFBD>
|
||||||
|
with open('douban_top250.txt', 'r', encoding='utf-8') as f:
|
||||||
|
print(f.read())
|
||||||
Reference in New Issue
Block a user