完成作业爬虫三种形式

This commit is contained in:
2509165016
2026-03-31 11:33:06 +08:00
parent 93e0e78a4c
commit 3728f9cfdd
3 changed files with 44 additions and 0 deletions

44
wenben.txt Normal file
View File

@@ -0,0 +1,44 @@
import requests
import re
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
all_chinese_titles = []
# ѭ<><D1AD><EFBFBD><EFBFBD>ȡ10ҳ<30><D2B3>ÿҳ25<32><35><EFBFBD><EFBFBD><EFBFBD><EFBFBD>250<35><30><EFBFBD><EFBFBD>
for page in range(10):
start = page * 25
url = f'https://movie.douban.com/top250?start={start}&filter='
print(f'<27><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1> {page+1} ҳ...')
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƿ<EFBFBD><C7B7>ɹ<EFBFBD>
html = response.text
# <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1><EFBFBD>е<EFBFBD>Ӱ<EFBFBD><D3B0><EFBFBD><EFBFBD>
pattern = r'<span class="title">([^<&]+)</span>'
titles = re.findall(pattern, html)
# <20><><EFBFBD>˵<EFBFBD>Ӣ<EFBFBD><D3A2><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֻ<EFBFBD><D6BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
chinese_titles = [t for t in titles if not t.startswith('/')]
all_chinese_titles.extend(chinese_titles)
time.sleep(1) # <20>ӳ<EFBFBD>1<EFBFBD><EFBFBD><EBA3AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><ECB1BB>
except Exception as e:
print(f'<27><> {page+1} ҳ<><D2B3>ȡʧ<C8A1><CAA7>: {e}')
break
# <20><><EFBFBD><EFBFBD>ı<EFBFBD><C4B1>ļ<EFBFBD>
with open('douban_top250.txt', 'w', encoding='utf-8') as f:
for i, title in enumerate(all_chinese_titles, 1):
f.write(f'{i}. {title}\n')
print(f'<27>ѳɹ<D1B3><C9B9><EFBFBD><EFBFBD><EFBFBD>ȫ<EFBFBD><C8AB> {len(all_chinese_titles)} <20><><EFBFBD><EFBFBD>Ӱ<EFBFBD><D3B0> douban_top250.txt')
# <20><>֤<EFBFBD><D6A4><EFBFBD><EFBFBD>
with open('douban_top250.txt', 'r', encoding='utf-8') as f:
print(f.read())