This commit is contained in:
2026-06-25 15:47:41 +08:00
parent 11e6e8a461
commit 85b531e464
3 changed files with 287 additions and 0 deletions

30
q2_1_crawler/1.py Normal file
View File

@@ -0,0 +1,30 @@
import re
import requests
from bs4 import BeautifulSoup as bs
import json
header = {'User-Agent':'Mozilla/5.0'}
url = 'https://exam.detr.top/exam-b/movies'
html = requests.get(url, headers=header).text
# print(html)
open('movies.html','w',encoding='utf-8').write(html)
# print(html)
# print('==============')
# fd = bs(html, 'html.parser').find('p', class_='meta')
fd = re.search(r'exam_fingerprint:\s*(\S+)',html).group(1)
# print(fd)
resp = bs(html, 'html.parser').find_all('tr', class_='item-row')
# print(resp)
movies = []
for i in resp:
c = []
for r in i.find_all('td'):
c.append(r.text.strip())
movies.append({
'id':int(c[0]), 'title':c[1], 'director':c[2], 'year':int(c[3]), 'rating':float(c[4]), 'duration':int(c[5]), 'genre':c[6], 'actors_count':int(c[7])
})
json.dump({'ID':fd, 'movies':movies}, open('movies.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=2)