1

2026-06-25 15:47:41 +08:00
parent 11e6e8a461
commit 85b531e464
3 changed files with 287 additions and 0 deletions
--- a/q2_1_crawler/1.py
+++ b/q2_1_crawler/1.py
@@ -0,0 +1,30 @@
+import re
+import requests
+from bs4 import BeautifulSoup as bs
+import json
+header = {'User-Agent':'Mozilla/5.0'}
+url = 'https://exam.detr.top/exam-b/movies'
+
+html = requests.get(url, headers=header).text
+# print(html)
+open('movies.html','w',encoding='utf-8').write(html)
+# print(html)
+# print('==============')
+# fd  =  bs(html, 'html.parser').find('p', class_='meta')
+fd = re.search(r'exam_fingerprint:\s*(\S+)',html).group(1)
+# print(fd)
+
+resp = bs(html, 'html.parser').find_all('tr', class_='item-row')
+
+# print(resp)
+movies = []
+
+for i in resp:
+    c = []
+    for r in i.find_all('td'):
+        c.append(r.text.strip())
+    movies.append({
+        'id':int(c[0]), 'title':c[1], 'director':c[2], 'year':int(c[3]), 'rating':float(c[4]), 'duration':int(c[5]), 'genre':c[6], 'actors_count':int(c[7]) 
+    })
+
+json.dump({'ID':fd, 'movies':movies}, open('movies.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=2)