From 85b531e464da05bd7ce2c1713ee35885a5b592d2 Mon Sep 17 00:00:00 2001 From: gitea_eternal <401029566@qq.com> Date: Thu, 25 Jun 2026 15:47:41 +0800 Subject: [PATCH] 1 --- q2_1_crawler/1.py | 30 ++++++++ q2_1_crawler/movies.html | 152 +++++++++++++++++++++++++++++++++++++++ q2_1_crawler/movies.json | 105 +++++++++++++++++++++++++++ 3 files changed, 287 insertions(+) create mode 100644 q2_1_crawler/1.py create mode 100644 q2_1_crawler/movies.html create mode 100644 q2_1_crawler/movies.json diff --git a/q2_1_crawler/1.py b/q2_1_crawler/1.py new file mode 100644 index 0000000..f319e96 --- /dev/null +++ b/q2_1_crawler/1.py @@ -0,0 +1,30 @@ +import re +import requests +from bs4 import BeautifulSoup as bs +import json +header = {'User-Agent':'Mozilla/5.0'} +url = 'https://exam.detr.top/exam-b/movies' + +html = requests.get(url, headers=header).text +# print(html) +open('movies.html','w',encoding='utf-8').write(html) +# print(html) +# print('==============') +# fd = bs(html, 'html.parser').find('p', class_='meta') +fd = re.search(r'exam_fingerprint:\s*(\S+)',html).group(1) +# print(fd) + +resp = bs(html, 'html.parser').find_all('tr', class_='item-row') + +# print(resp) +movies = [] + +for i in resp: + c = [] + for r in i.find_all('td'): + c.append(r.text.strip()) + movies.append({ + 'id':int(c[0]), 'title':c[1], 'director':c[2], 'year':int(c[3]), 'rating':float(c[4]), 'duration':int(c[5]), 'genre':c[6], 'actors_count':int(c[7]) + }) + +json.dump({'ID':fd, 'movies':movies}, open('movies.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=2) diff --git a/q2_1_crawler/movies.html b/q2_1_crawler/movies.html new file mode 100644 index 0000000..3d89eeb --- /dev/null +++ b/q2_1_crawler/movies.html @@ -0,0 +1,152 @@ + + + + + +
+ +| 编号 | +电影名 | +导演 | +上映年份 | +评分 | +时长(分钟) | +类型 | +主演数 | +
|---|---|---|---|---|---|---|---|
| 1 | +霸王别姬 | +Frank Darabont | +2004 | +6.8 | +179 | +科幻 | +3 | +
| 2 | +放牛班的春天 | +陈凯歌 | +2019 | +6.7 | +117 | +喜剧 | +5 | +
| 3 | +泰坦尼克号 | +Robert Zemeckis | +2015 | +9.1 | +177 | +爱情 | +4 | +
| 4 | +三傻大闹宝莱坞 | +James Cameron | +2024 | +7.6 | +117 | +冒险 | +5 | +
| 5 | +阿甘正传 | +宫崎骏 | +2016 | +8.0 | +146 | +喜剧 | +3 | +
| 6 | +星际穿越 | +Christopher Nolan | +2010 | +9.2 | +168 | +爱情 | +2 | +
| 7 | +忠犬八公的故事 | +Lasse Hallström | +2016 | +9.3 | +106 | +悬疑 | +5 | +
| 8 | +肖申克的救赎 | +Rajkumar Hirani | +2018 | +7.3 | +107 | +剧情 | +4 | +
| 9 | +千与千寻 | +Christophe Barratier | +2019 | +7.3 | +118 | +动画 | +4 | +
| 10 | +盗梦空间 | +Christopher Nolan | +2008 | +9.3 | +111 | +动画 | +5 | +