diff --git a/q2_1_crawler/1.py b/q2_1_crawler/1.py new file mode 100644 index 0000000..f319e96 --- /dev/null +++ b/q2_1_crawler/1.py @@ -0,0 +1,30 @@ +import re +import requests +from bs4 import BeautifulSoup as bs +import json +header = {'User-Agent':'Mozilla/5.0'} +url = 'https://exam.detr.top/exam-b/movies' + +html = requests.get(url, headers=header).text +# print(html) +open('movies.html','w',encoding='utf-8').write(html) +# print(html) +# print('==============') +# fd = bs(html, 'html.parser').find('p', class_='meta') +fd = re.search(r'exam_fingerprint:\s*(\S+)',html).group(1) +# print(fd) + +resp = bs(html, 'html.parser').find_all('tr', class_='item-row') + +# print(resp) +movies = [] + +for i in resp: + c = [] + for r in i.find_all('td'): + c.append(r.text.strip()) + movies.append({ + 'id':int(c[0]), 'title':c[1], 'director':c[2], 'year':int(c[3]), 'rating':float(c[4]), 'duration':int(c[5]), 'genre':c[6], 'actors_count':int(c[7]) + }) + +json.dump({'ID':fd, 'movies':movies}, open('movies.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=2) diff --git a/q2_1_crawler/movies.html b/q2_1_crawler/movies.html new file mode 100644 index 0000000..3d89eeb --- /dev/null +++ b/q2_1_crawler/movies.html @@ -0,0 +1,152 @@ + + + + + + + + 电影列表 + + + +

电影列表

+

数据编号:B-20260625-3464

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
编号电影名导演上映年份评分时长(分钟)类型主演数
1霸王别姬Frank Darabont20046.8179科幻3
2放牛班的春天陈凯歌20196.7117喜剧5
3泰坦尼克号Robert Zemeckis20159.1177爱情4
4三傻大闹宝莱坞James Cameron20247.6117冒险5
5阿甘正传宫崎骏20168.0146喜剧3
6星际穿越Christopher Nolan20109.2168爱情2
7忠犬八公的故事Lasse Hallström20169.3106悬疑5
8肖申克的救赎Rajkumar Hirani20187.3107剧情4
9千与千寻Christophe Barratier20197.3118动画4
10盗梦空间Christopher Nolan20089.3111动画5
+ + \ No newline at end of file diff --git a/q2_1_crawler/movies.json b/q2_1_crawler/movies.json new file mode 100644 index 0000000..20ba5dc --- /dev/null +++ b/q2_1_crawler/movies.json @@ -0,0 +1,105 @@ +{ + "ID": "B-20260625-3464", + "movies": [ + { + "id": 1, + "title": "霸王别姬", + "director": "Frank Darabont", + "year": 2004, + "rating": 6.8, + "duration": 179, + "genre": "科幻", + "actors_count": 3 + }, + { + "id": 2, + "title": "放牛班的春天", + "director": "陈凯歌", + "year": 2019, + "rating": 6.7, + "duration": 117, + "genre": "喜剧", + "actors_count": 5 + }, + { + "id": 3, + "title": "泰坦尼克号", + "director": "Robert Zemeckis", + "year": 2015, + "rating": 9.1, + "duration": 177, + "genre": "爱情", + "actors_count": 4 + }, + { + "id": 4, + "title": "三傻大闹宝莱坞", + "director": "James Cameron", + "year": 2024, + "rating": 7.6, + "duration": 117, + "genre": "冒险", + "actors_count": 5 + }, + { + "id": 5, + "title": "阿甘正传", + "director": "宫崎骏", + "year": 2016, + "rating": 8.0, + "duration": 146, + "genre": "喜剧", + "actors_count": 3 + }, + { + "id": 6, + "title": "星际穿越", + "director": "Christopher Nolan", + "year": 2010, + "rating": 9.2, + "duration": 168, + "genre": "爱情", + "actors_count": 2 + }, + { + "id": 7, + "title": "忠犬八公的故事", + "director": "Lasse Hallström", + "year": 2016, + "rating": 9.3, + "duration": 106, + "genre": "悬疑", + "actors_count": 5 + }, + { + "id": 8, + "title": "肖申克的救赎", + "director": "Rajkumar Hirani", + "year": 2018, + "rating": 7.3, + "duration": 107, + "genre": "剧情", + "actors_count": 4 + }, + { + "id": 9, + "title": "千与千寻", + "director": "Christophe Barratier", + "year": 2019, + "rating": 7.3, + "duration": 118, + "genre": "动画", + "actors_count": 4 + }, + { + "id": 10, + "title": "盗梦空间", + "director": "Christopher Nolan", + "year": 2008, + "rating": 9.3, + "duration": 111, + "genre": "动画", + "actors_count": 5 + } + ] +} \ No newline at end of file