diff --git a/q2_1_crawler/q2_1.py b/q2_1_crawler/q2_1.py deleted file mode 100644 index d7d282d..0000000 --- a/q2_1_crawler/q2_1.py +++ /dev/null @@ -1,57 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import json -import os - -def task_1_scrape(): - url = "https://exam.detr.top/exam-b/movies" - - headers = { - "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)" - "AppleWebKit/537.36 (KHTML, like Gecko)" - "Chrome/129.0.0.0 Safari/537.36" - } - - try: - response = requests.get(url,headers=headers) - response.encoding = 'utf-8' - - if response.status_code == 200: - html_content = response.text - - - with open("movies.html","w",encoding="utf-8") as f: - f.write(html_content) - print("[成功]已保存movies.html") - - soup = BeautifulSoup(html_content,'html.parser') - table = soup.find('table') - rows = table.find_all('tr') - - movies_data = [] - - print(f"DEBUG:我抓到了{len(movies_data)}个电影数据") - - for row in rows[1:]: - cols = row.find_all('td') - if len(cols) > 0: - movie = { - "id": int(cols[0].get_text(strip = True)), - "title": cols[1].get_text(strip = True), - "director": cols[2].get_text(strip = True), - "year": int(cols[3].get_text(strip = True)), - "rating": float(cols[4].get_text(strip = True)), - "duration": int(cols[5].get_text(strip = True)), - "genre": cols[6].get_text(strip = True), - "actors_count": int(cols[7].get_text(strip = True)) - } - movies_data.append(movie) - - with open("movies.json","w",encoding="utf-8") as f: - json.dump(movies_data, f, ensure_ascii=False, indent = 4) - print(f"[成功]已抓取{len(movies_data)}部电影并保存至movies.json") - else: - print(f"[错误]请求失败,状态码:{response.status_code}") - - except Exception as e: - print(f"[异常]发生错误:{e}") \ No newline at end of file