diff --git a/260331+2509165020 作业/movies.csv b/260331+2509165020 作业/movies.csv new file mode 100644 index 0000000..91a10fe --- /dev/null +++ b/260331+2509165020 作业/movies.csv @@ -0,0 +1 @@ +排名,中文名,英文名,评分 diff --git a/260331+2509165020 作业/movies.json b/260331+2509165020 作业/movies.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/260331+2509165020 作业/movies.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/260331+2509165020 作业/movies.txt b/260331+2509165020 作业/movies.txt new file mode 100644 index 0000000..132612c --- /dev/null +++ b/260331+2509165020 作业/movies.txt @@ -0,0 +1,10 @@ +肖申克的救赎 +霸王别姬 +泰坦尼克号 +阿甘正传 +千与千寻 +美丽人生 +星际穿越 +这个杀手不太冷 +盗梦空间 +楚门的世界 diff --git a/260331+2509165020 作业/test1.py b/260331+2509165020 作业/test1.py new file mode 100644 index 0000000..15bab04 --- /dev/null +++ b/260331+2509165020 作业/test1.py @@ -0,0 +1,31 @@ +import requests +import re + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" +} + +url = "https://movie.douban.com/top250" + +try: + response = requests.get(url=url, headers=headers, timeout=10) + response.raise_for_status() + page_source = response.text + print("页面请求成功!") +except Exception as e: + print(f"请求失败:{e}") + exit() + +pattern = re.compile(r'([^&]+?)', re.S) +movie_names = pattern.findall(page_source) + +target_names = movie_names[:10] +with open("movies.txt", "w", encoding="utf-8") as f: + for name in target_names: + f.write(name + "\n") + +print(f"成功爬取{len(target_names)}部电影名,已保存到movies.txt!") + +print("\n爬取结果预览:") +for i, name in enumerate(target_names, 1): + print(f"{i}. {name}") \ No newline at end of file diff --git a/260331+2509165020 作业/test2.py b/260331+2509165020 作业/test2.py new file mode 100644 index 0000000..a87d9fa --- /dev/null +++ b/260331+2509165020 作业/test2.py @@ -0,0 +1,27 @@ +import requests +import re +import csv + +url = "https://movie.douban.com/top250" +headers = {"User-Agent": "Mozilla/5.0"} +resp = requests.get(url, headers=headers) +html = resp.text + +pattern = re.compile( + r'(\d+).*?' + r'(.*?).*?' + r'(.*?).*?' + r'(.*?)', + re.S +) +movies = pattern.findall(html)[:10] + +with open("movies.csv", "w", encoding="utf-8-sig", newline="") as f: + w = csv.writer(f) + w.writerow(["排名", "中文名", "英文名", "评分"]) + for m in movies: + rank, title, en, rating = m + en = en.replace("/", "").strip() + w.writerow([rank, title, en, rating]) + +print("已保存到 movies.csv") \ No newline at end of file diff --git a/260331+2509165020 作业/test3.py b/260331+2509165020 作业/test3.py new file mode 100644 index 0000000..12695ce --- /dev/null +++ b/260331+2509165020 作业/test3.py @@ -0,0 +1,33 @@ +import requests +import re +import json + +url = "https://movie.douban.com/top250" +headers = {"User-Agent": "Mozilla/5.0"} +resp = requests.get(url, headers=headers) +html = resp.text + +pattern = re.compile( + r'(\d+).*?' + r'(.*?).*?' + r'(.*?).*?' + r'(.*?).*?' + r'(.*?)?', + re.S +) +movies = pattern.findall(html)[:10] + +result = [] +for m in movies: + result.append({ + "rank": int(m[0]), + "title": m[1], + "en_title": m[2].replace("/", "").strip(), + "rating": m[3], + "quote": m[4] if len(m) > 4 else "" + }) + +with open("movies.json", "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=4) + +print("已保存到 movies.json") \ No newline at end of file