diff --git a/2.py b/2.py deleted file mode 100644 index 449c104..0000000 --- a/2.py +++ /dev/null @@ -1,43 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import json -import time - -headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" -} - -movies = [] -# 豆瓣Top250每页25条,前50条需要爬取2页(start=0和start=25) -for page in range(2): - url = f"https://movie.douban.com/top250?start={page*25}" - response = requests.get(url, headers=headers) - soup = BeautifulSoup(response.text, "html.parser") - - items = soup.find_all("div", class_="item") - for idx, item in enumerate(items): - rank = page * 25 + idx + 1 - # 电影名称 - title = item.find("span", class_="title").text.strip() - # 主演信息 - info = item.find("div", class_="bd").find("p", class_="").text.strip() - actors = info.split("\n")[0].split("主演:")[-1].strip() if "主演:" in info else "未知" - # 短评 - quote_tag = item.find("span", class_="inq") - quote = quote_tag.text.strip() if quote_tag else "无短评" - - movies.append({ - "rank": rank, - "title": title, - "actors": actors, - "quote": quote - }) - - # 礼貌间隔,避免被反爬 - time.sleep(1) - -# 保存为movies.json -with open("movies.json", "w", encoding="utf-8") as f: - json.dump(movies, f, ensure_ascii=False, indent=2) - -print("爬取完成,数据已保存到 movies.json") \ No newline at end of file