完成作业一

This commit is contained in:
2509165025
2026-04-02 15:55:18 +08:00
parent 7286ca5cd2
commit f4f94c5077
4 changed files with 209 additions and 0 deletions

67
爬虫.txt.txt Normal file
View File

@@ -0,0 +1,67 @@
import requests
from bs4 import BeautifulSoup
import time
def scrape_douban_top250_to_txt():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
with open('douban_movies.txt', 'w', encoding='utf-8') as txtfile:
for start in range(0, 250, 25):
url = f'https://movie.douban.com/top250?start={start}'
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
movies = soup.find_all('div', class_='item'
for movie in movies:
rank = movie.find('em').text
title = movie.find('span', class_='title').text
rating = movie.find('span', class_='rating_num').text
people_span = movie.find('div', class_='star').find_all('span')[-1]
people = people_span.text.replace('<27><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>', '')
quote_tag = movie.find('span', class_='inq')
quote = quote_tag.text if quote_tag else "<22><><EFBFBD><EFBFBD>"
info = movie.find('p', class_='').text.strip().split('\n')
director_actor = info[0].strip()
if '<27><><EFBFBD><EFBFBD>: ' in director_actor:
director = director_actor.split('<27><><EFBFBD><EFBFBD>: ')[1].split('<27><><EFBFBD><EFBFBD>: ')[0].strip()
actor = director_actor.split('<27><><EFBFBD><EFBFBD>: ')[1].strip() if '<27><><EFBFBD><EFBFBD>: ' in director_actor else "<22><><EFBFBD><EFBFBD>"
else:
director = "<22><><EFBFBD><EFBFBD>"
actor = "<22><><EFBFBD><EFBFBD>"
year_area_type = info[1].strip().split('/')
year = year_area_type[0].strip()
area = year_area_type[1].strip() if len(year_area_type) > 1 else "<22><><EFBFBD><EFBFBD>"
genre = year_area_type[2].strip() if len(year_area_type) > 2 else "<22><><EFBFBD><EFBFBD>
txtfile.write(f"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>{rank}\n")
txtfile.write(f"<22><>Ӱ<EFBFBD><D3B0><EFBFBD>ƣ<EFBFBD>{title}\n")
txtfile.write(f"<22><><EFBFBD>֣<EFBFBD>{rating}\n")
txtfile.write(f"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>{people}\n")
txtfile.write(f"<22><><EFBFBD><EFBFBD>̨<EFBFBD>ʣ<EFBFBD>{quote}\n")
txtfile.write(f"<22><><EFBFBD>ݣ<EFBFBD>{director}\n")
txtfile.write(f"<22><><EFBFBD>ݣ<EFBFBD>{actor}\n")
txtfile.write(f"<22><>ӳ<EFBFBD><D3B3><EFBFBD>ݣ<EFBFBD>{year}\n")
txtfile.write(f"<22><><EFBFBD><EFBFBD>/<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>{area}\n")
txtfile.write(f"<22><><EFBFBD>ͣ<EFBFBD>{genre}\n")
txtfile.write("-" * 50 + "\n") # <20>ָ<EFBFBD><D6B8><EFBFBD>
print(f"<22><> {start//25 + 1} ҳ<><D2B3>ȡ<EFBFBD><C8A1><EFBFBD>ɣ<EFBFBD><C9A3><EFBFBD>ǰURL: {url}")
time.sleep(1) # <20>ӳ<EFBFBD>1<EFBFBD><EFBFBD><EBA3AC><EFBFBD><EFBFBD><E2B1BB>IP
except requests.exceptions.RequestException as e:
print(f"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><ECB3A3>{e}")
continue
print("? <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1><EFBFBD>ɣ<EFBFBD><C9A3><EFBFBD><EFBFBD><EFBFBD>Ϊ douban_movies.txt")
if __name__ == '__main__':
scrape_douban_top250_to_txt()