完成作业一
This commit is contained in:
0
2509165025
Normal file
0
2509165025
Normal file
57
2509165025.CSV
Normal file
57
2509165025.CSV
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import csv
|
||||||
|
import time
|
||||||
|
|
||||||
|
def scrape_douban_top250():
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
with open('douban_movies.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
||||||
|
fieldnames = ['排名', '电影名称', '评分', '评价人数', '经典台词']
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
for start in range(0, 250, 25):
|
||||||
|
url = f'https://movie.douban.com/top250?start={start}'
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
movies = soup.find_all('div', class_='item')
|
||||||
|
|
||||||
|
for movie in movies:
|
||||||
|
rank = movie.find('em').text
|
||||||
|
|
||||||
|
title = movie.find('span', class_='title').text
|
||||||
|
|
||||||
|
rating = movie.find('span', class_='rating_num').text
|
||||||
|
|
||||||
|
people_span = movie.find('div', class_='star').find_all('span')[-1]
|
||||||
|
people = people_span.text.replace('人评价', '')
|
||||||
|
|
||||||
|
quote_tag = movie.find('span', class_='inq')
|
||||||
|
quote = quote_tag.text if quote_tag else "暂无"
|
||||||
|
|
||||||
|
writer.writerow({
|
||||||
|
'排名': rank,
|
||||||
|
'电影名称': title,
|
||||||
|
'评分': rating,
|
||||||
|
'评价人数': people,
|
||||||
|
'经典台词': quote
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"第 {start//25 + 1} 页爬取完成,当前 URL: {url}")
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"请求异常:{e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print("✅ 所有数据已爬取完成,已保存为 douban_movies.csv")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
scrape_douban_top250()
|
||||||
|
Can't render this file because it contains an unexpected character in line 37 and column 62.
|
85
2509165026(2).json
Normal file
85
2509165026(2).json
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
|
||||||
|
def scrape_douban_top250_to_json():
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
movies_list = []
|
||||||
|
|
||||||
|
for start in range(0, 250, 25):
|
||||||
|
url = f'https://movie.douban.com/top250?start={start}'
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
movies = soup.find_all('div', class_='item')
|
||||||
|
|
||||||
|
for movie in movies:
|
||||||
|
rank = movie.find('em').text
|
||||||
|
title = movie.find('span', class_='title').text
|
||||||
|
rating = movie.find('span', class_='rating_num').text
|
||||||
|
people_span = movie.find('div', class_='star').find_all('span')[-1]
|
||||||
|
people = people_span.text.replace('人评价', '')
|
||||||
|
quote_tag = movie.find('span', class_='inq')
|
||||||
|
quote = quote_tag.text if quote_tag else "暂无"
|
||||||
|
|
||||||
|
info = movie.find('p', class_='').text.strip().split('\n')
|
||||||
|
director_actor = info[0].strip()
|
||||||
|
|
||||||
|
if '导演: ' in director_actor:
|
||||||
|
director = director_actor.split('导演: ')[1].split('主演: ')[0].strip()
|
||||||
|
actor = director_actor.split('主演: ')[1].strip() if '主演: ' in director_actor else "暂无"
|
||||||
|
else:
|
||||||
|
director = "暂无"
|
||||||
|
actor = "暂无"
|
||||||
|
|
||||||
|
year_area_type = info[1].strip().split('/')
|
||||||
|
year = year_area_type[0].strip()
|
||||||
|
area = year_area_type[1].strip() if len(year_area_type) > 1 else "暂无"
|
||||||
|
genre = year_area_type[2].strip() if len(year_area_type) > 2 else "暂无"
|
||||||
|
|
||||||
|
movie_dict = {
|
||||||
|
|
||||||
|
python
|
||||||
|
荣成
|
||||||
|
pythonhtml
|
||||||
|
python python
|
||||||
|
对应的の
|
||||||
|
dict在中国言甑
|
||||||
|
|
||||||
|
"排名": rank,
|
||||||
|
"电影名称": title,
|
||||||
|
"评分
|
||||||
|
python
|
||||||
|
"评分": rating,
|
||||||
|
"python
|
||||||
|
"评价人数": people,
|
||||||
|
"经典台词": quote,
|
||||||
|
"导演": director,
|
||||||
|
"主演": actor,
|
||||||
|
"上映年份": year,
|
||||||
|
"国家/地区": area,
|
||||||
|
"类型": genre
|
||||||
|
}
|
||||||
|
movies_list.append(movie_dict)
|
||||||
|
|
||||||
|
print(f"第 {start//25 + 1} 页爬取完成,当前URL: {url}")
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"请求异常:{e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
with open('douban_movies.json', 'w', encoding='utf-8') as jsonfile:
|
||||||
|
json.dump(movies_list, jsonfile, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
print("✅ 所有数据已爬取完成,保存为 douban_movies.json")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
scrape_douban_top250_to_json()
|
||||||
67
爬虫.txt.txt
Normal file
67
爬虫.txt.txt
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import time
|
||||||
|
|
||||||
|
def scrape_douban_top250_to_txt():
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
with open('douban_movies.txt', 'w', encoding='utf-8') as txtfile:
|
||||||
|
for start in range(0, 250, 25):
|
||||||
|
url = f'https://movie.douban.com/top250?start={start}'
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
movies = soup.find_all('div', class_='item'
|
||||||
|
|
||||||
|
for movie in movies:
|
||||||
|
rank = movie.find('em').text
|
||||||
|
title = movie.find('span', class_='title').text
|
||||||
|
rating = movie.find('span', class_='rating_num').text
|
||||||
|
people_span = movie.find('div', class_='star').find_all('span')[-1]
|
||||||
|
people = people_span.text.replace('<27><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>', '')
|
||||||
|
quote_tag = movie.find('span', class_='inq')
|
||||||
|
quote = quote_tag.text if quote_tag else "<22><><EFBFBD><EFBFBD>"
|
||||||
|
|
||||||
|
info = movie.find('p', class_='').text.strip().split('\n')
|
||||||
|
director_actor = info[0].strip()
|
||||||
|
|
||||||
|
if '<27><><EFBFBD><EFBFBD>: ' in director_actor:
|
||||||
|
director = director_actor.split('<27><><EFBFBD><EFBFBD>: ')[1].split('<27><><EFBFBD><EFBFBD>: ')[0].strip()
|
||||||
|
actor = director_actor.split('<27><><EFBFBD><EFBFBD>: ')[1].strip() if '<27><><EFBFBD><EFBFBD>: ' in director_actor else "<22><><EFBFBD><EFBFBD>"
|
||||||
|
else:
|
||||||
|
director = "<22><><EFBFBD><EFBFBD>"
|
||||||
|
actor = "<22><><EFBFBD><EFBFBD>"
|
||||||
|
|
||||||
|
year_area_type = info[1].strip().split('/')
|
||||||
|
year = year_area_type[0].strip()
|
||||||
|
area = year_area_type[1].strip() if len(year_area_type) > 1 else "<22><><EFBFBD><EFBFBD>"
|
||||||
|
genre = year_area_type[2].strip() if len(year_area_type) > 2 else "<22><><EFBFBD><EFBFBD>
|
||||||
|
|
||||||
|
txtfile.write(f"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>{rank}\n")
|
||||||
|
txtfile.write(f"<22><>Ӱ<EFBFBD><D3B0><EFBFBD>ƣ<EFBFBD>{title}\n")
|
||||||
|
txtfile.write(f"<22><><EFBFBD>֣<EFBFBD>{rating}\n")
|
||||||
|
txtfile.write(f"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>{people}\n")
|
||||||
|
txtfile.write(f"<22><><EFBFBD><EFBFBD>̨<EFBFBD>ʣ<EFBFBD>{quote}\n")
|
||||||
|
txtfile.write(f"<22><><EFBFBD>ݣ<EFBFBD>{director}\n")
|
||||||
|
txtfile.write(f"<22><><EFBFBD>ݣ<EFBFBD>{actor}\n")
|
||||||
|
txtfile.write(f"<22><>ӳ<EFBFBD><D3B3><EFBFBD>ݣ<EFBFBD>{year}\n")
|
||||||
|
txtfile.write(f"<22><><EFBFBD><EFBFBD>/<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>{area}\n")
|
||||||
|
txtfile.write(f"<22><><EFBFBD>ͣ<EFBFBD>{genre}\n")
|
||||||
|
txtfile.write("-" * 50 + "\n") # <20>ָ<EFBFBD><D6B8><EFBFBD>
|
||||||
|
|
||||||
|
print(f"<22><> {start//25 + 1} ҳ<><D2B3>ȡ<EFBFBD><C8A1><EFBFBD>ɣ<EFBFBD><C9A3><EFBFBD>ǰURL: {url}")
|
||||||
|
time.sleep(1) # <20>ӳ<EFBFBD>1<EFBFBD>룬<EFBFBD><EBA3AC><EFBFBD>ⱻ<EFBFBD><E2B1BB>IP
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"<22><><EFBFBD><EFBFBD><EFBFBD>쳣<EFBFBD><ECB3A3>{e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print("? <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1><EFBFBD>ɣ<EFBFBD><C9A3><EFBFBD><EFBFBD><EFBFBD>Ϊ douban_movies.txt")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
scrape_douban_top250_to_txt()
|
||||||
Reference in New Issue
Block a user