44 lines
1.5 KiB
Python
44 lines
1.5 KiB
Python
import requests
|
||
from bs4 import BeautifulSoup
|
||
import json
|
||
head = {
|
||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/129.0.0.0 Safari/537.36"
|
||
}
|
||
web_url = "https://exam.detr.top/exam-b/movies"
|
||
res = requests.get(web_url, headers=head)
|
||
res.encoding = "utf-8"
|
||
source_html = res.text
|
||
html_file = open("movies.html", "w", encoding="utf-8")
|
||
html_file.write(source_html)
|
||
html_file.close()
|
||
html_parse = BeautifulSoup(source_html, "html.parser")
|
||
all_movie = html_parse.find_all("div", class_="item")
|
||
movie_info = []
|
||
for movie in all_movie:
|
||
movie_id = movie.get("data-id")
|
||
m_title = movie.find("span", class_="title").get_text().strip()
|
||
info_block = movie.find("div", class_="bd")
|
||
info_text = info_block.get_text().strip().split("\n")[0]
|
||
split_data = [text.strip() for text in info_text.split("/")]
|
||
director = split_data[0]
|
||
year = split_data[1]
|
||
duration = split_data[2]
|
||
genre = split_data[3]
|
||
actor_num = int(split_data[4])
|
||
score = float(movie.find("p", class_="quote").get_text().strip())
|
||
single_movie = {
|
||
"id": movie_id,
|
||
"title": m_title,
|
||
"director": director,
|
||
"year": year,
|
||
"rating": score,
|
||
"duration": duration,
|
||
"genre": genre,
|
||
"actors_count": actor_num
|
||
}
|
||
movie_info.append(single_movie)
|
||
json_file = open("movies.json", "w", encoding="utf-8")
|
||
json.dump(movie_info, json_file, ensure_ascii=False, indent=2)
|
||
json_file.close()
|
||
|
||
print("爬虫抓取完成,已生成movies.html、movies.json两个文件") |