上传文件至 /

This commit is contained in:
2026-04-02 16:04:48 +08:00
parent 727fe4d282
commit 2bd96675c0

48
260402 2505155046.py Normal file
View File

@@ -0,0 +1,48 @@
import requests
import re
import time
def crawl_douban_top250_regex_with_quote():
base_url = "https://movie.douban.com/top250"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
movies = []
for start in range(0, 250, 25):
url = f"{base_url}?start={start}&filter="
response = requests.get(url, headers=headers)
html = response.text
pattern = re.compile(
r'<em class="">(\d+)</em>.+?'
r'<span class="title">([^&]+?)</span>.+?'
r'<span class="rating_num" property="v:average">(\d\.\d)</span>.+?'
r'<p class="quote">(.+?)</p>',
re.S
)
items = pattern.findall(html)
for item in items:
rank = item[0]
title = item[1]
rating = item[2]
quote = item[3].strip()
movies.append({
"rank": rank,
"title": title,
"rating": rating,
"quote": quote
})
return movies
if __name__ == "__main__":
top250 = crawl_douban_top250_regex_with_quote()