From f4f94c5077c76723b4df309aca53d376a7f707c8 Mon Sep 17 00:00:00 2001
From: 2509165025 <2509165025@student.edu.cn>
Date: Thu, 2 Apr 2026 15:55:18 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BD=9C=E4=B8=9A=E4=B8=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 2509165025         |  0
 2509165025.CSV     | 57 +++++++++++++++++++++++++++++++
 2509165026(2).json | 85 ++++++++++++++++++++++++++++++++++++++++++++++
 çˆ¬è™«.txt.txt       | 67 ++++++++++++++++++++++++++++++++++++
 4 files changed, 209 insertions(+)
 create mode 100644 2509165025
 create mode 100644 2509165025.CSV
 create mode 100644 2509165026(2).json
 create mode 100644 çˆ¬è™«.txt.txt

diff --git a/2509165025 b/2509165025
new file mode 100644
index 0000000..e69de29
diff --git a/2509165025.CSV b/2509165025.CSV
new file mode 100644
index 0000000..bf96a52
--- /dev/null
+++ b/2509165025.CSV
@@ -0,0 +1,57 @@
+import requests
+from bs4 import BeautifulSoup
+import csv
+import time
+
+def scrape_douban_top250():
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+    }
+    
+    with open('douban_movies.csv', 'w', newline='', encoding='utf-8') as csvfile:
+        fieldnames = ['æŽ’å', 'ç”µå½±åç§°', 'è¯„åˆ†', 'è¯„ä»·äººæ•°', 'ç»å…¸å°è¯']
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader() 
+
+        for start in range(0, 250, 25):
+            url = f'https://movie.douban.com/top250?start={start}'
+            
+            try:
+                response = requests.get(url, headers=headers, timeout=10)
+                response.raise_for_status() 
+                
+                soup = BeautifulSoup(response.text, 'html.parser')
+                movies = soup.find_all('div', class_='item')
+
+                for movie in movies:
+                    rank = movie.find('em').text
+                    
+                    title = movie.find('span', class_='title').text
+
+                    rating = movie.find('span', class_='rating_num').text
+                    
+                    people_span = movie.find('div', class_='star').find_all('span')[-1]
+                    people = people_span.text.replace('äººè¯„ä»·', '')
+                    
+                    quote_tag = movie.find('span', class_='inq')
+                    quote = quote_tag.text if quote_tag else "æš‚æ— "
+
+                    writer.writerow({
+                        'æŽ’å': rank,
+                        'ç”µå½±åç§°': title,
+                        'è¯„åˆ†': rating,
+                        'è¯„ä»·äººæ•°': people,
+                        'ç»å…¸å°è¯': quote
+                    })
+
+                print(f"ç¬¬ {start//25 + 1} é¡µçˆ¬å–å®Œæˆï¼Œå½“å‰ URL: {url}")
+                time.sleep(1) 
+
+            except requests.exceptions.RequestException as e:
+                print(f"è¯·æ±‚å¼‚å¸¸ï¼š{e}")
+                continue
+
+    print("âœ… æ‰€æœ‰æ•°æ®å·²çˆ¬å–å®Œæˆï¼Œå·²ä¿å­˜ä¸º douban_movies.csv")
+
+if __name__ == '__main__':
+    scrape_douban_top250()
\ No newline at end of file
diff --git a/2509165026(2).json b/2509165026(2).json
new file mode 100644
index 0000000..22ff339
--- /dev/null
+++ b/2509165026(2).json
@@ -0,0 +1,85 @@
+import requests
+from bs4 import BeautifulSoup
+import time
+import json
+
+def scrape_douban_top250_to_json():
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+    }
+    
+    movies_list = []
+
+    for start in range(0, 250, 25):
+        url = f'https://movie.douban.com/top250?start={start}'
+        
+        try:
+            response = requests.get(url, headers=headers, timeout=10)
+            response.raise_for_status()
+            
+            soup = BeautifulSoup(response.text, 'html.parser')
+            movies = soup.find_all('div', class_='item')
+
+            for movie in movies:
+                rank = movie.find('em').text
+                title = movie.find('span', class_='title').text
+                rating = movie.find('span', class_='rating_num').text
+                people_span = movie.find('div', class_='star').find_all('span')[-1]
+                people = people_span.text.replace('äººè¯„ä»·', '')
+                quote_tag = movie.find('span', class_='inq')
+                quote = quote_tag.text if quote_tag else "æš‚æ— "
+
+                info = movie.find('p', class_='').text.strip().split('\n')
+                director_actor = info[0].strip()
+                
+                if 'å¯¼æ¼”: ' in director_actor:
+                    director = director_actor.split('å¯¼æ¼”: ')[1].split('ä¸»æ¼”: ')[0].strip()
+                    actor = director_actor.split('ä¸»æ¼”: ')[1].strip() if 'ä¸»æ¼”: ' in director_actor else "æš‚æ— "
+                else:
+                    director = "æš‚æ— "
+                    actor = "æš‚æ— "
+                
+                year_area_type = info[1].strip().split('/')
+                year = year_area_type[0].strip()
+                area = year_area_type[1].strip() if len(year_area_type) > 1 else "æš‚æ— "
+                genre = year_area_type[2].strip() if len(year_area_type) > 2 else "æš‚æ— "
+
+                movie_dict = {
+
+                   python
+è£æˆ
+ pythonhtml
+python python
+å¯¹åº”çš„ã®
+dictåœ¨ä¸­å›½è¨€ç”‘
+
+                    "æŽ’å": rank,
+                    "ç”µå½±åç§°": title,
+                    "è¯„åˆ†
+python
+                    "è¯„åˆ†": rating,
+                    "python
+                    "è¯„ä»·äººæ•°": people,
+                    "ç»å…¸å°è¯": quote,
+                    "å¯¼æ¼”": director,
+                    "ä¸»æ¼”": actor,
+                    "ä¸Šæ˜ å¹´ä»½": year,
+                    "å›½å®¶/åœ°åŒº": area,
+                    "ç±»åž‹": genre
+                }
+                movies_list.append(movie_dict)
+
+            print(f"ç¬¬ {start//25 + 1} é¡µçˆ¬å–å®Œæˆï¼Œå½“å‰URL: {url}")
+            time.sleep(1)  
+
+        except requests.exceptions.RequestException as e:
+            print(f"è¯·æ±‚å¼‚å¸¸ï¼š{e}")
+            continue
+
+    with open('douban_movies.json', 'w', encoding='utf-8') as jsonfile:
+        json.dump(movies_list, jsonfile, ensure_ascii=False, indent=4)
+    
+    print("âœ… æ‰€æœ‰æ•°æ®å·²çˆ¬å–å®Œæˆï¼Œä¿å­˜ä¸º douban_movies.json")
+
+if __name__ == '__main__':
+    scrape_douban_top250_to_json()
\ No newline at end of file
diff --git a/çˆ¬è™«.txt.txt b/çˆ¬è™«.txt.txt
new file mode 100644
index 0000000..333e723
--- /dev/null
+++ b/çˆ¬è™«.txt.txt
@@ -0,0 +1,67 @@
+import requests
+from bs4 import BeautifulSoup
+import time
+
+def scrape_douban_top250_to_txt():
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+    }
+   
+    with open('douban_movies.txt', 'w', encoding='utf-8') as txtfile:
+        for start in range(0, 250, 25):
+            url = f'https://movie.douban.com/top250?start={start}'
+            
+            try:
+                response = requests.get(url, headers=headers, timeout=10)
+                response.raise_for_status()
+                
+                soup = BeautifulSoup(response.text, 'html.parser')
+                movies = soup.find_all('div', class_='item'
+
+                for movie in movies:
+                    rank = movie.find('em').text
+                    title = movie.find('span', class_='title').text
+                    rating = movie.find('span', class_='rating_num').text
+                    people_span = movie.find('div', class_='star').find_all('span')[-1]
+                    people = people_span.text.replace('ÈËÆÀ¼Û', '')
+                    quote_tag = movie.find('span', class_='inq')
+                    quote = quote_tag.text if quote_tag else "ÔÝÎÞ"
+
+                    info = movie.find('p', class_='').text.strip().split('\n')
+                    director_actor = info[0].strip()
+                    
+                    if 'µ¼ÑÝ: ' in director_actor:
+                        director = director_actor.split('µ¼ÑÝ: ')[1].split('Ö÷ÑÝ: ')[0].strip()
+                        actor = director_actor.split('Ö÷ÑÝ: ')[1].strip() if 'Ö÷ÑÝ: ' in director_actor else "ÔÝÎÞ"
+                    else:
+                        director = "ÔÝÎÞ"
+                        actor = "ÔÝÎÞ"
+                    
+                    year_area_type = info[1].strip().split('/')
+                    year = year_area_type[0].strip()
+                    area = year_area_type[1].strip() if len(year_area_type) > 1 else "ÔÝÎÞ"
+                    genre = year_area_type[2].strip() if len(year_area_type) > 2 else "ÔÝÎÞ
+
+                    txtfile.write(f"ÅÅÃû£º{rank}\n")
+                    txtfile.write(f"µçÓ°Ãû³Æ£º{title}\n")
+                    txtfile.write(f"ÆÀ·Ö£º{rating}\n")
+                    txtfile.write(f"ÆÀ¼ÛÈËÊý£º{people}\n")
+                    txtfile.write(f"¾­µäÌ¨´Ê£º{quote}\n")
+                    txtfile.write(f"µ¼ÑÝ£º{director}\n")
+                    txtfile.write(f"Ö÷ÑÝ£º{actor}\n")
+                    txtfile.write(f"ÉÏÓ³Äê·Ý£º{year}\n")
+                    txtfile.write(f"¹ú¼Ò/µØÇø£º{area}\n")
+                    txtfile.write(f"ÀàÐÍ£º{genre}\n")
+                    txtfile.write("-" * 50 + "\n")  # ·Ö¸ôÏß
+
+                print(f"µÚ {start//25 + 1} Ò³ÅÀÈ¡Íê³É£¬µ±Ç°URL: {url}")
+                time.sleep(1)  # ÑÓ³Ù1Ãë£¬±ÜÃâ±»·âIP
+
+            except requests.exceptions.RequestException as e:
+                print(f"ÇëÇóÒì³££º{e}")
+                continue
+
+    print("? ËùÓÐÊý¾ÝÒÑÅÀÈ¡Íê³É£¬±£´æÎª douban_movies.txt")
+
+if __name__ == '__main__':
+    scrape_douban_top250_to_txt()
\ No newline at end of file