From 4c5f702d61c3377dd0ed81af5109773ca5b64b65 Mon Sep 17 00:00:00 2001 From: 2509165025 <2509165025@student.edu.cn> Date: Thu, 26 Mar 2026 15:46:10 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BD=9C=E4=B8=9A=E4=B8=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 2509165025.py => 26 5025.py | 0 爬虫/爬虫.py (2).txt | 30 ------------- 爬虫/爬虫.py.txt | 88 ------------------------------------- 3 files changed, 118 deletions(-) rename 2509165025.py => 26 5025.py (100%) delete mode 100644 爬虫/爬虫.py (2).txt delete mode 100644 爬虫/爬虫.py.txt diff --git a/2509165025.py b/26 5025.py similarity index 100% rename from 2509165025.py rename to 26 5025.py diff --git a/爬虫/爬虫.py (2).txt b/爬虫/爬虫.py (2).txt deleted file mode 100644 index 00dee05..0000000 --- a/爬虫/爬虫.py (2).txt +++ /dev/null @@ -1,30 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import time - -headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" -} - -all_movies = [] - -for page in range(0, 250, 25): - url = f"https://movie.douban.com/top250?start={page}&filter=" - print(f"ȡ {page//25 + 1} ҳ{url}") - - response = requests.get(url, headers=headers) - response.encoding = "utf-8" - soup = BeautifulSoup(response.text, "html.parser") - - - items = soup.find_all("div", class_="item") - for item in items: - title = item.find("span", class_="title").get_text(strip=True) - all_movies.append(title) - print(title) - - - time.sleep(1) - - -print(f"\nһӰ{len(all_movies)} ") \ No newline at end of file diff --git a/爬虫/爬虫.py.txt b/爬虫/爬虫.py.txt deleted file mode 100644 index 939569d..0000000 --- a/爬虫/爬虫.py.txt +++ /dev/null @@ -1,88 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import time -import csv - - -headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" -} - -def fetch_news_list(url): - """获取新闻列表页的标题、链接、时间""" - try: - response = requests.get(url, headers=headers, timeout=10) - response.raise_for_status() - soup = BeautifulSoup(response.text, 'html.parser') - - news_list = [] - - for item in soup.select('ul.feed-card-list li.feed-card-item'): - title_tag = item.select_one('a.feed-card-link') - if not title_tag: - continue - title = title_tag.get_text(strip=True) - link = title_tag['href'] - - if not link.startswith('http'): - link = 'https://news.sina.com.cn' + link - - time_tag = item.select_one('span.feed-card-time') - publish_time = time_tag.get_text(strip=True) if time_tag else '未知时间' - - news_list.append({ - 'title': title, - 'link': link, - 'publish_time': publish_time - }) - return news_list - except Exception as e: - print(f"获取新闻列表失败: {e}") - return [] - -def fetch_news_content(news_url): - """进入新闻详情页,提取正文内容""" - try: - response = requests.get(news_url, headers=headers, timeout=10) - response.raise_for_status() - soup = BeautifulSoup(response.text, 'html.parser') - - - content = '' - for p in soup.select('div.article p'): - content += p.get_text(strip=True) + '\n' - return content if content else '无正文内容' - except Exception as e: - print(f"获取新闻正文失败: {e}") - return '' - -def save_to_csv(news_data, filename='sina_news.csv'): - """将爬取到的新闻保存为CSV文件""" - with open(filename, 'w', newline='', encoding='utf-8-sig') as f: - writer = csv.DictWriter(f, fieldnames=['title', 'link', 'publish_time', 'content']) - writer.writeheader() - writer.writerows(news_data) - print(f"✅ 新闻已保存到 {filename}") - -if __name__ == "__main__": - - target_url = "https://news.sina.com.cn/china/" - print("开始爬取新闻列表...") - news_list = fetch_news_list(target_url) - - if not news_list: - print("未获取到新闻列表,结束爬取") - exit() - - - news_data = [] - for i, news in enumerate(news_list[:10], 1): # 只爬前10条 - print(f"正在爬取第 {i} 条: {news['title']}") - content = fetch_news_content(news['link']) - news['content'] = content - news_data.append(news) - time.sleep(1) # 延迟1秒,避免请求过快被封 - - - save_to_csv(news_data) - print("🎉 爬取完成!") \ No newline at end of file