From 82f7dbb53aad8a87b59c0fc2ee028c22665a555a Mon Sep 17 00:00:00 2001 From: 2509165025 <2509165025@student.edu.cn> Date: Fri, 13 Mar 2026 18:00:51 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BD=9C=E4=B8=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 爬虫.py.txt | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 爬虫.py.txt diff --git a/爬虫.py.txt b/爬虫.py.txt new file mode 100644 index 0000000..939569d --- /dev/null +++ b/爬虫.py.txt @@ -0,0 +1,88 @@ +import requests +from bs4 import BeautifulSoup +import time +import csv + + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} + +def fetch_news_list(url): + """获取新闻列表页的标题、链接、时间""" + try: + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + + news_list = [] + + for item in soup.select('ul.feed-card-list li.feed-card-item'): + title_tag = item.select_one('a.feed-card-link') + if not title_tag: + continue + title = title_tag.get_text(strip=True) + link = title_tag['href'] + + if not link.startswith('http'): + link = 'https://news.sina.com.cn' + link + + time_tag = item.select_one('span.feed-card-time') + publish_time = time_tag.get_text(strip=True) if time_tag else '未知时间' + + news_list.append({ + 'title': title, + 'link': link, + 'publish_time': publish_time + }) + return news_list + except Exception as e: + print(f"获取新闻列表失败: {e}") + return [] + +def fetch_news_content(news_url): + """进入新闻详情页,提取正文内容""" + try: + response = requests.get(news_url, headers=headers, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + + + content = '' + for p in soup.select('div.article p'): + content += p.get_text(strip=True) + '\n' + return content if content else '无正文内容' + except Exception as e: + print(f"获取新闻正文失败: {e}") + return '' + +def save_to_csv(news_data, filename='sina_news.csv'): + """将爬取到的新闻保存为CSV文件""" + with open(filename, 'w', newline='', encoding='utf-8-sig') as f: + writer = csv.DictWriter(f, fieldnames=['title', 'link', 'publish_time', 'content']) + writer.writeheader() + writer.writerows(news_data) + print(f"✅ 新闻已保存到 {filename}") + +if __name__ == "__main__": + + target_url = "https://news.sina.com.cn/china/" + print("开始爬取新闻列表...") + news_list = fetch_news_list(target_url) + + if not news_list: + print("未获取到新闻列表,结束爬取") + exit() + + + news_data = [] + for i, news in enumerate(news_list[:10], 1): # 只爬前10条 + print(f"正在爬取第 {i} 条: {news['title']}") + content = fetch_news_content(news['link']) + news['content'] = content + news_data.append(news) + time.sleep(1) # 延迟1秒,避免请求过快被封 + + + save_to_csv(news_data) + print("🎉 爬取完成!") \ No newline at end of file