Files
task-2-1-data-collection/爬虫/爬虫.py.txt
2026-03-13 18:10:21 +08:00

88 lines
2.9 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import time
import csv
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
def fetch_news_list(url):
"""获取新闻列表页的标题、链接、时间"""
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
news_list = []
for item in soup.select('ul.feed-card-list li.feed-card-item'):
title_tag = item.select_one('a.feed-card-link')
if not title_tag:
continue
title = title_tag.get_text(strip=True)
link = title_tag['href']
if not link.startswith('http'):
link = 'https://news.sina.com.cn' + link
time_tag = item.select_one('span.feed-card-time')
publish_time = time_tag.get_text(strip=True) if time_tag else '未知时间'
news_list.append({
'title': title,
'link': link,
'publish_time': publish_time
})
return news_list
except Exception as e:
print(f"获取新闻列表失败: {e}")
return []
def fetch_news_content(news_url):
"""进入新闻详情页,提取正文内容"""
try:
response = requests.get(news_url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
content = ''
for p in soup.select('div.article p'):
content += p.get_text(strip=True) + '\n'
return content if content else '无正文内容'
except Exception as e:
print(f"获取新闻正文失败: {e}")
return ''
def save_to_csv(news_data, filename='sina_news.csv'):
"""将爬取到的新闻保存为CSV文件"""
with open(filename, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=['title', 'link', 'publish_time', 'content'])
writer.writeheader()
writer.writerows(news_data)
print(f"✅ 新闻已保存到 {filename}")
if __name__ == "__main__":
target_url = "https://news.sina.com.cn/china/"
print("开始爬取新闻列表...")
news_list = fetch_news_list(target_url)
if not news_list:
print("未获取到新闻列表,结束爬取")
exit()
news_data = []
for i, news in enumerate(news_list[:10], 1): # 只爬前10条
print(f"正在爬取第 {i} 条: {news['title']}")
content = fetch_news_content(news['link'])
news['content'] = content
news_data.append(news)
time.sleep(1) # 延迟1秒避免请求过快被封
save_to_csv(news_data)
print("🎉 爬取完成!")