diff --git a/douban_bs4.py.txt b/douban_bs4.py.txt new file mode 100644 index 0000000..1b61185 --- /dev/null +++ b/douban_bs4.py.txt @@ -0,0 +1,58 @@ +import requests +from bs4 import BeautifulSoup +import csv +import time +import random + +# 模拟浏览器访问 +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' +} + +movie_list = [] + +# 爬取 10 页,一共 250 部电影 +for page in range(10): + start = page * 25 + url = f'https://movie.douban.com/top250?start={start}&filter=' + print(f'正在爬取第 {page+1} 页...') + + # 随机延迟,防止被封 + time.sleep(random.uniform(0.5, 1.5)) + + try: + # 1. 发送请求获取网页 + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + # 2. 解析网页 + soup = BeautifulSoup(response.text, 'lxml') + items = soup.select('.item') + + # 3. 提取每部电影信息 + for item in items: + rank = item.select_one('.pic em').text # 排名 + title = item.select_one('.hd .title').text # 电影名 + score = item.select_one('.rating_num').text # 评分 + quote = item.select_one('.inq').text if item.select_one('.inq') else "无" + info = item.select_one('.bd p').text.strip() # 导演主演 + + movie = { + "排名": rank, + "电影名": title, + "评分": score, + "引言": quote, + "主创信息": info + } + movie_list.append(movie) + + except Exception as e: + print(f"第 {page+1} 页爬取失败:{e}") + +# 4. 保存为 CSV 文件 +with open("douban_top250_bs4.csv", "w", encoding="utf-8-sig", newline="") as f: + writer = csv.DictWriter(f, fieldnames=["排名", "电影名", "评分", "引言", "主创信息"]) + writer.writeheader() + writer.writerows(movie_list) + +print(f"? 爬取完成!共 {len(movie_list)} 条数据,已保存到 douban_top250_bs4.csv") \ No newline at end of file diff --git a/douban_xpath.py.txt b/douban_xpath.py.txt new file mode 100644 index 0000000..377c5c4 --- /dev/null +++ b/douban_xpath.py.txt @@ -0,0 +1,50 @@ +import requests +from lxml import etree +import csv +import time +import random + +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' +} + +movie_list = [] + +for page in range(10): + start = page * 25 + url = f'https://movie.douban.com/top250?start={start}&filter=' + print(f'正在爬取第 {page+1} 页...') + + time.sleep(random.uniform(0.5, 1.5)) + + try: + response = requests.get(url, headers=headers, timeout=10) + tree = etree.HTML(response.text) + items = tree.xpath('//div[@class="item"]') + + for item in items: + rank = item.xpath('.//em/text()')[0] + title = item.xpath('.//span[@class="title"][1]/text()')[0] + score = item.xpath('.//span[@class="rating_num"]/text()')[0] + quote = item.xpath('.//span[@class="inq"]/text()') + quote = quote[0] if quote else "无" + info = item.xpath('.//div[@class="bd"]/p[1]/text()')[0].strip() + + movie = { + "排名": rank, + "电影名": title, + "评分": score, + "引言": quote, + "主创信息": info + } + movie_list.append(movie) + + except Exception as e: + print(f"第 {page+1} 页爬取失败:{e}") + +with open("douban_top250_xpath.csv", "w", encoding="utf-8-sig", newline="") as f: + writer = csv.DictWriter(f, fieldnames=["排名", "电影名", "评分", "引言", "主创信息"]) + writer.writeheader() + writer.writerows(movie_list) + +print(f"? 爬取完成!共 {len(movie_list)} 条数据,已保存到 douban_top250_xpath.csv") \ No newline at end of file