From 256d2cb21d0decfec52caa9fe629bdb4747ba16d Mon Sep 17 00:00:00 2001 From: 2509165016 <2509165016@student.edu.cn> Date: Thu, 26 Mar 2026 15:35:50 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BD=9C=E4=B8=9A2=EF=BC=9A?= =?UTF-8?q?=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 2509165016 task-2-1-data-collection.py | 31 +++++++++++++++++++ 爬虫2.txt | 42 -------------------------- 爬虫3.py.txt | 26 ---------------- 3 files changed, 31 insertions(+), 68 deletions(-) create mode 100644 2509165016 task-2-1-data-collection.py delete mode 100644 爬虫2.txt delete mode 100644 爬虫3.py.txt diff --git a/2509165016 task-2-1-data-collection.py b/2509165016 task-2-1-data-collection.py new file mode 100644 index 0000000..be93b47 --- /dev/null +++ b/2509165016 task-2-1-data-collection.py @@ -0,0 +1,31 @@ +import requests +from bs4 import BeatifulSoup + + +a = 0 +url = f'https://ww.douban.com/doulist/3936288/?start={a}&sort=time&piayable=0&sub_type=' + +headers = {'User-Agent':'Mozilla/5.0 (windows NT 6.1; Win64; x64) AppleWebkit/537.36 (KHTHL, like Gecko) Chrome/96.0.4664.110 Safari/537.36'} + + + +# print(soup) +# print('---------------------------') +# print('---------------------------') +# print('---------------------------') + +response = requests.get(ur1, headers=headers,timeout=10) +response.encoding = 'utf-8' +soup = BeatifulSoup(response.text, 'html.parser') +print(soup) +for b in soup.find_all('a'): + # print(a) + href - b.get('herf','') + if '/subject/'in href: + title = b.get_text(strip=True) + print(title) +//'('@id="17923343"j/div/div(2)/div(4)/a + +c - soup.select('a') +print(c) + diff --git a/爬虫2.txt b/爬虫2.txt deleted file mode 100644 index b9ec78c..0000000 --- a/爬虫2.txt +++ /dev/null @@ -1,42 +0,0 @@ -# 人工智能数据服务 - 作业二 -# 学号:你的学号 -# 姓名:你的姓名 - -print("=" * 40) -print("学号:2509165016") -print("姓名:郭宇涵") -print("Hello, Git!") -print("我已经学会使用 Git 提交作业啦!") -print("=" * 40) - -import requests -from bs4 import BeautifulSoup as bs -print('-------------') -url = 'https://www.baidu.com' -params = {'key':'value'} - -response = requests.get(url,params=params) - -print(response.status_code) -print('-------------') -html_content = response.text -print(html__content) -print('-------------') -soup = bs(html_content,'lxml') -print(soup) -print('-------------') -title = soup.find('title').string -print(title) -print('-------------') -links = soup.find_all('a') -print(links) -print('-------------') -for link in links: - # print("11111111") - print("链接:",linl.get('href')) -div_elements = soup.select('div.di') -print(div_elements) -for div in div_elements: - print('div:',div.text) - - diff --git a/爬虫3.py.txt b/爬虫3.py.txt deleted file mode 100644 index bb0b733..0000000 --- a/爬虫3.py.txt +++ /dev/null @@ -1,26 +0,0 @@ -import requests -from bs4 import BeautifulSoup - -headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', -} - -url = 'https://www.douban.com/doulist/3936288/?start=25&sort=time&playable=0&sub_type=' -response = requests.get(url, headers=headers, timeout=10) -response.encoding = 'utf-8' -for page in range(10): - url = f'https://movie.douban.com/top250?start={page*25}' - print(f'正在爬取第 {page+1} 页:{url}') - -# print(response.status_code) -soup = BeautifulSoup(response.text, 'html.parser') - -movies = [] - -for a in soup.find_all('a'): - href = a.get('href', '') - if '?' in href: - title = a.get_text(strip=True) - print(title) - movies.append(title) - \ No newline at end of file