Compare commits
6 Commits
82e389e988
...
4bd9d42ae3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4bd9d42ae3 | ||
|
|
cc2b8096f4 | ||
|
|
82e2af26a5 | ||
|
|
dc174b313e | ||
|
|
3d7d3eed00 | ||
| 14787e562d |
27
2.py
Normal file
27
2.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup as bs
|
||||||
|
print('------------')
|
||||||
|
url = 'https://www.baidu.com'
|
||||||
|
params = {'key':'value'}
|
||||||
|
response = requests.get(url,params=params)
|
||||||
|
print(response.status_code)
|
||||||
|
print('------------')
|
||||||
|
html_content = response.text
|
||||||
|
print(html_content)
|
||||||
|
print('------------')
|
||||||
|
soup = bs(html_content,'lxml')
|
||||||
|
print(soup)
|
||||||
|
print('============')
|
||||||
|
title = soup.find('title').string
|
||||||
|
print(title)
|
||||||
|
print('============')
|
||||||
|
links = soup.find_all('a')
|
||||||
|
print(links)
|
||||||
|
print('============')
|
||||||
|
for link in links:
|
||||||
|
# print("11111111")
|
||||||
|
print("链接:",link.get('href'))
|
||||||
|
div_element = soup.select('div.di')
|
||||||
|
print(div_element)
|
||||||
|
for div in div_element:
|
||||||
|
print('div:',div.text)
|
||||||
18
26.03.24_48.py
Normal file
18
26.03.24_48.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}
|
||||||
|
url = 'https://www.douban.com/doulist/3936288/'
|
||||||
|
response = requests.get(url,headers=header,timeout=10)
|
||||||
|
response.encoding='utf-8'
|
||||||
|
print(response.status_code)
|
||||||
|
soup=BeautifulSoup(response.text,'html.parser')
|
||||||
|
movies=[]
|
||||||
|
for a in soup.find_all('a'):
|
||||||
|
print(a)
|
||||||
|
href=a.get('href','')
|
||||||
|
if '/subject' in href:
|
||||||
|
title = a.get_text(strip=True)
|
||||||
|
print(title)
|
||||||
|
movies.append(title)
|
||||||
|
print('----------')
|
||||||
|
print(movies)
|
||||||
58
douban_bs4.py.txt
Normal file
58
douban_bs4.py.txt
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import csv
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
# ģ<><C4A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
movie_list = []
|
||||||
|
|
||||||
|
# <20><>ȡ 10 ҳ<><D2B3>һ<EFBFBD><D2BB> 250 <20><><EFBFBD><EFBFBD>Ӱ
|
||||||
|
for page in range(10):
|
||||||
|
start = page * 25
|
||||||
|
url = f'https://movie.douban.com/top250?start={start}&filter='
|
||||||
|
print(f'<27><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1> {page+1} ҳ...')
|
||||||
|
|
||||||
|
# <20><><EFBFBD><EFBFBD><EFBFBD>ӳ٣<D3B3><D9A3><EFBFBD>ֹ<EFBFBD><D6B9><EFBFBD><EFBFBD>
|
||||||
|
time.sleep(random.uniform(0.5, 1.5))
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 1. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1>ҳ
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# 2. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ҳ
|
||||||
|
soup = BeautifulSoup(response.text, 'lxml')
|
||||||
|
items = soup.select('.item')
|
||||||
|
|
||||||
|
# 3. <20><>ȡÿ<C8A1><C3BF><EFBFBD><EFBFBD>Ӱ<EFBFBD><D3B0>Ϣ
|
||||||
|
for item in items:
|
||||||
|
rank = item.select_one('.pic em').text # <20><><EFBFBD><EFBFBD>
|
||||||
|
title = item.select_one('.hd .title').text # <20><>Ӱ<EFBFBD><D3B0>
|
||||||
|
score = item.select_one('.rating_num').text # <20><><EFBFBD><EFBFBD>
|
||||||
|
quote = item.select_one('.inq').text if item.select_one('.inq') else "<22><>"
|
||||||
|
info = item.select_one('.bd p').text.strip() # <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||||
|
|
||||||
|
movie = {
|
||||||
|
"<22><><EFBFBD><EFBFBD>": rank,
|
||||||
|
"<22><>Ӱ<EFBFBD><D3B0>": title,
|
||||||
|
"<22><><EFBFBD><EFBFBD>": score,
|
||||||
|
"<22><><EFBFBD><EFBFBD>": quote,
|
||||||
|
"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ": info
|
||||||
|
}
|
||||||
|
movie_list.append(movie)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"<22><> {page+1} ҳ<><D2B3>ȡʧ<C8A1>ܣ<EFBFBD>{e}")
|
||||||
|
|
||||||
|
# 4. <20><><EFBFBD><EFBFBD>Ϊ CSV <20>ļ<EFBFBD>
|
||||||
|
with open("douban_top250_bs4.csv", "w", encoding="utf-8-sig", newline="") as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=["<22><><EFBFBD><EFBFBD>", "<22><>Ӱ<EFBFBD><D3B0>", "<22><><EFBFBD><EFBFBD>", "<22><><EFBFBD><EFBFBD>", "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ"])
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(movie_list)
|
||||||
|
|
||||||
|
print(f"? <20><>ȡ<EFBFBD><C8A1><EFBFBD>ɣ<EFBFBD><C9A3><EFBFBD> {len(movie_list)} <20><><EFBFBD><EFBFBD><EFBFBD>ݣ<EFBFBD><DDA3>ѱ<EFBFBD><D1B1>浽 douban_top250_bs4.csv")
|
||||||
50
douban_xpath.py.txt
Normal file
50
douban_xpath.py.txt
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
import requests
|
||||||
|
from lxml import etree
|
||||||
|
import csv
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
movie_list = []
|
||||||
|
|
||||||
|
for page in range(10):
|
||||||
|
start = page * 25
|
||||||
|
url = f'https://movie.douban.com/top250?start={start}&filter='
|
||||||
|
print(f'<27><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1> {page+1} ҳ...')
|
||||||
|
|
||||||
|
time.sleep(random.uniform(0.5, 1.5))
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
tree = etree.HTML(response.text)
|
||||||
|
items = tree.xpath('//div[@class="item"]')
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
rank = item.xpath('.//em/text()')[0]
|
||||||
|
title = item.xpath('.//span[@class="title"][1]/text()')[0]
|
||||||
|
score = item.xpath('.//span[@class="rating_num"]/text()')[0]
|
||||||
|
quote = item.xpath('.//span[@class="inq"]/text()')
|
||||||
|
quote = quote[0] if quote else "<22><>"
|
||||||
|
info = item.xpath('.//div[@class="bd"]/p[1]/text()')[0].strip()
|
||||||
|
|
||||||
|
movie = {
|
||||||
|
"<22><><EFBFBD><EFBFBD>": rank,
|
||||||
|
"<22><>Ӱ<EFBFBD><D3B0>": title,
|
||||||
|
"<22><><EFBFBD><EFBFBD>": score,
|
||||||
|
"<22><><EFBFBD><EFBFBD>": quote,
|
||||||
|
"<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ": info
|
||||||
|
}
|
||||||
|
movie_list.append(movie)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"<22><> {page+1} ҳ<><D2B3>ȡʧ<C8A1>ܣ<EFBFBD>{e}")
|
||||||
|
|
||||||
|
with open("douban_top250_xpath.csv", "w", encoding="utf-8-sig", newline="") as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=["<22><><EFBFBD><EFBFBD>", "<22><>Ӱ<EFBFBD><D3B0>", "<22><><EFBFBD><EFBFBD>", "<22><><EFBFBD><EFBFBD>", "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ"])
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(movie_list)
|
||||||
|
|
||||||
|
print(f"? <20><>ȡ<EFBFBD><C8A1><EFBFBD>ɣ<EFBFBD><C9A3><EFBFBD> {len(movie_list)} <20><><EFBFBD><EFBFBD><EFBFBD>ݣ<EFBFBD><DDA3>ѱ<EFBFBD><D1B1>浽 douban_top250_xpath.csv")
|
||||||
9
import requests.py
Normal file
9
import requests.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
import requests
|
||||||
|
url = 'https://douyin.com'
|
||||||
|
params = {'key':'value'}
|
||||||
|
response = requests.get(url,params=params)
|
||||||
|
if response.status_code == 200:
|
||||||
|
html_content = response.text
|
||||||
|
print("请求成功,获取到HTML内容")
|
||||||
|
else:
|
||||||
|
print(f"请求失败,状态码:{response.status_code}")
|
||||||
Reference in New Issue
Block a user