想给家里老人爬取些广场舞的视频,遇到问题,求大神指导
问题描述:
打算先爬取视频内容合集页大概100左右数量的单个视频页的地址,再转到单个视频页爬取视频,但是返回的结果是空值,试过其他几种定位方式,都不行
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding' : 'gzip',
'DNT' : '1',
'Connection' : 'close',
'Accept-Language':'zh-CN'
}
f
url_address = 'https://haokan.baidu.com/author/1622522456467855'
item_collection_responde = requests.get(url = url_address, headers = headers)
item_collection_soup = BeautifulSoup(item_collection_responde.content, 'lxml')
item_collection = item_collection_soup.find_all('a', class_= 'card-item-link skeleton')
print(item_collection)
也试过第二步,直接在单个视频页抓取 <video>标签里的下载地址,也是返回空值
答
我又研究了一下,发现可以直接爬取好看视频网站的json数据
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding' : 'gzip',
'DNT' : '1',
'Connection' : 'close',
'Accept-Language':'zh-CN'
}
url_address = 'https://haokan.baidu.com/author/1622522456467855'
ctime = ""
while True:
resjson = requests.get(url = f'{url_address}?_format=json&rn=16&ctime={ctime}&_api=1', headers = headers).json()
response = resjson['data']['response']
for it in response['results']:
print("视频名称:",it['content']['title'])
print("标清视频地址:",it['content']['video_list'].get('sd','无'))
print("高清视频地址:",it['content']['video_list'].get('hd','无'))
print("超清视频地址:",it['content']['video_list'].get('sc','无'))
print()
if response['has_more'] == 0:
print("爬取结束")
break
ctime = response['ctime']
答
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding' : 'gzip',
'DNT' : '1',
'Connection' : 'close',
'Accept-Language':'zh-CN'
}
url_address = 'https://haokan.baidu.com/author/1622522456467855'
item_collection_responde = requests.get(url = url_address, headers = headers)
item_collection_soup = BeautifulSoup(item_collection_responde.content, 'lxml')
item_collection = item_collection_soup.find_all('a', class_= 'card-item-link')
print(item_collection)
for item in item_collection:
print(item.get("href"))
答
去掉 skeleton
item_collection_soup.find_all('a', class_= 'card-item-link')
答
第二步,单个视频页中<video>标签里的地址是用js动态生成的,需要用selenium模块抓取
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding' : 'gzip',
'DNT' : '1',
'Connection' : 'close',
'Accept-Language':'zh-CN'
}
url_address = 'https://haokan.baidu.com/author/1622522456467855'
item_collection_responde = requests.get(url = url_address, headers = headers)
item_collection_soup = BeautifulSoup(item_collection_responde.content, 'lxml')
item_collection = item_collection_soup.find_all('a', class_= 'card-item-link')
driver = webdriver.Chrome()
for item in item_collection:
driver.get(item.get("href"))
time.sleep(3)
videourl = driver.find_element_by_xpath('//*[@id="mse"]/video').get_attribute("src")
print(videourl)
答
楼下正解。
答
高清视频地址: https://vd4.bdstatic.com/mda-mbju87g1y0g5bjuq/hd/cae_h264_nowatermark/1613860273/mda-mbju87g1y0g5bjuq.mp4?auth_key=1614085642-0-0-2ca457b29bde0146d07c317164733b86&bcevod_channel=searchbox_feed&pd=1&pt=3
高清视频地址: https://vd3.bdstatic.com/mda-mbjm3dca9spqewen/hd/cae_h264_nowatermark/1613805083/mda-mbjm3dca9spqewen.mp4?auth_key=1614085642-0-0-a0c953cf7f4f48deb7183ea13ecaa1e0&bcevod_channel=searchbox_feed&pd=1&pt=3
高清视频地址: https://vd2.bdstatic.com/mda-mbipdzajhx3g5qbi/hd/cae_h264_nowatermark/1613775626/mda-mbipdzajhx3g5qbi.mp4?auth_key=1614085642-0-0-90d9230aa1606d5aa3961a7ada5190f8&bcevod_channel=searchbox_feed&pd=1&pt=3
Traceback (most recent call last):
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 382, in _make_request
self._validate_conn(conn)
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 1010, in _validate_conn
conn.connect()
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connection.py", line 411, in connect
self.sock = ssl_wrap_socket(
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\ssl_.py", line 428, in ssl_wrap_socket
ssl_sock = _ssl_wrap_socket_impl(
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\ssl_.py", line 472, in _ssl_wrap_socket_impl
return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\ssl.py", line 1040, in _create
self.do_handshake()
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
ssl.SSLEOFError: EOF occurred in violation of protocol (_ssl.c:1125)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\adapters.py", line 439, in send
resp = conn.urlopen(
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 755, in urlopen
retries = retries.increment(
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\retry.py", line 573, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='vd2.bdstatic.com', port=443): Max retries exceeded with url: /mda-mbipdzajhx3g5qbi/hd/cae_h264_nowatermark/1613775626/mda-mbipdzajhx3g5qbi.mp4?auth_key=1614085642-0-0-90d9230aa1606d5aa3961a7ada5190f8&bcevod_channel=searchbox_feed&pd=1&pt=3 (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1125)')))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "f:/CodeWar/spider/20210222-广场舞(升级版).py", line 35, in <module>
video_dance = requests.get(high_pix_address, headers=headers)
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\adapters.py", line 514, in send
raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='vd2.bdstatic.com', port=443): Max retries exceeded with url: /mda-mbipdzajhx3g5qbi/hd/cae_h264_nowatermark/1613775626/mda-mbipdzajhx3g5qbi.mp4?auth_key=1614085642-0-0-90d9230aa1606d5aa3961a7ada5190f8&bcevod_channel=searchbox_feed&pd=1&pt=3 (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1125)')))
刚开始能正常爬取完列表,在测试下载文件的时候,总是会出现停顿,然后跳出上面这些报错,下载的文件数量,每次都不一样,有时候能爬40多个,50多个,有时候又只能爬取几个,试着把下载文件的代码段注释掉,也会中途停一下,然后报错结束
import requests
import os
import threading
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding' : 'gzip',
'DNT' : '1',
'Connection' : 'close',
'Accept-Language':'zh-CN'
}
url_address = 'https://haokan.baidu.com/author/1622522456467855'
path = 'F:/CodeWar/spider/广场舞/'
end_name = '.mp4'
high_pix = []
ctime = ""
while True:
resjson = requests.get(url = f'{url_address}?_format=json&rn=16&ctime={ctime}&_api=1', headers = headers, timeout = 500).json()
print(resjson)
response = resjson['data']['response']
for it in response['results']:
#print("视频名称:",it['content']['title'])
#print("标清视频地址:",it['content']['video_list'].get('sd','无'))
print("高清视频地址:",it['content']['video_list'].get('hd','无'))
#print("超清视频地址:",it['content']['video_list'].get('sc','无'))
print()
file_name = it['content']['title']
high_pix_address = it['content']['video_list'].get('hd','无')
high_pix.append(high_pix_address)
video_dance = requests.get(high_pix_address, headers=headers)
if video_dance.status_code == 200:
save_filename = os.path.join(path, file_name)
full_name = save_filename + end_name
with open(full_name, 'wb') as f:
f.write(video_dance.content)
if response['has_more'] == 0:
print("爬取结束")
break
ctime = response['ctime']
print(len(high_pix))