想给家里老人爬取些广场舞的视频,遇到问题,求大神指导

问题描述:

打算先爬取视频内容合集页大概100左右数量的单个视频页的地址,再转到单个视频页爬取视频,但是返回的结果是空值,试过其他几种定位方式,都不行

import requests

from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding' : 'gzip',
    'DNT' : '1', 
    'Connection' : 'close',
    'Accept-Language':'zh-CN'
}
f
url_address = 'https://haokan.baidu.com/author/1622522456467855'  
item_collection_responde = requests.get(url = url_address, headers = headers)
item_collection_soup = BeautifulSoup(item_collection_responde.content,  'lxml')
item_collection =  item_collection_soup.find_all('a', class_= 'card-item-link skeleton')

print(item_collection)

也试过第二步,直接在单个视频页抓取 <video>标签里的下载地址,也是返回空值

我又研究了一下,发现可以直接爬取好看视频网站的json数据

 

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding' : 'gzip',
    'DNT' : '1',
    'Connection' : 'close',
    'Accept-Language':'zh-CN'
}

url_address = 'https://haokan.baidu.com/author/1622522456467855'

ctime = ""
while True:
    resjson = requests.get(url = f'{url_address}?_format=json&rn=16&ctime={ctime}&_api=1', headers = headers).json()
    response = resjson['data']['response']
    for it in response['results']:
        print("视频名称:",it['content']['title'])
        print("标清视频地址:",it['content']['video_list'].get('sd','无'))
        print("高清视频地址:",it['content']['video_list'].get('hd','无'))
        print("超清视频地址:",it['content']['video_list'].get('sc','无'))
        print()
    if response['has_more'] == 0:
        print("爬取结束")
        break
    ctime = response['ctime']

import requests

from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding' : 'gzip',
    'DNT' : '1',
    'Connection' : 'close',
    'Accept-Language':'zh-CN'
}

url_address = 'https://haokan.baidu.com/author/1622522456467855'
item_collection_responde = requests.get(url = url_address, headers = headers)
item_collection_soup = BeautifulSoup(item_collection_responde.content,  'lxml')
item_collection =  item_collection_soup.find_all('a', class_= 'card-item-link')
print(item_collection)
for item in item_collection:
     print(item.get("href"))

去掉 skeleton

item_collection_soup.find_all('a', class_= 'card-item-link')

第二步,单个视频页中<video>标签里的地址是用js动态生成的,需要用selenium模块抓取 

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding' : 'gzip',
    'DNT' : '1',
    'Connection' : 'close',
    'Accept-Language':'zh-CN'
}

url_address = 'https://haokan.baidu.com/author/1622522456467855'
item_collection_responde = requests.get(url = url_address, headers = headers)
item_collection_soup = BeautifulSoup(item_collection_responde.content,  'lxml')
item_collection =  item_collection_soup.find_all('a', class_= 'card-item-link')

driver = webdriver.Chrome()

for item in item_collection:
    driver.get(item.get("href"))
    time.sleep(3)
    videourl = driver.find_element_by_xpath('//*[@id="mse"]/video').get_attribute("src")
    print(videourl)


 

楼下正解。

高清视频地址: https://vd4.bdstatic.com/mda-mbju87g1y0g5bjuq/hd/cae_h264_nowatermark/1613860273/mda-mbju87g1y0g5bjuq.mp4?auth_key=1614085642-0-0-2ca457b29bde0146d07c317164733b86&bcevod_channel=searchbox_feed&pd=1&pt=3

高清视频地址: https://vd3.bdstatic.com/mda-mbjm3dca9spqewen/hd/cae_h264_nowatermark/1613805083/mda-mbjm3dca9spqewen.mp4?auth_key=1614085642-0-0-a0c953cf7f4f48deb7183ea13ecaa1e0&bcevod_channel=searchbox_feed&pd=1&pt=3

高清视频地址: https://vd2.bdstatic.com/mda-mbipdzajhx3g5qbi/hd/cae_h264_nowatermark/1613775626/mda-mbipdzajhx3g5qbi.mp4?auth_key=1614085642-0-0-90d9230aa1606d5aa3961a7ada5190f8&bcevod_channel=searchbox_feed&pd=1&pt=3

Traceback (most recent call last):
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 699, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 382, in _make_request
    self._validate_conn(conn)
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 1010, in _validate_conn
    conn.connect()
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connection.py", line 411, in connect
    self.sock = ssl_wrap_socket(
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\ssl_.py", line 428, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\ssl_.py", line 472, in _ssl_wrap_socket_impl
    return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\ssl.py", line 500, in wrap_socket
    return self.sslsocket_class._create(
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\ssl.py", line 1040, in _create
    self.do_handshake()
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\ssl.py", line 1309, in do_handshake
    self._sslobj.do_handshake()
ssl.SSLEOFError: EOF occurred in violation of protocol (_ssl.c:1125)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\adapters.py", line 439, in send
    resp = conn.urlopen(
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 755, in urlopen
    retries = retries.increment(
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\retry.py", line 573, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='vd2.bdstatic.com', port=443): Max retries exceeded with url: /mda-mbipdzajhx3g5qbi/hd/cae_h264_nowatermark/1613775626/mda-mbipdzajhx3g5qbi.mp4?auth_key=1614085642-0-0-90d9230aa1606d5aa3961a7ada5190f8&bcevod_channel=searchbox_feed&pd=1&pt=3 (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1125)')))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "f:/CodeWar/spider/20210222-广场舞(升级版).py", line 35, in <module>
    video_dance = requests.get(high_pix_address, headers=headers)
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\api.py", line 76, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\api.py", line 61, in request
    return session.request(method=method, url=url, **kwargs)
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\sessions.py", line 542, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\sessions.py", line 655, in send
    r = adapter.send(request, **kwargs)
  File "C:\Users\long\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\adapters.py", line 514, in send
    raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='vd2.bdstatic.com', port=443): Max retries exceeded with url: /mda-mbipdzajhx3g5qbi/hd/cae_h264_nowatermark/1613775626/mda-mbipdzajhx3g5qbi.mp4?auth_key=1614085642-0-0-90d9230aa1606d5aa3961a7ada5190f8&bcevod_channel=searchbox_feed&pd=1&pt=3 (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1125)')))

 

刚开始能正常爬取完列表,在测试下载文件的时候,总是会出现停顿,然后跳出上面这些报错,下载的文件数量,每次都不一样,有时候能爬40多个,50多个,有时候又只能爬取几个,试着把下载文件的代码段注释掉,也会中途停一下,然后报错结束

 

import requests
import os
import threading

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding' : 'gzip',
    'DNT' : '1',
    'Connection' : 'close',
    'Accept-Language':'zh-CN'
}

url_address = 'https://haokan.baidu.com/author/1622522456467855'

path = 'F:/CodeWar/spider/广场舞/'
end_name = '.mp4'


high_pix = []
ctime = ""
while True:
    resjson = requests.get(url = f'{url_address}?_format=json&rn=16&ctime={ctime}&_api=1', headers = headers, timeout = 500).json()
    print(resjson)
    response = resjson['data']['response']
    for it in response['results']:
        #print("视频名称:",it['content']['title'])
        #print("标清视频地址:",it['content']['video_list'].get('sd','无'))
        print("高清视频地址:",it['content']['video_list'].get('hd','无'))
        #print("超清视频地址:",it['content']['video_list'].get('sc','无'))
        print()
        file_name = it['content']['title']
        high_pix_address = it['content']['video_list'].get('hd','无')
        high_pix.append(high_pix_address)
        video_dance = requests.get(high_pix_address, headers=headers) 
        if video_dance.status_code == 200:
    
            save_filename = os.path.join(path, file_name)
            full_name = save_filename + end_name
            with open(full_name, 'wb') as f:
                f.write(video_dance.content)
            
           

    if response['has_more'] == 0:
        print("爬取结束")
        break
    ctime = response['ctime']
print(len(high_pix))