爬虫 4 selenium

- cookie的处理
- 手动处理
- cookie从抓包工具中捕获封装到headers中
- 自动处理
- session对象。
- 代理
- 代理服务器
- 进行请求转发
- 代理ip：port作用到get、post方法的proxies = {'http':'ip:port'}中
- 代理池（列表）
- 验证码的识别
- 超级鹰
- 模拟登陆
- 验证码的识别
- 动态请求参数
- cookie
- 单线程+多任务异步协程
- 协程
- 如果一个函数的定义被asyic修饰后，则改函数调用后会返回一个协程对象。
- 任务对象：
- 就是对协程对象的进一步封装
- 绑定回调
- task.add_done_callback(func):func(task):task.result()
- 事件循环对象
- 事件循环对象是用来装载任务对象。该对象被启动后，则会异步的处理调用其内部装载的每一个任务对象。（将任务对象手动进行挂起操作）
- aynic，await
- 注意事项：在特殊函数内部不可以出现不支持异步模块的代码，否则会中断整个异步的效果！！！
- aiohttp支持异步请求的模块

- selenium模块在爬虫中的使用
- 概念：是一个基于浏览器自动化的模块。
- 爬虫之间的关联：
- 便捷的捕获到动态加载到的数据。（可见即可得）
- 实现模拟登陆
- 环境安装：pip install selenium
- 基本使用：
- 准备好某一款浏览器的驱动程序：http://chromedriver.storage.googleapis.com/index.html
- 版本的映射关系：https://blog.****.net/huilan_same/article/details/51896672

from selenium import webdriver
from time import sleep
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('https://www.jd.com/')
sleep(1)
#进行标签定位
search_input = bro.find_element_by_id('key')
search_input.send_keys('mac pro')

btn = bro.find_element_by_xpath('//*[@>)
btn.click()
sleep(2)

#执行js
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)

page_text = bro.page_source
print(page_text)

sleep(2)
bro.quit()

selenium 基础示例

from selenium import webdriver from time import sleep # 后面是你的浏览器驱动位置，记得前面加r'','r'是防止字符转义的 driver = webdriver.Chrome(r'chromedriver.exe') # 用get打开百度页面 driver.get("http://www.baidu.com") # 查找页面的“设置”选项，并进行点击 print(driver.find_elements_by_link_text('设置')) driver.find_element_by_id('s-usersetting-top').click() # # 打开设置后找到“搜索设置”选项，设置为每页显示50条 driver.find_elements_by_link_text('搜索设置')[0].click() sleep(2) # 选中每页显示50条 driver.find_element_by_xpath('//*[@>).click() # m = driver.find_element_by_id('nr_2') sleep(2) driver.find_element_by_xpath('//*[@>).click() # 点击保存设置 # 处理弹出的警告页面确定accept() 和取消dismiss() driver.switch_to_alert().accept() sleep(3) # 找到百度的输入框，并输入美女 driver.find_element_by_id('kw').send_keys('美女') sleep(3) # 点击搜索按钮 driver.find_element_by_id('su').click() sleep(3) # 在打开的页面中找到“Selenium - 开源中国社区”，并打开这个页面 driver.find_elements_by_link_text('美女_海量精选高清图片_百度图片')[0].click() sleep(5) # 关闭浏览器 driver.quit()

from selenium import webdriver from time import sleep from lxml import etree bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('http://125.35.6.84:81/xk/') sleep(1) page_text = bro.page_source page_text_list = [page_text] for i in range(3): bro.find_element_by_id('pageIto_next').click()#点击下一页 sleep(1) page_text_list.append(bro.page_source) for page_text in page_text_list: tree = etree.HTML(page_text) li_list = tree.xpath('//ul[@>) for li in li_list: title = li.xpath('./dl/@title')[0] num = li.xpath('./ol/@title')[0] print(title+':'+num) sleep(2) bro.quit()

from selenium import webdriver from time import sleep from selenium.webdriver import ActionChains bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') bro.switch_to.frame('iframeResult') div_tag = bro.find_element_by_id('draggable') #拖动= 点击+滑动 action = ActionChains(bro) action.click_and_hold(div_tag) for i in range(5): #perform让动作链立即执行 action.move_by_offset(17,5).perform() sleep(0.5) action.release() sleep(3) bro.quit()

from selenium import webdriver from time import sleep from PIL import Image from selenium.webdriver import ActionChains from Cjy import Chaojiying_Client from selenium.webdriver import ActionChains bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://kyfw.12306.cn/otn/login/init') sleep(5) bro.save_screenshot('main.png') code_img_tag = bro.find_element_by_xpath('//*[@>) location = code_img_tag.location size = code_img_tag.size #裁剪的区域范围 rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height'])) i = Image.open('./main.png') frame = i.crop(rangle) frame.save('code.png') def get_text(imgPath,imgType): chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370') im = open(imgPath, 'rb').read() return chaojiying.PostPic(im, imgType)['pic_str'] #55,70|267,133 ==[[55,70],[33,66]] result = get_text('./code.png',9004) all_list = [] if '|' in result: list_1 = result.split('|') count_1 = len(list_1) for i in range(count_1): xy_list = [] x = int(list_1[i].split(',')[0]) y = int(list_1[i].split(',')[1]) xy_list.append(x) xy_list.append(y) all_list.append(xy_list) else: x = int(result.split(',')[0]) y = int(result.split(',')[1]) xy_list = [] xy_list.append(x) xy_list.append(y) all_list.append(xy_list) print(all_list) # action = ActionChains(bro) for a in all_list: x = a[0] y = a[1] ActionChains(bro).move_to_element_with_offset(code_img_tag,x,y).click().perform() sleep(1) bro.find_element_by_id('username').send_keys('123456') sleep(1) bro.find_element_by_id('password').send_keys('67890000000') sleep(1) bro.find_element_by_id('loginSub').click() sleep(5) bro.quit()

from selenium import webdriver from time import sleep from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(r'chromedriver.exe',chrome_options=chrome_options) driver.get('https://www.cnblogs.com/') print(driver.page_source)

from selenium import webdriver from selenium.webdriver import ChromeOptions from time import sleep option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) driver = webdriver.Chrome(r'chromedriver.exe',options=option) driver.get('https://www.taobao.com/')

相关推荐