scrapy框架之中间件Middleware
scrapy框架的中间件主要有两个,一个是spiderMiddleware(爬虫中间件),一个是DownloaderMiddleware(下载中间件)
通常由于在请求对象和相应对象数据在下载中间件就能处理好,一般不会去使用爬虫中间件。
下载中间件主要用到的方法有三个:
process_request:用来处理正常的请求对象的数据
process_response:用来处理响应对象的数据
process_exception:用来处理抛异常的请求对象的数据
其他的初始化类的对象的方法及打开日志的方法可有可无
按照惯例:
1.会将请求头user-agent 的设置文件放置在process_request中 :便于对正确的请求对象全部生成新的请求头
2.将代理ip信息放置在process_exception中:一旦请求对象出现请求异常,将其请求ip替换为代理ip去继续发送请求。
3.对于动态数据加载的页面数据,无法直接通过一次请求获取全部数据,可以使用selenium
scrapy中使用selenium编码:
a.spider构造方法中创建一个浏览器对象(作为当前spider的一个属性)
b.重写spider中的closed,在该方法中执行浏览器关闭操作
c.在下载中间件中process_response中,通过spider参数获取浏览器对象
d.在中间件的process_response中定制基于浏览器的自动化操作代码
e.实例化一个响应对象,将page_source中的页面源码封装到该对象中
f.返回该响应对象
以下是对中间件的简单配置:
from scrapy import signals
import random
from scrapy.http import HtmlResponse
from time import sleep
class VideoDownloaderMiddleware(object):
user_agent_list= [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36"
]
PROXY_HTTP = [
'111.206.6.101:80',
'39.137.107.98:80',
'120.210.219.101:8080'
]
PROXY_HTTPS=[
'218.60.8.83:3129',
'218.28.238.165:3128',
'120.234.63.196:3128'
]
def process_request(self, request, spider):
#给请求对象设置请求头
request.headers['User-Agent']=random.choice(self.user_agent_list)
print(request.headers['User-Agent'])
return None
def process_response(self, request, response, spider):
bow=spider.bow
bow.get(url=request.url) #获取当前请求对象要访问的URL
page_text=bow.page_source
return HtmlResponse(url=spider.bow.current_url,body=page_text,encoding='utf-8',request=request)
def process_exception(self, request, exception, spider):
print("代理池配置文件被读取")
if request.url.split(':')[0]=="http": # 判断协议设置不同的代理 代理池
request.meta['proxy']=random.choice(self.PROXY_HTTP)
else:
request.meta['proxy'] = random.choice(self.PROXY_HTTPS)
scrapy爬虫部分代码:
# -*- coding: utf-8 -*-
import scrapy
from video.items import VideoItem
from selenium import webdriver
class MvSpider(scrapy.Spider):
name = 'mv'
# allowed_domains = ['www.piaohua.com/']
start_urls = ['http://www.88ys.cc/dianying/1.html']
def __init__(self):
#创建一个浏览器对象
self.bow=webdriver.Chrome(executable_path=r'C:UsersasaxhDesktopchromedriver.exe')
def detail_parse(self,response):
item=response.meta['item']
year=response.xpath('//div[@class="ct-c"]/dl/dd[3]/text()').extract_first()
country = response.xpath('//div[@class="ct-c"]/dl/dd[2]/text()').extract_first()
type_list=response.xpath('//div[@class="ct-c"]/dl/dt//a/text()').extract()
type=" ".join(type_list) #电影类型 多标签 列表转字符串
actor = response.xpath('//div[@class="ct-c"]/dl/dt[3]/text()').extract_first()
about=response.xpath('//div[@class="ee"]/text()').extract_first()
item['year']=year
item['country'] =country
item['type'] =type
item['actor'] =actor
item['about'] =about
print(item)
yield item
def parse(self, response):
li_list=response.xpath('//div[@class="index-area clearfix"]/ul/li/a')
item=VideoItem()
for li in li_list:
m_url='http://www.88ys.cc'+li.xpath('./@href').extract_first()
name=li.xpath('./@title').extract_first()
item['name']=name
yield scrapy.Request(url=m_url,callback=self.detail_parse,meta={'item':item})
def closed(self,spider):
self.bow.quit() #关闭浏览器对象