爬虫项目-爬取亚马逊商品信息
分类:
IT文章
•
2024-05-08 15:14:42

# -*- coding: utf-8 -*-
# Scrapy settings for AMAZON project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
DB="amazon"
COLLECTION="goods"
HOST="localhost"
PORT=27017
USER="root"
PWD="123456"
FILE_PATH="goods.txt"
BOT_NAME = 'AMAZON'
SPIDER_MODULES = ['AMAZON.spiders']
NEWSPIDER_MODULE = 'AMAZON.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'AMAZON (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'AMAZON.middlewares.AmazonSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'AMAZON.middlewares.DownMiddleware1': 200,
# 'AMAZON.middlewares.DownMiddleware2': 300,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'AMAZON.pipelines.MongoPipeline': 200,
'AMAZON.pipelines.FilePipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
DUPEFILTER_CLASS = 'AMAZON.cumstomdupefilter.MyDupeFilter'
配置文件
from scrapy.cmdline import execute
# execute(['scrapy', 'crawl', 'amazon','--nolog'])
# execute(['scrapy', 'crawl', 'amazon',])
#scrapy crawl amazon -a keyword=iphone8手机
execute(['scrapy', 'crawl', 'amazon','-a','keyword=iphone8手机','--nolog'])
# execute(['scrapy', 'crawl', 'baidu',])
# execute(['scrapy', 'crawl', 'baidu','--nolog'])
entrypoint.py
# -*- coding: utf-8 -*-
import scrapy
from urllib.parse import urlencode
from AMAZON.items import AmazonItem
# from scrapy.http import Request
# from scrapy.spiders import Spider,CrawlSpider,XMLFeedSpider,CSVFeedSpider,SitemapSpider
# from scrapy.selector import HtmlXPathSelector #response.xpath
# print(Spider is scrapy.Spider)
# print(XMLFeedSpider is scrapy.XMLFeedSpider)
# print(Request is scrapy.Request)
# from scrapy.dupefilter import RFPDupeFilter
# from scrapy.core.scheduler import Scheduler
class AmazonSpider(scrapy.Spider):
name = 'amazon'
allowed_domains = ['www.amazon.cn']
start_urls = ['http://www.amazon.cn/',]
#self.settings.get()
custom_settings = {
"BOT_NAME" : 'EGON_AMAZON',
'REQUSET_HEADERS':{
},
}
def __init__(self,keyword,*args,**kwargs):
super(AmazonSpider,self).__init__(*args,**kwargs)
self.keyword=keyword
def start_requests(self):
'''
爬虫第一次请求执行的函数
:return:
'''
url='https://www.amazon.cn/s/ref=nb_sb_noss_1/461-4093573-7508641?' #https://www.amazon.cn/ref=nb_sb_noss_null
url+=urlencode({"field-keywords" : self.keyword})
print(url)
yield scrapy.Request(url,
callback=self.parse_index, #解析完页面直接回调self.parse_index
dont_filter=False,
)
def parse_index(self, response):
# print('============>',self.settings['NEWSPIDER_MODULE'])
# print('============>',self.settings['BOT_NAME'])
# print('============>',self.settings['REQUSET_HEADERS'])
# self.logger.warn('============>%s' %self.settings['REQUSET_HEADERS'])
# print('======>',response.request.meta,response.meta)
# print('======>',response.request.url,response.url)
# print('%s 解析结果:%s' %(response.url,len(response.body)))
detail_urls=response.xpath('//*[contains(@id,"result_")]/div/div[3]/div[1]/a/@href').extract() #//*[contains(@id,"result_")]/div/div[3]/div[1]/a/@href
# print(detail_urls) #解析到全部的url
for detail_url in detail_urls: #获取到的url是一个列表
# 拿到每一件商品的url信息,重新发起request请求,并且调用回调函数
yield scrapy.Request(url=detail_url, #向detail_url发起请求解析详情页面
callback=self.parse_detail
)
next_url=response.urljoin(response.xpath('//*[@>因为详情会有好几页,所以想下一页发起请求,重新使用该回调函数
# print(next_url)
yield scrapy.Request(url=next_url,
callback=self.parse_index
)
def parse_detail(self,response):
# print('%s 详情页解析结果:%s' % (response.url, len(response.body)))
name=response.xpath('//*[@> 找到商品名,找到下边的全部文本,切分
price=response.xpath('//*[@> 找到价格信息
delivery_method=''.join(response.xpath('//*[@>找子子孙孙,吧所有的文本放在一起
print(response.url)
print(name)
print(price)
print(delivery_method)
item=AmazonItem() #得到一个item对象
item["name"]=name #这里的key值必须
item["price"]=price #将价格添加到item对象
item["delivery_method"]=delivery_method
return item
def close(spider, reason):
print('结束啦')
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class AmazonItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
price = scrapy.Field()
delivery_method = scrapy.Field()
item
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exceptions import DropItem
from pymongo import MongoClient
class MongoPipeline(object):
def __init__(self,db,collection,host,port,user,pwd):
self.db=db
self.collection=collection
self.host=host
self.port=port
self.user=user
self.pwd=pwd
@classmethod
def from_crawler(cls, crawler):
"""
Scrapy会先通过getattr判断我们是否自定义了from_crawler,有则调它来完
成实例化
"""
db = crawler.settings.get('DB')
collection = crawler.settings.get('COLLECTION')
host = crawler.settings.get('HOST')
port = crawler.settings.getint('PORT')
user = crawler.settings.get('USER')
pwd = crawler.settings.get('PWD')
return cls(db,collection,host,port,user,pwd)
def open_spider(self,spider):
"""
爬虫刚启动时执行一次
"""
print('==============>爬虫程序刚刚启动,自动开始链接数据库')
self.client = MongoClient("mongodb://%s:%s@%s:%s" %(
self.user,
self.pwd,
self.host,
self.port
))
def process_item(self, item, spider):
# 操作并进行持久化
# return表示会被后续的pipeline继续处理
d=dict(item) #将Amazon传过来的json数据转换为一个字典
if all(d.values()): #这句主要判断字典里面的每个key都有对应的value
self.client[self.db][self.collection].save(d) #保存数据
# return之后代表可以继续往后走
return item
# 表示将item丢弃,不会被后续pipeline处理
# raise DropItem()
def close_spider(self,spider):
"""
爬虫关闭时执行一次
"""
print('==============>爬虫程序运行完毕,数据库链接关闭')
self.client.close()
class FilePipeline(object):
def __init__(self, file_path):
self.file_path=file_path
@classmethod
def from_crawler(cls, crawler):
"""
Scrapy会先通过getattr判断我们是否自定义了from_crawler,有则调它来完
成实例化
"""
file_path = crawler.settings.get('FILE_PATH') # 获取到文件保存的路径
return cls(file_path)
def open_spider(self, spider):
"""
爬虫刚启动时执行一次
"""
print('==============>爬虫程序刚刚启动,打开一个文件准备写入数据')
self.fileobj=open(self.file_path,'w',encoding='utf-8')
def process_item(self, item, spider):
# 操作并进行持久化
# return表示会被后续的pipeline继续处理
d = dict(item)
if all(d.values()):
self.fileobj.write(r"%s
" %str(d))
return item
# 表示将item丢弃,不会被后续pipeline处理
# raise DropItem()
def close_spider(self, spider):
"""
爬虫关闭时执行一次
"""
print('==============>爬虫程序运行完毕')
self.fileobj.close()
pipelines
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
from scrapy.http import Response
from scrapy.exceptions import IgnoreRequest
from AMAZON.proxy_handle import get_proxy,delete_proxy #爬取代理的蜘蛛
class AmazonSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DownMiddleware1(object):
def process_request(self, request, spider):
"""
请求需要被下载时,经过所有下载器中间件的process_request调用
:param request:
:param spider:
:return:
None,继续后续中间件去下载;
Response对象,停止process_request的执行,开始执行process_response
Request对象,停止中间件的执行,将Request重新调度器
raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
"""
# spider.name
print('下载中间件1')
# request.meta['proxy']='http://user:pwd@ip:port'
request.meta['download_timeout']=10 #超时等待时间
request.meta['proxy']='http://'+get_proxy() #获取一个代理ip
print(request.meta)
# return Response('http://www.xxx.com')
# print(request.dont_filter)
# return request
# raise IgnoreRequest
# raise TimeoutError
def process_response(self, request, response, spider):
"""
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return:
Response 对象:转交给其他中间件process_response
Request 对象:停止中间件,request会被重新调度下载
raise IgnoreRequest 异常:调用Request.errback
"""
print('response1')
return response
def process_exception(self, request, exception, spider):
"""
当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
:param response:
:param exception:
:param spider:
:return:
None:继续交给后续中间件处理异常;
Response对象:停止后续process_exception方法
Request对象:停止中间件,request将会被重新调用下载
"""
print('异常1')
# return None
# 删旧代理 delelte request.meta['proxy']
old_proxy=request.meta['proxy'].split("//")[-1] #切出xx.xx.xx.x:port
delete_proxy(old_proxy) #删除没有用的代理
request.meta['proxy']='http://'+get_proxy() #重新给一个代理,继续发起request请求
return request
# class DownMiddleware2(object):
# def process_request(self, request, spider):
# """
# 请求需要被下载时,经过所有下载器中间件的process_request调用
# :param request:
# :param spider:
# :return:
# None,继续后续中间件去下载;
# Response对象,停止process_request的执行,开始执行process_response
# Request对象,停止中间件的执行,将Request重新调度器
# raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
# """
# print('下载中间件2')
#
# def process_response(self, request, response, spider):
# """
# spider处理完成,返回时调用
# :param response:
# :param result:
# :param spider:
# :return:
# Response 对象:转交给其他中间件process_response
# Request 对象:停止中间件,request会被重新调度下载
# raise IgnoreRequest 异常:调用Request.errback
# """
# print('response2')
# # return response
# # return request
# # raise IgnoreRequest
#
# def process_exception(self, request, exception, spider):
# """
# 当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
# :param response:
# :param exception:
# :param spider:
# :return:
# None:继续交给后续中间件处理异常;
# Response对象:停止后续process_exception方法
# Request对象:停止中间件,request将会被重新调用下载
# """
# print('异常2')
# return None