
# 程序启动文件 start.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os, sys
BASEPATH = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
print(BASEPATH)
sys.path.append(BASEPATH)
from core import SpiderMan
if __name__ == '__main__':
s=SpiderMan.SpiderMan()
s.async()
# 爬虫调度器
#!/usr/bin/python
# -*- coding: utf-8 -*-
from gevent import spawn,monkey,joinall;monkey.patch_all()
from concurrent.futures import ThreadPoolExecutor
from core.UrlManager import UrlManager
from core.Htmldown import Htmldown
from core.Htmlparser import Htmlparser
# from core.DataOutput import DataOutput
class SpiderMan:
def __init__(self):
self.manager=UrlManager() #url管理
self.downloader=Htmldown() #HTML下载
self.parser=Htmlparser() #HTML解析
# self.output=DataOutput()
def index_work(self):
'''
爬取凌霄主页
'''
url = 'http://www.lingxiaozhishang.com'
self.manager.oldurls.add(url) #列表中添加每次传进来的url
html_dict = self.downloader.down_page(url) #下载器下载
if html_dict is None:
# raise print("爬取主页出错了")
print("爬取主页出错了")
return None
new_urls = self.parser.parser_index(html_dict,url) # 解析二层链接
self.manager.add_urls(new_urls) # 所有的a标签存放的列表
print("爬取 主页 + 所有文章url 完成")
def async(self):
'''
开启协程
'''
self.index_work()
pool = ThreadPoolExecutor(10) # 开启十个线程池
while True:
url = self.manager.get_url() # 从url管理器中获取url
if url is None:
break
pool.submit(self.downloader.down_page,url).add_done_callback(self.parser.parser_page) # 提交下载任务,解析
pool.shutdown(wait=True) #最后得关闭线程池
print("完了-----------------------")
# URL管理器
#!/usr/bin/python
# -*- coding: utf-8 -*-
class UrlManager:
def __init__(self):
self.newurls=set()
self.oldurls=set()
def add_url(self,newurl):
'''
添加小说章节的url
:return:
'''
if newurl not in self.oldurls:
self.newurls.add(newurl)
def add_urls(self,newurls):
'''
添加多个小说章节的url
:param newurls:
:return:
'''
if len(newurls)==0:return
for url in newurls:
self.add_url(url)
def get_url(self):
'''
取出一个小说章节的url
:return:
'''
try:
url = self.newurls.pop()
if url is not None:
self.oldurls.add(url)
return url
except KeyError:
pass
def has_oldurls(self):
'''
返回已爬小说章节的数量
:return:
'''
return len(self.oldurls)
# HTML下载器
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
class Htmldown:
def down_page(self,url):
'''
下载网页内容
'''
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0'}
r=requests.get(url,headers=headers)
r.encoding='utf8'
if r.status_code==200:
return r.text
# HTML解析器 解析完直接存储到文件了,应该持久化到MongoDB中
#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
class Htmlparser:
def parser_index(self,html_conf,url):
soup = BeautifulSoup(html_conf, 'html.parser')
list_a = soup.find(class_="chapterlist").find_all('a')
new_urls=[]
for a in list_a:
#url=http://www.lingxiaozhishang.com
#/book/439.html
new_url ="%s%s"%(url,a.attrs["href"])
new_urls.append(new_url)
return new_urls
def parser_page(self,html_conf):
'''
解析小说章节页面
:param html_conf:
:return:
'''
html_conf =html_conf.result()
soup=BeautifulSoup(html_conf,'html.parser')
title = soup.find('h1').get_text()
text = soup.find(id="BookText").get_text()
filepath = r"C:UsersAdministratorDesktopArticledb\%s.txt"%title
with open(filepath,"w") as f:
f.write(text)
print("%s 下载完成"%title)