Python3网络爬虫
# 最近在实验楼学习了爬取妹子图,发现在运行的时候不是很流畅,有些图片下 1 # coding: utf-8
1 # coding: utf-8 2 3 import re 4 import threading 5 from urllib.request import urlopen 6 from urllib.error import HTTPError 7 from bs4 import BeautifulSoup 8 import meizi_series_nextpage 9 10 def loadurl(url): 11 try: 12 conn = urlopen(url) 13 html = conn.read() 14 return html 15 except HTTPError as e: 16 return e 17 except Exception as e: 18 print("unkown exception in conn.read() %s "%e) 19 return '' 20 21 def meizi(url,path): 22 # 获取首页标签 23 print('start open meiziwang') 24 html = '' 25 while True: 26 html = loadurl(url) 27 if html == '': 28 print('load', url,'error') 29 continue 30 else: 31 break 32 mnvtp = BeautifulSoup(html) 33 taglists = mnvtp.findAll("div",{"class":"tags"}) 34 taglistss = re.findall('<a.*?href="(.*?)".*?>','%s'%taglists) 35 print(list(set(taglistss))) 36 print(len(list(set(taglistss)))) 37 print('open meiziwang over') 38 meizi_series_nextpage.nextpage(url,path) 39 threads = [] 40 for url in list(set(taglistss)): 41 t =threading.Thread(target=meizi_series_nextpage.nextpage, args=(url, path)) 42 threads.append(t) 43 for t in threads: 44 t.start() 45 for t in threads: 46 t.join() 47 if __name__ == '__main__': 48 meizi('http://www.meizitu.com','D:\MeiZi\') 49 print ('Spider Stop')
# coding: utf-8 import re from urllib.request import urlopen from urllib.error import HTTPError from bs4 import BeautifulSoup import meizi_series_getpage #同样的,这里是加载链接防超时 def loadurl(url): try: conn = urlopen(url, timeout=5) html = conn.read() return html except HTTPError as e: print(e) except Exception as e: print(e) def nextpage(url,path): #获取首页尾部标签 nextweibu = re.split("/",url) # 获取头部文件 nexthead = re.split("/a/",url) nexthead = nexthead[0] + "/a/" # 创建首页路径 path = path+"\"+nextweibu[-1].split(".",1)[0] # 获取html while True: html = loadurl(url) if html == '': print('load', url,'error') continue else: break # 获取子标签 mnvtp = BeautifulSoup(html) taglists = mnvtp.findAll("div",{"id":"wp_page_numbers"}) taglists = re.findall('<a.*?href="(.*?)".*?>','%s'%taglists) taglists = sorted(list(set(taglists))) if taglists == []: taglists = [nextweibu[-1]] # 获取单个首页所有标签完整url路径 print("正在获取首页所有子标签Url:%s"%url) completeurl = [] for i in taglists: url = nexthead + i completeurl.append(url) completeurl = sorted(completeurl) for i in completeurl: print("正在获取子标签下所有套图url路径") meizi_series_getpage.tag_series(i,path)