python静态网页爬虫之xpath(容易的博客更新提醒功能)
python静态网页爬虫之xpath(简单的博客更新提醒功能)
直接上代码:
#!/usr/bin/env python3 #antuor:Alan #-*- coding: utf-8 -*- import requests from lxml import etree import datetime,time import os class xxoohelper(object): #易读 def __init__(self): self.url = 'http://www.cnblogs.com/alan-babyblog/' #初始化 def getSource(self): html = requests.get(self.url).content #content比text好用,一个返回的是byte,一个返回的是str return html def getContent(self,html): #先大后小 selector = etree.HTML(html) title = selector.xpath('//div[1]/div[2]/a/text()')[0].strip() #从列表提取文本 content = selector.xpath('//div[1]/div[2]/div[1]/div/div[1]/div[3]/div/text()')[0].strip() post_time = selector.xpath('//div[1]/div[2]/div[1]/div/div[1]/div[5]/text()')[0].strip() send_text = title+content+post_time #类型是str return send_text def tosave(self,text): with open('myblog.txt','a') as f: f.write(('{0}\n').format(text)) #换行 def tocheck(self,data): if not os.path.exists('myblog.txt'): #判断是否存在文件 return True else: with open ('myblog.txt','r') as f: existblog = f.readlines() #print(data+'\n') if data +'\n' in existblog: #判断是否已经纪录过内容 return False else: return True if __name__ == '__main__': #程序入口 helper = xxoohelper() #实例化 while True : #while循环不断监控页面 source = helper.getSource() content = helper.getContent(source) if helper.tocheck(content): post_time = str(datetime.datetime.now()) print(post_time,'有新内容\n',content) helper.tosave(content) else: print('扫描中......') pass time.sleep(30)