个人作业——顶会热词进程1.3
学习进度条:
目标任务:
完成python论文爬取且存入数据库
预计时间:
1天
完成情况:
完成了python论文爬取且存入数据库
python文件:
import requests import pymysql from jieba.analyse import extract_tags from lxml import etree # 导入库 from bs4 import BeautifulSoup import re import time db = pymysql.connect(host="localhost", user="root", passwd="123asd..00", database="lunwen", charset='utf8') cursor = db.cursor() # 定义爬虫类 class Spider(): def __init__(self): self.url = 'https://openaccess.thecvf.com/CVPR2018?day=2018-06-21' #self.url='https://openaccess.thecvf.com/CVPR2018?day=2018-06-20' #self.url='https://openaccess.thecvf.com/CVPR2018?day=2018-06-19' #self.url = 'https://openaccess.thecvf.com/CVPR2019?day=2019-06-18' #self.url = 'https://openaccess.thecvf.com/CVPR2019?day=2019-06-19' #self.url = 'https://openaccess.thecvf.com/CVPR2019?day=2019-06-20' #self.url = 'https://openaccess.thecvf.com/ICCV2019?day=2019-10-29' #self.url = 'https://openaccess.thecvf.com/ICCV2019?day=2019-10-31' #self.url = 'https://openaccess.thecvf.com/CVPR2018?day=2018-06-19' self.headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 ' 'Safari/537.36 ' } r = requests.get(self.url, headers=self.headers) r.encoding = r.apparent_encoding self.html = r.text def lxml_find(self): '''用lxml解析''' tonum = 200 number = 1 start = time.time() # 三种方式速度对比 selector = etree.HTML(self.html) # 转换为lxml解析的对象 titles = selector.xpath('//dt[@class="ptitle"]/a/@href') # 这里返回的是一个列表 for each in titles[200:]: title0 = each.strip() # 去掉字符左右的空格 # print("https://openaccess.thecvf.com/content_CVPR_2019"+title[17:]) chaolianjie = "https://openaccess.thecvf.com/content_cvpr_2018" + title0[17:] req = requests.get(chaolianjie, headers=self.headers) req.encoding = req.apparent_encoding onehtml = req.text selector1 = etree.HTML(req.text) title = selector1.xpath('//div[@>) # print(title[0][1:]) abst = selector1.xpath('//div[@>) hre0 = selector1.xpath('//a/@href') hre = "https://openaccess.thecvf.com" + hre0[5][5:] # print(hre) zuozhe = selector1.xpath('//dd/div[@>) va = [] for keyword, weight in extract_tags(abst[0].strip(), topK=1, withWeight=True): print('%s %s' % (keyword, weight)) va.append(title) va.append(hre) va.append(abst) va.append(zuozhe) va.append("2018-06-21") va.append(keyword) sql = "insert into result (title,link,abstract,zuozhe,time,keywork) values (%s,%s,%s,%s,%s,%s)" cursor.execute(sql, va) db.commit() print("已爬取" + str(number) + "条数据") number = number + 1 end = time.time() print('共耗时:', end - start) if __name__ == '__main__': spider = Spider() spider.lxml_find() cursor.close() db.close()
遇到问题:
无