个人作业——顶会热词进程1.3

学习进度条：
目标任务：
完成python论文爬取且存入数据库
预计时间：
1天
完成情况：
完成了python论文爬取且存入数据库
python文件：
import requests
import pymysql
from jieba.analyse import extract_tags
from lxml import etree  # 导入库
from bs4 import BeautifulSoup
import re
import time

db = pymysql.connect(host="localhost", user="root", passwd="123asd..00", database="lunwen", charset='utf8')
cursor = db.cursor()


# 定义爬虫类

class Spider():
    def __init__(self):
        self.url = 'https://openaccess.thecvf.com/CVPR2018?day=2018-06-21'
        #self.url='https://openaccess.thecvf.com/CVPR2018?day=2018-06-20'
        #self.url='https://openaccess.thecvf.com/CVPR2018?day=2018-06-19'
        #self.url = 'https://openaccess.thecvf.com/CVPR2019?day=2019-06-18'
        #self.url = 'https://openaccess.thecvf.com/CVPR2019?day=2019-06-19'
        #self.url = 'https://openaccess.thecvf.com/CVPR2019?day=2019-06-20'
        #self.url = 'https://openaccess.thecvf.com/ICCV2019?day=2019-10-29'
        #self.url = 'https://openaccess.thecvf.com/ICCV2019?day=2019-10-31'
        #self.url = 'https://openaccess.thecvf.com/CVPR2018?day=2018-06-19'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 '
                          'Safari/537.36 '
        }
        r = requests.get(self.url, headers=self.headers)
        r.encoding = r.apparent_encoding
        self.html = r.text

    def lxml_find(self):
        '''用lxml解析'''
        tonum = 200
        number = 1
        start = time.time()  # 三种方式速度对比
        selector = etree.HTML(self.html)  # 转换为lxml解析的对象
        titles = selector.xpath('//dt[@class="ptitle"]/a/@href')  # 这里返回的是一个列表
        for each in titles[200:]:
            title0 = each.strip()  # 去掉字符左右的空格
            # print("https://openaccess.thecvf.com/content_CVPR_2019"+title[17:])
            chaolianjie = "https://openaccess.thecvf.com/content_cvpr_2018" + title0[17:]
            req = requests.get(chaolianjie, headers=self.headers)
            req.encoding = req.apparent_encoding
            onehtml = req.text
            selector1 = etree.HTML(req.text)
            title = selector1.xpath('//div[@>)
            # print(title[0][1:])
            abst = selector1.xpath('//div[@>)
            hre0 = selector1.xpath('//a/@href')
            hre = "https://openaccess.thecvf.com" + hre0[5][5:]
            # print(hre)
            zuozhe = selector1.xpath('//dd/div[@>)
            va = []
            for keyword, weight in extract_tags(abst[0].strip(), topK=1, withWeight=True):
                print('%s %s' % (keyword, weight))
            va.append(title)
            va.append(hre)
            va.append(abst)
            va.append(zuozhe)
            va.append("2018-06-21")
            va.append(keyword)
            sql = "insert into result (title,link,abstract,zuozhe,time,keywork) values (%s,%s,%s,%s,%s,%s)"
            cursor.execute(sql, va)
            db.commit()

            print("已爬取" + str(number) + "条数据")
            number = number + 1

        end = time.time()
        print('共耗时：', end - start)


if __name__ == '__main__':
    spider = Spider()
    spider.lxml_find()
    cursor.close()
    db.close()
遇到问题：
无
个人作业——顶会热词进程1.3

相关推荐