'''二级静态页面的爬取'''
from urllib import request
import re
import time
import random
import pymysql
class DianyingtiantangSpider:
def __init__(self):
self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
self.headers = {'User-Agent': random.choice([
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
])}
self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', db='dianyingdb',
charset='utf8')
self.cursor = self.db.cursor()
# 获取html函数(因为两个页面都需要请求)
def get_page(self, url):
req = request.Request(url=url, headers=self.headers)
res = request.urlopen(req)
html = res.read().decode('gb2312', 'ignore')
return html
# 解析提取数据(把名称和下载链接一次性拿到)
def parse_page(self, html):
# 1.先解析一级页面(提取电影名称,和详情链接)
pattern = re.compile('<table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">(.*?)</a>.*?</table>',
re.S)
# film_list:[('详情链接','电影名称'),()]
film_list = pattern.findall(html)
# print(film_list)
result_list = []
for film in film_list:
film_name = film[1].strip()
film_link = 'https://www.dytt8.net{}'.format(film[0].strip())
# print(film_link)
download_link = self.parse_two_page(film_link)
result_list.append([film_name, download_link])
self.save_page(result_list)
def parse_two_page(self, film_link):
two_html = self.get_page(film_link)
pattern = re.compile('<td style="WORD-WRAP.*?>.*?>(.*?)</a>', re.S)
download_link = pattern.findall(two_html)
# print('你猜',download_link)
return download_link[0].strip()
# 2.拿到详情链接后,再去获取详情链接的html,提取下载链接
# 保存
def save_page(self, result_list):
ins = 'insert into film values(%s,%s)'
self.cursor.executemany(ins, result_list)
self.db.commit()
# 主函数
def main(self):
ins = 'delete from film'
self.cursor.execute(ins)
self.db.commit()
i = 1
for i in range(1, 5):
url = self.url.format(i)
html = self.get_page(url)
self.parse_page(html)
print('第{}页爬取成功'.format(i))
i += 1
time.sleep(random.randint(1, 3))
self.cursor.close()
self.db.close()
if __name__ == '__main__':
start = time.time()
spider = DianyingtiantangSpider()
spider.main()
end = time.time()
print('程序执行时间为:%.2f' % (end - start))