1 import parsel
2 import pymysql
3 from lxml import etree
4 import re
5 import requests
6 def download_data(url,cookie):
7 '''
8 获取加密网页源码
9 获取加密文件
10 :return:
11 '''
12 headers = {
13 "Cookie": cookie,
14 "Referer": "http://www.dianping.com/",
15 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
16 }
17 '''
18 获取原始网页
19 '''
20 ret = requests.get(url=url, headers=headers).text
21 with open('01 原始网页_加密.html', 'w', encoding='utf-8') as f:
22 f.write(ret)
23
24 '''
25 获取css文件
26 '''
27 css_url = re.findall('<link rel="stylesheet" type="text/css" href="(//s3plus.meituan.*?)">', ret)
28 css_url = 'https:' + css_url[0]
29 css_response = requests.get(css_url).text
30 with open('02 css样式.css', 'w', encoding='utf-8') as f:
31 f.write(css_response)
32
33 '''
34 获取svg对照表
35 '''
36 svg_urls = re.findall(r'.*?[class^="(.*?)"]{.*?background-image: url((.*?));', css_response)
37 for svg_url in svg_urls:
38 name, url = svg_url
39 svg_url = 'https:' + url
40 svg_response = requests.get(svg_url).text
41 with open(F'03 svg对照表{name}.svg', 'w', encoding='utf-8') as f:
42 f.write(svg_response)
43 def crack_data():
44 '''
45 解密数据,破解svg对应关系
46 :return:
47 '''
48 with open('03 svg对照表zpd.svg', 'r', encoding='utf-8') as f:#文件名称根据获取到的svg文件更换
49 svg_html = f.read()
50 sel = parsel.Selector(svg_html)
51 texts = sel.css('textPath')
52 paths = sel.css('path')
53 path_dict = {}
54 for path in paths:
55 path_dict[path.css('path::attr(id)').get()] = path.css('path::attr(d)').get().split(' ')[1]
56 # print(path.css('path::attr(id)').get())
57 # print(path.css('path::attr(d)').get().split(' ')[1])
58 count = 1
59 zpd_svg_dict = {} # y坐标和字符串的联系
60 for text in texts:
61 zpd_svg_dict[path_dict[str(count)]] = text.css('textPath::text').get()
62 count += 1
63 print(zpd_svg_dict)
64
65 with open('02 css样式.css', 'r', encoding='utf-8') as f:
66 css_html = f.read()
67
68 css_paths = re.findall(r'''
69 .(zpd.*?) {
70 background: -(d+).0px -(d+).0px;
71 }
72 ''', css_html) # 正则表达式条件根据css文件类标签更换
73 print(css_paths)
74 last_map = {}
75 for css_path in css_paths:
76 css_name, x, y = css_path
77 index = int(int(x) / 14)
78 for i in zpd_svg_dict:
79 if int(y) > int(i):
80 pass
81 else:
82 last_map[css_name] = zpd_svg_dict[i][index]
83 break
84 return last_map
85
86 def decryption(last_map):
87 '''
88 返回破解后的html
89 :param last_map:
90 :return:
91 '''
92
93 with open('01 原始网页_加密.html', 'r', encoding='utf-8') as f:
94 ret = f.read()
95 svg_list = re.findall('<svgmtsi class="(.*?)"></svgmtsi>', ret)
96 for svg in svg_list:
97 print(svg, last_map[svg])
98 ret = ret.replace(f'<svgmtsi class="{svg}"></svgmtsi>', last_map[svg])
99 return ret
100 def write_data(ret):
101 '''
102 获取评论数据并写入数据库
103
104 :param ret:
105 :return:
106 '''
107 # 用不到的div标签去掉 并不是全部都有这个标签 影响代码编写
108 ret = ret.replace(' <div class="richtitle">消费后评价</div>', '')
109 # ret = ret.replace(div,'')
110 # print(ret)
111 etre = etree.HTML(ret)
112 li_list = etre.xpath('//*[@>)
113
114 # 初始化数据库
115 db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, database='review',
116 charset='utf8mb4')
117 cursor = db.cursor()
118 count = 0
119 for li in li_list:
120 name = li.xpath('./div[@class="main-review"]/div[1]/a/text()')[0].strip()
121 score = re.findall('sml-rank-stars sml-str(.*?) star', li.xpath('./div[1]/div[2]/span[1]/@class')[0])[0].strip()
122 time = li.xpath('//div[@class="misc-info clearfix"]/span[1]/text()')[count].strip()
123 shop_name = li.xpath('//div[@class="misc-info clearfix"]/span[2]/text()')[count].strip()
124 comment = ','.join([i.replace('
', '').strip() for i in li.xpath('./div/div[4]/text()')])
125 count += 1
126 print(name, score, time, shop_name, comment)
127 # 写入数据库
128 sql = 'insert into dianping(name,score,time,shop_name,comment)values(%s,%s,%s,%s,%s)'
129 cursor.execute(sql, (name, score, time, shop_name, comment))
130 db.commit()
131 # 关闭连接
132 db.close()
133
134
135 if __name__ == '__main__':
136 #cookie 不定时更换
137 cookie = "s_ViewType=10; _lxsdk_cuid=175e331ad79c8-0996df2d570671-46460e2a-1fa400-175e331ad79c8; _lxsdk=175e331ad79c8-0996df2d570671-46460e2a-1fa400-175e331ad79c8; _hc.v=c4dfac1c-01af-6a87-d803-2cd6b8db107a.1605834485; fspop=test; ctu=ef0b64e4cabf67f148563284ea8c8d0555a008f7ca0dee097831c90b52822812; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1605834487,1605835298,1606093773; cy=2; cye=beijing; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1606098153; expand=yes; _lxsdk_s=175f2cc7d23-6-9d5-75e%7C%7C532"
138 url = 'http://www.dianping.com/shop/130096343/review_all' #这是一个商家的评论 可以更换
139 try:
140 download_data(url,cookie)
141 except Exception:
142 print('出现验证码验证')#访问过多会出现验证码 目前没有破解
143 map_dict = {}
144 try:
145 map_dict = crack_data()
146 except Exception:
147 print('css类属性发生变化')
148 ret = decryption(map_dict)
149 write_data(ret)