1 """
2 原生爬虫
3
4 爬虫前奏:
5 明确目的
6 找到数据对应的网页
7 分析网页的结构找到数据所在的标签位置
8
9 模拟HTTP请求,向服务器发送这个请求,获取到服务器返回给我们的HTML
10 用正则表达式提取我们要的数据(名字、人气<热度>)
11
12 参考文档:
13 https://blog.****.net/qq_38151401/article/details/93018656
14
15 思路:
16
17 (1)获取网页内容
18
19 (2)分析所要获取的数据格式
20
21 (3)获取相应的数据
22
23 (4)将数据转化为所需要的格式
24
25 (5)数据展现
26 """
1 #样例:原生爬虫爬取虎牙的王者荣耀板块,进行主播人气排序
2 #拓展爬虫框架:BeautifulSoup,Scrapy
3 # 爬虫、反爬虫、反反爬虫 ip容易被封,代理IP库
4 import re
5 from urllib import request
6 import ssl
7 #断点调试
8 class Spider():
9 #定义链接、截取字段
10 url = 'https://www.huya.com/g/wzry' #爬虫获取的网站
11 root_pattern = '<span class="txt">([sS]*?)</li>' #爬虫获取的节点
12 #root_pattern2 = '<li class="game-live-item"[sS]*?</li>'
13 name_pattern = '<i class="nick" title="([sS]*?)">'#爬虫获取的名字(正则)
14 number_pattern = '<i class="js-num">([sS]*?)</i>' #爬虫获取的人气值(正则)
15
16 #获取网站的代码
17 def __fetch_content(self):
18 ssl._create_default_https_context = ssl._create_unverified_context#创建免验证的ssl
19 r = request.urlopen(Spider.url)#获取地址
20 htmls = r.read() #读取代码
21 htmls = str(htmls, encoding='utf-8')#变为可阅读的字符串格式
22 return htmls
23 #
24 #分析代码(使用正则获取字段) 将取得的字段放入列表(数组)中
25 def __analysis(self, htmls):
26 root_html = re.findall(Spider.root_pattern,htmls)
27 print(root_html[0])
28 #root_html2 = re.findall(Spider.root_pattern2,htmls)
29 anchors = []
30 for html in root_html:
31 name = re.findall(Spider.name_pattern,html)
32 number = re.findall(Spider.number_pattern,html)
33 anchor = {'name':name,'number':number} # {'name': ['Dae-心态'], 'number': ['<i class="js-num">473.4万</i>']}
34 anchors.append(anchor)
35 #print(anchors[0])
36 a = 1
37 return anchors
38
39 #处理所获取数组中多余的符号等
40 def __refine(self,anchors):
41 l = lambda anchor:{
42 'name':anchor['name'][0].strip(),
43 'number': anchor['number'][0]
44 }
45 return map(l,anchors)
46
47 #排序
48 def __sort(self,anchors):
49 #filter
50 anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
51 return anchors
52 #排序的条件
53 def __sort_seed(self,anchor):
54
55 #r = re.findall('[1-9]d[^,]*.d*|0.d*[1-9]d*|[^,]',anchor['number'])
56 #r = re.findall('[1-9][^,]d*.d*|0.d*[1-9][^,]d*', '1,816.1万')
57 # print(anchor['number'],list(r),r[0])
58 number = float(str(anchor['number']).replace('万', ''))
59
60 if ',' in anchor['number']:
61 number = float(str('1,816.1万').replace(',','').replace('万',''))
62 elif '万' in anchor['number']:
63 number *= 10000
64 return number
65
66 #展示
67 def __show(self,anchors):
68 for rank in range(0,len(anchors)):
69 #print(anchor['name']+'-----'+anchor['number'])
70 print('rank ' + str(rank + 1)
71 + ':' + anchors[rank]['name']
72 + ' ' + anchors[rank]['number'])
73 #公共方法区调用私有方法
74 def go(self):
75 htmls = self.__fetch_content() #获取网站的代码
76 anchors = self.__analysis(htmls) #分析代码(使用正则获取字段) 将取得的字段放入列表(数组)中
77 anchors = list(self.__refine(anchors)) #处理所获取数组中多余的符号等
78 anchors = self.__sort(anchors) #排序
79 self.__show(anchors) #展示
80 #print(list(anchors))
81
82 spider = Spider()
83 spider.go()
84
85 """
86 <li class="game-live-item" g>
87 <a href="https://www.huya.com/688" class="video-info " target="_blank">
88 <img class="pic" data-original="//live-cover.msstatic.com/huyalive/1259515661837-1259515661837-4682562792811659264-2519031447130-10057-A-0-1/20200723205252.jpg?x-oss-process=image/resize,limit_0,m_fill,w_338,h_190/sharpen,80/format,jpg/interlace,1/quality,q_90" src="//live-cover.msstatic.com/huyalive/1259515661837-1259515661837-4682562792811659264-2519031447130-10057-A-0-1/20200723205252.jpg?x-oss-process=image/resize,limit_0,m_fill,w_338,h_190/sharpen,80/format,jpg/interlace,1/quality,q_90" data-default-img="338x190" alt="张大仙的直播" title="张大仙的直播">
89
90 <em class="tag tag-recommend">超级明星</em>
91
92 <div class="item-mask"></div>
93 <i class="btn-link__hover_i"></i>
94 <p class="tag-right">
95
96 <!-- 手机开播 -->
97
98 <!-- VR直播 -->
99
100 <!-- 无损音质 || 蓝光 -->
101 <em class="tag-blue">蓝光8M</em>
102
103
104
105 </p>
106 </a>
107 <a href="https://www.huya.com/688" class="title" title="大仙来啦" target="_blank">大仙来啦</a>
108 <span class="txt">=============================================================================
109 <span class="avatar fl">
110 <img data-original="https://huyaimg.msstatic.com/avatar/1016/b9/b6824c9d5593f03f5b5c4f71189023_180_135.jpg" src="https://huyaimg.msstatic.com/avatar/1016/b9/b6824c9d5593f03f5b5c4f71189023_180_135.jpg" data-default-img="84x84" alt="张大仙" title="张大仙">
111 <i class="nick" title="张大仙">张大仙</i>
112 </span>
113 <span class="num">
114 <i class="num-icon"></i>
115 <i class="js-num">1,404.5万</i></span>
116 </span>
117 </li>
118
119 """