Python爬网——获取安卓手机统计数据

Python爬网——获取安卓手机统计数据

[本文出自天外归云的博客园]

1. 在安卓网上对热门机型进行爬网,取前五十:

# -*- coding: utf-8 -*-
import requests,re
from bs4 import BeautifulSoup

def get_rank_list():
    s = requests.Session()
    rank_list = []
    for pageNum in xrange(1,10):
        url = "http://product.hiapk.com/mobile/p"+str(pageNum)+"-s1-list.html"
        r = s.get(url)
        soup = BeautifulSoup(r.content,"lxml")
        content = soup.find(id='content')
        if len(rank_list)<51:
            for item in content.findAll('dt'):
                phone_name = item.find('a').attrs['title']
                if (not re.search('iphone', phone_name, re.IGNORECASE)) and (len(rank_list)<51):
                    rank_list.append(phone_name)
                else:
                    break
        else:
            break
    return rank_list

if __name__ == '__main__':
    for phone in get_rank_list():
        print phone

2. 在talkingdata上对安卓手机统计数据进行分类爬取:

# -*- coding: utf-8 -*-
import requests,re,sys
from bs4 import BeautifulSoup

'''
    type:
        1-按品牌排名
        2-按机型排名
        3-按分辨率排名
        4-按操作系统排名
        5-按运营商排名
        6-按网络排名
'''
def rank_crawl(type):
    s = requests.Session()
    url = 'http://mi.talkingdata.com/terminals.html?terminalType='+str(type)
    r = s.get(url)
    soup = BeautifulSoup(r.content,"lxml")
    list_content = soup.find(id='list-content')
    rank_list = []
    for item in list_content.findAll('a'):
        rank_list.append(item.attrs['title'])
    return rank_list

if __name__ == '__main__':
    rank_all = {}
    rank_all['brand'] = rank_crawl(1)
    rank_all['model'] = rank_crawl(2)
    rank_all['resolution'] = rank_crawl(3)
    rank_all['system'] = rank_crawl(4)
    rank_all['operator'] = rank_crawl(5)
    rank_all['network'] = rank_crawl(6)
    for one in rank_all[sys.argv[1]]:
        print one.encode("gbk")