分析ajax请求抓取今日头条关键字美图

  1 # 目标:抓取今日头条关键字美图
  2 # 思路:
  3 # 一、分析目标站点
  4 # 二、构造ajax请求,用requests请求到索引页的内容,正则+BeautifulSoup得到索引url
  5 # 三、对索引url请求,得到图片url与标题,下载并保存到数据库,本次使用MongDB
  6 # 四、开启循环与多进程,对多页内容遍历与抓取
  7 
  8 #问题一、为什么要构造请求
  9 #为什么要构造请求,举个例子,第一屏的内容我们看到的实际url是:
 10 # http://www.toutiao.com/search_content/?offset=20&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1
 11 # 后面有一大串参数,这些参数就是请求的一些‘设定’,表示关键词,加载的页数,等等,是一个字典的形式,
 12 # 如果人为去传这些数据显然十分繁琐,我们需要将这字典编码成一定格式加载请求函数里面。
 13 import os
 14 from json import JSONDecodeError
 15 from multiprocessing.pool import Pool
 16 
 17 import requests
 18 from urllib.parse import urlencode
 19 import json
 20 import pymongo
 21 
 22 from bs4 import BeautifulSoup
 23 
 24 from requests.exceptions import RequestException
 25 import re
 26 from config import *
 27 
 28 client = pymongo.MongoClient(MONGO_URL)
 29 db = client[MONGO_DB]
 30 
 31 def get_index_page(offset,keyword):
 32     data = {
 33         'offset': offset,
 34         'format': 'json',
 35         'keyword': keyword,
 36         'autoload': 'true',
 37         'count': '20',
 38         'cur_tab': 1
 39     }
 40     data = urlencode(data)
 41     url ='http://www.toutiao.com/search_content/?' + data
 42     #print(url)
 43     try:
 44         response = requests.get(url)
 45         if response.status_code == 200:
 46             return response.text
 47         else:
 48             return None
 49     except RequestException:
 50         print('请求不到索引页面!')
 51         return None
 52 
 53 
 54 def parse_index_page(html):
 55 
 56     #json_obj = json.dumps(html)#将Python对象序列化为json
 57     #python_obj = json.loads(json_obj)#将json加载成Python对象
 58     data = json.loads(html)
 59     #在进行json操作之前有必要了解一下json是怎么操作的
 60     if data and 'data' in data.keys():
 61         for item in data.get('data'):
 62             yield item.get('article_url')
 63 
 64 
 65 def get_detail_page(url):
 66     try:
 67         response = requests.get(url)
 68         if response.status_code == 200:
 69             return response.text
 70         else:
 71             return None
 72     except RequestException:
 73         return None
 74 
 75 def save_to_mongo(result):
 76     if db[MONG_TABLE].insert(result):
 77         print('存储到MongoDB成功',result)
 78         return True
 79     else:
 80         return False
 81 
 82 def parse_detail_page(html,url):
 83     soup = BeautifulSoup(html,'lxml')
 84     title = soup.title.string
 85     pattern = re.compile(r'var gallery = (.*?);',re.S)
 86     result = re.findall(pattern,html)
 87     if result:
 88         images=[]
 89         for i in result:
 90             i = json.loads(i)
 91             j = i.get("sub_images")
 92             #print(j)
 93             for k in j:
 94                 k = k.get('url')
 95                 images.append(k)
 96 
 97             return{
 98                     'title':title,
 99                     'url':url,
100                     'images':images
101                 }
102 
103 def download_image(result):
104     image_list = result.get('images')
105     image_title  = result.get('title')
106     print('正在下载:%s'%image_title)
107 
108     if image_title not in os.listdir(path ='.'):
109         os.mkdir(image_title)
110         os.chdir(image_title)
111         for image in image_list:
112             try:
113                 response = requests.get(image)
114                 if response.status_code == 200:
115                     filename = image.split('/')[-1] + '.jpg'
116                     with open(filename,'wb') as f:
117                         f.write(response.content)
118                         print('正在下载:%s'%image)
119 
120                 else:
121                     return None
122             except RequestException:
123                 return None
124         os.chdir(os.pardir)#返回上一级目录
125 
126 
127 def main(offset):
128 
129     html = get_index_page(offset,KEYWORDS)
130     for url in parse_index_page(html):
131         #print(url)
132         html = get_detail_page(url)
133         if html:
134             result = parse_detail_page(html,url)
135             if result:
136                 #print(result)
137                 #save_to_mongo(result)
138                 download_image(result)
139 
140 
141 
142 if __name__ == '__main__':
143 
144     groups = [i*20 for i in range(GROUP_START,GROUP_END + 1)]
145     pool = Pool()
146     pool.map(main,groups)
  1 #对比老司机所写
  2 import json
  3 import os
  4 from urllib.parse import urlencode
  5 import pymongo
  6 import requests
  7 from bs4 import BeautifulSoup
  8 from requests.exceptions import ConnectionError
  9 import re
 10 from multiprocessing import Pool
 11 from hashlib import md5
 12 from json.decoder import JSONDecodeError
 13 from config import *
 14 
 15 client = pymongo.MongoClient(MONGO_URL, connect=False)
 16 db = client[MONGO_DB]
 17 
 18 
 19 def get_page_index(offset, keyword):
 20     data = {
 21         'autoload': 'true',
 22         'count': 20,
 23         'cur_tab': 3,
 24         'format': 'json',
 25         'keyword': keyword,
 26         'offset': offset,
 27     }
 28     params = urlencode(data)
 29     base = 'http://www.toutiao.com/search_content/'
 30     url = base + '?' + params
 31     try:
 32         response = requests.get(url)
 33         if response.status_code == 200:
 34             return response.text
 35         return None
 36     except ConnectionError:
 37         print('Error occurred')
 38         return None
 39 
 40 
 41 def download_image(url):
 42     print('Downloading', url)
 43     try:
 44         response = requests.get(url)
 45         if response.status_code == 200:
 46             save_image(response.content)
 47         return None
 48     except ConnectionError:
 49         return None
 50 
 51 
 52 def save_image(content):
 53     file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
 54     print(file_path)
 55     if not os.path.exists(file_path):
 56         with open(file_path, 'wb') as f:
 57             f.write(content)
 58             f.close()
 59 
 60 
 61 def parse_page_index(text):
 62     try:
 63         data = json.loads(text)
 64         if data and 'data' in data.keys():
 65             for item in data.get('data'):
 66                 yield item.get('article_url')
 67     except JSONDecodeError:
 68         pass
 69 
 70 
 71 def get_page_detail(url):
 72     try:
 73         response = requests.get(url)
 74         if response.status_code == 200:
 75             return response.text
 76         return None
 77     except ConnectionError:
 78         print('Error occurred')
 79         return None
 80 
 81 
 82 def parse_page_detail(html, url):
 83     soup = BeautifulSoup(html, 'lxml')
 84     result = soup.select('title')
 85     title = result[0].get_text() if result else ''
 86     images_pattern = re.compile('var gallery = (.*?);', re.S)
 87     result = re.search(images_pattern, html)
 88     if result:
 89         data = json.loads(result.group(1))
 90         if data and 'sub_images' in data.keys():
 91             sub_images = data.get('sub_images')
 92             images = [item.get('url') for item in sub_images]
 93             for image in images: download_image(image)
 94             return {
 95                 'title': title,
 96                 'url': url,
 97                 'images': images
 98             }
 99 
100 
101 def save_to_mongo(result):
102     if db[MONGO_TABLE].insert(result):
103         print('Successfully Saved to Mongo', result)
104         return True
105     return False
106 
107 
108 def main(offset):
109     text = get_page_index(offset, KEYWORD)
110     urls = parse_page_index(text)
111     for url in urls:
112         html = get_page_detail(url)
113         result = parse_page_detail(html, url)
114         if result: save_to_mongo(result)
115 
116 
117 pool = Pool()
118 groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
119 pool.map(main, groups)
120 pool.close()
121 pool.join()