今日头条网页图片爬取

 1 import requests,os,json,re
 2 from urllib import request
 3 from day3.mysql_text import mysql_conn
 4 for i in range(0,60,20):
 5     url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(i)
 6     print(url)
 7 
 8     headers = {
 9         'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
10     }
11 
12     response = requests.get(url,headers=headers)
13     html_json_dict = response.json()
14 
15 
16     # 获取dict中的data key对应的列表
17 
18 
19 
20     data_list = html_json_dict['data']
21 
22     # 获取列表中含有article_url的值
23     for data_item in data_list:
24         if 'article_url' in data_item:
25             article_url = data_item['article_url']
26 
27             response = requests.get(article_url,headers=headers)
28 
29             html_str = response.text
30             pattern = r'gallery: JSON.parse((.*)),'
31 
32             match_res = re.search(pattern, html_str)
33 
34             # 新建文件夹
35             if not os.path.exists('downloads'):
36                 os.mkdir('downloads')
37 
38             if match_res:
39                 # print(match_res.group(1))
40                 json_origin = match_res.group(1)
41                 a1 = json.loads(json_origin)
42                 # print(a1,type(a1))
43                 a2 = json.loads(a1)
44                 # print(a2['sub_images'])
45                 for a2_list in a2['sub_images']:
46                     image_url = a2_list['url']
47 
48                     filename = 'downloads/' + image_url.split('/')[-1] + '.jpg'
49                     print(filename)
50                     request.urlretrieve(image_url, filename)
51 
52             else:
53                 pass