1 import requests,os,json,re
2 from urllib import request
3 from day3.mysql_text import mysql_conn
4 for i in range(0,60,20):
5 url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(i)
6 print(url)
7
8 headers = {
9 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
10 }
11
12 response = requests.get(url,headers=headers)
13 html_json_dict = response.json()
14
15
16 # 获取dict中的data key对应的列表
17
18
19
20 data_list = html_json_dict['data']
21
22 # 获取列表中含有article_url的值
23 for data_item in data_list:
24 if 'article_url' in data_item:
25 article_url = data_item['article_url']
26
27 response = requests.get(article_url,headers=headers)
28
29 html_str = response.text
30 pattern = r'gallery: JSON.parse((.*)),'
31
32 match_res = re.search(pattern, html_str)
33
34 # 新建文件夹
35 if not os.path.exists('downloads'):
36 os.mkdir('downloads')
37
38 if match_res:
39 # print(match_res.group(1))
40 json_origin = match_res.group(1)
41 a1 = json.loads(json_origin)
42 # print(a1,type(a1))
43 a2 = json.loads(a1)
44 # print(a2['sub_images'])
45 for a2_list in a2['sub_images']:
46 image_url = a2_list['url']
47
48 filename = 'downloads/' + image_url.split('/')[-1] + '.jpg'
49 print(filename)
50 request.urlretrieve(image_url, filename)
51
52 else:
53 pass