1 import os
2 import requests
3 from urllib.parse import urlencode
4 from hashlib import md5
5 from multiprocessing.pool import Pool
6
7 GROUP_START = 1
8 GROUP_END = 5
9
10 def get_page(offset):
11 params = {
12 'offset': offset,
13 'format': 'json',
14 'keyword': '街拍',
15 'autoload': 'true',
16 'count': '20',
17 'cur_tab': '3',
18 'from': 'gallery',
19 }
20 url = 'https://www.toutiao.com/search_content/?' + urlencode(params)
21 try:
22 response = requests.get(url)
23 if response.status_code == 200:
24 return response.json()
25 except requests.ConnectionError:
26 return None
27
28 def get_images(json):
29 data = json.get('data')
30 if data:
31 for item in data:
32 # print(item)
33 image_list = item.get('image_list')
34 title = item.get('title')
35 # print(image_list)
36 for image in image_list:
37 yield {
38 'image': image.get('url'),
39 'title': title
40 }
41
42 def save_image(item):
43 if not os.path.exists(item.get('title')):
44 os.mkdir(item.get('title'))
45 try:
46 local_image_url = item.get('image')
47 new_image_url = local_image_url.replace('list','large')
48 response = requests.get('http:' + new_image_url)
49 if response.status_code == 200:
50 file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')
51 if not os.path.exists(file_path):
52 with open(file_path, 'wb')as f:
53 f.write(response.content)
54 else:
55 print('Already Downloaded', file_path)
56 except requests.ConnectionError:
57 print('Failed to save image')
58
59 def main(offset):
60 json = get_page(offset)
61 for item in get_images(json):
62 print(item)
63 save_image(item)
64
65 if __name__ == '__main__':
66 pool = Pool()
67 groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
68 pool.map(main, groups)
69 pool.close()
70 pool.join()