1 # 目标:抓取今日头条关键字美图
2 # 思路:
3 # 一、分析目标站点
4 # 二、构造ajax请求,用requests请求到索引页的内容,正则+BeautifulSoup得到索引url
5 # 三、对索引url请求,得到图片url与标题,下载并保存到数据库,本次使用MongDB
6 # 四、开启循环与多进程,对多页内容遍历与抓取
7
8 #问题一、为什么要构造请求
9 #为什么要构造请求,举个例子,第一屏的内容我们看到的实际url是:
10 # http://www.toutiao.com/search_content/?offset=20&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1
11 # 后面有一大串参数,这些参数就是请求的一些‘设定’,表示关键词,加载的页数,等等,是一个字典的形式,
12 # 如果人为去传这些数据显然十分繁琐,我们需要将这字典编码成一定格式加载请求函数里面。
13 import os
14 from json import JSONDecodeError
15 from multiprocessing.pool import Pool
16
17 import requests
18 from urllib.parse import urlencode
19 import json
20 import pymongo
21
22 from bs4 import BeautifulSoup
23
24 from requests.exceptions import RequestException
25 import re
26 from config import *
27
28 client = pymongo.MongoClient(MONGO_URL)
29 db = client[MONGO_DB]
30
31 def get_index_page(offset,keyword):
32 data = {
33 'offset': offset,
34 'format': 'json',
35 'keyword': keyword,
36 'autoload': 'true',
37 'count': '20',
38 'cur_tab': 1
39 }
40 data = urlencode(data)
41 url ='http://www.toutiao.com/search_content/?' + data
42 #print(url)
43 try:
44 response = requests.get(url)
45 if response.status_code == 200:
46 return response.text
47 else:
48 return None
49 except RequestException:
50 print('请求不到索引页面!')
51 return None
52
53
54 def parse_index_page(html):
55
56 #json_obj = json.dumps(html)#将Python对象序列化为json
57 #python_obj = json.loads(json_obj)#将json加载成Python对象
58 data = json.loads(html)
59 #在进行json操作之前有必要了解一下json是怎么操作的
60 if data and 'data' in data.keys():
61 for item in data.get('data'):
62 yield item.get('article_url')
63
64
65 def get_detail_page(url):
66 try:
67 response = requests.get(url)
68 if response.status_code == 200:
69 return response.text
70 else:
71 return None
72 except RequestException:
73 return None
74
75 def save_to_mongo(result):
76 if db[MONG_TABLE].insert(result):
77 print('存储到MongoDB成功',result)
78 return True
79 else:
80 return False
81
82 def parse_detail_page(html,url):
83 soup = BeautifulSoup(html,'lxml')
84 title = soup.title.string
85 pattern = re.compile(r'var gallery = (.*?);',re.S)
86 result = re.findall(pattern,html)
87 if result:
88 images=[]
89 for i in result:
90 i = json.loads(i)
91 j = i.get("sub_images")
92 #print(j)
93 for k in j:
94 k = k.get('url')
95 images.append(k)
96
97 return{
98 'title':title,
99 'url':url,
100 'images':images
101 }
102
103 def download_image(result):
104 image_list = result.get('images')
105 image_title = result.get('title')
106 print('正在下载:%s'%image_title)
107
108 if image_title not in os.listdir(path ='.'):
109 os.mkdir(image_title)
110 os.chdir(image_title)
111 for image in image_list:
112 try:
113 response = requests.get(image)
114 if response.status_code == 200:
115 filename = image.split('/')[-1] + '.jpg'
116 with open(filename,'wb') as f:
117 f.write(response.content)
118 print('正在下载:%s'%image)
119
120 else:
121 return None
122 except RequestException:
123 return None
124 os.chdir(os.pardir)#返回上一级目录
125
126
127 def main(offset):
128
129 html = get_index_page(offset,KEYWORDS)
130 for url in parse_index_page(html):
131 #print(url)
132 html = get_detail_page(url)
133 if html:
134 result = parse_detail_page(html,url)
135 if result:
136 #print(result)
137 #save_to_mongo(result)
138 download_image(result)
139
140
141
142 if __name__ == '__main__':
143
144 groups = [i*20 for i in range(GROUP_START,GROUP_END + 1)]
145 pool = Pool()
146 pool.map(main,groups)
1 #对比老司机所写
2 import json
3 import os
4 from urllib.parse import urlencode
5 import pymongo
6 import requests
7 from bs4 import BeautifulSoup
8 from requests.exceptions import ConnectionError
9 import re
10 from multiprocessing import Pool
11 from hashlib import md5
12 from json.decoder import JSONDecodeError
13 from config import *
14
15 client = pymongo.MongoClient(MONGO_URL, connect=False)
16 db = client[MONGO_DB]
17
18
19 def get_page_index(offset, keyword):
20 data = {
21 'autoload': 'true',
22 'count': 20,
23 'cur_tab': 3,
24 'format': 'json',
25 'keyword': keyword,
26 'offset': offset,
27 }
28 params = urlencode(data)
29 base = 'http://www.toutiao.com/search_content/'
30 url = base + '?' + params
31 try:
32 response = requests.get(url)
33 if response.status_code == 200:
34 return response.text
35 return None
36 except ConnectionError:
37 print('Error occurred')
38 return None
39
40
41 def download_image(url):
42 print('Downloading', url)
43 try:
44 response = requests.get(url)
45 if response.status_code == 200:
46 save_image(response.content)
47 return None
48 except ConnectionError:
49 return None
50
51
52 def save_image(content):
53 file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
54 print(file_path)
55 if not os.path.exists(file_path):
56 with open(file_path, 'wb') as f:
57 f.write(content)
58 f.close()
59
60
61 def parse_page_index(text):
62 try:
63 data = json.loads(text)
64 if data and 'data' in data.keys():
65 for item in data.get('data'):
66 yield item.get('article_url')
67 except JSONDecodeError:
68 pass
69
70
71 def get_page_detail(url):
72 try:
73 response = requests.get(url)
74 if response.status_code == 200:
75 return response.text
76 return None
77 except ConnectionError:
78 print('Error occurred')
79 return None
80
81
82 def parse_page_detail(html, url):
83 soup = BeautifulSoup(html, 'lxml')
84 result = soup.select('title')
85 title = result[0].get_text() if result else ''
86 images_pattern = re.compile('var gallery = (.*?);', re.S)
87 result = re.search(images_pattern, html)
88 if result:
89 data = json.loads(result.group(1))
90 if data and 'sub_images' in data.keys():
91 sub_images = data.get('sub_images')
92 images = [item.get('url') for item in sub_images]
93 for image in images: download_image(image)
94 return {
95 'title': title,
96 'url': url,
97 'images': images
98 }
99
100
101 def save_to_mongo(result):
102 if db[MONGO_TABLE].insert(result):
103 print('Successfully Saved to Mongo', result)
104 return True
105 return False
106
107
108 def main(offset):
109 text = get_page_index(offset, KEYWORD)
110 urls = parse_page_index(text)
111 for url in urls:
112 html = get_page_detail(url)
113 result = parse_page_detail(html, url)
114 if result: save_to_mongo(result)
115
116
117 pool = Pool()
118 groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
119 pool.map(main, groups)
120 pool.close()
121 pool.join()