python爬取京东商品评论,数据稍微多点就出现报错JSONDecodeError Traceback (most recent call last)
问题描述:
爬取得数据稍微多一点就会报错
代码
#程序文件Pex3_r_5.py
#爬取京东产品评论
# 导入必要的包
import requests
import json #JSON(JavaScript Object Notation) 是一种轻量级的数据交换格式,易于人阅读和编写
import pandas as pd
# header这个的作用在于伪装成浏览器进行操作,有些网页识别到不是浏览器就不能访问,User-Agent能伪装
# User-Agent可以用不同个,一般在刚刚找网页网址url的Headers的下面就有,当然也可以使用手机的,可网页搜索找到不同的User-Agent,都能进行相应操作
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
b=[]
for page in range(0,1999+1):
#https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100018640796&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1
url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=64119968113&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'.format(page)
# 我们可以简单的解析这个网址,前面不动,后面的我们点击下一页,看会出现什么改变
#https://rate.tmall.com/list_detail_rate.htm?itemId=40522681118&spuId=932989129&sellerId=2106754117&order=3¤tPage=1&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098%23E1hvEvvnvPOvUpCkvvvvvjiWPFLwljEmRFFwAjEUPmPh0jrRRFdhgjn8RLzwljrnnUvCvCLNtAxG4DdNzY%2F3SP1fJYYIzYSw54QCvvyvvOA8tvvvyXmVvpvhvUCvpvgCvvLMMQvvuvhvmvvvpLdyf92nkvhvC9hvpyPyAb9Cvm9vvhCvvvvvvvvvp09vvvjUvvCHhQvv9pvvvhZLvvvCfvvvBBWvvvH%2BmvhvLv8K8A%2BaaXgArqhl7E7tRu0Adch%2Ba4mAdBkK4Z7xfXeK5dUfbjxrV8g7rjlUzn97RqJ6EvLvqbVQKfE9ZKFE%2BFuTRogRiNoAdXKKN6wvvpvVvUCvpvvvRvhvCvvvphmevpvhvvCCB29CvvpvvhCvdvhvmpmvjC9gvvmwNQ%3D%3D&needFold=0&_ksTS=1628759080568_464&callback=jsonp465
#https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=5225346&score=0&sortType=5&page=1&pageSize=10&isShadowSku=0&rid=0&fold=1
# 我们发现只有page在变化,根据这个我们可以进行翻页爬取,我们先进行第一页的操作
# 先向浏览器发送请求
response = requests.get(url, headers=header)
data = response.text
# 解码 JSON 数据。该函数返回 Python 字段的数据类型。
jd = json.loads(data.lstrip('fetchJSON_comment98(').rstrip(');'))
#用于去掉左边和右边字符串的
data_list = jd['comments']
for data in data_list:
buyer_id = data['id'] # 评论买家id
content = data['content'] # 评论内容
time = data['creationTime'] # 评论时间
#print(content)
#直接写入记事本文本文件
#f1=open("1.txt","a")
#f1.writelines(str(buyer_id) + content +"\n")
b.append([buyer_id,content,time])
c=pd.DataFrame(b,columns=["id","content","creationTime"])
#print(b)
f=pd.ExcelWriter('reuqest6.xlsx') #创建文件对象
c.to_excel(f,"sheet1") #把c写入Excel文件
#c.to_excel(f,"sheet2") #c再写入另一个表单中
f.save()
运行结果及报错内容
JSONDecodeError Traceback (most recent call last)
<ipython-input-23-5415769acf43> in <module>
20 data = response.text
21 # 解码 JSON 数据。该函数返回 Python 字段的数据类型。
---> 22 jd = json.loads(data.lstrip('fetchJSON_comment98(').rstrip(');'))
23 #用于去掉左边和右边字符串的
24 data_list = jd['comments']
~\anaconda3\lib\json\__init__.py in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
355 parse_int is None and parse_float is None and
356 parse_constant is None and object_pairs_hook is None and not kw):
--> 357 return _default_decoder.decode(s)
358 if cls is None:
359 cls = JSONDecoder
~\anaconda3\lib\json\decoder.py in decode(self, s, _w)
335
336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
338 end = _w(s, end).end()
339 if end != len(s):
~\anaconda3\lib\json\decoder.py in raw_decode(self, s, idx)
353 obj, end = self.scan_once(s, idx)
354 except StopIteration as err:
--> 355 raise JSONDecodeError("Expecting value", s, err.value) from None
356 return obj, end
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
答
反爬措施吧.返回的不再是JSON数据格式了.debug看下