python爬取京东商品评论,数据稍微多点就出现报错JSONDecodeError Traceback (most recent call last)

python爬取京东商品评论,数据稍微多点就出现报错JSONDecodeError   Traceback (most recent call last)

问题描述:

爬取得数据稍微多一点就会报错
代码

#程序文件Pex3_r_5.py
#爬取京东产品评论
# 导入必要的包
import requests
import json  #JSON(JavaScript Object Notation) 是一种轻量级的数据交换格式,易于人阅读和编写
import pandas as pd
# header这个的作用在于伪装成浏览器进行操作,有些网页识别到不是浏览器就不能访问,User-Agent能伪装
# User-Agent可以用不同个,一般在刚刚找网页网址url的Headers的下面就有,当然也可以使用手机的,可网页搜索找到不同的User-Agent,都能进行相应操作
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
b=[]
for page in range(0,1999+1):
    #https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100018640796&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1
    url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=64119968113&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'.format(page)
    # 我们可以简单的解析这个网址,前面不动,后面的我们点击下一页,看会出现什么改变
    #https://rate.tmall.com/list_detail_rate.htm?itemId=40522681118&spuId=932989129&sellerId=2106754117&order=3&currentPage=1&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098%23E1hvEvvnvPOvUpCkvvvvvjiWPFLwljEmRFFwAjEUPmPh0jrRRFdhgjn8RLzwljrnnUvCvCLNtAxG4DdNzY%2F3SP1fJYYIzYSw54QCvvyvvOA8tvvvyXmVvpvhvUCvpvgCvvLMMQvvuvhvmvvvpLdyf92nkvhvC9hvpyPyAb9Cvm9vvhCvvvvvvvvvp09vvvjUvvCHhQvv9pvvvhZLvvvCfvvvBBWvvvH%2BmvhvLv8K8A%2BaaXgArqhl7E7tRu0Adch%2Ba4mAdBkK4Z7xfXeK5dUfbjxrV8g7rjlUzn97RqJ6EvLvqbVQKfE9ZKFE%2BFuTRogRiNoAdXKKN6wvvpvVvUCvpvvvRvhvCvvvphmevpvhvvCCB29CvvpvvhCvdvhvmpmvjC9gvvmwNQ%3D%3D&needFold=0&_ksTS=1628759080568_464&callback=jsonp465
    #https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=5225346&score=0&sortType=5&page=1&pageSize=10&isShadowSku=0&rid=0&fold=1
    # 我们发现只有page在变化,根据这个我们可以进行翻页爬取,我们先进行第一页的操作
    # 先向浏览器发送请求
    response = requests.get(url, headers=header)
    data = response.text
    # 解码 JSON 数据。该函数返回 Python 字段的数据类型。
    jd = json.loads(data.lstrip('fetchJSON_comment98(').rstrip(');'))
    #用于去掉左边和右边字符串的
    data_list = jd['comments']
    for data in data_list:
        buyer_id = data['id']  # 评论买家id
        content = data['content'] # 评论内容
        time = data['creationTime']  # 评论时间
        #print(content)
        #直接写入记事本文本文件
        #f1=open("1.txt","a")
        #f1.writelines(str(buyer_id) + content +"\n")
        b.append([buyer_id,content,time])
c=pd.DataFrame(b,columns=["id","content","creationTime"])
#print(b)
f=pd.ExcelWriter('reuqest6.xlsx')  #创建文件对象
c.to_excel(f,"sheet1")  #把c写入Excel文件
#c.to_excel(f,"sheet2")  #c再写入另一个表单中
f.save()


运行结果及报错内容

JSONDecodeError                           Traceback (most recent call last)
<ipython-input-23-5415769acf43> in <module>
     20     data = response.text
     21     # 解码 JSON 数据。该函数返回 Python 字段的数据类型。
---> 22     jd = json.loads(data.lstrip('fetchJSON_comment98(').rstrip(');'))
     23     #用于去掉左边和右边字符串的
     24     data_list = jd['comments']

~\anaconda3\lib\json\__init__.py in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    355             parse_int is None and parse_float is None and
    356             parse_constant is None and object_pairs_hook is None and not kw):
--> 357         return _default_decoder.decode(s)
    358     if cls is None:
    359         cls = JSONDecoder

~\anaconda3\lib\json\decoder.py in decode(self, s, _w)
    335 
    336         """
--> 337         obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338         end = _w(s, end).end()
    339         if end != len(s):

~\anaconda3\lib\json\decoder.py in raw_decode(self, s, idx)
    353             obj, end = self.scan_once(s, idx)
    354         except StopIteration as err:
--> 355             raise JSONDecodeError("Expecting value", s, err.value) from None
    356         return obj, end

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

反爬措施吧.返回的不再是JSON数据格式了.debug看下