python+xpath+requests爬取*历史上的今天

a') baseurl = 'https://zh.wikipedia.org/wiki/' begin_date = datetime.datetime.strptime('2016-01-01', "%Y-%m-%d") contents=[] for i in range(196,366): content = [] mid_date = begin_date + datetime.timedelta(days=i) thedate = str(mid_date.month) + '月' + str(mid_date.day) + '日' print(thedate) urlthedate = urllib.parse.quote(thedate) url = baseurl + urlthedate print(url) html = requests.get(url).text.encode("utf-8") tree = etree.HTML(html) ul = tree.xpath('//li[@class="toclevel-1 tocsection-1"]/ul/li/a/span[@class="toctext"]/text()') num = len(ul) fhout.write("data_"+str(mid_date.month)+"_"+str(mid_date.day)+"=[") for i in range(num, 0, -1): records = tree.xpath('//div[@>) ulen = len(records) for j in range(ulen-1,-1,-1): content.append(records[j].xpath('string(.)')) fhout.write("'"+records[j].xpath('string(.)')+"' ") fhout.write("] ") print(content) contents.append(content) fhout.close()

相关推荐