#-*-coding:utf-8-*-
# @Time :2021/4/22 7:08
# @Author:shuaichao
# @File :.py
# @Software: PyCharm
from bs4 import BeautifulSoup #网页解析,获悉数据.231
import re #正则表达式
import urllib.request,urllib.error #制定URL,获取网页数据
import pymysql
import traceback
import time
import requests
import json
def askUrl(url):
head={
# "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
# "Connection": "keep-alive",
# "Cache-Control": "max-age = 0",
# "Accept-Language": "zh - CN, zh;q = 0.9",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
}
if __name__ == '__main__':
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reasen)
return html
def get_info(baseurl):
html = askUrl(baseurl)
bs = BeautifulSoup(html, "html.parser")
return bs
#soup处理并转换成字符串
def transport(bs, info):
ex_info = bs.find_all(class_=info)
info = str(ex_info)
return ex_info, info
def get_conn():
conn = pymysql.connect(
host="localhost",
user="root",
passwd="qwer1234",
db="news",
charset="utf8mb4"
)
cursor = conn.cursor()
return conn, cursor
#关闭数据库
def close_conn(conn, cursor):
if cursor:
cursor.close()
if conn:
conn.close()
#更新新闻数据
def update_news(allinfo):
cursor = None
conn = None
try:
conn, cursor = get_conn()
sql = "insert into new(title, article, type) values(%s,%s,%s)"
print(f"{time.asctime()}开始更新最新数据")
for item in allinfo:
cursor.execute(sql, item)
conn.commit()
print(f"{time.asctime()}更新最新数据完毕")
except:
traceback.print_exc()
finally:
close_conn(conn, cursor)
if __name__=="__main__":
head = {
# "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
# "Connection": "keep-alive",
# "Cache-Control": "max-age = 0",
# "Accept-Language": "zh - CN, zh;q = 0.9",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
}
#存放所有的新闻网址
linkall = []
#所有存放新闻的.js文件
linkJQ = []
#所有超链接id
Linkid = []
#所有超链接Authorid
LinkAid = []
#存放所有标题
allTitle = []
#存放所有文章
allArticle = []
#存放所有图片链接
allImg = []
#汇总所有存入mysql的数据
allinfo = []
#制作每个js网页的链接
for i in range(1,10):
linkJQ.append('https://cis.sohu.com/cis/feeds?callback=jQuery112404940224114573859_1619226100800&clientType=3&suv=2011032041009993&pvId=1619226100991dZepSty&sceneParam=%5B%7B%22page%22%3A'+str(i)+'%2C%22size%22%3A24%2C%22spm%22%3A%22smpc.travel-home.feed%22%7D%5D&refererSpm=smpc.travel-home.feed&refererPath=%2F')
res = requests.get(linkJQ[i-1], headers=head)
response_data = json.loads(res.text.replace('jQuery112404940224114573859_1619226100800(', '')[:-1])
#存入每个新闻的id和authorid
for index, value in enumerate(response_data['smpc.travel-home.feed']['data']):
if int(response_data['smpc.travel-home.feed']['data'][index]['resourceData']['id']) > 1000000:
Linkid.append(response_data['smpc.travel-home.feed']['data'][index]['resourceData']['id'])
LinkAid.append(str(response_data['smpc.travel-home.feed']['data'][index]['resourceData']['contentData']['authorId']))
#制作旅游新闻所有网址
for index,value in enumerate(Linkid):
linkall.append('https://www.sohu.com/a/'+str(Linkid[index])+'_'+str(LinkAid[index])+'?scm=1004.768163804164063232.0.0.4162&spm=smpc.travel-home.feed.5.1619267001122I92VC4c')
#最后一个链接是广告,删除
linkall.pop()
#开始爬取主要数据
for index, value in enumerate(linkall):
bs = get_info(value)
title = bs.select("h1")
article = bs.select("article > p")
if title and article:
str = ''
# 总文章表添加文章
for item in range(1, len(article)):
str += article[item].get_text()
if len(str) * 4 > 21000:
print("超出可储存长度")
del linkall[index]
continue
# article = article[0].get_text().replace("返回搜狐,查看更多", "").replace("责任编辑:", "").replace(r"
", "")
allArticle.append(str.replace("返回搜狐,查看更多", "").replace("责任编辑:", ""))
# 总标题表添加标题
allTitle.append(title[0].get_text().strip().replace("原创", "").replace("
", ""))
print(index)
print(value)
print(title[0].get_text().strip().replace("原创", ""))
# 总图片表添加图片
# ex_info, info = transport(bs, "ql-align-center")
# findImg = re.compile(r'<p class="ql-align-center"><img max-width="600" src="(.*?)"/></p>')
# Img = re.findall(findImg, info)
# if Img:
# allImg.append(Img)
# else:
# allImg.append("")
else:
print(index)
print(value)
del linkall[index]
# for item in linkall:
# allinfo.append([item])
for index, value in enumerate(allTitle):
allinfo.append([value])
allinfo[index].append(allArticle[index])
allinfo[index].append('旅游')
for item in allinfo:
print(item)
update_news(allinfo)