数据采集第六次作业
作业一
用requests和BeautifulSoup库方法爬取豆瓣电影Top250数据。
成果截图
代码
import re
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import request
import threading
import urllib.request
import pymysql
class Douban():
def start_up(self):
try:
self.con = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='axx123123', charset='utf8',
db='mydb')
self.cursor = self.con.cursor()
try:
# 如果有表就删除
self.cursor.execute("drop table Douban")
except:
pass
try:
sql = "create table Douban (排名 varchar(4),电影名称 varchar(256),导演 varchar(256),主演 varchar(256)"
",上映时间 varchar(256),国家 varchar(256),电影类型 varchar(256),评分 int,评价人数 int ,引用 varchar(512),文件路径 varchar(512))"
self.cursor.execute(sql)
except:
pass
except Exception as err:
print(err)
def closeUp(self):
try:
self.con.commit()
self.con.close()
except Exception as err:
print(err)
def insertDB(self, rank, name, director, mainactor, time, country, type, score, rateCount, quote,path):
try:
self.cursor.execute("insert into Douban (排名,电影名称,导演,主演,上映时间,国家,电影类型,评分,评价人数,引用,文件路径) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(rank, name, director, mainactor, time, country, type, score, rateCount, quote,path))
except Exception as err:
print(err)
def DouBanSpider(self,start_url):
global threads
try:
urls = []
r = request.get(start_url, headers=headers)
html = r.text
soup = BeautifulSoup(html, 'html.parser')
movielist = soup.select("ol li")
for movie in movielist:
rank = movie.select("em")[0].text
name = movie.select("div[class='hd'] span")[0].text
detailurl = movie.select("div[class='hd'] a")[0]['href']
req = request.get(detailurl, headers=headers)
html_1 = req.text
soup_1 = BeautifulSoup(html_1, 'html.parser')
director = soup_1.select("div[id='info'] a[rel='v:directedBy']")[0].text
mainactor = soup_1.select("div[id='info'] a[rel='v:starring']")[0].text
time = re.findall(r'd+.+', movie.select("div[class='bd'] p")[0].text.strip())[0].split("/")[0].strip()
country = re.findall(r'd+.+', movie.select("div[class='bd'] p")[0].text.strip())[0].split("/")[1].strip()
type = re.findall(r'd+.+', movie.select("div[class='bd'] p")[0].text.strip())[0].split("/")[2].strip()
score = movie.select("div[class='star'] span[class='rating_num']")[0].text
rateCount = movie.select("div[class='star'] span")[3].text
rateCount=re.findall(r'd+',rateCount)[0]
if movie.select("p[class='quote'] span"):
quote = movie.select("p[class='quote'] span")[0].text
else:
quote = ''
print(rank, name, director, mainactor, time, country, type, score, rateCount, quote,name+'jpg')
self.insertDB(rank, name, director, mainactor, time, country, type, score, rateCount, quote,name+'jpg')
images = soup.select("img")
for image in images:
try:
url = image['src']
jpgname = image['alt']
if url not in urls:
print(url)
T = threading.Thread(target=self.download, args=(url, jpgname))
T.setDaemon(False)
T.start()
threads.append(T)
except Exception as err:
print(err)
except Exception as err:
print(err)
def download(self,url, name):
try:
if (url[len(url) - 4] == '.'):
ext = url[len(url) - 4:]
else:
ext = ''
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
with open('./images/' + name + ext, 'wb') as f:
f.write(data)
f.close()
print("downloaded" + name + ext)
except Exception as err:
print(err)
headers = {
'cookie': 'bid=MVprhlNrC9g; douban-fav-remind=1; __yadk_uid=XLbeJ2b65whlEmF7XR2tyJzVjnr0e7lx; ll="108300"; _vwo_uuid_v2=D2CBDAAF0CB60468FE6426EF13E90F383|a7dcf5048e170aa20a86966b60bd5c21; push_noty_num=0; push_doumail_num=0; __utmv=30149280.22726; __gads=ID=7cf2842854f39a40-22818e09ebc4009c:T=1606274137:RT=1606274137:S=ALNI_Mbwadx0Pb_JPKO7BASVfooEmVTUPQ; dbcl2="227263608:srZZtUzuoiY"; ck=TtEH; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1606296668%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_id.100001.8cb4=1bae15819dcbb00a.1600416620.6.1606296668.1606271160.; _pk_ses.100001.8cb4=*; __utma=30149280.1094101319.1600416624.1606273588.1606296668.8; __utmc=30149280; __utmz=30149280.1606296668.8.6.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.2.10.1606296668','user-agent': 'Mozilla/5,0'}
threads = []
a=Douban()
a.start_up()
for page in range(0,11):
a.DouBanSpider("https://movie.douban.com/top250?start="+str(page*25)+"&filter=")
for t in threads:
t.join()
a.closeUp()
心得体会
太久没用了,完全忘光,正则表达式还是有点不熟练,导演主演之类的还得点进去网页获取,xpath用多了css的用法也忘了,[attr=value],搜子节点用空格分隔
作业二
爬取科软学校排名,并获取学校的详细链接,进入下载学校Logo存储、获取官网Url、院校信息等内容。
成果截图
代码
Myspider.py
import scrapy
from ..items import GetcollegeItem
from bs4 import UnicodeDammit
import request
class MySpider(scrapy.Spider):
name='mySpider'
def start_requests(self):
url = 'https://www.shanghairanking.cn/rankings/bcur/2020'
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
try:
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector=scrapy.Selector(text=data)
collegelist=selector.xpath("//table[@class='rk-table']/tbody/tr")
for college in collegelist:
#载入电影详情页
detailUrl="https://www.shanghairanking.cn"+college.xpath("./td[@class='align-left']/a/@href").extract_first()
print(detailUrl)
req = request.get(detailUrl)
req.encoding='utf-8'
text=req.text
selector_1=scrapy.Selector(text=text)
#抓取数据
sNo=college.xpath("./td[position()=1]/text()").extract_first().strip()
print(sNo)
schoolName=selector_1.xpath("//div[@class='univ-name']/text()").extract_first()
print(schoolName)
city=college.xpath("./td[position()=3]/text()").extract_first().strip()
print(city)
officialUrl=selector_1.xpath("//div[@class='univ-website']/a/text()").extract_first()
print(officialUrl)
info=selector_1.xpath("//div[@class='univ-introduce']/p/text()").extract_first()
print(info)
mFile=sNo+'.jpg'
#获取并下载图片
src = selector_1.xpath("//td[@class='univ-logo']/img/@src").extract_first()
req_1 = request.get(src)
image=req_1.content
picture=open("D:/PycharmProjects/爬虫6/getCollege/getCollege/image/"+str(sNo)+'.png',"wb")
picture.write(image)
picture.close()
#存入item
item=GetcollegeItem()
item['sNo']=sNo if sNo else ""
item['schoolName']=schoolName if schoolName else ""
item['city']=city if city else ""
item['officialUrl']=officialUrl if officialUrl else ""
item['info']=info if info else ""
item['mFile']=mFile if mFile else ""
yield item
except Exception as err:
print(err)
piplines.py
import pymysql
class GetcollegePipeline:
def open_spider(self,spider):
try:
self.con = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='axx123123', charset='utf8',
db='mydb')
self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
try:
# 如果有表就删除
self.cursor.execute("drop table college")
except:
pass
try:
sql = "create table college (sNo int,schoolName varchar(256),city varchar(256),officialUrl varchar(256)"
",info varchar(1024),mFile varchar(256))"
self.cursor.execute(sql)
except:
pass
except Exception as err:
print(err)
def close_spider(self,spider):
self.con.commit()
self.con.close()
print("closed")
def process_item(self, item, spider):
try:
self.cursor.execute("insert into college (sNo,schoolName,city,officialUrl,info,mFile) values (%s,%s,%s,%s,%s,%s)",
(item['sNo'],item['schoolName'],item['city'],item['officialUrl'],item['info'],item['mFile']))
except Exception as err:
print(err)
return item
心得体会
scrapy也没怎么复习,测试的时候settings都没改,Myspider用来抓取数据和下载图片,pipelines存入数据库,xpath获取文本内容要在xpath语句里面加text,用法和selenium的也有所区别
作业三
使用Selenium框架+ MySQL数据库存储技术模拟登录慕课网,并获取学生自己账户中已学课程的信息并保存在MYSQL中。
成果截图
代码
from selenium import webdriver from selenium.webdriver.chrome.options import Options import pymysql import time import selenium import datetime import sqlite3 class MySpider(): def startUp(self, url): # # Initializing Chrome browser chrome_options = Options() # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(chrome_options=chrome_options) self.No=1 # Initializing database try:
self.con = pymysql.connect(host='127.0.0.1',port=3306,user='root',passwd='axx123123',charset='utf8',db='mydb') self.cursor = self.con.cursor() try: # 如果有表就删除 self.cursor.execute("drop table Mooc") except: pass try: sql = "create table mooc (Mid int,cCourse varchar(256),cCollege varchar(256),cTeacher varchar(256)" ",cTeam varchar(256),cCount varchar(256),cProcess varchar(256),cBrief varchar(256))" self.cursor.execute(sql) except: pass except Exception as err: print(err) time.sleep(3) self.driver.get(url) def closeUp(self): try: self.con.commit() self.con.close() self.driver.close() except Exception as err: print(err) def insertDB(self, Mid, cCourse, cCollege,cTeacher, cTeam, cCount, cProcess, cBrief): try: self.cursor.execute("insert into mooc (Mid, cCourse,cCollege, cTeacher, cTeam, cCount, cProcess, cBrief) " "values(%s,%s,%s,%s,%s,%s,%s,%s)", (Mid, cCourse, cCollege,cTeacher, cTeam, cCount, cProcess, cBrief)) except Exception as err: print(err) def showDB(self): try: con = sqlite3.connect("Mooc.db") cursor = con.cursor() print("%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s" % ( "Mid", "cCourse", "cCollege","cTeacher", "cTeam", "cCount", "cProcess", "cBrief")) cursor.execute("select * from Mooc order by Mid") rows = cursor.fetchall() for row in rows: print("%-16s %-16s %-16s %-16s %-16s %-16s %-16s %-16s" % ( row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7])) con.close() except Exception as err: print(err) def login(self): self.driver.maximize_window() # login try: #点击登录 self.driver.find_element_by_xpath("//div[@id='g-body']/div[@id='app']/div[@class='_1FQn4']" "/div[@class='_2g9hu']/div[@class='_2XYeR']/div[@class='_2yDxF _3luH4']" "/div[@class='_1Y4Ni']/div[@role='button']").click() time.sleep(3) #其他登录方式 self.driver.find_element_by_xpath("//span[@class='ux-login-set-scan-code_ft_back']").click() time.sleep(3) #手机登录 self.driver.find_element_by_xpath("//ul[@class='ux-tabs-underline_hd']/li[position()=2]").click() time.sleep(3) #切换到frame窗口输入账号信息 zx = self.driver.find_elements_by_tag_name("iframe")[1].get_attribute("id") self.driver.switch_to.frame(zx) #输入账号 self.driver.find_element_by_xpath("//div[@class='u-input box']/input[@type='tel']").send_keys("xxxxxxxx") time.sleep(1) #输入密码 self.driver.find_element_by_xpath( "//div[@class='u-input box']/input[@class='j-inputtext dlemail']").send_keys("xxxxxxxx") time.sleep(1) #点击登录 self.driver.find_element_by_xpath("//a[@class='u-loginbtn btncolor tabfocus ']").click() time.sleep(3) # 点击我的课程 self.driver.find_element_by_xpath("//div[@class='_3uWA6']").click() time.sleep(3) except Exception as err: print(err) def processSpider(self): try: list=self.driver.find_elements_by_xpath("//div[@class='course-panel-body-wrapper']/div") print(list) for li in list: #点击课程 li.click() time.sleep(2) new_tab=self.driver.window_handles[-1] self.driver.switch_to.window(new_tab) time.sleep(2) #点击课程详细 self.driver.find_element_by_xpath("//h4[@class='f-fc3 courseTxt']").click() time.sleep(2) new_new_tab=self.driver.window_handles[-1] self.driver.switch_to.window(new_new_tab) id=self.No #print(id) Course=self.driver.find_element_by_xpath("//*[@id='g-body']/div[1]/div/div[3]/div/div[1]/div[1]/span[1]").text #print(Course) College=self.driver.find_element_by_xpath("//*[@id='j-teacher']/div/a/img").get_attribute("alt") #print(College) Teacher=self.driver.find_element_by_xpath("//*[@id='j-teacher']/div/div/div[2]/div/div/div/div/div/h3").text #print(Teacher) Teamlist=self.driver.find_elements_by_xpath("//*[@id='j-teacher']/div/div/div[2]/div/div[@class='um-list-slider_con']/div") Team='' for name in Teamlist: main_name=name.find_element_by_xpath("./div/div/h3[@class='f-fc3']").text Team+=str(main_name)+" " #print(Team) Count=self.driver.find_element_by_xpath("//*[@id='course-enroll-info']/div/div[2]/div[1]/span").text Count=Count.split(" ")[1] #print(Count) Process=self.driver.find_element_by_xpath('//*[@]/div/div[1]/div[2]/div[1]/span[2]').text #print(Process) Brief=self.driver.find_element_by_xpath('//*[@]').text #print(Brief) time.sleep(2) #关闭课程详细界面 self.driver.close() pre_tab=self.driver.window_handles[1] self.driver.switch_to.window(pre_tab) time.sleep(2) #关闭课程界面 self.driver.close() pre_pre_tab=self.driver.window_handles[0] self.driver.switch_to.window(pre_pre_tab) time.sleep(2) self.No+=1 self.insertDB(id, Course, College, Teacher, Team, Count, Process, Brief) try: time.sleep(2) #下一页 nextpage = self.driver.find_element_by_xpath("//a[@class='th-bk-main-gh']") time.sleep(2) nextpage.click() self.processSpider() except: self.driver.find_element_by_xpath("//a[@class='th-bk-disable-gh']") except Exception as err: print(err) def executeSpider(self, url): starttime = datetime.datetime.now() print("Spider starting......") self.startUp(url) print("Spider processing......") self.login() self.processSpider() print("Spider closing......") self.closeUp() print("Spider completed......") endtime = datetime.datetime.now() elapsed = (endtime - starttime).seconds print("Total ", elapsed, " seconds elapsed")
url = "https://www.icourse163.org/"
spider = MySpider()
spider.executeSpider(url)
心得体会
和上次作业基本相同,课程详情页的xpath都没怎么变,需要注意的是登录以后点击我的课程是直接在当前页跳转不需要switch,点击课程还要再点开详情页,关闭的时候也要close两次,switch两次