2018-8-10爬虫第四天内容 2018-8-10爬虫第四天内容
- 豆瓣电影英剧和美剧的爬取(作业)
import json
import requests
class GetDouBanMovies(object):
__instance = None
def __new__(cls, *args, **kwargs):
if cls.__instance is None:
cls.__instance = super().__new__(cls)
return cls.__instance
return cls.__instance
def __init__(self):
self.temp_urls = [
{
"url": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?os=ios&for_mobile=1&start={}&count=18&loc_id=108288&_=1533811595869",
"Referer": "https://m.douban.com/tv/british"
},
{
"url": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?os=ios&for_mobile=1&start={}&count=18&loc_id=108288&_=1533818802677",
"Referer": "https://m.douban.com/tv/american"
}
]
# self.url = url
self.headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko)"
" Version/11.0 Mobile/15A372 Safari/604.1",
"Referer": "https://m.douban.com/tv/british"
}
def __prase_url(self, url):
response = requests.get(url, headers=self.headers, timeout=5)
assert response.status_code == 200
html_str = response.content.decode()
ret = json.loads(html_str)
self.subject_collection_items = ret.get("subject_collection_items")
return self.subject_collection_items
def prase_url(self, url):
try:
movies_lists = self.__prase_url(url)
except Exception as e:
movies_lists = None
return movies_lists
def write_str(self, movies_lists):
for temp_movie in movies_lists:
with open("./british2/read.txt", "a", encoding="utf-8") as f:
f.write(json.dumps(temp_movie, ensure_ascii=False))
f.write("
")
def run(self):
for next_url in self.temp_urls:
start_num = 0
self.headers["Referer"] = next_url["Referer"]
while True:
url = next_url["url"].format(start_num)
print(url)
movies_lists = self.prase_url(url)
self.write_str(movies_lists)
if len(self.subject_collection_items) != 18:
break
start_num += 18
if __name__ == "__main__":
s1 = GetDouBanMovies()
s1.run()
- 正则表达式
- re模块的三种方法
- re.findall() # 返回类表
- re.sub() # 返回字符串
- re.compile() # 编译提交匹配效率
- 普通模式
string_a = '<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta http-equiv="content-type" content="text/html;charset=utf-8">
<meta content="always" name="referrer">
<meta name="theme-color" content="#2932e1">'
ret = re.findall(r'<.*>',string_a)
print(ret)
- 执行结果、
['<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">', '<meta http-equiv="content-type" content="text/html;charset=utf-8">', '<meta content="always" name="referrer">', '<meta name="theme-color" content="#2932e1">']
- re.s模式
string_a = '<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta http-equiv="content-type" content="text/html;charset=utf-8">
<meta content="always" name="referrer">
<meta name="theme-color" content="#2932e1">'
ret = re.findall(r'<.*>',string_a,re.S)
print(ret)
- 执行结果
['<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta http-equiv="content-type" content="text/html;charset=utf-8">
<meta content="always" name="referrer">
<meta name="theme-color" content="#2932e1">']
-
Python中的原始字符串
- 原始字符串r的操作
- 相对一特殊符号而言,表示特殊符号的字面的意思
-
xpath对html页面的处理
from lxml import etree
text = """
<div> <ul>
<li class="item-1"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul> </div>
"""
html = etree.HTML(text)
li_lists = html.xpath("//li") # 获得的是[<Element li at 0x7fc520c7d088>,]的li对象列表
item = {}
for li_list in li_lists:
key = li_list.xpath("./a/@href")[0] if li_list.xpath("./a/@href") else None # 取href中的属性的值
value = li_list.xpath("./a/text()")[0] if li_list.xpath("./a/text()") else None # 获取所有的值
item[key] = value
print(item)
handeled_html_str = etree.tostring(html).decode() # 将Element对象转化为字符串
print(handeled_html_str)
-
总结:
- etree.HTML(text)会对传入的字符串进行转化,补充符合规范的的HTML的语法;
- etree.tostring(html).decode()的方法中将Element转化为字符串;
-
果壳网中数据
import json
from pprint import pprint
import re
import requests
class GetNews(object):
__instance = None
def __new__(cls, *args, **kwargs):
if cls.__instance is None:
cls.__instance = super().__new__(cls)
return cls.__instance
return cls.__instance
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko)"
" Version/11.0 Mobile/15A372 Safari/604.1",
}
def __prase_url(self, url):
response = requests.get(url, headers=self.headers, timeout=5)
assert response.status_code == 200
html_str = response.content.decode()
news_lists = re.findall(r"""<h2><a target="_blank" href="(.*?)">(.*?)</a></h2>""", html_str)
return news_lists
def prase_url(self, url):
try:
news_lists = self.__prase_url(url)
except Exception as e:
news_lists = None
return news_lists
def write_str(self, movies_lists):
for temp_movie in movies_lists:
with open("./news/read.txt", "a", encoding="utf-8") as f:
f.write(json.dumps(temp_movie, ensure_ascii=False))
f.write("
")
def run(self):
page = 0
# while True:
# print(page)
# url = "https://www.guokr.com/ask/highlight/?page={}".format(page)
# print(url)
# news_lists = self.prase_url(url)
# self.write_str(news_lists)
# page += 1
# if page >100:
# break
for page in range(1,5):
print(page)
url = "https://www.guokr.com/ask/highlight/?page={}".format(page)
print(url)
news_lists = self.prase_url(url)
self.write_str(news_lists)
if __name__ == "__main__":
n1 = GetNews()
n1.run()