002 requests的使用方法以及xpath和beautifulsoup4提取数据

1、直接使用url，没用headers的请求

import requests

url = 'http://www.baidu.com'
# requests请求用get方法
response = requests.get(url)

# 返回的content是字节 需要解码
data = response.content.decode()
print(data)
# 而text返回的是字符串类型
data = response.text
print(data)

只有url的代码

import requests url = 'http://www.baidu.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } # requests请求用get方法 response = requests.get(url, headers=headers) # 1. 获取请求头 print(response.request.headers) # 2. 获取相应头 print(response.headers) # 3. 获取状态码 print(response.status_code) # 4. 请求的cookie print(response.request._cookies) # 5. 相应的cookie print(response.cookies)

import requests url = 'http://www.baidu.com/s?wd=你好' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } # requests请求用get方法 response = requests.get(url, headers=headers) data = response.content.decode() print(data)

import requests url = 'http://www.baidu.com/' params = { 'wd': '你好', } headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } # requests请求用get方法 response = requests.get(url, headers=headers, params=params) data = response.content.decode() print(data)

import requests url = 'http://baidu.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } params = { 'wd': '你好' } free_proxy = { 'https': '153.232.156.201:8080', } # 加代理IP response = requests.get(url, headers=headers, params=params, proxies=free_proxy) data = response.content print(data.decode())

import requests url = 'http://iclass.ncut.edu.cn/iclass/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } data = { 'login': '17159010225', 'password': '******', } # requests请求用post方法 response = requests.post(url, headers=headers, data=data) data = response.content with open('01 登录界面.html', 'wb') as fp: fp.write(data)

import requests from lxml import etree url = 'https://www.qiushibaike.com/text/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } response = requests.get(url, headers=headers) data = response.content # 1. 转解析类型 xpath_data = etree.HTML(data) # 2. 调用xpath的方法 div_list = xpath_data.xpath('//div[@class="col1"]/div') for div in div_list: author = div.xpath('.//div[@class="author clearfix"]/a[2]/h2/text()')[0].strip(' ') # 打印作者名 print(author)

from bs4 import BeautifulSoup html_doc = ''' <div > <a href="/" target="_blank" rel="nofollow">热门</a> <a href="/hot/" target="_blank">24小时</a> <a href="/imgrank/" target="_blank">热图</a> <a >文字</a> <a href="/history/" target="_blank">穿越</a> <a href="/pic/" target="_blank">糗图</a> <a href="/textnew/" target="_blank">新鲜</a> </div> ''' # 1、转类型 soup = BeautifulSoup(html_doc, 'lxml') # 2、格式化输出 result = soup.prettify() # print(result) # 3、取标签 print(soup.a) # 4、取文本 print(soup.a.string) # 5、取属性 print(soup.a['target'])

from bs4 import BeautifulSoup html_doc = ''' <div > <a href="/" target="_blank" rel="nofollow">热门</a> <a href="/hot/" target="_blank">24小时</a> <a href="/imgrank/" target="_blank">热图</a> <a >文字</a> <a href="/history/" target="_blank">穿越</a> <a href="/pic/" target="_blank">糗图</a> <a href="/textnew/" target="_blank">新鲜</a> </div> ''' # 1、转类型 soup = BeautifulSoup(html_doc, 'lxml') # 2、通用解析方法 # find 返回符合查询条件的第一个标签 print(soup.find(name='a')) print(soup.find(attrs={"target": '_blank'})) # find_all 返回list(标签对象) print(soup.find_all(name='a', limit=3)) # select_one 返回css的选中器 print(soup.select_one('.menu')) # select 返回的是list print(soup.select('#highlight')) print(soup.select('a[target="_blank"]'))

002 requests的使用方法以及xpath和beautifulsoup4提取数据

相关推荐