# 提取信息
# 调用string属性获取文本的值
# 利用那么属性获取节点的名称
# 调用attrs获取所有HTML节点属性
1 from bs4 import BeautifulSoup
2
3 html = """
4 <html><head><title>The Dormouse's story</title></head>
5 <body>
6 <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
7 <p class="story">Once upon a time there were three little sisters; and their names were
8 <a href="http://example.com/elsie" class="sister" ><!-- Elsie --></a>,
9 <a href="http://example.com/lacie" class="sister" >Lacie</a> and
10 <a href="http://example.com/tillie" class="sister" >Tillie</a>;
11 and they lived at the bottom of a well.</p>
12 <p class="story">...</p>
13 """
14
15 soup = BeautifulSoup(html, 'lxml')
16
17 print(soup.title.name) # 选取title节点,然后调用name属性获得节点名称
18 # 输出:title
19 print(soup.title.string) # 调用string属性,获取title节点的文本值
20 # 输出:The Dormouse's story
21
22 print(soup.p.attrs) # 调用attrs,获取p节点的所有属性
23 # 输出:{'class': ['title'], 'name': 'dromouse'}
24
25 print(soup.p.attrs['name']) # 获取name属性
26 # 输出:dromouse
27 print(soup.p['name']) # 获取name属性
28 # 输出:dromouse
View Code
# 嵌套选择
1 from bs4 import BeautifulSoup
2
3 html = """
4 <html><head><title>The Dormouse's story</title></head>
5 <body>
6 """
7
8 soup = BeautifulSoup(html, 'lxml')
9 print(soup.head.title)
10 print(type(soup.head.title))
11 print(soup.head.title.string)
12
13 # 输出:
14 <title>The Dormouse's story</title>
15 <class 'bs4.element.Tag'>
16 The Dormouse's story
View Code
# 关联选择
# 1、子节点和子孙节点
# contents属性得到的结果是直接子节点的列表。
1 from bs4 import BeautifulSoup
2
3 html = """
4 <html>
5 <head>
6 <title>
7 The Dormouse's story
8 </title>
9 </head>
10 <body>
11 <p class="story">
12 Once upon a time there were three little sisters; and their names were
13 <a class="sister" href="http://example.com/elsie" >
14 <!-- Elsie -->
15 </a>
16 ,
17 <a class="sister" href="http://example.com/lacie" >
18 Lacie
19 </a>
20 and
21 <a class="sister" href="http://example.com/tillie" >
22 Tillie
23 </a>
24 ;
25 and they lived at the bottom of a well.
26 </p>
27 <p class="story">
28 ...
29 </p>
30 </body>
31 </html>
32 """
33
34 soup = BeautifulSoup(html, 'lxml')
35 # 选取节点元素之后,可以调用contents属性获取它的直接子节点
36 print(soup.p.contents)
37
38 # 输出:
39 ['
Once upon a time there were three little sisters; and their names were
', <a class="sister" href="http://example.com/elsie" id="link1">
40 <!-- Elsie -->
41 </a>, '
,
', <a class="sister" href="http://example.com/lacie" id="link2">
42 Lacie
43 </a>, '
and
', <a class="sister" href="http://example.com/tillie" id="link3">
44 Tillie
45 </a>, '
;
and they lived at the bottom of a well.
']
46 # 返回结果是一个列表,列表中的元素是所选节点的直接子节点(不包括孙节点)
直接子节点
# children属性,返回结果是生成器类型。与contents属性一样,只是返回结果类型不同。
1 from bs4 import BeautifulSoup
2
3 html = """
4 <html>
5 <head>
6 <title>
7 The Dormouse's story
8 </title>
9 </head>
10 <body>
11 <p class="story">
12 Once upon a time there were three little sisters; and their names were
13 <a class="sister" href="http://example.com/elsie" >
14 <span>Elsie</span>
15 </a>
16 ,
17 <a class="sister" href="http://example.com/lacie" >
18 Lacie
19 </a>
20 and
21 <a class="sister" href="http://example.com/tillie" >
22 Tillie
23 </a>
24 ;
25 and they lived at the bottom of a well.
26 </p>
27 <p class="story">
28 ...
29 </p>
30 </body>
31 </html>
32 """
33
34 soup = BeautifulSoup(html, 'lxml')
35 print(soup.p.children) # 输出:<list_iterator object at 0x1159b7668>
36 for i, child in enumerate(soup.p.children):
37 print(i, child)
38
39
40 # for 循环的输出结果:
41 0
42 Once upon a time there were three little sisters; and their names were
43
44 1 <a class="sister" href="http://example.com/elsie" id="link1">
45 <span>Elsie</span>
46 </a>
47 2
48 ,
49
50 3 <a class="sister" href="http://example.com/lacie" id="link2">
51 Lacie
52 </a>
53 4
54 and
55
56 5 <a class="sister" href="http://example.com/tillie" id="link3">
57 Tillie
58 </a>
59 6
60 ;
61 and they lived at the bottom of a well.
62
直接子节点
# descendants属性会递归查询所有子节点,得到所有子孙节点。
1 from bs4 import BeautifulSoup
2
3 html = """
4 <html>
5 <head>
6 <title>
7 The Dormouse's story
8 </title>
9 </head>
10 <body>
11 <p class="story">
12 Once upon a time there were three little sisters; and their names were
13 <a class="sister" href="http://example.com/elsie" >
14 <span>Elsie</span>
15 </a>
16 ,
17 <a class="sister" href="http://example.com/lacie" >
18 Lacie
19 </a>
20 and
21 <a class="sister" href="http://example.com/tillie" >
22 Tillie
23 </a>
24 ;
25 and they lived at the bottom of a well.
26 </p>
27 <p class="story">
28 ...
29 </p>
30 </body>
31 </html>
32 """
33
34 soup = BeautifulSoup(html, 'lxml')
35 print(soup.p.descendants) # 输出:<generator object Tag.descendants at 0x1131d0048>
36 for i, child in enumerate(soup.p.descendants):
37 print(i, child)
38
39
40
41 # for 循环输出结果:
42 0
43 Once upon a time there were three little sisters; and their names were
44
45 1 <a class="sister" href="http://example.com/elsie" id="link1">
46 <span>Elsie</span>
47 </a>
48 2
49
50 3 <span>Elsie</span>
51 4 Elsie
52 5
53
54 6
55 ,
56
57 7 <a class="sister" href="http://example.com/lacie" id="link2">
58 Lacie
59 </a>
60 8
61 Lacie
62
63 9
64 and
65
66 10 <a class="sister" href="http://example.com/tillie" id="link3">
67 Tillie
68 </a>
69 11
70 Tillie
71
72 12
73 ;
74 and they lived at the bottom of a well.
75
获取子孙节点
# 2、父节点和祖先节点
1 from bs4 import BeautifulSoup
2
3 html = """
4 <html>
5 <head>
6 <title>
7 The Dormouse's story
8 </title>
9 </head>
10 <body>
11 <p class="story">
12 Once upon a time there were three little sisters; and their names were
13 <a class="sister" href="http://example.com/elsie" >
14 <span>Elsie</span>
15 </a>
16 </p>
17 <p class="story">
18 ...
19 </p>
20 </body>
21 </html>
22 """
23
24 soup = BeautifulSoup(html, 'lxml')
25 print(soup.a.parent)
26
27
28 # 输出:
29 <p class="story">
30 Once upon a time there were three little sisters; and their names were
31 <a class="sister" href="http://example.com/elsie" id="link1">
32 <span>Elsie</span>
33 </a>
34 </p>
parent获取某个节点的一个父节点
1 from bs4 import BeautifulSoup
2
3 html = """
4 <html>
5 <head>
6 <title>
7 The Dormouse's story
8 </title>
9 </head>
10 <body>
11 <p class="story">
12 Once upon a time there were three little sisters; and their names were
13 <a class="sister" href="http://example.com/elsie" >
14 <span>Elsie</span>
15 </a>
16 </p>
17 <p class="story">
18 ...
19 </p>
20 </body>
21 </html>
22 """
23
24 soup = BeautifulSoup(html, 'lxml')
25 print(soup.a.parents, type(soup.a.parents), list(enumerate(soup.a.parents)), sep='
')
26
27
28 # 输出:
29 <generator object PageElement.parents at 0x11c76e048>
30
31 <class 'generator'>
32
33 [(0, <p class="story">
34 Once upon a time there were three little sisters; and their names were
35 <a class="sister" href="http://example.com/elsie" id="link1">
36 <span>Elsie</span>
37 </a>
38 </p>), (1, <body>
39 <p class="story">
40 Once upon a time there were three little sisters; and their names were
41 <a class="sister" href="http://example.com/elsie" id="link1">
42 <span>Elsie</span>
43 </a>
44 </p>
45 <p class="story">
46 ...
47 </p>
48 </body>), (2, <html>
49 <head>
50 <title>
51 The Dormouse's story
52 </title>
53 </head>
54 <body>
55 <p class="story">
56 Once upon a time there were three little sisters; and their names were
57 <a class="sister" href="http://example.com/elsie" id="link1">
58 <span>Elsie</span>
59 </a>
60 </p>
61 <p class="story">
62 ...
63 </p>
64 </body>
65 </html>), (3, <html>
66 <head>
67 <title>
68 The Dormouse's story
69 </title>
70 </head>
71 <body>
72 <p class="story">
73 Once upon a time there were three little sisters; and their names were
74 <a class="sister" href="http://example.com/elsie" id="link1">
75 <span>Elsie</span>
76 </a>
77 </p>
78 <p class="story">
79 ...
80 </p>
81 </body>
82 </html>
83 )]
parent获取所有祖先节点
# 涉及内置函数enumerate()
# enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,一般用在 for 循环当中。
1 # enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,一般用在 for 循环当中。
2
3 a = ["恕", "我", "直", "言", "在", "坐", "的", "各", "位", "都", "是", "爱", "学", "习", "的"]
4 print(a) # 输出:['恕', '我', '直', '言', '在', '坐', '的', '各', '位', '都', '是', '爱', '学', '习', '的']
5 b = enumerate(a)
6 print(enumerate(a)) # 输出:<enumerate object at 0x11a1f8b40>
7 print(list(b))
8 # [(0, '恕'), (1, '我'), (2, '直'), (3, '言'), (4, '在'), (5, '坐'), (6, '的'), (7, '各'), (8, '位'), (9, '都'),
9 # (10, '是'), (11, '爱'), (12, '学'), (13, '习'), (14, '的')]
10
11 for m, n in enumerate(a):
12 print(m, n)
13 # for 循环 输出:
14 0 恕
15 1 我
16 2 直
17 3 言
18 4 在
19 5 坐
20 6 的
21 7 各
22 8 位
23 9 都
24 10 是
25 11 爱
26 12 学
27 13 习
28 14 的
enumerate()内置函数
# 3、兄弟节点
1 from bs4 import BeautifulSoup
2
3 html = """
4 <html>
5 <head>
6 <title>
7 The Dormouse's story
8 </title>
9 </head>
10 <body>
11 <p class="story">
12 Once upon a time there were three little sisters; and their names were
13 <a class="sister" href="http://example.com/elsie" >
14 <span>Elsie</span>
15 </a>
16 ,
17 <a class="sister" href="http://example.com/lacie" >
18 Lacie
19 </a>
20 and
21 <a class="sister" href="http://example.com/tillie" >
22 Tillie
23 </a>
24 ;
25 and they lived at the bottom of a well.
26 </p>
27 <p class="story">
28 ...
29 </p>
30 </body>
31 </html>
32 """
33
34 soup = BeautifulSoup(html, 'lxml')
35 print(
36 # 获取下一个兄弟元素
37 {'Next Sibling': soup.a.next_sibling},
38 # 获取上一个兄弟元素
39 {'Previous Sibling': soup.a.previous_sibling},
40 # 返回后面的兄弟元素
41 {'Next Siblings': list(enumerate(soup.a.next_siblings))},
42 # 返回前面的兄弟元素
43 {'Previous Siblings': list(enumerate(soup.a.previous_siblings))},
44
45 sep='
'
46 )
47
48
49 # 输出:
50 {'Next Sibling': '
,
'}
51
52 {'Previous Sibling': '
Once upon a time there were three little sisters; and their names were
'}
53
54 {'Next Siblings': [(0, '
,
'), (1, <a class="sister" href="http://example.com/lacie" id="link2">
55 Lacie
56 </a>), (2, '
and
'), (3, <a class="sister" href="http://example.com/tillie" id="link3">
57 Tillie
58 </a>), (4, '
;
and they lived at the bottom of a well.
')]}
59
60 {'Previous Siblings': [(0, '
Once upon a time there were three little sisters; and their names were
')]}
获取同级节点
# 4、提取信息
1 from bs4 import BeautifulSoup
2
3 html = """
4 <html>
5 <body>
6 <p class="story">
7 Once upon a time there were three little sisters; and their names were
8 <a class="sister" href="http://example.com/elsie" >Bob</a>
9 <a class="sister" href="http://example.com/lacie" >Lacie</a>
10 </p>
11 </body>
12 </html>
13 """
14
15 soup = BeautifulSoup(html, 'lxml')
16 print(
17 'Next Sibling:',
18
19 [soup.a.next_sibling], # 获取上一个兄弟节点
20 #
21 type(soup.a.next_sibling), # 上一个兄弟节点的类型
22 # <class 'bs4.element.NavigableString'>
23 [soup.a.next_sibling.string], # 获取上一个兄弟节点的内容
24 #
25 sep='
'
26 )
27
28 print(
29 'Parent:',
30
31 [type(soup.a.parents)], # 获取所有的祖先节点
32 # <class 'generator'>
33 [list(soup.a.parents)[0]], # 获取第一个祖先节点
34 # <p class="story">
35 Once upon a time there were three little sisters; and their names were
36 <a class="sister" href="http://example.com/elsie" id="link1">Bob</a>
37 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
38 </p>
39 [list(soup.a.parents)[0].attrs['class']], # 获取第一个祖先节点的"class属性"的值
40 # ['story']
41 sep='
'
42 )
43
44 # 为了输出返回的结果,均以列表形式
45
46
47 # 输出:
48 Next Sibling:
49 ['
']
50 <class 'bs4.element.NavigableString'>
51 ['
']
52 Parent:
53 [<class 'generator'>]
54 [<p class="story">
55 Once upon a time there were three little sisters; and their names were
56 <a class="sister" href="http://example.com/elsie" id="link1">Bob</a>
57 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
58 </p>]
59 [['story']]
View Code
方法选择器
-
find_all(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
# 查询所有符合条件的元素
1 from bs4 import BeautifulSoup
2
3 html = """
4 <div>
5 <ul>
6 <li class="item-O"><a href="linkl.html">first item</a></li>
7 <li class="item-1"><a href="link2.html">second item</a></li>
8 <li class="item-inactive"><a href="link3.html">third item</a></li>
9 <li class="item-1"><a href="link4.html">fourth item</a></li>
10 <li class="item-0"><a href="link5.html">fifth item</a>
11 </ul>
12 </div>
13 """
14
15 soup = BeautifulSoup(html, 'lxml')
16 print(soup.find_all(name='li'),
17 type(soup.find_all(name='li')[0]),
18 sep='
')
19
20
21 # 输出:
22 [<li class="item-O"><a href="linkl.html">first item</a></li>, <li class="item-1"><a href="link2.html">second item</a></li>, <li class="item-inactive"><a href="link3.html">third item</a></li>, <li class="item-1"><a href="link4.html">fourth item</a></li>, <li class="item-0"><a href="link5.html">fifth item</a>
23 </li>]
24
25 <class 'bs4.element.Tag'>
26
27
28 # 返回值是一个列表,列表的元素是名为"li"的节点,每个元素都是bs4.element.Tag类型
29
30
31 # 遍历每个a节点
32 from bs4 import BeautifulSoup
33
34 html = """
35 <div>
36 <ul>
37 <li class="item-O"><a href="linkl.html">first item</a></li>
38 <li class="item-1"><a href="link2.html">second item</a></li>
39 <li class="item-inactive"><a href="link3.html">third item</a></li>
40 <li class="item-1"><a href="link4.html">fourth item</a></li>
41 <li class="item-0"><a href="link5.html">fifth item</a>
42 </ul>
43 </div>
44 """
45
46 soup = BeautifulSoup(html, 'lxml')
47 li = soup.find_all(name='li')
48
49 for a in li:
50 print(a.find_all(name='a'))
51
52 # 输出:
53 [<a href="linkl.html">first item</a>]
54 [<a href="link2.html">second item</a>]
55 [<a href="link3.html">third item</a>]
56 [<a href="link4.html">fourth item</a>]
57 [<a href="link5.html">fifth item</a>]
name参数
1 from bs4 import BeautifulSoup
2
3 html = """
4 <div>
5 <ul>
6 <li class="item-O"><a href="linkl.html">first item</a></li>
7 <li class="item-1"><a href="link2.html">second item</a></li>
8 <li class="item-inactive"><a href="link3.html">third item</a></li>
9 <li class="item-1"><a href="link4.html">fourth item</a></li>
10 <li class="item-0"><a href="link5.html">fifth item</a>
11 </ul>
12 </div>
13 """
14
15 soup = BeautifulSoup(html, 'lxml')
16
17 print(soup.find_all(attrs={'class': 'item-0'}))
18 print(soup.find_all(attrs={'href': 'link5.html'}))
19
20
21 # 输出:
22 [<li class="item-0"><a href="link5.html">fifth item</a>
23 </li>]
24 [<a href="link5.html">fifth item</a>]
25
26 # 可以通过attrs参数传入一些属性来进行查询,即通过特定的属性来查询
27 # find_all(attrs={'属性名': '属性值', ......})
attrs参数
1 from bs4 import BeautifulSoup
2 import re
3
4 html = """
5 <div class="panel">
6 <div class="panel-body">
7 <a>Hello, this is a link</a>
8 <a>Hello, this is a link, too</a>
9 <div/>
10 <div/>
11 """
12
13 soup = BeautifulSoup(html, 'lxml')
14
15 # 正则表达式规则对象
16 regular = re.compile('link')
17
18 # text参数课用来匹配节点的文本,传入的形式可以是字符串,也可以是正则表达式对象
19 print(soup.find_all(text=regular))
20
21 # 正则匹配输出
22 print(re.findall(regular, html))
23
24
25 # 输出:
26 ['Hello, this is a link', 'Hello, this is a link, too']
27 ['link', 'link']
text参数
-
find(name=None, attrs={}, recursive=True, text=None, **kwargs)
仅返回与给定条件匹配标记的第一个元素
CSS选择器