1 # 字符串初始化
2 html = '''
3 <div>
4 <ul>
5 <li class = "item-0">first item</li>
6 <li class = "item-1"><a href = "link2.html">second item</a></li>
7 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
8 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
9 <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
10 </ul>
11 </div>
12 '''
13
14 from pyquery import PyQuery as pq
15 doc = pq(html)
16 print(doc('li'))
17
18 # url初始化
19 from pyquery import PyQuery as pq
20 doc = pq(url = "http://www.baidu.com")
21 print(doc("head"))
22
23 # 文件初始化
24 from pyquery import PyQuery as pq
25 doc = pq(filename = "demo.html")
26 print(doc('li'))
27
28 # 基本CSS选择器
29 html = '''
30 <div id = "container">
31 <ul class = "list">
32 <li class = "item-0">first item</li>
33 <li class = "item-1"><a href = "link2.html">second item</a></li>
34 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
35 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
36 <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
37 </ul>
38 </div>
39 '''
40 from pyquery import PyQuery as pq
41 doc = pq(html)
42 # 注意下面id 前面需要加上#,class 前面需要加上.
43 print(doc('#container .list li'))
44
45 # 查找元素
46 # 子元素
47 html = '''
48 <div id = "container">
49 <ul class = "list">
50 <li class = "item-0">first item</li>
51 <li class = "item-1"><a href = "link2.html">second item</a></li>
52 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
53 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
54 <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
55 </ul>
56 </div>
57 '''
58 from pyquery import PyQuery as pq
59 doc = pq(html)
60 items = doc('.list')
61 print(type(items))
62 print(items)
63 lis = items.find('li')
64 print(type(lis))
65 print(lis)
66
67 lis = items.children()
68 print(type(lis))
69 print(lis)
70
71 lis = items.children('.active')
72 print(lis)
73
74 # 父元素
75 html = '''
76 <div id = "container">
77 <ul class = "list">
78 <li class = "item-0">first item</li>
79 <li class = "item-1"><a href = "link2.html">second item</a></li>
80 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
81 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
82 <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
83 </ul>
84 </div>
85 '''
86 from pyquery import PyQuery as pq
87 doc = pq(html)
88 items = doc('.list')
89 container = items.parent()
90 print(type(container))
91 print(container)
92
93 html = '''
94 <div class = "wrap">
95 <div id = "container">
96 <ul class = "list">
97 <li class = "item-0">first item</li>
98 <li class = "item-1"><a href = "link2.html">second item</a></li>
99 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
100 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
101 <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
102 </ul>
103 </div>
104 </div>
105 '''
106 from pyquery import PyQuery as pq
107 doc = pq(html)
108 items = doc('.list')
109 parents = items.parents()
110 print(type(parents))
111 print(parents)
112
113 parents = items.parents('.wrap')
114 print(parents)
1 # 兄弟元素
2 html = '''
3 <div class = "wrap">
4 <div id = "container">
5 <ul class = "list">
6 <li class = "item-0">first item</li>
7 <li class = "item-1"><a href = "link2.html">second item</a></li>
8 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
9 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
10 <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
11 </ul>
12 </div>
13 </div>
14 '''
15 from pyquery import PyQuery as pq
16 doc = pq(html)
17 # 注意下面item-0后面直接是. 没有空格
18 li = doc('.list .item-0.active')
19 print(li.siblings())
20
21 print(li.siblings('.active'))
22
23 # 遍历
24 # 单个元素
25 html = '''
26 <div class = "wrap">
27 <div id = "container">
28 <ul class = "list">
29 <li class = "item-0">first item</li>
30 <li class = "item-1"><a href = "link2.html">second item</a></li>
31 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
32 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
33 <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
34 </ul>
35 </div>
36 </div>
37 '''
38 from pyquery import PyQuery as pq
39 doc = pq(html)
40 li = doc('.item-0.active')
41 print(li)
42
43 html = '''
44 <div class = "wrap">
45 <div id = "container">
46 <ul class = "list">
47 <li class = "item-0">first item</li>
48 <li class = "item-1"><a href = "link2.html">second item</a></li>
49 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
50 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
51 <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
52 </ul>
53 </div>
54 </div>
55 '''
56 from pyquery import PyQuery as pq
57 doc = pq(html)
58 lis = doc('li').items()
59 print(type(lis))
60 for li in lis:
61 print(li)
62
63 # 获取信息
64 # 获取属性
65 html = '''
66 <div class = "wrap">
67 <div id = "container">
68 <ul class = "list">
69 <li class = "item-0">first item</li>
70 <li class = "item-1"><a href = "link2.html">second item</a></li>
71 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
72 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
73 <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
74 </ul>
75 </div>
76 </div>
77 '''
78 from pyquery import PyQuery as pq
79 doc = pq(html)
80 a = doc('.item-0.active a')
81 print(a)
82 # 获取属性的两种方法
83 print(a.attr('href'))
84 print(a.attr.href)
85
86 # 获取文本
87 print(a.text())
88
89 # 获取html
90 from pyquery import PyQuery as pq
91 doc = pq(html)
92 li = doc('.item-0.active')
93 print(li)
94 # 得到<li>标签里面的代码
95 print(li.html())
96
97 # DOM操作
98 # addClass、removeClass
99 from pyquery import PyQuery as pq
100 doc = pq(html)
101 li = doc('.item-0.active')
102 print(li)
103 li.remove_class('active')
104 print(li)
105 li.add_class('active')
106 print(li)
107
108 # attr CSS
109 li.attr('name', 'link')
110 print(li)
111 li.css('font-size', '14px')
112 print(li)
113
114 # remove
115 html = '''
116 <div class = "wrap">
117 Hello,World
118 <p>This is a paragraph</p>
119 </div>
120 '''
121 from pyquery import PyQuery as pq
122 doc = pq(html)
123 wrap = doc('.wrap')
124 print(wrap.text())
125 wrap.find('p').remove()
126 print(wrap.text())
127
128 # 伪类选择器
129 html = '''
130 <div class = "wrap">
131 <div id = "container">
132 <ul class = "list">
133 <li class = "item-0">first item</li>
134 <li class = "item-1"><a href = "link2.html">second item</a></li>
135 <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
136 <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
137 <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
138 </ul>
139 </div>
140 </div>
141 '''
142 from pyquery import PyQuery as pq
143 doc = pq(html)
144 # 获取第一个元素
145 li = doc('li:first-child')
146 print(li)
147 # 获取最后一个元素
148 li = doc('li:last-child')
149 print(li)
150 # 获取第二个元素
151 li = doc('li:nth-child(2)')
152 print(li)
153 # 获取下标为2的元素后面的所有元素(下标从0开始)
154 li = doc('li:gt(2)')
155 print(li)
156 # 获取下标为偶数的元素
157 li = doc('li:nth-child(2n)')
158 print(li)
159 # 获取内容包含second 的元素
160 li = doc('li:contains(second)')
161 print(li)