爬蟲筆記pyquery

原創

2020-06-23 00:23

構建doc

from pyquery import PyQuery as pq

url = "https://news.163.com"
# doc = pq(url) # 獲取傳入獲取到的html文檔
html = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>pyquery test</title>
</head>
<body>
    <p class="text">this is test page</p>
    <ul id="list">
        <li id="baidu" class="meta tt"><a href="http://www.baidu.com"></a></li>
        <li id="163" class="meta"><a href="http://www.163.com"></a></li>
        <li id="google"><a href="http://www.google.com"></a></li>
        <li id="fb"><a href="http://www.facebook.com"></a></li>
    </ul>
</body>
</html>'''
doc = pq(html)

屬性選擇器

# 定位id爲baidu的標籤
doc("#baidu")
doc('[id="baidu"]')
doc.find("*[id='baidu']")

樣式選擇器

# 定位class爲meta tt的元素
doc(".meta")(".tt")
doc(".meta").filter(".tt") 
doc(".meta.tt")

鏈式調用

# 定位id爲list下的class爲meta tt的li下面的第二個a標籤的href
doc("#list li.meta.tt a").attr.href

多屬性定位

<a id="a" class="b c">
如果定位此標籤可以通過此方法定位
doc("#a.b.b")

僞類選擇器

僞類選擇器可以匹配到標籤名或屬性值爲此的元素

# 匹配所有按鈕輸入元素和按鈕元素
d = pq(('<div><input type="button"></input><button></button></div>'))
d(':button')
# <input type="button"/> <button/>
# 匹配所有複選框輸入元素
d = PyQuery('<div><input type="checkbox"/></div>')
d('input:checkbox')
# <input>
# 單選框
d = PyQuery('<div><input checked="checked"/></div>')
d('input:checked')
# <input>

contains

匹配包含指定文本的標籤,即使只有一個，可能會返回多個元素，因爲會把父標籤算進去

doc(":contains(‘bbb’)")

disable

匹配所有禁用標籤

d = PyQuery('<div><input disabled="disabled"/></div>')
d('input:disabled')
[<input>]

empty

匹配所有不包含其他元素的標籤

d = PyQuery('<div><h1><span>title</span></h1><h2/></div>')
 d(':empty')
[<h2>]

eq

按索引匹配

d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
>>> d('h1:eq(0)')
[<h1.first>]
>>> d('h1:eq(1)')
[<h1.last>]

even

匹配偶數索引元素

d = PyQuery('<div><p></p><p class="last"></p></div>')
>>> d('p:even')
[<p>]

odd

匹配奇數索引以及0索引

first

匹配第一個

d = PyQuery('<div><p class="first"></p><p></p></div>')
>>> d('p:first')
[<p.first>]

gt

匹配索引大於指定值的

d = PyQuery('<div><p class="first"></p><p></p></div>')
>>> d('p:first')
[<p.first>]

lt

匹配索引小於指定值的

d = PyQuery('<div><p class="first"></p><p></p></div>')
>>> d('p:first')
[<p.first>]

has()

參數可以是屬性，也可以是元素

>>> from pyquery import PyQuery
>>> d = PyQuery('<div class="foo"><div class="bar"></div></div>')
>>> d('.foo:has(".baz")')
[]
>>> d('.foo:has(".foo")')
[]
>>> d('.foo:has(".bar")')
[<div.foo>]
>>> d('.foo:has(div)')
[<div.foo>]

header

匹配h1-h6標籤

>>> from pyquery import PyQuery
>>> d = PyQuery('<div><h1>title</h1></div>')
>>> d(':header')
[<h1>]

hidden

匹配所有隱藏input標籤

>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input type="hidden"/></div>')
>>> d('input:hidden')
[<input>]

image

匹配所有type爲image的Input標籤

>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input type="image"/></div>')
>>> d('input:image')
[<input>]

input

匹配所有input標籤

last

最後一個

parent

匹配包含其他元素的元素(作爲parent的元素)

pseudo

翻譯僞元素?

password radio reset submit select text

input爲以上類型的
d(‘input:xxxx’)

操縱

each

apply func on each nodes

# 爲所有a標籤添加haha類
doc("a").each(lambda i,e:pq(e).add_class("haha"))

遍歷

如果要選擇虛線ID，則需要轉義點：

d = pq('<p id="hello.you"><a/></p><p id="test"><a/></p>')
>>> d('#hello\.you')

make_links_absolute

將鏈接的相對地址轉成絕對地址

html = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>pyquery test</title>
</head>
<body>
    <h1>test</h1>
    <p class="text">this is test page</p>
    <ul id="list">
        <li id="baidu" class="meta tt"><a href="/lala">bbb</a></li>
        <li id="163" class="meta"><a href="/wawa"></a></li>
        <li id="google" display="hidden"><a href="http://www.google.com"></a></li>
        <li id="fb"><a href="http://www.facebook.com"></a></li>
    </ul>
</body>
</html>'''
doc = pq(html,parser="html")
doc.make_links_absolute(base_url="https://www.baidu.com")

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.