python 爬蟲學習第四課

python 爬蟲學習之PyQuery庫

PyQuery練習一

#===========Pyquery練習一===================
html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''

from pyquery import PyQuery as pq 
doc = pq(url='http://www.baidu.com',encoding='utf-8')
print(doc)
print(type(doc))
print(doc('head'))

doc = pq(html)
print(doc('#container .list li'))
print(doc('li'))
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(lis)
lis2 = items.children()
print(lis2)
li2 = items.children('.active') 
print(li2)

PyQuery練習二

#===========Pyquery練習二===================
#爬取豆瓣讀書(https://book.douban.com/)信息

import requests
from pyquery import PyQuery as pq

response = requests.get('https://book.douban.com/')
doc = pq(response.content)
lis = doc('.info ').items()
book_items = []
for li in lis:
    item = {}
    item['title'] = li('.title a').text()
    item['link'] = li('.title a').attr('href')
    if not li('div.author').text() is None:
        item['author'] = "".join(li('div.author').text().split())
    else:
        item['author'] = li('div.author').text()
    book_items.append(item)

print(book_items)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章