python爬取糗事百科

import requests
from lxml import etree
import pymongo

class QiushiSpider:
    def __init__(self):
        self.url = "https://www.qiushibaike.com/text/page/8/"
#        定義爬取的url
        self.headers = {"User-Agent":"Mozilla5.0/"}
        self.conn = pymongo.MongoClient("localhost",27017)
#        存入mongo數據庫中
        self.db = self.conn.Baike
        self.myset = self.db.baikeset1

    def getPage(self):
        res = requests.get(self.url,headers=self.headers)
        res.enconding="utf-8"
        html = res.text
        self.parsePage(html)

    def parsePage(self,html):
        parstHtml = etree.HTML(html)
#        基準的xpath
        base_list = parstHtml.xpath('//div[contains(@id,"qiushi_tag_")]')
#        遍歷每個段子的節點對象
        for children in base_list:
#            用戶暱稱
            username = children.xpath('./div/a/h2')[0].text.strip()
#            段子內容
            content = children.xpath('./a/div[@class="content"]/span')[0].text.strip()
#            好笑數量
            laughf_num = children.xpath(".//span/i")[0].text.strip() 
#            評論數量
            ping_num = children.xpath('.//i[@class="number"]')[1].text.strip()
            d={"username":username,
               "content":content,
               "laughf_num":laughf_num,
               "ping_num":ping_num
                    }
            self.myset.insert(d)

if __name__ =="__main__":
    q= QiushiSpider()
    q.getPage()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章