爬蟲解析庫:XPath

from lxml import etree
from fake_useragent import UserAgent
from urllib import parse
import requests


class TBImage():
    def __init__(self):
        self.ua = UserAgent().ie
        self.headers = {
            "User-Agent": self.ua
        }
        self.url = "https://tieba.baidu.com/f?kw="

    def purpose(self):
        key = input("請輸入你要查詢的關鍵字:")
        BgPage = input("請輸入你要查詢的起始頁:")
        LtPage = input("請輸入你要查詢的終止頁:")
        keyword = parse.quote(key)
        pn = [pn for pn in map(lambda x: x * 50, range(int(BgPage) - 1, int(LtPage)))]
        for pagepn in pn:
            url = self.url + keyword + "&ie=utf-8&pn=" + str(pagepn)
            print("正在加載第{}頁。。。".format(int(BgPage)))
            self.downLoadPage(url)
            BgPage = int(BgPage)
            BgPage += 1
        print("下載已完成,謝謝使用!")

    def downLoadPage(self, url):
        req = requests.get(url=url, headers=self.headers)
        res = req.text
        html = etree.HTML(res)
        link = html.xpath('//a[@class="j_th_tit "]/@href')
        for item in link:
            fullurl = "http://www.tieba.com" + item
            print("正在查找{}中圖片鏈接".format(fullurl))
            self.downLoadImageLink(fullurl)

    def downLoadImageLink(self, url):
        req2 = requests.get(url=url, headers=self.headers)
        res2 = req2.text
        # print(res2)
        html2 = etree.HTML(res2)
        imagelink = html2.xpath('//img[@class="BDE_Image"]//@src')
        for image in imagelink:
            print("正在下載圖片{}".format(image))
            self.dowLoadImage(image)

    def dowLoadImage(self, url):
        req1 = requests.get(url=url, headers=self.headers)
        res1 = req1.content
        filename = url[-9:]
        with open("./Image/" + filename, "wb") as f:
            f.write(res1)


if __name__ == '__main__':
    tiebaImage = TBImage()
    tiebaImage.purpose()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章