爬蟲解析庫:XPath

from lxml import etree
from fake_useragent import UserAgent
from urllib import parse
import requests


class TBImage():
    def __init__(self):
        self.ua = UserAgent().ie
        self.headers = {
            "User-Agent": self.ua
        }
        self.url = "https://tieba.baidu.com/f?kw="

    def purpose(self):
        key = input("請輸入你要查詢的關鍵字：")
        BgPage = input("請輸入你要查詢的起始頁：")
        LtPage = input("請輸入你要查詢的終止頁：")
        keyword = parse.quote(key)
        pn = [pn for pn in map(lambda x: x * 50, range(int(BgPage) - 1, int(LtPage)))]
        for pagepn in pn:
            url = self.url + keyword + "&ie=utf-8&pn=" + str(pagepn)
            print("正在加載第{}頁。。。".format(int(BgPage)))
            self.downLoadPage(url)
            BgPage = int(BgPage)
            BgPage += 1
        print("下載已完成，謝謝使用！")

    def downLoadPage(self, url):
        req = requests.get(url=url, headers=self.headers)
        res = req.text
        html = etree.HTML(res)
        link = html.xpath('//a[@class="j_th_tit "]/@href')
        for item in link:
            fullurl = "http://www.tieba.com" + item
            print("正在查找{}中圖片鏈接".format(fullurl))
            self.downLoadImageLink(fullurl)

    def downLoadImageLink(self, url):
        req2 = requests.get(url=url, headers=self.headers)
        res2 = req2.text
        # print(res2)
        html2 = etree.HTML(res2)
        imagelink = html2.xpath('//img[@class="BDE_Image"]//@src')
        for image in imagelink:
            print("正在下載圖片{}".format(image))
            self.dowLoadImage(image)

    def dowLoadImage(self, url):
        req1 = requests.get(url=url, headers=self.headers)
        res1 = req1.content
        filename = url[-9:]
        with open("./Image/" + filename, "wb") as f:
            f.write(res1)


if __name__ == '__main__':
    tiebaImage = TBImage()
    tiebaImage.purpose()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

爬蟲解析庫:XPath

物理機開關機

前端使用 Konva 實現可視化設計器（15）- 自定義連接點、連接優化

python3設置默認pip源

xpath無法解析部分javascript獲取結果爲空問題

python利用裝飾器打印日誌、打印報錯UnicodeEncodeError

python3 post json數據類型請求

win10右擊菜單添加在此處打開cmd命令窗口

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結