關於PyQt5中PyQtWebEngine的最新使用

大家知道 用Python做爬蟲如遇動態渲染javascript則需要一些框架如PyQt,然而 PyQt5 相比4做了很大的改動,似乎難用了許多,沒有了findAllElements方法,無法直接查找元素,用lxml加載之後又與原文檔失去了關聯,那麼該如何動態修改頁面元素呢?下面給出了我的改進用法,希望對大家有所幫助:

import csv
import time
try:
    from PySide2.QtWidgets import QApplication
    from PySide2.QtWebEngineWidgets import *
    from PySide2.QtCore import *
except ImportError:
    from PyQt5.QtWidgets import QApplication
    from PyQt5.QtCore import QUrl, QEventLoop, QTimer
    from PyQt5.QtWebEngineWidgets import QWebEngineView,QWebEnginePage
import lxml.html


class BrowserRender(QWebEngineView):
    def __init__(self, display=True):
        self.app = QApplication([])
        QWebEngineView.__init__(self)
        self.html = ''
        self.tree:lxml.html.etree._Element = None
        if display:
            self.show() # show the browser

    def open(self, url, timeout=60):
        """Wait for download to complete and return result"""
        loop = QEventLoop()
        timer = QTimer()
        timer.setSingleShot(True)
        timer.timeout.connect(loop.quit)
        self.loadFinished.connect(loop.quit)
        self.load(QUrl(url))
        timer.start(timeout * 1000)
        loop.exec_() # delay here until download finished
        if timer.isActive():
            # downloaded successfully
            timer.stop()
            self.page().toHtml(self.callable)
        else:
            # timed out
            print ('Request timed out:' + url)

        self.app.exec_()

    # def _loadFinished(self):
    #     self.page().toHtml(self.callable)

    def callable(self, data):
        self.html = data
        self.tree = lxml.html.fromstring(self.html)
        # dodo = self.page().action(QWebEnginePage.SelectAll)
        self.app.quit()

    def get_html(self):
        """Shortcut to return the current HTML"""
        return self.html

    def find(self, pattern):
        """Find all elements that match the pattern"""
        # return self.page().mainFrame().findAllElements(pattern)
        return self.tree.cssselect(pattern)

    def attr(self, pattern, name, value):
        """Set attribute for matching elements"""
        for e in self.find(pattern):
            e.attrib.update({name:value})

        # self.page().setHtml(str(lxml.html.tostring(self.tree), encoding="utf8"), baseUrl=QUrl('http://example.python-scraping.com/search'))
        # self.setHtml(str(lxml.html.tostring(self.tree), encoding="utf8"))


    def text(self, pattern, value):
        """Set attribute for matching elements"""
        for e in self.find(pattern):
            e.text = value

        # self.page().setHtml(str(lxml.html.tostring(self.tree), encoding="utf8"), baseUrl=QUrl('http://example.python-scraping.com/search'))
        # self.setHtml(str(lxml.html.tostring(self.tree), encoding="utf8"))

    def setSearchItem(self, pattern, search_value):
        """Click matching elements"""
        page:QWebEnginePage = self.page()
        js_string = '''
        function myFunction(id, value)
        {{
            document.getElementById(id).value = value;
            document.getElementById('page_size').children[1].selected = true
            document.getElementById('page_size').children[1].innerText = 1000
            return document.getElementById(id).value;
        }}
                
        myFunction("{id}", "{value}");
        '''

        for e in self.find(pattern):
            page.runJavaScript(js_string.format(id=e.attrib['id'],value=search_value), self.js_callback)

        self.app.exec_()

    def click(self, pattern):
        """Click matching elements"""
        page:QWebEnginePage = self.page()
        js_string = '''
        function myFunction(id)
        {{
            document.getElementById(id).click();
            return id
        }}
                
        myFunction("{id}");
        '''

        for e in self.find(pattern):
            page.runJavaScript(js_string.format(id=e.attrib['id']), self.js_callback)

        self.app.exec_()


    def js_callback(self,result):
        print(result)
        self.app.quit()
        # QMessageBox.information(self, "提示", str(result))

    def wait_load(self, pattern, timeout=60):
        """Wait for this pattern to be found in webpage and return matches"""
        deadline = time.time() + timeout
        while time.time() < deadline:
            self.app.processEvents()

            matches = self.find(pattern)
            if matches:
                return matches
            else:
                self.page().toHtml(self.callable)
                self.app.exec_()
        print('Wait load timed out')


def main():
    br = BrowserRender()
    br.open('http://example.python-scraping.com/search')
    br.attr('#search_term', 'value', '.')
    br.setSearchItem('#search_term', '.')
    br.text('#page_size option:checked', '1000')
    br.click('#search')

    elements = br.wait_load('#results a')
    writer = csv.writer(open('countries_or_districts.csv', 'w'))

    for country_or_district in [e.text_content().strip() for e in elements]:
        writer.writerow([country_or_district])


if __name__ == '__main__':
    main()

如有疑問,可以留言諮詢。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章