spider動態頁面 抓取方式

 中間件重寫, 登錄起始開啓有頭模擬,保存cookies後,開啓無頭模擬

       示例:

class SeleniumDownloaderMiddleware(object):
    def __init__(self):
        self.chrome_options = Options()
        self.chrome_options.add_argument('--headless')
        self.chrome_options.add_argument('--disable-gpu')
        self.url = 'http://www.taobao.com'
        path = r'E:\LiuLanQi\chromedriver.exe'
        # self.driver = webdriver.Chrome(executable_path=path, chrome_options=self.chrome_options)
        self.driver = webdriver.Chrome(executable_path=path)
        self.load_cookies = False

    # def __del__(self):
    #     self.driver.close()

    def process_request(self, request, spider):

        if request.meta['page'] == '0':
            self.driver.get(url=self.url)
            if not os.path.exists('cookies.json'):
                time.sleep(30)
                dictCookies = self.driver.get_cookies()
                jsonCookies = json.dumps(dictCookies)
                # 登錄完成後,將cookie保存到本地文件
                with open('cookies.json', 'w') as f:
                    f.write(jsonCookies)
                self.driver.close()
            else:
                if not self.load_cookies:
                    # 刪除第一次建立連接時的cookie
                    self.driver.delete_all_cookies()
                    # 讀取登錄時存儲到本地的cookie
                    with open('cookies.json', 'r') as f:
                        listCookies = json.loads(f.read())
                    for cookie in listCookies:
                        self.driver.add_cookie({
                            'domain': '.taobao.com',  # 此處xxx.com前,需要帶點
                            'name': cookie['name'],
                            'value': cookie['value'],
                            'path': '/',
                            'expires': None
                        })
                    self.load_cookies = True
                # 再次訪問頁面,便可實現免登陸訪問
                self.driver.get('https://www.taobao.com')
                time.sleep(5)
                my_input = self.driver.find_element_by_css_selector('#q')
                # 向這個框裏面寫內容
                my_input.send_keys('口紅')
                time.sleep(3)
                button = self.driver.find_element_by_css_selector('#J_TSearchForm > div.search-button > button')
                button.click()
                time.sleep(5)
                bai_botton = self.driver.find_element_by_link_text('紀梵希')
                bai_botton.click()
                time.sleep(10)
                hufu_info = self.driver.find_element_by_css_selector(
                    '#mainsrp-itemlist > div > div > div:nth-child(1) > div:nth-child(1)')
                hufu_info.click()
                time.sleep(20)

                pinglun = self.driver.find_element_by_css_selector('#J_TabBar > li:nth-child(2)')
                pinglun.click()
                time.sleep(5)
                ping = self.driver.find_element_by_css_selector('#J_Reviews > div > div.rate-grid > table > tbody > tr:nth-child(1) > td.tm-col-master > div.tm-rate-content > div.tm-rate-fulltxt').text
                url = self.driver.current_url
                page_source = self.driver.page_source

                
                # 根據網頁源代碼,創建htmlresponse對象
                # 因爲返回的是文本內容,指定字符編碼格式
                time.sleep(5)
                response = HtmlResponse(url=url, body=page_source, encoding='utf-8',request=request)
                return response

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章