python爬蟲之JS鏈接跳轉抓取

實現步驟

1、從民政數據網站中提取最新區劃代碼

# 特點

1、最新的在上面

# 代碼實現

import requests

from lxml import etree

import re

 

url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

html = requests.get(url, headers=headers).text

parse_html = etree.HTML(html)

article_list = parse_html.xpath('//a[@class="artitlelist"]')

 

for article in article_list:

    title = article.xpath('./@title')[0]

    # 正則匹配title中包含這個字符串的鏈接

    if re.findall(r'.**?', title, re.S):

        # 獲取到第1個就停止即可,第1個永遠是最新的鏈接

        two_link = 'http://www.mca.gov.cn' + article.xpath('./@href')[0]

        print(two_link)

        break

從二級頁面鏈接中提取真實鏈接(反爬-響應內容中嵌入JS,指向新的鏈接)

1、向二級頁面鏈接發請求得到響應內容,並查看嵌入的JS代碼

2、正則提取真實的二級頁面鏈接

# 相關思路代碼

two_html = requests.get(two_link, headers=headers).text

# 從二級頁面的響應中提取真實的鏈接(此處爲JS動態加載跳轉的地址)

new_two_link = re.findall(r'window.location.href="(.*?)"', html2, re.S)[0]

3、在數據庫表中查詢此條鏈接是否已經爬取,建立增量爬蟲

1、數據庫中建立version表,存儲爬取的鏈接

2、每次執行程序和version表中記錄覈對,查看是否已經爬取過

# 思路代碼

cursor.execute('select * from version')

result = self.cursor.fetchall()

if result:

    if result[-1][0] == two_link:

        print('已是最新')

    else:

        # 有更新,開始抓取

        # 將鏈接再重新插入version表記錄

代碼實現

'''網站數據抓取(增量爬蟲)'''

import requests

from lxml import etree

import re

import pymysql

 

class Govement(object):

    def __init__(self):

        self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'

        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

        self.db = pymysql.connect('127.0.0.1','root','123456','govdb')

        self.cursor = self.db.cursor()

 

    # 獲取二級頁面真實鏈接,並和數據庫中比對

    def get_two_link(self):

        html = requests.get(self.one_url,headers=self.headers).text

        # 此處隱藏了真實的二級頁面的url鏈接,通過js腳本生成,保存本地文件查看

        parse_html = etree.HTML(html)

        a_list = parse_html.xpath('//a[@class="artitlelist"]')

        for a in a_list:

            title = a.xpath('./@title')[0]

            # 正則匹配title中包含這個字符串的鏈接

             # 利用正則找到第一個自己需要的title裏面的地址(第一個一般都是最新的)

            if re.findall(r'.*?.*?',title,re.S):

                # 獲取到第1個就停止即可,第1個永遠是最新的鏈接

                two_link = 'http://www.mca.gov.cn' + a.xpath('./@href')[0]

                break

 

        # 從已提取的two_link中提取二級頁面的真實鏈接

        two_html = requests.get(two_link, headers=self.headers).text

        # 從二級頁面的響應中提取真實的鏈接(此處爲JS動態加載跳轉的地址)

        real_two_link = re.findall(r'window.location.href="(.*?)"', two_html, re.S)[0]

        # 實現增量爬取

        self.cursor.execute('select * from version')

        result = self.cursor.fetchall()

        if result:

            if result[-1][0] == real_two_link:

                print('已是最新')

        else:

            self.get_data(real_two_link)

            self.cursor.execute('insert into version values(%s)',[real_two_link])

            self.db.commit()

 

    # xpath直接提取數據

    def get_data(self,real_two_link):

        real_two_html = requests.get(real_two_link,headers=self.headers).text

        parse_html = etree.HTML(real_two_html)

        # 基準xpath,提取每個信息的節點列表對象

        tr_list = parse_html.xpath('//tr[@height=19]')

        city_info = {}

        for tr in tr_list:

            city_info['code'] = tr.xpath('./td[2]/text()')

            city_info['name'] = tr.xpath('./td[3]/text()')

            print(city_info)

 

if __name__ == '__main__':

    spider = Govement()

    spider.get_two_link()

# coding=utf-8  

from bs4 import BeautifulSoup  

from selenium import webdriver  

import time  

  

#使用selenium  

driver = webdriver.PhantomJS(executable_path="D:\\python\\phantomjs-2.1.1\\bin\\phantomjs.exe")  

driver.maximize_window()  

  

#登錄QQ空間  

def get_shuoshuo(qq):  

    driver.get('http://user.qzone.qq.com/{}/311'.format(qq))  

    time.sleep(5)  

    try:  

        driver.find_element_by_id('login_div')  

        a = True  

        print("需要登錄...")  

    except:  

        a = False  

        print("不需要登錄...")  

        function(){ //亨達 http://www.hantecglobal.org.cn/

          

    if a == True:  

        driver.switch_to.frame('login_frame')  

        driver.find_element_by_id('switcher_plogin').click()  

        driver.find_element_by_id('u').clear()#選擇用戶名框  

        driver.find_element_by_id('u').send_keys('QQ號碼')  

        driver.find_element_by_id('p').clear()  

        driver.find_element_by_id('p').send_keys('QQ密碼')  

        driver.find_element_by_id('login_button').click()  

        time.sleep(3)  

    driver.implicitly_wait(3)  

      

    print("驗證權限...")  

    try:  

        driver.find_element_by_id('QM_OwnerInfo_Icon')  

        b = True  

    except:  

        b = False  

          

    if b == True:  

        print("獲取說說...")  

        driver.switch_to.frame('app_canvas_frame')  

        content = driver.find_elements_by_css_selector('.content')  

        stime = driver.find_elements_by_css_selector('.c_tx.c_tx3.goDetail')  

        for con,sti in zip(content,stime):  

            data = {  

                'time':sti.text,  

                'shuos':con.text  

            }  

            print(data)  

        pages = driver.page_source  

        #print(pages)  

        soup = BeautifulSoup(pages,'lxml')  

  

    cookie = driver.get_cookies()  

    cookie_dict = []  

    for c in cookie:  

        ck = "{0}={1};".format(c['name'],c['value'])  

        cookie_dict.append(ck)  

          

    i = ''  

    for c in cookie_dict:  

        i += c  

    print('Cookies:',i)  

    print("==========完成================")  

  

    driver.close()  

    driver.quit()  

  

if __name__ == '__main__':  

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章