python通過靜態和動態的方式爬取網站頁面URL

爬取網站所有頁面的url,可通過獲取<a>標籤裏面的href的方式,進行遞歸操作,從而獲取到整個網站的url。拿到url後,可請求該url,根據頁面返回的狀態碼來簡單驗證頁面是否可正常打開。這裏需要注意的是,需要將抓取到的url進行重複過濾,避免存入重複的url,導致無限遞歸。

由於不同網頁獲取數據的方式不同,有的是靜態獲取,有的是動態獲取,所以需要根據網頁的類型,使用不同的方法進行數據的抓取。那麼,怎麼判斷是否是通過ajax獲取到的數據呢?可以右鍵查看網頁源代碼,在源代碼中搜索需要的數據,如果能搜索出來,則是靜態獲取,如果搜索不出來,那麼通常是通過ajax請求獲取的數據。

下面通過抓取雲課堂的頁面url的例子來進行簡單地示例:

1、靜態頁面抓取頁面url

不是通過ajax請求獲取到的數據的url,都可以通過靜態的方式抓取。代碼如下:

from lxml import etree
import requests
import os

home_url = "https://study.163.com"
HTTP='https:'
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
}
all_urls = []

def get_url(current_url):
    urls = []
    resp = requests.get(current_url, headers=headers)
    status_code = resp.status_code
    if status_code != 200:
        print("當前頁面狀態值不等於200!頁面url爲:%s" %current_url )
        with open("error_url.txt",'a+') as fp:
            fp.write(str(status_code) + "::")
            fp.write(current_url + "\n")
        fp.close()

    htmlElement = etree.HTML(resp.content.decode('utf-8'))
    aList = htmlElement.xpath('//a/@href')
    if len(aList) == 0:
        print("當前頁面未檢測到url")
        return
    for a in aList:
        if a.startswith("//study.163.com") or a.startswith("//*.study.163.com"):
            url = HTTP + a
            urls.append(url)
        elif a.startswith("http://study.163.com") or a.startswith("https://study.163.com") or a.startswith(
                "http://*.study.163.com") or a.startswith("https://*.study.163.com"):
            url = a
            urls.append(url)
        else:
            url = ""
        if url:
            urls.append(url)
    for url in urls:
        if url not in all_urls:
            print(url)
            all_urls.append(url)
            if not os.path.exists("ykt_url.txt"):
                os.system(r"type nul>ykt_url.txt")
            with open("ykt_url.txt", 'a+') as fp:
                fp.write(url + "\n")
            fp.close()
            get_url(url)

 

2、動態頁面抓取頁面url

通過上述抓取會發現,有很多頁面的url並沒有抓取到,說明很多頁面數據是通過ajax請求返回的,這部分數據是動態獲取的,通過上述方式進行抓取是獲取不到這部分數據的,從而丟失大量頁面請求。

對於有ajax請求的頁面,可通過selenium來實現。通過webdriver啓動頁面,模擬正常頁面訪問,從而達到獲取動態數據的目的。對頁面進行判斷,對於需要登錄的頁面進行登錄操作,然後再進行數據的獲取。大體實現如下:

from lxml import etree
import requests
import os
from selenium import webdriver
import time

driver_path = r"D:\chromedriver\78\chromedriver.exe"
driver = webdriver.Chrome(driver_path)

home_url = "https://study.163.com"


HTTP='https:'
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'

}
all_urls = []



class GetYktUrlAjax():
    driver_path = r"D:\chromedriver\78\chromedriver.exe"

    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument('user-agent=%s' %headers['user-agent'])
        self.driver = webdriver.Chrome(executable_path=GetYktUrlAjax.driver_path,options=options)

    def click_agree(self):
        try:
            self.driver.find_element_by_xpath("//span[@class='ux-btn th-bk-main ux-btn- ux-btn- ux-modal-btn um-modal-btn_ok th-bk-main']").click()
        except:
            pass

    def get_url_status(self,url):
        try:
            resp = requests.get(url,headers=headers)
            status_code = resp.status_code
        except:
            print("當前url請求異常,返回一個自定義的code 9999")
            status_code = 9999
        return status_code

    def get_url(self,my_current_url):
        urls = []
        self.driver.get(my_current_url)
        time.sleep(2)
        self.click_agree()

        # 如果重定向到登錄頁面,則進行登錄操作
        if self.driver.current_url.startswith("https://study.163.com/member/login.htm"):
            print("進入登錄頁面")
            time.sleep(1)
            self.driver.find_element_by_xpath('//ul[@class="ux-tabs-underline_hd"]/li[last()]').click()

            div = self.driver.find_element_by_id("j-ursContainer-0")
            iframe = div.find_element_by_xpath(".//iframe")
            self.driver.switch_to.frame(iframe)
            time.sleep(1)
            username = self.driver.find_element_by_xpath("//input[@name='email']")
            username.clear()
            username.send_keys("[email protected]")
            password = self.driver.find_element_by_xpath("//input[@name='password']")
            password.clear()
            password.send_keys("******")
            login_btn = self.driver.find_element_by_id('dologin')
            login_btn.click()
            self.driver.switch_to.default_content()
            time.sleep(3)
        time.sleep(0.5)

        try:
            self.driver.maximize_window()
        except:
            pass

        # 判斷頁面url請求狀態
        status_code = self.get_url_status(my_current_url)
        if status_code != 200:
            print("當前頁面狀態有誤,請及時查看!頁面url: %s" %my_current_url)
            if not os.path.exists("error_url.txt"):
                os.system(r"type nul>error_url.txt")
            with open("error_url.txt", 'a+') as fp:
                fp.write(my_current_url + "\n")

        source = self.driver.page_source
        # print(source)
        htmlElement = etree.HTML(source)
        aList = htmlElement.xpath('//a/@href')
        if len(aList) == 0:
            print("當前頁面未檢測到url")
            return
        for a in aList:
            if a.startswith("//study.163.com") or a.startswith("//mooc.study.163.com") or a.startswith("//course.study.163.com"):
                url = HTTP + a
                
            elif a.startswith("http://study.163.com") or a.startswith("https://study.163.com") or a.startswith(
                    "http://mooc.study.163.com") or a.startswith("https://mooc.study.163.com") or a.startswith(
                    "http://course.study.163.com") or a.startswith("https://course.study.163.com"):
                url = a
                
            else:
                url = ""
            if url and url not in all_urls:
                print(url)
                
                all_urls.append(url)
                if not os.path.exists("ykt_url_ajax.txt"):
                    os.system(r"type nul>ykt_url_ajax.txt")
                with open("ykt_url_ajax.txt", 'a+') as fp:
                    fp.write(url + "\n")
                fp.close()
          
        for real_url in urls:
            self.get_url(url)

def read_file_to_urls():
    print("將文件中的url讀入all_urls中")
    if not os.path.exists("ykt_url_ajax.txt"):
        os.system(r"type nul>ykt_url_ajax.txt")
    with open("ykt_url_ajax.txt",'r') as fr:
        while True:
            url = fr.readline()
            if not url:
                break
            if url and (url != '\n') and (url not in all_urls):
                all_urls.append(url.split('\n')[0])



if __name__ == '__main__':
    read_file_to_urls()
    if len(all_urls) == 0:
        GetYktUrlAjax().get_url(home_url)
    else:
        for url in all_urls:
            GetYktUrlAjax().get_url(url)

上述方法可實現動態網頁數據的獲取。但是由於selenium啓動頁面需要時間,所以如果需要抓取全站url,將會非常耗時,可考慮對其進行優化。

 

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章