爬取網站所有頁面的url,可通過獲取<a>標籤裏面的href的方式,進行遞歸操作,從而獲取到整個網站的url。拿到url後,可請求該url,根據頁面返回的狀態碼來簡單驗證頁面是否可正常打開。這裏需要注意的是,需要將抓取到的url進行重複過濾,避免存入重複的url,導致無限遞歸。
由於不同網頁獲取數據的方式不同,有的是靜態獲取,有的是動態獲取,所以需要根據網頁的類型,使用不同的方法進行數據的抓取。那麼,怎麼判斷是否是通過ajax獲取到的數據呢?可以右鍵查看網頁源代碼,在源代碼中搜索需要的數據,如果能搜索出來,則是靜態獲取,如果搜索不出來,那麼通常是通過ajax請求獲取的數據。
下面通過抓取雲課堂的頁面url的例子來進行簡單地示例:
1、靜態頁面抓取頁面url
不是通過ajax請求獲取到的數據的url,都可以通過靜態的方式抓取。代碼如下:
from lxml import etree
import requests
import os
home_url = "https://study.163.com"
HTTP='https:'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
}
all_urls = []
def get_url(current_url):
urls = []
resp = requests.get(current_url, headers=headers)
status_code = resp.status_code
if status_code != 200:
print("當前頁面狀態值不等於200!頁面url爲:%s" %current_url )
with open("error_url.txt",'a+') as fp:
fp.write(str(status_code) + "::")
fp.write(current_url + "\n")
fp.close()
htmlElement = etree.HTML(resp.content.decode('utf-8'))
aList = htmlElement.xpath('//a/@href')
if len(aList) == 0:
print("當前頁面未檢測到url")
return
for a in aList:
if a.startswith("//study.163.com") or a.startswith("//*.study.163.com"):
url = HTTP + a
urls.append(url)
elif a.startswith("http://study.163.com") or a.startswith("https://study.163.com") or a.startswith(
"http://*.study.163.com") or a.startswith("https://*.study.163.com"):
url = a
urls.append(url)
else:
url = ""
if url:
urls.append(url)
for url in urls:
if url not in all_urls:
print(url)
all_urls.append(url)
if not os.path.exists("ykt_url.txt"):
os.system(r"type nul>ykt_url.txt")
with open("ykt_url.txt", 'a+') as fp:
fp.write(url + "\n")
fp.close()
get_url(url)
2、動態頁面抓取頁面url
通過上述抓取會發現,有很多頁面的url並沒有抓取到,說明很多頁面數據是通過ajax請求返回的,這部分數據是動態獲取的,通過上述方式進行抓取是獲取不到這部分數據的,從而丟失大量頁面請求。
對於有ajax請求的頁面,可通過selenium來實現。通過webdriver啓動頁面,模擬正常頁面訪問,從而達到獲取動態數據的目的。對頁面進行判斷,對於需要登錄的頁面進行登錄操作,然後再進行數據的獲取。大體實現如下:
from lxml import etree
import requests
import os
from selenium import webdriver
import time
driver_path = r"D:\chromedriver\78\chromedriver.exe"
driver = webdriver.Chrome(driver_path)
home_url = "https://study.163.com"
HTTP='https:'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
all_urls = []
class GetYktUrlAjax():
driver_path = r"D:\chromedriver\78\chromedriver.exe"
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument('user-agent=%s' %headers['user-agent'])
self.driver = webdriver.Chrome(executable_path=GetYktUrlAjax.driver_path,options=options)
def click_agree(self):
try:
self.driver.find_element_by_xpath("//span[@class='ux-btn th-bk-main ux-btn- ux-btn- ux-modal-btn um-modal-btn_ok th-bk-main']").click()
except:
pass
def get_url_status(self,url):
try:
resp = requests.get(url,headers=headers)
status_code = resp.status_code
except:
print("當前url請求異常,返回一個自定義的code 9999")
status_code = 9999
return status_code
def get_url(self,my_current_url):
urls = []
self.driver.get(my_current_url)
time.sleep(2)
self.click_agree()
# 如果重定向到登錄頁面,則進行登錄操作
if self.driver.current_url.startswith("https://study.163.com/member/login.htm"):
print("進入登錄頁面")
time.sleep(1)
self.driver.find_element_by_xpath('//ul[@class="ux-tabs-underline_hd"]/li[last()]').click()
div = self.driver.find_element_by_id("j-ursContainer-0")
iframe = div.find_element_by_xpath(".//iframe")
self.driver.switch_to.frame(iframe)
time.sleep(1)
username = self.driver.find_element_by_xpath("//input[@name='email']")
username.clear()
username.send_keys("[email protected]")
password = self.driver.find_element_by_xpath("//input[@name='password']")
password.clear()
password.send_keys("******")
login_btn = self.driver.find_element_by_id('dologin')
login_btn.click()
self.driver.switch_to.default_content()
time.sleep(3)
time.sleep(0.5)
try:
self.driver.maximize_window()
except:
pass
# 判斷頁面url請求狀態
status_code = self.get_url_status(my_current_url)
if status_code != 200:
print("當前頁面狀態有誤,請及時查看!頁面url: %s" %my_current_url)
if not os.path.exists("error_url.txt"):
os.system(r"type nul>error_url.txt")
with open("error_url.txt", 'a+') as fp:
fp.write(my_current_url + "\n")
source = self.driver.page_source
# print(source)
htmlElement = etree.HTML(source)
aList = htmlElement.xpath('//a/@href')
if len(aList) == 0:
print("當前頁面未檢測到url")
return
for a in aList:
if a.startswith("//study.163.com") or a.startswith("//mooc.study.163.com") or a.startswith("//course.study.163.com"):
url = HTTP + a
elif a.startswith("http://study.163.com") or a.startswith("https://study.163.com") or a.startswith(
"http://mooc.study.163.com") or a.startswith("https://mooc.study.163.com") or a.startswith(
"http://course.study.163.com") or a.startswith("https://course.study.163.com"):
url = a
else:
url = ""
if url and url not in all_urls:
print(url)
all_urls.append(url)
if not os.path.exists("ykt_url_ajax.txt"):
os.system(r"type nul>ykt_url_ajax.txt")
with open("ykt_url_ajax.txt", 'a+') as fp:
fp.write(url + "\n")
fp.close()
for real_url in urls:
self.get_url(url)
def read_file_to_urls():
print("將文件中的url讀入all_urls中")
if not os.path.exists("ykt_url_ajax.txt"):
os.system(r"type nul>ykt_url_ajax.txt")
with open("ykt_url_ajax.txt",'r') as fr:
while True:
url = fr.readline()
if not url:
break
if url and (url != '\n') and (url not in all_urls):
all_urls.append(url.split('\n')[0])
if __name__ == '__main__':
read_file_to_urls()
if len(all_urls) == 0:
GetYktUrlAjax().get_url(home_url)
else:
for url in all_urls:
GetYktUrlAjax().get_url(url)
上述方法可實現動態網頁數據的獲取。但是由於selenium啓動頁面需要時間,所以如果需要抓取全站url,將會非常耗時,可考慮對其進行優化。