使用selenium爬取百度搜索的URL

我使用selenium簡單的爬取搜索的URL,這應該對於那自動測試漏洞有用,我想使用谷歌搜索的,奈何沒錢買代理,Google 語法感覺比百度語法有用多了,

代碼

# -*- coding: utf-8 -*-
"""
Created on Sat May  2 15:17:58 2020

@author: 14504
"""


from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
from pyquery import PyQuery as pq 
import requests
import time

url_save_path="./url.txt"
SearchInformation="inurl: (admin)"
starPage=1   #頁數
endPage=1

# 添加無界面參數
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)

#browser = webdriver.Chrome()
wait= WebDriverWait(browser,10)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
    }



def searchURL(page):
    pageScema="&pn="+str(page)
    url="https://www.baidu.com/s?wd="+quote(SearchInformation)+pageScema
    try:
        browser.get(url)
        urlnum=geturl()
        return urlnum
    
    except TimeoutException:
        print("請求超時")

def geturl():
    urlnum=0;
    html=browser.page_source 
    doc=pq(html)
    items = doc('div#content_left  .result.c-container').items()
    for item in items:
       BDurl=item.children('div.f13 > a').attr('href')
       real_url=urlDecode(BDurl)
       if real_url=="":
           print("none")
       else:
           saveTotxt(real_url)
           urlnum=urlnum+1
    print("這一頁成功爬取了"+str(urlnum)+"個\n")
    return urlnum
    
#百度url解碼    
def urlDecode(BDurl):    
    try:
        res = requests.get(BDurl,allow_redirects=False) 
        Real_url=res.headers['Location']
        return Real_url
    except requests.exceptions.ConnectionError as e:
        print('ConnectionError', e.args)
        return("")

    except requests.exceptions.MissingSchema as e:
        print('Schema is none', e.args)
        return("")

    except:
        return("")
        

def saveTotxt(real_url):
    with open(url_save_path, 'a', encoding='utf-8') as file:
        file.write(real_url)
        file.write("\n")

def main():
    urlsum=0
    for page in range(starPage-1,endPage):
        print("正在爬取第"+str(page+1)+"頁")
        page=page*10
        urlnum=searchURL(page)
        urlsum=urlnum+urlsum      
        time.sleep(1)
    
    print("成功爬取"+str(urlsum)+"個url地址")
        

main()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章