Python爬蟲實例:爬取某個網頁的子網頁

筆者的目的是對已有的白名單進行細化處理。比如現在有常見域名名單(百度、騰訊、搜狐等等),筆者要做的是對每一個域名爬取其所有的子網站,比如騰訊對應的還有騰訊視頻、微信、QQ、騰訊新聞等等。

筆者的輸入是一個包含常見域名白名單的xls文件,輸出是一個包含白名單細花後的所有網站的xls文件。代碼如下:

import tldextract, requests, xlwt, time, random, sys
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from xlrd import open_workbook

#數據初始化
def init():
    global url, headers, workbook, table, row_now, get_url, todo_url, get_domin, count_layer
    get_url = []
    todo_url = []
    get_domin = []
    count_layer = 0
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
    }
    workbook = xlwt.Workbook(encoding='utf-8')
    table=workbook.add_sheet("name",cell_overwrite_ok=True)
    row_now = 0

#獲取當前URL的頁面 用BeautifulSoup解析後返回
def GetUrlCodeBS(now_url):
    req = requests.get(now_url, headers=headers)
    if req.encoding == 'ISO-8859-1':
        encodings = requests.utils.get_encodings_from_content(req.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = req.apparent_encoding
    else:
         encoding = req.encoding
    encode_content = req.content.decode(encoding, 'ignore').encode('utf-8', 'ignore')
    return BeautifulSoup(encode_content.decode('utf-8'))
    
#判斷當前函數是否可以作爲待選項
def TodoUrl(now_url,domain_url):
    global todo_url
    if 'javascript' in now_url:
        return 0
    elif '/' == now_url:
        return 0
    elif '#' in now_url:
        return 0
    elif 'com' in now_url:
        return 0
    elif 'cn' in now_url:
        return 0
    elif '{' in now_url:
        return 0
    else:
        todo_url.append(domain_url+now_url)

#獲取當前URL頁面的title
def GetUrlTitle(now_url):
    try:
        soup = GetUrlCodeBS(now_url)
        return soup.head.title.text
    except:
        return False    

#獲取當前域名下的所有子域名
def GetSubdomain(domain_url,domain_name):
    global get_url, todo_url, get_domin
    now_domain = tldextract.extract(domain_url).domain
    soup = GetUrlCodeBS(domain_url)
    message = soup.find_all('a')
    test_1 = []
    test_1.append(domain_url)
    test_1.append(domain_name)
    get_url.append(test_1)
    for data in message:
        try:
            #判斷當前的href屬性是否存在
            if hasattr(data,'href')== True:
                #根據不同的href內容執行不同操作
                if 'http' in data['href'] :
                    data_url = urlparse(data['href']).scheme + '://' + urlparse(data['href']).netloc
                    #若當前URL的domain和當前頁面的domain相同 認爲此URL是當前頁面的子網頁或者同級網頁
                    if tldextract.extract(data_url).domain == now_domain:
                        #若當前URL的地址以及被獲取過 則拋棄該URL
                        if urlparse(data_url).netloc in get_domin:
                            continue
                        test_1 = []
                        test_1.append(data_url)
    #                     if '/' in data['href'].replace('//',''):
    #                         url_title = GetUrlTitle(data_url)
    #                         if url_title == False:
    #                             continue
    #                     else:
                        if len(data.text)>10 or len(data.text)==0:
                            continue
                        url_title = data.text
                        test_1.append(url_title)
                        get_domin.append(urlparse(data_url).netloc)
                        get_url.append(test_1)
                        table.write(row_now, 0, url_title)
                        table.write(row_now, 1, data_url)
                        row_now = row_now + 1
                elif 'www' in data['href']:
                    data_url = 'https://' + urlparse(data['href']).netloc
                    if tldextract.extract(data_url).domain == now_domain:
                        if urlparse(data_url).netloc in get_domin:
                            continue
                        test_1 = []
                        test_1.append(data_url)
    #                     if '/' in data['href'].replace('//',''):
    #                         url_title = GetUrlTitle(data_url)
    #                         if url_title == False:
    #                             continue
    #                     else:
                        if len(data.text)>10 or len(data.text)==0:
                            continue
                        url_title = data.text
                        test_1.append(url_title)
                        get_domin.append(urlparse(data_url).netloc)
                        get_url.append(test_1)
                        table.write(row_now, 0, url_title)
                        table.write(row_now, 1, data_url)
                        row_now = row_now + 1
#                     else:
#                         TodoUrl(data['href'],domain_url)
#                 else:
#                     TodoUrl(data['href'],domain_url)
        except:
            continue
# #     for url_new in todo_url:
# #         print(url_new)
# #         time.sleep(random.random()*10)
# #         GetSubdomain_Sub(url_new)

#獲取xls文件中的列表,逐個細化
def GetXlsToDetail():
    global get_url, get_domin
    workbook = open_workbook(r'../changjianyuming/res_data/常見域名列表_綜合其他.xls')
    sheet = workbook.sheet_by_index(0)
    for i in range(sheet.nrows): 
        try:
            GetSubdomain('https://'+'www.'+sheet.row_values(i)[1],sheet.row_values(i)[0])
        except:
            try:
                GetSubdomain('http://'+'www.'+sheet.row_values(i)[1],sheet.row_values(i)[0])
            except:
                try:
                    GetSubdomain('https://'+sheet.row_values(i)[1],sheet.row_values(i)[0])
                except:
                    try:
                        GetSubdomain('http://'+sheet.row_values(i)[1],sheet.row_values(i)[0]) 
                    except:
                        print('獲取' + sheet.row_values(i)[1] + '網頁代碼失敗!')
                        continue
        get_url = []
        get_domin = []
        

#主函數
if __name__ == '__main__':
    init()
    GetXlsToDetail()
    workbook.save('../changjianyuming/res_data/常見域名列表_綜合其他_細化.xls')

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章