筆者的目的是對已有的白名單進行細化處理。比如現在有常見域名名單(百度、騰訊、搜狐等等),筆者要做的是對每一個域名爬取其所有的子網站,比如騰訊對應的還有騰訊視頻、微信、QQ、騰訊新聞等等。
筆者的輸入是一個包含常見域名白名單的xls文件,輸出是一個包含白名單細花後的所有網站的xls文件。代碼如下:
import tldextract, requests, xlwt, time, random, sys
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from xlrd import open_workbook
#數據初始化
def init():
global url, headers, workbook, table, row_now, get_url, todo_url, get_domin, count_layer
get_url = []
todo_url = []
get_domin = []
count_layer = 0
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
workbook = xlwt.Workbook(encoding='utf-8')
table=workbook.add_sheet("name",cell_overwrite_ok=True)
row_now = 0
#獲取當前URL的頁面 用BeautifulSoup解析後返回
def GetUrlCodeBS(now_url):
req = requests.get(now_url, headers=headers)
if req.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(req.text)
if encodings:
encoding = encodings[0]
else:
encoding = req.apparent_encoding
else:
encoding = req.encoding
encode_content = req.content.decode(encoding, 'ignore').encode('utf-8', 'ignore')
return BeautifulSoup(encode_content.decode('utf-8'))
#判斷當前函數是否可以作爲待選項
def TodoUrl(now_url,domain_url):
global todo_url
if 'javascript' in now_url:
return 0
elif '/' == now_url:
return 0
elif '#' in now_url:
return 0
elif 'com' in now_url:
return 0
elif 'cn' in now_url:
return 0
elif '{' in now_url:
return 0
else:
todo_url.append(domain_url+now_url)
#獲取當前URL頁面的title
def GetUrlTitle(now_url):
try:
soup = GetUrlCodeBS(now_url)
return soup.head.title.text
except:
return False
#獲取當前域名下的所有子域名
def GetSubdomain(domain_url,domain_name):
global get_url, todo_url, get_domin
now_domain = tldextract.extract(domain_url).domain
soup = GetUrlCodeBS(domain_url)
message = soup.find_all('a')
test_1 = []
test_1.append(domain_url)
test_1.append(domain_name)
get_url.append(test_1)
for data in message:
try:
#判斷當前的href屬性是否存在
if hasattr(data,'href')== True:
#根據不同的href內容執行不同操作
if 'http' in data['href'] :
data_url = urlparse(data['href']).scheme + '://' + urlparse(data['href']).netloc
#若當前URL的domain和當前頁面的domain相同 認爲此URL是當前頁面的子網頁或者同級網頁
if tldextract.extract(data_url).domain == now_domain:
#若當前URL的地址以及被獲取過 則拋棄該URL
if urlparse(data_url).netloc in get_domin:
continue
test_1 = []
test_1.append(data_url)
# if '/' in data['href'].replace('//',''):
# url_title = GetUrlTitle(data_url)
# if url_title == False:
# continue
# else:
if len(data.text)>10 or len(data.text)==0:
continue
url_title = data.text
test_1.append(url_title)
get_domin.append(urlparse(data_url).netloc)
get_url.append(test_1)
table.write(row_now, 0, url_title)
table.write(row_now, 1, data_url)
row_now = row_now + 1
elif 'www' in data['href']:
data_url = 'https://' + urlparse(data['href']).netloc
if tldextract.extract(data_url).domain == now_domain:
if urlparse(data_url).netloc in get_domin:
continue
test_1 = []
test_1.append(data_url)
# if '/' in data['href'].replace('//',''):
# url_title = GetUrlTitle(data_url)
# if url_title == False:
# continue
# else:
if len(data.text)>10 or len(data.text)==0:
continue
url_title = data.text
test_1.append(url_title)
get_domin.append(urlparse(data_url).netloc)
get_url.append(test_1)
table.write(row_now, 0, url_title)
table.write(row_now, 1, data_url)
row_now = row_now + 1
# else:
# TodoUrl(data['href'],domain_url)
# else:
# TodoUrl(data['href'],domain_url)
except:
continue
# # for url_new in todo_url:
# # print(url_new)
# # time.sleep(random.random()*10)
# # GetSubdomain_Sub(url_new)
#獲取xls文件中的列表,逐個細化
def GetXlsToDetail():
global get_url, get_domin
workbook = open_workbook(r'../changjianyuming/res_data/常見域名列表_綜合其他.xls')
sheet = workbook.sheet_by_index(0)
for i in range(sheet.nrows):
try:
GetSubdomain('https://'+'www.'+sheet.row_values(i)[1],sheet.row_values(i)[0])
except:
try:
GetSubdomain('http://'+'www.'+sheet.row_values(i)[1],sheet.row_values(i)[0])
except:
try:
GetSubdomain('https://'+sheet.row_values(i)[1],sheet.row_values(i)[0])
except:
try:
GetSubdomain('http://'+sheet.row_values(i)[1],sheet.row_values(i)[0])
except:
print('獲取' + sheet.row_values(i)[1] + '網頁代碼失敗!')
continue
get_url = []
get_domin = []
#主函數
if __name__ == '__main__':
init()
GetXlsToDetail()
workbook.save('../changjianyuming/res_data/常見域名列表_綜合其他_細化.xls')