Python爬蟲:查國家安全信息庫

本博客僅用於技術討論,若有侵權,聯繫作者刪除。

這次筆者想對國家信息安全漏洞庫進行爬蟲,獲取漏洞信息。並將每一個漏洞信息存爲一個json文件。

一、獲取當前總頁數

先獲取總頁數,以便進行for循環爬所有的漏洞數據:

#獲取當前總頁數
def get_all_page():
    global all_page
    req = requests.get('http://www.cnnvd.org.cn/web/vulnerability/querylist.tag',headers=headers,timeout=40)
    soup = BeautifulSoup(req.text, "lxml")
    message = soup.find('div', class_='page').find('a')
    if hasattr(message, 'text') == False:
        all_page = 1
    else:
        all_page = int(int(message.text.split(':')[1].replace(',',''))/10) + 1

二、獲取當前頁所有的漏洞鏈接

獲取完總頁數之後則獲取當前頁的每條漏洞的URL,並對每頁進行操作。此處筆者進行異常處理,當某個漏洞嘗試三次仍然爬取失敗,則輸出該頁面編號:

#獲取當前頁所有的漏洞鏈接
def get_now_page_all_url(now_url):
    req = requests.get(now_url,headers=headers,timeout=40)
    soup = BeautifulSoup(req.text, "lxml")
    message = soup.find('div', class_='list_list').find('ul').find_all('li')
    j = 0
    for data in message:
        i = 0
        while True:
            try:
                get_vulnerability_detail(data.div.a['href'])
            except:
                if i > 3:
                    print(str(j)+'***',end='\t')
                    break
                i = i + 1
                continue
            break
        j = j + 1

三、獲取當前漏洞信息並存入json文件

獲取當前漏洞信息時,筆者對很多數據進行判斷,當該數據不存在或者數據爲空時,缺省值爲暫無。數據存爲數組後轉化爲json格式存儲:

#獲取當前漏洞信息並存入json文件
def get_vulnerability_detail(url_now):
    req = requests.get(url+url_now,headers=headers,timeout=40)
    soup = BeautifulSoup(req.text, "lxml")
    info = {}
    info["cve_name"] = soup.find('div', class_='detail_xq w770').find('h2').text
    if hasattr(soup.find('div', class_='detail_xq w770').find('ul').span, 'text') == False:
        info["cnnvd_no"] = '暫無'
    elif soup.find('div', class_='detail_xq w770').find('ul').find('span').text == '':
        info["cnnvd_no"] = '暫無'
    else:
        info["cnnvd_no"] = soup.find('div', class_='detail_xq w770').find('ul').find('span').text
        info["cnnvd_no"] = info["cnnvd_no"].split(':')[1]
    message = soup.find('div', class_='detail_xq w770').find('ul').find_all('li')
    if hasattr(message[1].a, 'text') == False:
        info["cnnvd_level"] = '暫無'
    elif message[1].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["cnnvd_level"] = '暫無'
    else:
        info["cnnvd_level"] = message[1].a.text.replace('	','').replace('\n','').replace('\r','')
    if hasattr(message[2].a, 'text') == False:
        info["cve_no"] = info["cnnvd_no"]
    elif message[2].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["cve_no"] = info["cnnvd_no"]
    else:
        info["cve_no"] = message[2].a.text.replace('	','').replace('\n','').replace('\r','')
    if hasattr(message[3].a, 'text') == False:
        info["catag"] = '暫無'
    elif message[3].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["catag"] = '暫無'
    else:
        info["catag"] = message[3].a.text.replace('	','').replace('\n','').replace('\r','')
    if hasattr(message[4].a, 'text') == False:
        info["start_time"] = '暫無'
    elif message[4].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["start_time"] = '暫無'
    else:
        info["start_time"] = message[4].a.text.replace('	','').replace('\n','').replace('\r','')
    if hasattr(message[5].a, 'text') == False:
        info["threat_cata"] = '暫無'
    elif message[5].a.text.replace('	','').replace('\n','').replace('\r','') == '':
        info["threat_cata"] = '暫無'
    else:
        info["threat_cata"] = message[5].a.text.replace('	','').replace('\n','').replace('\r','')
    if hasattr(message[6].a, 'text') == False:
        info["update_time"] = '暫無'
    elif message[6].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["update_time"] = '暫無'
    else:
        info["update_time"] = message[6].a.text.replace('	','').replace('\n','').replace('\r','') 
    if hasattr(message[7].a, 'text') == False:
        info["company"] = '暫無'
    elif message[7].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["company"] = '暫無'
    else:
        info["company"] = message[7].a.text.replace('	','').replace('\n','').replace('\r','')
    if hasattr(message[8].a, 'text') == False:
        info["from"] = '暫無'
    elif message[8].a.text.replace('	','').replace('\n','').replace('\r','')== '':
        info["from"] = '暫無'
    else:
        info["from"] = message[8].a.text.replace('	','').replace('\n','').replace('\r','')
    message_0 = soup.find('div', class_='d_ldjj').find_all('p')
    introduction = ''
    i = 0
    for data_0 in message_0:
        introduction = introduction + data_0.text.replace('	','').replace('\n','').replace('\r','')
        i = i + 1
    if i == 0 or introduction=='':
        info["introduction"] = '暫無'
    else:
        info["introduction"] = introduction.replace('\n','')
    message_1 = soup.find('div', class_='d_ldjj m_t_20').find_all('p')
    bulletin = ''
    i = 0
    for data_1 in message_1:
        if i > 0:
            bulletin = bulletin + '|'
        bulletin = bulletin + data_1.text.replace('	','').replace('\n','').replace('\r','')
        i = i + 1
    if i == 0 or bulletin=='':
        info["bulletin"] = '暫無'
    else:
        info["bulletin"] = bulletin.replace('\n','')
    message_2 = soup.find_all('div', class_='d_ldjj m_t_20')[1].find_all('p')
    reference = ''
    i = 0
    for data_2 in message_2:
        if i > 0:
            reference = reference + '|'
        reference = reference + data_2.text.replace('	','').replace('\n','').replace('\r','')        
        i = i + 1
    if i == 0 or reference=='':
        info["reference"] = '暫無'
    else:
        info["reference"] = reference.replace('\n','')
    message_3 = soup.find_all('div', class_='d_ldjj m_t_20')[2].find_all('li')
    victim = ''
    i = 0
    for data_3 in message_3:
        if i > 0:
            victim = victim + '|'
        victim = victim + data_3.div.a.text.replace('	','').replace('\n','').replace('\r','')
        i = i+ 1
    if i == 0 or victim=='':
        info["victim"] ='暫無'
    else:
        info["victim"] = victim.replace('\n','')
    message_4 = soup.find_all('div', class_='d_ldjj m_t_20')[3].find_all('li')
    patch = ''
    i = 0
    for data_4 in message_4:
        if i > 0:
            patch = patch + '|'
        patch = patch + data_4.div.a.text.replace('	','').replace('\n','').replace('\r','')
        i = i+ 1
    if i == 0 or patch=='':
        info["patch"] ='暫無'
    else:
        info["patch"] = patch
    jsonData = json.dumps(info, ensure_ascii=False)
    fileObject = open('./cve_json/'+str(info["cve_no"])+'.json', 'w',encoding='utf-8')
    fileObject.write(jsonData)
    fileObject.close()

四、主函數

主函數利用循環對每頁進行操作,當某頁因網路原因未加載成功時,則再進行一次操作,指導頁面抓取成功。

#主函數
if __name__ == '__main__':
    init()
    get_all_page()
    for now_page in range(4219,all_page+1):
        while True:
            try:
                print(now_page, end='\t')
                get_now_page_all_url('http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=' + str(now_page) + '&repairLd=')
                print('end', end='\t')
            except:
                print('error', end='\t')
                time.sleep(3)
                continue
            break

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章