一、工程目錄

二、原理解析

Hive和Impala是兩個最常用的大數據查詢工具，他們的主要區別是Hive適合對實時性要求不太高的業務，對資源的要求較低；而Impala的由於採用了全新的架構，處理速度非常的快，但同樣的也對資源消耗比較大，適合實時性要求高的業務。

在我測試過程中發現，有些時候，即使通過shell命令來檢測，發現Hive或者Impala的進程正在運行，但是無法訪問他們的web頁面，其實這也不能算是正常運行的，因爲內部還是無法完成任務。利用這一點，我們可以通過爬蟲工具beautifulsoup，獲取Hive和Impala的web頁面的信息。繼而轉化成字符串，丟到grafana裏，即可實現監控功能了。

小弟也是新手一個，代碼也只是僅僅實現了功能而已，大神看了希望不要見笑哈~~

三、代碼解析

1、findkeytab.py

import os
import re

def findkeytab():
    pre='/var/run/cloudera-scm-agent/process'#CDH的進程路徑
    p = os.listdir(pre)
    max=l=0
    for i in range(len(p)):
        pattern = re.compile(r'(.*?)hive-HIVESERVER2$')#CDH每次重啓進程會生成新的進程號,這裏匹配最新(大)的進程目錄
        if pattern.match(p[i]):
            if int(p[i ].split('-')[0])>max:
                max=int(p[i].split('-')[0])
                l=i
    suf=p[l]
    return pre+'/'+suf+'/hive.keytab'#拼接出keytab路徑

2、kerberos2.py

import requests
import commands
import kerberos

class KerberosTicket:

    def __init__(self, service, keytab, pricipal):
            kt_cmd = 'kinit -kt ' + keytab + ' ' + pricipal#通過命令認證kerberos用戶
            (status, output) = commands.getstatusoutput(kt_cmd)

            if status != 0:
                print ("kinit ERROR:")
                print (output)
                exit()

            __, krb_context = kerberos.authGSSClientInit(service)
            kerberos.authGSSClientStep(krb_context, "")
            self._krb_context = krb_context
            self.auth_header = ("Negotiate " + kerberos.authGSSClientResponse(krb_context))

    def verify_response(self, auth_header):
        # Handle comma-separated lists of authentication fields
        for field in auth_header.split(","):
            kind, __, details = field.strip().partition(" ")
            if kind.lower() == "negotiate":
                auth_details = details.strip()
                break
        else:
            raise ValueError("Negotiate not found in %s" % auth_header)
        # Finish the Kerberos handshake
        krb_context = self._krb_context
        if krb_context is None:
            raise RuntimeError("Ticket already used for verification")
        self._krb_context = None
        kerberos.authGSSClientStep(krb_context, auth_details)

3、get_clustername.py

#!/usr/bin/env python
def enabled():
    return False

cluster_dict = {
    'cluster1': ["192.168.159.11-13"] #預先定義IP和clustername的映射
               }


def get_cluster_name(ip):#通過字符串的匹配來確定clustername
    cluster_name = ''
    for i in range(len(cluster_dict)):
        cluster_key = cluster_dict.keys()[i]
        locals()[str(cluster_key)+'_ip_list'] = []

        ip_part1_list = [i.split('.')[:3] for i in cluster_dict.get(cluster_key)]
        ip_part2_list = [i.split('.')[3] for i in cluster_dict.get(cluster_key)]
        try:
            ip_part2_list_detail_list=[]
            for k in range(len(ip_part2_list)):
                loop_start = int(ip_part2_list[k].split("-")[0])
                loop_end = int(ip_part2_list[k].split("-")[1])
                ip_part2_list_detail = [x for x in range(loop_start , loop_end + 1)]
                ip_part2_list_detail_list.append(ip_part2_list_detail)

            ip_list= zip(ip_part1_list,ip_part2_list_detail_list)
            for o in range(len(ip_list)):
                for p in range(len(ip_list[o][1])):
                    locals()[str(cluster_key) + '_ip_list'].append(str(ip_list[o][0][0])+"."+str(ip_list[o][0][1])+"."+str(ip_list[o][0][2])+"."+str(ip_list[o][1][p]))
        except IndexError:
            locals()[str(cluster_key) + '_ip_list'].append(str(ip_list[o][0][0]) + "." + str(ip_list[o][0][1]) + "." + str(ip_list[o][0][2]) + "." + str(ip_list[o][1][p]))
        if ip in locals()[str(cluster_key)+'_ip_list']:
            cluster_name = str(cluster_key)

    if cluster_name in cluster_dict.keys():
        return cluster_name
    else:
        return "null"

4、hive_jmx.py

import requests
import kerberos
import socket

#output all metric value輸出所有信息，測試用
'''for i in range(len(js['beans'])):
   prename=js['beans'][i]['name'].split('=')[1]
   for key in (js['beans'][i]):
       if key == "modelerType":
           continue
       lastname=key
       name=prename+'.'+lastname
       timestamp = str(int(time.time()))
       value = str(js['beans'][i][key])
       tag='hive'
       print name+' '+timestamp+' '+value+' '+tag'''

#out put pointed metric value

def hiveserver2_jmx(url):
    #在Hive的metric頁面找到需要的屬性
    def jmxana(i, metric_name, value_name):
        list = []
        dict = {}
        if js['beans'][i]['name'] == "metrics:name=" + metric_name:
            name = js['beans'][i]['name'].split('=')[1] + '.' + value_name
            list.append(metric_name)
            value = str(js['beans'][i][value_name])
            list.append(value)
            list.append(dict)
        return list

    #kerbero認證部分
    __, krb_context = kerberos.authGSSClientInit("HTTP/[email protected]")
    kerberos.authGSSClientStep(krb_context, "")
    negotiate_details = kerberos.authGSSClientResponse(krb_context)
    headers = {"Authorization": "Negotiate " + negotiate_details}

    r = requests.get(url, headers=headers, verify=False)
    hostname = socket.gethostname()
    js = r.json()
    jmx_list = []
    file = open("/root/test.txt")#這裏將要監控的屬性名稱存在一個文件裏，通過讀取文件內容獲取具體要監控的信息
    lines = file.readlines()
    for i in range(len(js['beans'])):
        for line in lines:
            tmp_list = jmxana(i,line.split(' ')[0].strip(),line.split(' ')[1].strip())
            if len(tmp_list)!=0:
                jmx_list.append(tmp_list)
    file.close()
    return jmx_list

5、hive_query.py

import requests
import kerberos
import time
from bs4 import BeautifulSoup


last_query_list = []
open_query_list = []
active_session_list = []
total_dict = {}
latest_query_timestamp=0

#beautifulsoup解析hive的監控頁面，返回html文本
def getsoup(url):
    __, krb_context = kerberos.authGSSClientInit("HTTP/[email protected]")
    kerberos.authGSSClientStep(krb_context, "")
    negotiate_details = kerberos.authGSSClientResponse(krb_context)
    headers = {"Authorization": "Negotiate " + negotiate_details}
    r = requests.get(url, headers=headers,verify=False)
    return BeautifulSoup(r.content,"html5lib")

#獲取頁面的session信息
def hiveserver2_jsp_session(url):
    soup=getsoup(url)
    del active_session_list[:]
    for h2 in soup.findAll('h2'):
        if h2.string == 'Active Sessions':
            tab = h2.parent.find('table')
            for tr in tab.findAll('tr'):
                list = []
                dict = {}
                count = 0
                for td in tr.findAll('td'):
                    if len(tr.findAll('td')) == 1:
                        td_num = tr.find('td').string.split(':')[1]
                        global total_dict
                        total_dict['Total number of sessions'] = td_num
                    else:
                        if count == 0:
                            dict['UserName'] = td.getText().strip()
                        elif count == 1:
                            dict['IP'] = td.getText().strip()
                        elif count == 2:
                            dict['OperationCount'] = td.getText().strip()
                        else:
                            list.append(td.getText().strip())
                    count+=1
                if len(dict)!=0:
                    list.append(dict)
                if len(list) != 0:
                    active_session_list.append(list)
    return active_session_list

#獲取正在執行的任務信息
def hiveserver2_jsp_open(url):
    soup = getsoup(url)
    del open_query_list[:]
    for h2 in soup.findAll('h2'):
        if h2.string == 'Open Queries':
            tab = h2.parent.find('table')
            for tr in tab.findAll('tr'):
                list = []
                dict = {}
                if len(tr.findAll('td')) == 1:
                    open_query_num = tr.find('td').string.split(':')[1]
                    global total_dict
                    total_dict['Total number of open queries'] = open_query_num
                if tr.find('a') != None:
                    count = 0
                    for td in tr.findAll('td'):
                        if count ==0:
                            dict['UserName']=td.getText().strip()
                        elif count == 1:
                            list.append(td.getText().strip())
                        elif count == 2:
                            dict['ExecutionEngine'] = td.getText().strip()
                        elif count == 3:
                            dict['State'] = td.getText().strip()
                        elif count == 4:
                            dt = td.getText().strip()
                            timestamp = str(int(time.mktime(time.strptime(dt, '%a %b %d %H:%M:%S %Z %Y'))))
                            dict['OpenedTimestamp'] = timestamp
                        elif count == 5:
                            dict['Opened'] = td.getText().strip()
                        elif count == 6:
                            dict['Latency'] = td.getText().strip()
                        else:
                            list.append(tr.find('a').get('href').split('=')[1])
                        count += 1
                    list.append(dict)
                    open_query_list.append(list)
    return open_query_list

#獲取已經完成的任務信息
def hiveserver2_jsp_last(url):
    soup = getsoup(url)
    if len(last_query_list)!=0:
        global latest_query_timestamp
        latest_query_timestamp = int(last_query_list[len(last_query_list) - 1][2]['ClosedTimestamp'])
    del last_query_list[:]
    for h2 in soup.findAll('h2'):
        if h2.string == 'Last Max 25 Closed Queries':
            tab = h2.parent.find('table')
            for tr in tab.findAll('tr'):
                list = []
                dict = {}
                if len(tr.findAll('td')) == 1:
                    last_query_num = tr.find('td').string.split(':')[1]
                    global total_dict
                    total_dict['Total number of last queries'] = last_query_num
                if tr.find('a') != None:
                    count = 0
                    for td in tr.findAll('td'):
                        if count ==0:
                            dict['UserName']=td.getText().strip()
                        elif count == 1:
                            list.append(td.getText().strip())
                        elif count == 2:
                            dict['ExecutionEngine'] = td.getText().strip()
                        elif count == 3:
                            dict['State'] = td.getText().strip()
                        elif count == 4:
                            dict['Opened'] = td.getText().strip()
                        elif count == 5:
                            dt = td.getText().strip()
                            timestamp = str(int(time.mktime(time.strptime(dt, '%a %b %d %H:%M:%S %Z %Y'))))
                            dict['ClosedTimestamp'] = timestamp
                        elif count == 6:
                            dict['Latency'] = td.getText().strip()
                        else:
                            list.append(tr.find('a').get('href').split('=')[1])
                        count += 1
                    list.append(dict)
                    if int(timestamp) <= latest_query_timestamp:

#爲了避免重複輸出，如果已經輸出過就不要再輸出

continue else: last_query_list.append(list) return last_query_list#統計總的任務信息def hiveserver2_jsp_total(): dict={} global total_dict return total_dict

6、impala_metrics.py

import requests
import re
from bs4 import BeautifulSoup

# get metric from impalad
# 同Hive一樣，用beautifulsoup解析impala三類節點上的metric信息
def impala_metrics(url):
    r = requests.get(url)
    pattern=re.compile(r'\-?\d*\.\d*\s(MB|KB|GB|B)')# 將G、M、K統一都轉換爲B
    soup = BeautifulSoup(r.content,"html5lib")
    metric_dict = {}
    return_dict = {}
    for tr in soup.findAll('tr'):
        list = []
        count = 0
        if len(tr.findAll('td'))!=0:
            for td in tr.findAll('td'):
                if count==0:
                    list.append(td.getText().strip())
                elif count==1:
                    if pattern.match(td.getText().strip()):
                        unit=td.getText().strip().split(' ')[1]
                        if unit=='KB':
                            list.append(str(float(td.getText().strip().split(' ')[0])*1024)[:-3])
                        elif unit=='MB':
                            list.append(str(float(td.getText().strip().split(' ')[0])*1024*1024)[:-3])
                        elif unit=='GB':
                            list.append(str(float(td.getText().strip().split(' ')[0])*1024*1024*1024)[:-3])
                        else:
                            list.append(td.getText().strip().split(' ')[0])
                    else:
                        list.append(td.getText().strip())
                else:
                    continue
                count+=1
            metric_dict[list[0]]=list[1]

    serviceport = url.split(':')[2][0:5]#打開對應的文件，找到對應要監控的屬性名
    if serviceport=='25000':
        file = open('/root/impalad.txt')
    elif serviceport=='25010':
        file = open('/root/statestored.txt')
    elif serviceport=='25020':
        file = open('/root/catalogd.txt')
    else:
        print ("can't find impala service")
    lines = file.readlines()
    for key in metric_dict:
        for line in lines:
            if line.strip()==key:
                return_dict[key]=metric_dict[key]
    return return_dict

7、impala_queries.py

import requests
from bs4 import BeautifulSoup
import time

latest_query_timestamp=0
closed_query_list = []

#獲取正在運行的session信息，但往往impala運行的非常快，除非在生產環境，否則一般沒有數據
def impala_session(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html5lib")
    session_list = []
    th_list=['SessionType','OpenQueries','TotalQueries','User','DelegatedUser','SessionID','NetworkAddress','Default Database',
             'StartTime','LastAccessed','IdleTimeout','Expired','Closed','RefCount','Action']
    for tr in soup.findAll('tr'):
        list = []
        dict = {}
        count = 0
        if len(tr.findAll('td'))!=0:
            for td in tr.findAll('td'):
                if count==5:
                    list.append(td.getText().strip())
                elif count==8:
                    dt = td.getText().strip()
                    timestamp = str(int(time.mktime(time.strptime(dt, '%Y-%m-%d %H:%M:%S'))))
                    dict[th_list[count]]=timestamp
                elif count==9:
                    dt = td.getText().strip()
                    timestamp = str(int(time.mktime(time.strptime(dt, '%Y-%m-%d %H:%M:%S'))))
                    dict[th_list[count]]=timestamp
                else:
                    dict[th_list[count]]=td.getText().strip()
                count+=1
            if len(dict)!=0:
                list.append(dict)
            if len(list) != 0:
                session_list.append(list)
    return session_list

#獲取已經完成的任務信息
def impala_query_closed(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content,"html5lib")
    th_list = ['User','DefaultDb','Statement','QueryType','StartTime','EndTime','Duration','ScanProgress','State','RowsFetched','ResourcePool']

    if len(closed_query_list)!=0:
        global latest_query_timestamp
        latest_query_timestamp = int(closed_query_list[0][2]['EndTime'])
    del closed_query_list[:]
    for h3 in soup.findAll('h3'):
        if 'Completed' in h3.getText():
            table= h3.find_next('table')
            for tr in table.findAll('tr'):
                list = []
                dict = {}
                count = 0
                for td in tr.findAll('td'):
                    if count==2:
                        list.append(td.getText())
                    elif count==4:
                        sdt = td.getText().strip()
                        second = sdt.split(':')[2][0:2]
                        sdt = sdt.split(':')[0] + ':' + sdt.split(':')[1] + ':' + second
                        stimestamp = str(int(time.mktime(time.strptime(sdt, '%Y-%m-%d %H:%M:%S'))))
                        dict['StartTime'] = stimestamp
                    elif count==5:
                        edt = td.getText().strip()
                        second = edt.split(':')[2][0:2]
                        edt = edt.split(':')[0]+':'+edt.split(':')[1]+':'+second
                        etimestamp = str(int(time.mktime(time.strptime(edt, '%Y-%m-%d %H:%M:%S'))))
                        dict['EndTime'] = etimestamp
                    elif count==11:
                        list.append(td.find('a').get('href').split('=')[1])
                    else:
                        dict[th_list[count]] = td.getText().strip()
                    count+=1
                if len(dict)!=0:
                    list.append(dict)
                if len(list) != 0:
                    if int(dict['EndTime']) <= latest_query_timestamp:#爲了避免重複輸出，如果已經輸出過就不要再輸出了
                        break
                    else:
                        closed_query_list.append(list)
    return closed_query_list

8、prin_.py(千萬不要叫print.py，會跟python自帶包衝突，坑了我好久)

import hive_query
import hive_jmx
import impala_queries
import impala_metrics
import get_clusternmae
import time
import socket
import kerberos2
import findkeytab

def printdict(dict):
    for key in dict:
        print key + '=' + dict[key],
    print

def writedict(dict,file):
    count=1
    for key in dict:
        file.write(key + '=' + dict[key])
        if count==len(dict):
            file.write('\n')
        else:
            file.write(',')
        count+=1

krb = kerberos2.KerberosTicket('HTTP/'+socket.gethostname()+'@HADOOP.COM', findkeytab.findkeytab(), 'HTTP/'+socket.gethostname()+'@HADOOP.COM')

while True:
    timestamp = str(int(time.time()))
    common_tag_dict={}
    endpoint=socket.gethostname()
    ipaddr=socket.gethostbyname(endpoint)
    clustername = get_clusternmae.get_cluster_name(ipaddr)

    common_tag_dict['endpoint']=endpoint
    common_tag_dict['cluster'] = clustername


    #monitor hive active session
    active_session_list = []
    active_session_list = hive_query.hiveserver2_jsp_session('http://'+endpoint+':10002/hiveserver2.jsp')
    if len(active_session_list) != 0:
        for l in active_session_list:
            print 'hive.hiveserver2.session.ActiveTime ' + l[0] + ' ',
            l[2].update(common_tag_dict)
            del l[2]['IP']
            printdict(l[2])

            print 'hive.hiveserver2.session.IdleTime ' + l[1] + ' ',
            l[2].update(common_tag_dict)
            printdict(l[2])

    # monitor hive opening queries
    open_query_list = []
    open_query_list = hive_query.hiveserver2_jsp_open('http://'+endpoint+':10002/hiveserver2.jsp')
    if len(open_query_list) != 0:
        for l in open_query_list:
            print 'hive.hiveserver2.query.opening-uuid '+l[1]+' '+timestamp,
            l[2].update(common_tag_dict)
            printdict(l[2])

    #monitor hive closed queries
    last_query_list = []
    last_query_list = hive_query.hiveserver2_jsp_last('http://'+endpoint+':10002/hiveserver2.jsp')
    if len(last_query_list) != 0:
        for l in last_query_list:
            file = open("/root/hive_query.txt",'a')
            file.write('uuid='+l[1]+','+'Sql='+l[0]+',')
            writedict(l[2],file)
            file.close()
            del l[2]['Latency']
            print 'hive.hiveserver2.query.closed-uuid '+l[1]+' '+timestamp,
            l[2].update(common_tag_dict)
            printdict(l[2])

    #monitor the total hive data
    total_dict = hive_query.hiveserver2_jsp_total()
    for key in total_dict:
        if key == 'Total number of sessions':
            print 'hive.hiveserver2.total.session'+total_dict['Total number of sessions']+' '+timestamp,
            printdict(common_tag_dict)
        if key == 'Total number of open queries':
            print 'hive.hiveserver2.total.openingquery'+total_dict['Total number of open queries']+' '+timestamp,
            printdict(common_tag_dict)
        if key == 'Total number of last queries':
            print 'hive.hiveserver2.total.closedquery'+total_dict['Total number of last queries']+' '+timestamp,
            printdict(common_tag_dict)

    #monitor the hive jmx
    jmx_list = hive_jmx.hiveserver2_jmx('http://'+endpoint+':10002/jmx')
    for i in range (len(jmx_list)):
        print 'hive.'+jmx_list[i][0]+' '+ jmx_list[i][1]+' '+timestamp,
        jmx_list[i][2].update(common_tag_dict)
        printdict(jmx_list[i][2])


    #monitor impala metrics
    impalad_url='http://'+ipaddr+':25000/metrics'
    impala_impalad_metrics = impala_metrics.impala_metrics(impalad_url)
    for key in impala_impalad_metrics:
        print 'impala.'+key+' '+impala_impalad_metrics[key]+' '+timestamp,
        printdict(common_tag_dict)

    statestored_url = 'http://' + ipaddr + ':25010/metrics'
    impala_statestored_metrics = impala_metrics.impala_metrics(statestored_url)
    for key in impala_statestored_metrics:
        print 'impala.'+key +' '+ impala_statestored_metrics[key]+' '+timestamp,
        printdict(common_tag_dict)

    catalogd_url = 'http://' + ipaddr + ':25020/metrics'
    impala_catalogd_metrics = impala_metrics.impala_metrics(catalogd_url)
    for key in impala_catalogd_metrics:
        print 'impala.'+key +' '+ impala_catalogd_metrics[key]+' '+timestamp,
        printdict(common_tag_dict)



    # monitor impala closed queries
    impala_closed_query_list = []
    impala_closed_query_list = impala_queries.impala_query_closed('http://'+ipaddr+':25000/queries#')
    if len(impala_closed_query_list)!=0:
        for l in impala_closed_query_list:
            file = open("/root/impala_query.txt",'a')
            file.write(l[1]+' '+l[0]+'\n')
            file.write('uuid=' + l[1] + ',' + 'Sql=' + l[0]+',')
            writedict(l[2], file)
            file.close()
            del l[2]['ResourcePool']
            del l[2]['ScanProgress']
            del l[2]['DefaultDb']
            del l[2]['EndTime']
            del l[2]['QueryType']
            print 'impala.query.closed-uuid '+l[1]+' '+timestamp,
            l[2].update(common_tag_dict)
            printdict(l[2])

    # monitor impala session
    impala_session_list=[]
    impala_session_list = impala_queries.impala_session('http://'+ipaddr+':25000/sessions')
    if len(impala_session_list)!=0:
        for l in impala_session_list:
            print 'impala.active.session'+' '+ l[0] + ' '+timestamp,
            l[1].update(common_tag_dict)
            printdict(l[1])
    time.sleep(10)

大數據平臺運維-----Kerberos環境下Hive及Impala監控腳本的開發

一、工程目錄

二、原理解析

三、代碼解析

1、findkeytab.py

2、kerberos2.py

3、get_clustername.py

4、hive_jmx.py

5、hive_query.py

6、impala_metrics.py

7、impala_queries.py

8、prin_.py(千萬不要叫print.py，會跟python自帶包衝突，坑了我好久)

kafka使用mysql進行認證管理

修改源碼使kafka-console-consumer.sh支持從指定時間開始消費

kafka只讓Producer自動創建Topic同時禁止consumer自動創建Topic

Maven編譯系列（一）——Plugin

大數據平臺部署-----ambari在線和離線安裝

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結