一、背景
使用Python,打通Impala通道,實現取數自動化,或是作爲數據分析的數據源。
二、Apache Impala
Impala是一個開源的,基於Hadoop的分析型數據庫。
Impala可以查詢存儲在HDFS或者HBase中的數據。
Impala通過專用分佈式查詢引擎,繞過MapReduce直接訪問數據,查詢性能遠高於Hive。
三、impyla
基於HiveServer2 實現的分佈式查詢引擎(如Impala、Hive)的Python客戶端。
完全符合DB API 2.0(PEP 249)規範。
使用Kerberos、LDAP、SSL。
支持將數據轉換爲pandas的DataFrame,輕鬆集成到Python數據棧(如scikit-learn、matplotlib等)。
四、類封裝
from impala.dbapi import connect
from impala.error import ProgrammingError
from utils.db.sql import SQL
class Impala(SQL):
DESC_EXEC_SUCCESS = "執行成功"
def __init__(self, host, port, database, user, password=None):
"""Impala工具類
:param host: IP
:param port: 端口
:param database: 數據庫名
:param user: 用戶名
:param password: 密碼
"""
self.host = host
self.port = port
self.database = database
self.user = user
self.password = password
self.connect = None
self.cursor = None
def get_connect(self, timeout=600):
"""獲取連接
:param timeout: 超時時間
"""
self.connect = connect(
host=self.host, # IP
port=self.port, # 端口
timeout=timeout, # 超時時間
database=self.database # 數據庫名
)
def get_cursor(self):
"""獲取遊標
"""
self.cursor = self.connect.cursor(
user=self.user # 用戶名
)
def close(self):
"""關閉連接
"""
self.cursor.close()
self.connect.close()
self.cursor = None
self.connect = None
def execute(self, sql, auto_close=True):
"""執行sql
:param auto_close: 執行結束是否自動關閉連接
"""
if not self.connect: self.get_connect()
if not self.cursor: self.get_cursor()
self.cursor.execute(sql)
try:
result = self.cursor.fetchall()
except ProgrammingError:
result = self.DESC_EXEC_SUCCESS
if auto_close: self.close()
return result
五、使用例子
from utils.db.impala import Impala
impala = Impala(
host="10.123.0.11",
port=123456,
database="fields",
user="unclebean"
)
sql = "select 1 as a, 2 as b union all select 3 as a, 4 as b"
result = impala.execute(sql, auto_close=True)
print(result)
默認的返回結果是一個列表,列表中每個元素代表一行結果,類型爲元組,如上面返回的結果:[(1, 2), (3, 4)]
若想行結果變爲字典,而非元組,則獲取遊標時需傳入參數dictify=True,如
sql = "select 1 as a, 2 as b union all select 3 as a, 4 as b"
result = impala.execute(sql, auto_close=True, dictify=True)
print(result)
則返回結果變爲:[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]
六、tkinter封裝,實現一鍵刷新元數據和按日統計數據量
import os
from datetime import datetime
from menu.menu import EMenu
from utils.db.impala import Impala
class MenuImpala(EMenu):
LABEL_NAME = "Impala"
LABEL_NAME_INVALIDATE_METADATA = "Invalidate Metadata"
LABEL_NAME_COUNT_BY_DAY = "Count By Day"
DESC_SSH_CMD_FAILED = "執行錯誤"
def __init__(self, master=None, cnf={}, **kw):
super().__init__(master=master, cnf=cnf, **kw)
self.impala = Impala(
host = self.conf.impala.HOST,
port = self.conf.impala.PORT,
database = self.conf.impala.DATABASE,
user = self.conf.impala.USER
)
master.add_cascade(label=self.LABEL_NAME, menu=self) # 添加主菜單
self.add_command( # 添加子菜單-刷新元數據
label=self.LABEL_NAME_INVALIDATE_METADATA,
command=self.invalidate_metadata
)
self.add_command( # 添加子菜單-按日統計數據量
label=self.LABEL_NAME_COUNT_BY_DAY,
command=self.count_by_day
)
@EMenu.thread_run(LABEL_NAME_COUNT_BY_DAY)
def count_by_day(self):
"""菜單命令:按日統計數據量
"""
table_name = self.get_table_name_from_clip() # 從剪貼板中獲取表名
self.invalidate_table(table_name, auto_close=False) # 先刷新元數據
sql = "select data_date,count(1) from {} group by data_date order by data_date desc".format(
table_name
)
self.stdout(sql, with_time=" - ")
result = self.impala.execute(sql) # 再按日統計數據量
result = "\n".join([str(row) for row in result])
self.stdout("{} -> {}".format(sql, result), with_time=" - ")
self.msg_box_info(result)
@EMenu.thread_run(LABEL_NAME_INVALIDATE_METADATA)
def invalidate_metadata(self):
"""菜單命令:刷新元數據
"""
self.invalidate_table(self.get_table_name_from_clip()) # 從剪貼板中獲取表名,然後刷新元數據
def invalidate_table(self, table_name, auto_close=True):
sql = "invalidate metadata {}".format(table_name)
self.stdout(sql, with_time=" - ")
result = self.impala.execute(sql=sql, auto_close=auto_close)
self.stdout("{} -> {}".format(sql, result), with_time=" - ")
def get_table_name_from_clip(self):
table_name = self.paste()
if len(table_name.split(".")) == 1:
table_name = table_name.split("_")[0] + "." + table_name
return table_name
七、完整代碼
GitHub上搜索TheUncleWhoGrowsBeans