目錄
- 背景簡介
- 需求分析
- 項目分析
- 注意事項
一:背景簡介
大家都知道mysql做隨機查詢的速度比較快,而類型於hive之類這種基於大數據背景的離線計算相對較慢,而hive比較適合於做數據分析即OLAP,mysql,hbase,oracle之類的偏向於OLTP,cloudera公司也考慮到了這個問題,所以開源了kudu,kudu的隨機查詢能力和數據分析能力較之傳統的數據庫和數據倉庫,性能都比較優異。以往企業爲了能夠提供線上業務快速響應能力,需要在關係型數據庫或者非關係型數據庫當中維護一套庫表,爲了結合線上的業務,需要做數據分析,來指正公司發展的戰略方向,這時候又要維護一套庫表到數據倉庫當中,中間需要維護非常複雜的ETL操作,不管對於人力,機器資源的支出都是非常巨大的。kudu可以較好解決以上問題,不管是線上業務,還是提供數據分析的數據源,kudu都能以較優秀的能力支持,詳細信息見kudu官網
二:需求分析
介紹完背景以後,目前主要的工作是如何把傳統型關係數據庫和hive中的數據遷移到kudu當中來,注意雖然impala同是可以結合hive和kudu使用,但是調用的是兩套不同的數據源。我將功能大致分爲以下步驟來進行實現:
- 利用kudu api創建與mysql和hive對應表結構的庫表
- 從源表當中讀取數據源
- 對數據源進行封裝成kudu的格式
- 插入到kudu表裏面
三:項目分析
根據需求分析的步驟來,我們第一步應該要利用kudu api創建與mysql和hive對應表結構的庫表,kudu支持的數據類型如下圖所示:
所以類似於mysql的datetime類型需要轉換一下,這裏值得一提的是,當mysql的數據類型爲tinyint,smallint時,最好將其轉換成int32類型,還有就是將decimal類型轉換成double類型,一般來說double的精度已經夠大了,不會出現數據丟失精度的問題,做以上轉換的原因並不是kudu不支持int8,int16,decimal,而是當kudu使用以上數據類型經常會出現類型轉換異常。爲了更好的讓讀者讀懂代碼,我先將用到的Utils方法給出:
HiveUtil如下:
package com.dmall.data.initial_mysql2kudu;
import com.alibaba.druid.pool.DruidDataSource;
import com.dmall.data.util.LogHelper;
import org.slf4j.Logger;
import java.sql.*;
import java.util.*;
import java.util.Date;
/**
* 大數據平臺連接工具
* @ClassName: com.dmall.survey.util.HiveUtil
* @author tao.wen
* @date 2018年10月29日 下午1:21:32
*/
public class HiveUtil {
private static Logger logger = LogHelper.LOG_KUDU_JOB;
/** HIVE數據源 */
public static DruidDataSource hiveDataSource;
HiveUtil(String hive_url,String hive_username,String hive_password){
hiveDataSource.setUrl(hive_url);
hiveDataSource.setUsername(hive_username);
hiveDataSource.setPassword(hive_password);
}
static {
try {
hiveDataSource = new DruidDataSource();
hiveDataSource.setDriverClassName("org.apache.hive.jdbc.HiveDriver");
// hiveDataSource.setUrl("jdbc:hive2://192.168.90.54:10000/dwd_data");
// hiveDataSource.setUsername("ddw_datamart");
// hiveDataSource.setPassword("371E1136-A1E2-486E-8EC5-050104708B29");
hiveDataSource.setInitialSize(5);
hiveDataSource.setMinIdle(1);
hiveDataSource.setMaxActive(10);
hiveDataSource.setRemoveAbandoned(true);//連接泄漏監測
hiveDataSource.setRemoveAbandonedTimeout(180);
hiveDataSource.setMaxWait(600000);//配置獲取連接等待超時的時間
hiveDataSource.setMinEvictableIdleTimeMillis(300000);
hiveDataSource.setTimeBetweenEvictionRunsMillis(60000); //配置間隔多久才進行一次檢測,檢測需要關閉的空閒連接,單位是毫秒
hiveDataSource.setValidationQuery("show tables");//防止過期
hiveDataSource.setTestWhileIdle(true);
hiveDataSource.setTestOnBorrow(true);
} catch (Exception e) {
logger.error("init hiveDataSource exception: " , e);
}
}
/**
* 執行查詢sql
*
* @param sql
* @return
* @throws Exception
*/
public static List<List<List<Object>>> select(String sql) throws Exception{
Connection conn = null;
PreparedStatement pstmt = null;
ResultSet rs = null;
List<List<List<Object>>> list = new ArrayList<List<List<Object>>>();
try {
logger.info("execute sql:{}", sql);
Long startTime = new Date().getTime();
conn = hiveDataSource.getConnection();
pstmt = conn.prepareStatement(sql);
rs = pstmt.executeQuery();
ResultSetMetaData md = rs.getMetaData(); //獲得結果集結構信息,元數據
int columnCount = md.getColumnCount(); //獲得列數
while (rs.next()) {
List rowData = new ArrayList(){};
for (int i = 1; i <= columnCount; i++) {
List column = new ArrayList<Object>();
column.add(md.getColumnName(i));
column.add(rs.getObject(i));
column.add(md.getColumnTypeName(i));
rowData.add(column);
}
list.add(rowData);
}
Long endTime = new Date().getTime();
logger.info("hive select sql execute times:{}", endTime - startTime);
} catch (Exception e) {
throw e;
} finally {
if(rs != null) rs.close();
if(pstmt != null) pstmt.close();
if(conn != null) conn.close();
}
return list;
}
/**
* 新增或修改操作
*
* @param sql
* @throws Exception
*/
public static void save(String sql) throws Exception{
Connection conn = null;
PreparedStatement pstmt = null;
try {
conn = hiveDataSource.getConnection();
String sqlLog = sql.length() > 500 ? (sql.substring(0, 500) + "...") : sql;
logger.info("execute sql:{}", sqlLog);
Long startTime = new Date().getTime();
pstmt = conn.prepareStatement(sql);
pstmt.execute();
Long endTime = new Date().getTime();
logger.info("hive save sql execute times:{}", endTime - startTime);
} catch (Exception e) {
throw e;
} finally {
if(pstmt != null) pstmt.close();
if(conn != null) conn.close();
}
}
public static Long getTableRowNumber(String sql) throws SQLException {
Long TableRowNumber = null;
Connection conn = null;
PreparedStatement pstmt = null;
ResultSet rs = null;
try {
logger.info("execute sql:{}", sql);
Long startTime = new Date().getTime();
conn = hiveDataSource.getConnection();
pstmt = conn.prepareStatement(sql);
rs = pstmt.executeQuery();
while(rs.next()){
TableRowNumber = Long.parseLong(String.valueOf(rs.getObject(1)));
break;
}
Long endTime = new Date().getTime();
logger.info("hive select sql execute times:{}", endTime - startTime);
} catch (Exception e) {
throw e;
} finally {
if(rs != null) rs.close();
if(pstmt != null) pstmt.close();
if(conn != null) conn.close();
}
return TableRowNumber;
}
public static DruidDataSource getDsDataSource() {
return hiveDataSource;
}
}
JdbcUtil如下:
package com.dmall.data.initial_mysql2kudu;
import com.alibaba.druid.pool.DruidDataSource;
import com.dmall.data.util.LogHelper;
import org.slf4j.Logger;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
/**
* 簡單JDBC工具類
* @ClassName: com.dmall.survey.util.JdbcUtil
* @author tao.wen
* @date 2018年10月29日 下午1:21:39
*/
public class JdbcUtil {
private static Logger logger = LogHelper.LOG_KUDU_JOB;
/** 數據源 */
public static DruidDataSource dsDataSource;
JdbcUtil(String mysql_url,String mysql_username,String mysql_password){
dsDataSource.setUrl(mysql_url+"?rewriteBatchedStatements=true&allowMultiQueries=true");
dsDataSource.setUsername(mysql_username);
dsDataSource.setPassword(mysql_password);
}
/**
* 數據庫類型
* @ClassName: com.dmall.survey.util.DBtype
* @author tao.wen
* @date 2018年10月29日 下午1:33:04
*/
public enum DBtype{CG}
static {
try {
dsDataSource = new DruidDataSource();
dsDataSource.setDriverClassName("com.mysql.jdbc.Driver");
// dsDataSource.setUrl("jdbc:MySql://10.248.8.14:13306/test?rewriteBatchedStatements=true&allowMultiQueries=true");
// dsDataSource.setUsername("wumart");
// dsDataSource.setPassword("!QAZxsw2");
dsDataSource.setInitialSize(5);
dsDataSource.setMinIdle(1);
dsDataSource.setMaxActive(10);
dsDataSource.setRemoveAbandoned(true);//連接泄漏監測
dsDataSource.setRemoveAbandonedTimeout(30);
dsDataSource.setMaxWait(60000);//配置獲取連接等待超時的時間
dsDataSource.setTimeBetweenEvictionRunsMillis(60000); //配置間隔多久才進行一次檢測,檢測需要關閉的空閒連接,單位是毫秒
dsDataSource.setValidationQuery("SELECT 'x'");//防止過期
dsDataSource.setTestWhileIdle(true);
dsDataSource.setTestOnBorrow(true);
} catch (Exception e) {
logger.error("init datasource exception: {}" , e);
}
}
/**
* 根據類型做查詢操作
*
* @param sql
* @return
* @throws Exception
*/
public static List<List<List<Object>>> select(String sql) throws Exception{
Connection conn = null;
PreparedStatement pstmt = null;
ResultSet rs = null;
List<List<List<Object>>> list = new ArrayList<List<List<Object>>>();
try {
logger.info("execute sql:{}", sql);
Long startTime = new Date().getTime();
conn = dsDataSource.getConnection();
pstmt = conn.prepareStatement(sql);
rs = pstmt.executeQuery();
ResultSetMetaData md = rs.getMetaData(); //獲得結果集結構信息,元數據
int columnCount = md.getColumnCount(); //獲得列數
while (rs.next()) {
List rowData = new ArrayList(){};
for (int i = 1; i <= columnCount; i++) {
List column = new ArrayList<Object>();
column.add(md.getColumnName(i));
column.add(rs.getObject(i));
column.add(md.getColumnTypeName(i));
rowData.add(column);
}
list.add(rowData);
}
Long endTime = new Date().getTime();
logger.info("jdbc select sql execute times:{}", endTime - startTime);
} catch (Exception e) {
throw e;
} finally {
if(rs != null) rs.close();
if(pstmt != null) pstmt.close();
if(conn != null) conn.close();
}
return list;
}
/**
* 根據類型做新增或修改操作
*
* @param dbtype
* @param sql
* @throws Exception
*/
public static void save(DBtype dbtype, String sql) throws Exception{
Connection conn = null;
PreparedStatement pstmt = null;
try {
if(dbtype.equals(DBtype.CG)){
conn = dsDataSource.getConnection();
}
String sqlLog = sql.length() > 500 ? (sql.substring(0, 500) + "...") : sql;
logger.info("execute sql:{}", sqlLog);
Long startTime = new Date().getTime();
pstmt = conn.prepareStatement(sql);
pstmt.execute();
Long endTime = new Date().getTime();
logger.info("jdbc save sql execute times:{}", endTime - startTime);
} catch (Exception e) {
throw e;
} finally {
if(pstmt != null) pstmt.close();
if(conn != null) conn.close();
}
}
public static DruidDataSource getDsDataSource(){return dsDataSource;}
}
以上Util方法因人而異,自己可以做個性化更改,接下來是創建kuud表的案例代碼:
package com.dmall.data.initial_mysql2kudu;
import com.alibaba.druid.pool.DruidDataSource;
import com.dmall.data.kudu.KuduAgentClient;
import com.dmall.data.util.LogHelper;
import org.apache.kudu.ColumnSchema;
import org.apache.kudu.Schema;
import org.apache.kudu.Type;
import org.apache.kudu.client.CreateTableOptions;
import org.apache.kudu.client.KuduClient;
import org.apache.kudu.client.KuduException;
import org.apache.kudu.shaded.com.google.common.collect.ImmutableList;
import org.slf4j.Logger;
import java.sql.*;
import java.util.ArrayList;
import java.util.List;
public class createKuduTable {
private static String masterHost = "idc-10-248-3-71.ddw.dmall.com:7051,idc-10-248-3-72.ddw.dmall.com:7051,idc-10-248-3-73.ddw.dmall.com:7051";
private static Logger logger = LogHelper.LOG_KUDU_JOB;
static KuduAgentClient agent = null;
private static String mysqlORhive_url = null;
private static String mysqlORhive_username = null;
private static String mysqlORhive_password = null;
private static String mysqlORhive_dbname = null;
private static String mysqlORhive_tablename = null;
private static String tableKeyName = null;
public static void main(String[] args) {
try {
agent = new KuduAgentClient(masterHost);
mysqlORhive_url = args[0];
mysqlORhive_username = args[1];
mysqlORhive_password = args[2];
mysqlORhive_dbname = args[3];
mysqlORhive_tablename = args[4];
tableKeyName = args[5];
String executeMothod = args[6];
logger.info("###get input data type :{}->client:{}", executeMothod, agent);
switch (executeMothod) {
case "createTableForMysql":
createTableForMysql(mysqlORhive_dbname, mysqlORhive_tablename, tableKeyName);
break;
case "createTableForHive":
createTableForHive(mysqlORhive_dbname, mysqlORhive_tablename, tableKeyName);
break;
default:
break;
}
} catch (Exception e) {
e.printStackTrace();
logger.info("###Execute error message {}", e);
}
}
private static void createTableForHive(String mysqlORhive_dbname, String mysqlORhive_tablename, String tableKeyName) throws Exception {
KuduClient client = agent.getKdClient();
if (client.tableExists(mysqlORhive_dbname + "." + mysqlORhive_tablename)) {
logger.info("存在重複表,刪除現有表");
client.deleteTable(mysqlORhive_dbname + "." + mysqlORhive_tablename);
}
List<ColumnSchema> columns = new ArrayList(2);
DruidDataSource hiveDataSource = HiveUtil.getDsDataSource();
hiveDataSource.setUrl(mysqlORhive_url);
hiveDataSource.setUsername(mysqlORhive_username);
hiveDataSource.setPassword(mysqlORhive_password);
Connection conn = null;
PreparedStatement pstmt = null;
ResultSet rs = null;
String sql = "SELECT * FROM " + mysqlORhive_dbname + "." + mysqlORhive_tablename;
//用於存放每列的列信息
List<List> list = new ArrayList<List>();
try {
logger.info("execute sql:{}", sql);
conn = hiveDataSource.getConnection();
pstmt = conn.prepareStatement(sql);
rs = pstmt.executeQuery();
ResultSetMetaData md = rs.getMetaData(); //獲得結果集結構信息,元數據
int columnCount = md.getColumnCount(); //獲得列數
for (int i = 1; i <= columnCount; i++) {
List column = new ArrayList<Object>();
String columnName = md.getColumnName(i).split("\\.")[1];
column.add(columnName);
column.add(md.getColumnTypeName(i));
column.add(columnName.equals(tableKeyName) ? true : false);
list.add(column);
}
logger.info("讀取的mysql元數據信息爲:{}", list);
} catch (Exception e) {
throw e;
} finally {
if (pstmt != null) pstmt.close();
if (conn != null) conn.close();
if (rs != null) rs.close();
}
for (int i = 0; i < list.size(); i++) {
Type kudu_columnType = null;
List<Object> column_info = list.get(i);
String columnName = String.valueOf(column_info.get(0));
String columnType = String.valueOf(column_info.get(1));
Boolean isPrimaryKey = Boolean.parseBoolean(String.valueOf(column_info.get(2)));
if (isPrimaryKey) {
if (columnType.startsWith("double") || columnType.startsWith("float") || columnType.startsWith("boolean")) {
throw new Exception("主鍵類型不能爲double,float,boolean");
}
}
logger.info("columnName is :{} and columnType is :{} and isPrimaryKey is :{}", columnName, columnType, isPrimaryKey);
if (columnType.startsWith("tinyint")) {
kudu_columnType = Type.INT32;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("smallint")) {
kudu_columnType = Type.INT32;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("int")) {
kudu_columnType = Type.INT32;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("bigint")) {
kudu_columnType = Type.INT64;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("string")) {
kudu_columnType = Type.STRING;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("float")) {
kudu_columnType = Type.FLOAT;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("double")) {
kudu_columnType = Type.DOUBLE;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("decimal")) {
kudu_columnType = Type.DOUBLE;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("datetime")) {
kudu_columnType = Type.STRING;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else {
logger.info("nothing to matching", columnType, kudu_columnType);
}
if (isPrimaryKey) {
logger.info("this column is keyColumn:{}", columnName);
columns.add(new ColumnSchema.ColumnSchemaBuilder(columnName, kudu_columnType).key(true).nullable(false).build());
} else {
logger.info("this column is not keyColumn:{}", columnName);
columns.add(new ColumnSchema.ColumnSchemaBuilder(columnName, kudu_columnType).nullable(false).build());
}
}
// 創建schema
Schema schema = new Schema(columns);
// id,user_id,device_id相當於聯合主鍵,三個條件都滿足的情況下,纔可以更新數據,否則就是插入數據
ImmutableList<String> hashKeys = ImmutableList.of(tableKeyName);
CreateTableOptions tableOptions = new CreateTableOptions();
// 設置hash分區和數量
tableOptions.addHashPartitions(hashKeys, 2);
tableOptions.setNumReplicas(3);
// 創建table,並設置partition
client.createTable(mysqlORhive_dbname + "." + mysqlORhive_tablename, schema, tableOptions);
}
private static void createTableForMysql(String mysqlORhive_dbname, String mysqlORhive_tablename, String tableKeyName) throws Exception {
KuduClient client = agent.getKdClient();
if (client.tableExists(mysqlORhive_dbname + "." + mysqlORhive_tablename)) {
logger.info("存在重複表,刪除現有表");
client.deleteTable(mysqlORhive_dbname + "." + mysqlORhive_tablename);
}
List<ColumnSchema> columns = new ArrayList(2);
DruidDataSource dsDataSource = JdbcUtil.getDsDataSource();
dsDataSource.setUrl(mysqlORhive_url + "?rewriteBatchedStatements=true&allowMultiQueries=true");
dsDataSource.setUsername(mysqlORhive_username);
dsDataSource.setPassword(mysqlORhive_password);
Connection conn = null;
PreparedStatement pstmt = null;
ResultSet rs = null;
String sql = "SELECT * FROM " + mysqlORhive_dbname + "." + mysqlORhive_tablename;
//用於存放每列的列信息
List<List> list = new ArrayList<List>();
try {
logger.info("execute sql:{}", sql);
conn = dsDataSource.getConnection();
pstmt = conn.prepareStatement(sql);
rs = pstmt.executeQuery();
ResultSetMetaData md = rs.getMetaData(); //獲得結果集結構信息,元數據
int columnCount = md.getColumnCount(); //獲得列數
for (int i = 1; i <= columnCount; i++) {
List column = new ArrayList<Object>();
column.add(md.getColumnName(i));
column.add(md.getColumnTypeName(i));
column.add(md.getColumnName(i).equals(tableKeyName) ? true : false);
list.add(column);
}
logger.info("讀取的mysql元數據信息爲:{}", list);
} catch (Exception e) {
throw e;
} finally {
if (pstmt != null) pstmt.close();
if (conn != null) conn.close();
if (rs != null) rs.close();
}
for (int i = 0; i < list.size(); i++) {
Type kudu_columnType = null;
List<Object> column_info = list.get(i);
String columnName = String.valueOf(column_info.get(0));
String columnType = String.valueOf(column_info.get(1));
Boolean isPrimaryKey = Boolean.parseBoolean(String.valueOf(column_info.get(2)));
if (isPrimaryKey) {
if (columnType.startsWith("DOUBLE") || columnType.startsWith("FLOAT") || columnType.startsWith("BOOLEAN")) {
throw new Exception("主鍵類型不能爲double,float,boolean");
}
}
logger.info("columnName is :{} and columnType is :{} and isPrimaryKey is :{}", columnName, columnType, isPrimaryKey);
if (columnType.startsWith("TINYINT")) {
kudu_columnType = Type.INT32;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("SMALLINT")) {
kudu_columnType = Type.INT32;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("INT")) {
kudu_columnType = Type.INT32;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("BIGINT")) {
kudu_columnType = Type.INT64;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("VARCHAR")) {
kudu_columnType = Type.STRING;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("FLOAT")) {
kudu_columnType = Type.FLOAT;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("DOUBLE")) {
kudu_columnType = Type.DOUBLE;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("DECIMAL")) {
kudu_columnType = Type.DOUBLE;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else if (columnType.startsWith("DATETIME")) {
kudu_columnType = Type.STRING;
logger.info("column_type :{} and column_kudu_type :{}", columnType, kudu_columnType);
} else {
logger.info("nothing to matching", columnType, kudu_columnType);
}
if (isPrimaryKey) {
logger.info("this column is keyColumn:{}", columnName);
columns.add(new ColumnSchema.ColumnSchemaBuilder(columnName, kudu_columnType).key(true).nullable(false).build());
} else {
logger.info("this column is not keyColumn:{}", columnName);
columns.add(new ColumnSchema.ColumnSchemaBuilder(columnName, kudu_columnType).nullable(false).build());
}
}
// 創建schema
Schema schema = new Schema(columns);
// id,user_id,device_id相當於聯合主鍵,三個條件都滿足的情況下,纔可以更新數據,否則就是插入數據
ImmutableList<String> hashKeys = ImmutableList.of(tableKeyName);
CreateTableOptions tableOptions = new CreateTableOptions();
// 設置hash分區和數量
tableOptions.addHashPartitions(hashKeys, 2);
tableOptions.setNumReplicas(3);
// 創建table,並設置partition
client.createTable(mysqlORhive_dbname + "." + mysqlORhive_tablename, schema, tableOptions);
}
}
代碼的入口做了很多配置入口,主要是做一個通用的工具,當然讀者也可以根據自己需求做適當的更改,在創建kudu表的過程中主要是要指定kudu的columnType,主鍵,非空等重要參數,這裏需求注意的坑是:由於我們這邊是從mysql讀取的元數據結構封裝成List集合,大家都知道List集合是有序的,這樣也會導致建kudu表時與mysql的元數據結構是一致,然而當我們指定的kudu primaryKey所對應的的column如果不是kudu表結構的第一列時,就會報以下錯誤:
不得不說,kudu這一點還是挺坑的,既然知道這個坑,各位小夥伴們,在建表的時候就要注意了,記得把主鍵對應的列放到第一列,好啦表建好了,接下來就需要插入數據了。
插入數據的大概邏輯也是先從mysql或者hive的數據源讀取,把每行封裝成一個rowdata,在進行kudu格式的一個轉換,主要是column_value和kudu_column_type,這裏主要涉及到一點,其實就是巨量數據遷移的時候要注意內存爆滿的現象,我這裏給出的解決方案可能比較蠢,代神們有自己的想法也可以自己實現,我的解決方案是,如果hive的話,因爲hive不支持limit m,n的格式,所以只能用row_number over()的方式打上序號,在針對序號做where,如果是mysql的話,直接limit m,n就好了,每次讀取的批次視自己公司情況而定,我定的是10000條。還有一個坑就是:
agent.upsert(mysqlORhive_dbname + "." + mysqlORhive_tablename, agent.getKdClient(), myrows01);
如果你是用kudu api創建的表,使用以上api插入數據是可以的,但是如果你是用impala-shell創建的kudu表,使用以上api插入數據是會報錯的,提示找不到對應table,原因是什麼呢?還是文章開頭說的,impala管理的kudu的元數據,但是並不是完全互通的,當你使用impala-shell創建的表,在kudu裏是找不到的,所以需要改成如下:
agent.upsert("impala::"+mysqlORhive_dbname + "." + mysqlORhive_tablename, agent.getKdClient(), myrows01);
詳細代碼如下:
package com.dmall.data.initial_mysql2kudu;
import com.dmall.data.kudu.KuduAgentClient;
import com.dmall.data.kudu.KuduColumn;
import com.dmall.data.kudu.KuduRow;
import com.dmall.data.util.LogHelper;
import org.apache.kudu.Type;
import org.slf4j.Logger;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
/**
* @author ywf
* @Title: KuduDataTest
* @ProjectName dmall-data-mart
* @Description: TODO
* @date 2019/1/2316:55
**/
public class initial_InsertKudu {
private static String masterHost = "idc-10-248-3-71.ddw.dmall.com:7051,idc-10-248-3-72.ddw.dmall.com:7051,idc-10-248-3-73.ddw.dmall.com:7051";
private static Logger logger = LogHelper.LOG_KUDU_JOB;
static KuduAgentClient agent = null;
private static String mysqlORhive_url = null;
private static String mysqlORhive_username = null;
private static String mysqlORhive_password = null;
private static String mysqlORhive_dbname = null;
private static String mysqlORhive_tablename = null;
private static String tableKeyName = null;
public static void main(String[] args) {
try {
agent = new KuduAgentClient(masterHost);
mysqlORhive_url = args[0];
mysqlORhive_username = args[1];
mysqlORhive_password = args[2];
mysqlORhive_dbname = args[3];
mysqlORhive_tablename = args[4];
tableKeyName = args[5];
String executeMothod = args[6];
logger.info("###get input data type :{}->client:{}", executeMothod, agent);
switch (executeMothod) {
case "mysql2kudu":
mysql2kudu(mysqlORhive_dbname, mysqlORhive_tablename);
break;
case "hive2kudu":
hive2kudu(mysqlORhive_dbname, mysqlORhive_tablename);
break;
default:
break;
}
} catch (Exception e) {
e.printStackTrace();
logger.info("###Execute error message {}", e);
}
}
private static void hive2kudu(String mysqlORhive_dbname, String mysqlORhive_tablename) throws Exception {
Long tableRowNumber = null;
String rowNum = "SELECT count(1) FROM " + mysqlORhive_dbname + "." + mysqlORhive_tablename;
new HiveUtil(mysqlORhive_url, mysqlORhive_username, mysqlORhive_password);
Connection conn = null;
PreparedStatement pstmt = null;
PreparedStatement preparedStatement = null;
ResultSet rs = null;
ResultSet resultSet = null;
try {
conn = HiveUtil.getDsDataSource().getConnection();
pstmt = conn.prepareStatement(rowNum);
logger.info("execute sql:{}", rowNum);
resultSet = pstmt.executeQuery();
while (resultSet.next()) {
tableRowNumber = Long.parseLong(String.valueOf(resultSet.getObject(1)));
logger.info("table行數爲:{}", tableRowNumber);
break;
}
// 分批次讀取數據,定義起始和末尾角標
Long beginIndex = 1L;
Long endIndex = null;
if (tableRowNumber <= 10000L) {
endIndex = tableRowNumber;
} else {
endIndex = 10000L;
}
while (endIndex <= tableRowNumber) {
conn = HiveUtil.getDsDataSource().getConnection();
String sql = "select * from (select (row_number() over (order by " + tableKeyName + ")) as rnum ,* from " + mysqlORhive_dbname + "" + "." + "" + mysqlORhive_tablename + ")t where rnum between " + beginIndex + " and " + endIndex + "";
preparedStatement = conn.prepareStatement(sql);
rs = preparedStatement.executeQuery();
ResultSetMetaData md = rs.getMetaData(); //獲得結果集結構信息,元數據
int columnCount = md.getColumnCount(); //獲得列數
while (rs.next()) {
List<List<Object>> rowdata = new ArrayList();
for (int i = 2; i <= columnCount; i++) {
List<Object> column = new ArrayList<Object>();
column.add(md.getColumnName(i));
column.add(rs.getObject(i));
column.add(md.getColumnTypeName(i));
rowdata.add(column);
}
List<KuduColumn> row = new ArrayList<>();
for (int j = 0; j < rowdata.size(); j++) {
String column_name = null;
Object column_value = null;
String column_type = null;
Type column_kudu_type = null;
for (int z = 0; z < rowdata.get(j).size(); z++) {
switch (z) {
case 0:
column_name = String.valueOf(rowdata.get(j).get(z)).split("\\.")[1];
break;
case 1:
column_value = rowdata.get(j).get(z);
break;
case 2:
column_type = String.valueOf(rowdata.get(j).get(z));
if (column_type.startsWith("tinyint")) {
column_kudu_type = Type.INT8;
if (column_value == null || column_value == "") {
column_value = -1;
} else {
column_value = Integer.parseInt(String.valueOf(column_value));
}
} else if (column_type.startsWith("smallint")) {
column_kudu_type = Type.INT16;
if (column_value == null || column_value == "") {
column_value = -1;
} else {
column_value = Integer.parseInt(String.valueOf(column_value));
}
} else if (column_type.startsWith("int")) {
column_kudu_type = Type.INT32;
if (column_value == null || column_value == "") {
column_value = -1;
} else {
column_value = Integer.parseInt(String.valueOf(column_value));
}
} else if (column_type.startsWith("bigint")) {
column_kudu_type = Type.INT64;
if (column_value == null || column_value == "") {
column_value = -1l;
} else {
column_value = Long.parseLong(String.valueOf(column_value));
}
} else if (column_type.startsWith("string")) {
column_kudu_type = Type.STRING;
if (column_value == null || column_value == "") {
column_value = "null";
} else {
column_value = String.valueOf(column_value);
}
} else if (column_type.startsWith("float")) {
column_kudu_type = Type.FLOAT;
if (column_value == null || column_value == "") {
column_value = -1f;
} else {
column_value = Float.parseFloat(String.valueOf(column_value));
}
} else if (column_type.startsWith("double")) {
column_kudu_type = Type.DOUBLE;
if (column_value == null || column_value == "") {
column_value = -1d;
} else {
column_value = Double.parseDouble(String.valueOf(column_value));
}
} else if (column_type.startsWith("decimal")) {
column_kudu_type = Type.DOUBLE;
if (column_value == null || column_value == "") {
column_value = -1d;
} else {
column_value = Double.parseDouble(String.valueOf(column_value));
}
} else if (column_type.startsWith("datetime")) {
column_kudu_type = Type.STRING;
if (column_value == null || column_value == "") {
column_value = "null";
} else {
column_value = String.valueOf(column_value);
}
} else {
logger.info("nothing to matching", column_type, column_kudu_type);
}
break;
default:
break;
}
}
// logger.info("key:{} and value:{} and type:{}", column_name, column_value, column_kudu_type);
//將key和value寫入kudu表當中
KuduColumn c01 = new KuduColumn();
c01.setColumnName(column_name).setColumnValue(column_value).setColumnType(column_kudu_type);
row.add(c01);
}
// logger.info("插入kudu前,組合成的一條數據:{}", row);
KuduRow myrows01 = new KuduRow();
myrows01.setRows(row);
// logger.info("即將插入kudu,庫表爲:{},client:{},rows:{}", mysqlORhive_dbname + "." + mysqlORhive_tablename, agent.getKdClient(), myrows01);
// 如果是impala-shell創建的kudu表,則在表名前加上impala::,如果是用kudu api創建的則不需要
agent.upsert(mysqlORhive_dbname + "." + mysqlORhive_tablename, agent.getKdClient(), myrows01);
}
//修改startIndex和endIndex
beginIndex = endIndex + 1L;
endIndex = endIndex + 10000L;
}
} catch (Exception e) {
throw e;
} finally {
if (rs != null) rs.close();
if (pstmt != null) pstmt.close();
if (conn != null) conn.close();
}
}
public static void mysql2kudu(String mysqlORhive_dbname, String mysqlORhive_tablename) throws Exception {
Long tableRowNumber = null;
String rowNum = "SELECT count(1) FROM " + mysqlORhive_dbname + "." + mysqlORhive_tablename;
new JdbcUtil(mysqlORhive_url, mysqlORhive_username, mysqlORhive_password);
Connection conn = null;
PreparedStatement pstmt = null;
PreparedStatement preparedStatement = null;
ResultSet rs = null;
ResultSet resultSet = null;
try {
conn = JdbcUtil.getDsDataSource().getConnection();
pstmt = conn.prepareStatement(rowNum);
logger.info("execute sql:{}", rowNum);
resultSet = pstmt.executeQuery();
while (resultSet.next()) {
tableRowNumber = Long.parseLong(String.valueOf(resultSet.getObject(1)));
logger.info("table行數爲:{}", tableRowNumber);
break;
}
// 分批次讀取數據,定義起始和末尾角標
Long beginIndex = 0L;
Long endIndex = null;
if (tableRowNumber <= 10000L) {
endIndex = tableRowNumber;
} else {
endIndex = 10000L;
}
while (endIndex <= tableRowNumber) {
conn = JdbcUtil.getDsDataSource().getConnection();
String sql = "SELECT * FROM " + mysqlORhive_dbname + "." + mysqlORhive_tablename + " limit " + beginIndex + "," + endIndex;
logger.info("execute sql:{}", sql);
preparedStatement = conn.prepareStatement(sql);
rs = preparedStatement.executeQuery();
ResultSetMetaData md = rs.getMetaData(); //獲得結果集結構信息,元數據
int columnCount = md.getColumnCount(); //獲得列數
while (rs.next()) {
List<List<Object>> rowdata = new ArrayList();
for (int i = 1; i <= columnCount; i++) {
List<Object> column = new ArrayList<Object>();
column.add(md.getColumnName(i));
column.add(rs.getObject(i));
column.add(md.getColumnTypeName(i));
rowdata.add(column);
}
List<KuduColumn> row = new ArrayList<>();
for (int j = 0; j < rowdata.size(); j++) {
String column_name = null;
Object column_value = null;
String column_type = null;
Type column_kudu_type = null;
for (int z = 0; z < rowdata.get(j).size(); z++) {
switch (z) {
case 0:
column_name = String.valueOf(rowdata.get(j).get(z));
break;
case 1:
column_value = rowdata.get(j).get(z);
break;
case 2:
column_type = String.valueOf(rowdata.get(j).get(z));
if (column_type.startsWith("TINYINT")) {
column_kudu_type = Type.INT8;
if (column_value == null || column_value == "") {
column_value = -1;
} else {
column_value = Integer.parseInt(String.valueOf(column_value));
}
} else if (column_type.startsWith("SMALLINT")) {
column_kudu_type = Type.INT16;
if (column_value == null || column_value == "") {
column_value = -1;
} else {
column_value = Integer.parseInt(String.valueOf(column_value));
}
} else if (column_type.startsWith("INT")) {
column_kudu_type = Type.INT32;
if (column_value == null || column_value == "") {
column_value = -1;
} else {
column_value = Integer.parseInt(String.valueOf(column_value));
}
} else if (column_type.startsWith("BIGINT")) {
column_kudu_type = Type.INT64;
if (column_value == null || column_value == "") {
column_value = -1l;
} else {
column_value = Long.parseLong(String.valueOf(column_value));
}
} else if (column_type.startsWith("VARCHAR")) {
column_kudu_type = Type.STRING;
if (column_value == null || column_value == "") {
column_value = "null";
} else {
column_value = String.valueOf(column_value);
}
} else if (column_type.startsWith("FLOAT")) {
column_kudu_type = Type.FLOAT;
if (column_value == null || column_value == "") {
column_value = -1f;
} else {
column_value = Float.parseFloat(String.valueOf(column_value));
}
} else if (column_type.startsWith("DOUBLE")) {
column_kudu_type = Type.DOUBLE;
if (column_value == null || column_value == "") {
column_value = -1d;
} else {
column_value = Double.parseDouble(String.valueOf(column_value));
}
} else if (column_type.startsWith("DECIMAL")) {
column_kudu_type = Type.DOUBLE;
if (column_value == null || column_value == "") {
column_value = -1d;
} else {
column_value = Double.parseDouble(String.valueOf(column_value));
}
} else if (column_type.startsWith("DATETIME")) {
column_kudu_type = Type.STRING;
if (column_value == null || column_value == "") {
column_value = "null";
} else {
column_value = String.valueOf(column_value);
}
} else {
logger.info("nothing to matching", column_type, column_kudu_type);
}
break;
default:
break;
}
}
// logger.info("key:{} and value:{} and type:{}", column_name, column_value, column_kudu_type);
//將key和value寫入kudu表當中
KuduColumn c01 = new KuduColumn();
c01.setColumnName(column_name).setColumnValue(column_value).setColumnType(column_kudu_type);
row.add(c01);
}
// logger.info("插入kudu前,組合成的一條數據:{}", row);
KuduRow myrows01 = new KuduRow();
myrows01.setRows(row);
// 如果是impala-shell創建的kudu表,則在表名前加上impala::,如果是用kudu api創建的則不需要
agent.upsert(mysqlORhive_dbname + "." + mysqlORhive_tablename, agent.getKdClient(), myrows01);
}
//修改startIndex和endIndex
beginIndex = endIndex + 1L;
endIndex = endIndex + 10000L;
}
} catch (Exception e) {
throw e;
} finally {
if (rs != null) rs.close();
if (pstmt != null) pstmt.close();
if (conn != null) conn.close();
}
}
}
四:注意事項
- 使用kudu api創建表以後,在impala-shell裏面是看不到的,需要創建外部表,在做下映射才行,具體做法如下:
- CREATE EXTERNAL TABLE dmall_pos_sale.payment
STORED AS KUDU
TBLPROPERTIES (
'kudu.table_name' = 'dmall_pos_sale.payment'
); - 在做mysql和hive到kudu數據類型映射的時候,注意類型的轉換。
- 創建表時,指定主鍵的列,必須要放到kudu表結構的第一列,不然也會拋異常。
- 針對插入大數據量時要注意內存的溢出問題。
- 使用agent.upsert方法時要注意是否需要在表名前加上impala::
以上就是全部內容,原創作品,轉載請註明出處
謝謝!