Hive知識總結

hive
--------------
   數據倉庫,在線分析處理。
   HiveQL,類似sql語言。
   表,metadata->rdbms.
   hive處理的數據是hdfs.
   MR,聚合操作。

1）內部表,2）管理表,3）託管表
-----------------------
內部表的特點：hive,drop ,數據也刪除

外部表
-------------------------
hive表結構。

分區表；
---------------
目錄.
where 縮小查詢範圍。

bucket表
---------------
   文件。
   hash
   clustered by ''

join（鏈接查詢）
-------------
水平進行查詢

union
-------------
select id（聯合查詢，可以查不同的表）

hive
-------------------
   select id,name from customers union select id,orderno from orders（訂單號） ;
   $>hive                           //hive --service cli
   $>hive --servic hiveserver2       //啓動hiveserver2，10000 [thriftServer]
   $>hive --service beeline       //beeline

hive使用jdbc協議實現遠程訪問
-----------------------------

hive
------------
$hive>CREATE TABLE t3(id int,name string,age int) PARTITIONED BY (Year INT, Month INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

export
---------
$hive>EXPORT TABLE customers TO '/user/centos/tmp.txt'; //導出表結構+數據。

//order全排序
$hive>select * from orders order by id asc ;

//sort,map端排序,本地有序。
$hive>select * from orders sort by id asc ;

   //DISTRIBUTE BY類似於mysql的group by,進行分區操作。
   //select cid , ... from orders distribute by cid sort by name ;           //注意順序.
   $hive>select id,orderno,cid from orders distribute by cid sort by cid desc ;

//cluster by ===> distribute by cid sort by cid

函數（主要講解hive中的函數）
----------------
   mysql>select concat('tom',1000) ;
   $hive>select current_database(),current_user() ;
   $hive>tab                               //查看幫助

設置作業參數
---------------
   $hive>set hive.exec.reducers.bytes.per.reducer=xxx           //設置reducetask的字節數。
   $hive>set hive.exec.reducers.max=0                           //設置reduce task的最大任務數
   $hive>set mapreduce.job.reduces=0                           //設置reducetask個數。

動態分區
---------------
   動態分區模式:strict-嚴格模式，插入時至少指定一個靜態分區，nonstrict-非嚴格模式-可以不指定靜態分區。
   set hive.exec.dynamic.partition.mode=nonstrict           //設置非嚴格模式
   $hive>INSERT OVERWRITE TABLE employees PARTITION (country, state) SELECT ..., se.cnty, se.st FROM staged_employees se WHERE se.cnty = 'US';

159

hive事務處理在>0.13.0之後支持行級事務。
---------------------------------------
   1.所有事務自動提交。
   2.只支持orc格式。面向列的數據格式（相當於hbase）
   3.使用bucket表。
   4.配置hive參數，使其支持事務。
$hive>SET hive.support.concurrency = true;   （支持併發）
$hive>SET hive.enforce.bucketing = true;       （強制支持bucket處理）
$hive>SET hive.exec.dynamic.partition.mode = nonstrict;   （動態分區）
$hive>SET hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
$hive>SET hive.compactor.initiator.on = true;
$hive>SET hive.compactor.worker.threads = 1;

5.使用事務性操作
$>CREATE TABLE tx(id int,name string,age int) CLUSTERED BY (id) INTO 3 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' stored as orc TBLPROPERTIES ('transactional'='true');

聚合處理
---------------
   $hive>select cid,count(*) c ,max(price) from orders group by cid having c > 1 ;

wordcount
----------------
   $hive>select t.word,count(*) c from ((select explode(split(line, ' ')) as word from doc) as t) group by t.word order by c desc limit 2 ;

創建新表:stats(word string,c int) ;
將查詢結果插入到指定表中。

view:視圖,虛表
-----------
//創建視圖
$hive>create view v1 as select a.id aid,a.name ,b.id bid , b.order from customers a left outer join default.tt b on a.id = b.cid ;

   //查看視圖
   $hive>show tables ;
   $hive>select * from v1 ;

Map端連接
-------------------
   $hive>set hive.auto.convert.join=true           //設置自動轉換連接,默認開啓了。
   //使用mapjoin連接暗示實現mapjoin
   $hive>select /*+ mapjoin(customers) */ a.*,b.* from customers a left outer join orders b on a.id = b.cid ;

調優
--------------------
   1.explain
       使用explain查看查詢計劃
       hive>explain [extended] select count(*) from customers ;
       hive>explain select t.name , count(*) from (select a.name ,b.id,b.orderno from customers a ,orders b where a.id = b.cid) t group by t.name ;

//設置limit優化測，避免全部查詢.
hive>set hive.limit.optimize.enable=true

       //本地模式
       $hive>set mapred.job.tracker=local;           //
       $hive>set hive.exec.mode.local.auto=true   //自動本地模式,測試

//並行執行,同時執行不存在依賴關係的階段。??
$hive>set hive.exec.parallel=true //

       //嚴格模式,
       $hive>set hive.mapred.mode=strict           //1.分區表必須指定分區進行查詢
                                                   //2.order by時必須使用limit子句。
                                                   //3.不允許笛卡爾積.

//設置MR的數量
hive> set hive.exec.reducers.bytes.per.reducer=750000000; //設置reduce處理的字節數。

//JVM重用
$hive>set mapreduce.job.jvm.numtasks=1 //-1沒有限制，使用大量小文件。

       //UDF
       //User define function,用戶自定義函數
       //current_database(),current_user();

       //顯式所有函數
       $hive>show functions;
       $hive>select array(1,2,3) ;

//顯式指定函數幫助
$hive>desc function current_database();

//表生成函數,多行函數。
$hive>explode(str,exp); //按照exp切割str.

自定義函數
------------------
1.創建類，繼承UDF
package com.it18zhang.hivedemo.udf;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;

       /**
       * 自定義hive函數
       */
       @Description(name = "myadd",
               value = "myadd(int a , int b) ==> return a + b ",
               extended = "Example:\n"
                       + " myadd(1,1) ==> 2 \n"
                       + " myadd(1,2,3) ==> 6;")
       public class AddUDF extends UDF {

           public int evaluate(int a ,int b) {
               return a + b ;
           }

           public int evaluate(int a ,int b , int c) {
               return a + b + c;
           }
       }
   2.打成jar包。
       cmd>cd {classes所在目錄}
       cmd>jar cvf HiveDemo.jar -C x/x/x/x/classes/ .
   3.添加jar包到hive的類路徑
       //添加jar到類路徑
       $>cp /mnt/hgfs/downloads/bigdata/data/HiveDemo.jar /soft/hive/lib

   3.重進入hive
       $>....

   4.創建臨時函數
       //
       CREATE TEMPORARY FUNCTION myadd AS 'com.it18zhang.hivedemo.udf.AddUDF';

   5.在查詢中使用自定義函數
       $hive>select myadd(1,2) ;

   6.定義日期函數
       1)定義類
       public class ToCharUDF extends UDF {
           /**
           * 取出服務器的當前系統時間 2017/3/21 16:53:55
           */
           public String evaluate() {
               Date date = new Date();
               SimpleDateFormat sdf = new SimpleDateFormat();
               sdf.applyPattern("yyyy/MM/dd hh:mm:ss");
               return sdf.format(date) ;
           }
           public String evaluate(Date date) {
               SimpleDateFormat sdf = new SimpleDateFormat();
               sdf.applyPattern("yyyy/MM/dd hh:mm:ss");
               return sdf.format(date) ;
           }

           public String evaluate(Date date,String frt) {
               SimpleDateFormat sdf = new SimpleDateFormat();
               sdf.applyPattern(frt);
               return sdf.format(date) ;
           }
       }

       2)導出jar包，通過命令添加到hive的類路徑(不需要重進hive)。
           $hive>add jar /mnt/hgfs/downloads/bigdata/data/HiveDemo-1.0-SNAPSHOT.jar

       3)註冊函數
           $hive>CREATE TEMPORARY FUNCTION to_char AS 'com.it18zhang.hivedemo.udf.ToCharUDF';
           $hive>CREATE TEMPORARY FUNCTION to_date AS 'com.it18zhang.hivedemo.udf.ToDateUDF';

定義Nvl函數
------------------
package com.it18zhang.hivedemo.udf;

   import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
   import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
   import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
   import org.apache.hadoop.hive.ql.metadata.HiveException;
   import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
   import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils;
   import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;

   /**
   * 自定義null值處理函數
   */
   public class Nvl extends GenericUDF {
       private GenericUDFUtils.ReturnObjectInspectorResolver returnOIResolver;
       private ObjectInspector[] argumentOIs;

       public ObjectInspector initialize(ObjectInspector[] arguments)
               throws UDFArgumentException {
           argumentOIs = arguments;
           //檢查參數個數
           if (arguments.length != 2) {
               throw new UDFArgumentLengthException(
                       "The operator 'NVL' accepts 2 arguments.");
           }
           returnOIResolver = new GenericUDFUtils.ReturnObjectInspectorResolver(true);
           //檢查參數類型
           if (!(returnOIResolver.update(arguments[0]) && returnOIResolver
                   .update(arguments[1]))) {
               throw new UDFArgumentTypeException(2,
                       "The 1st and 2nd args of function NLV should have the same type, "
                               + "but they are different: \"" + arguments[0].getTypeName()
                               + "\" and \"" + arguments[1].getTypeName() + "\"");
           }
           return returnOIResolver.get();
       }

       public Object evaluate(DeferredObject[] arguments) throws HiveException {
           Object retVal = returnOIResolver.convertIfNecessary(arguments[0].get(), argumentOIs[0]);
           if (retVal == null) {
               retVal = returnOIResolver.convertIfNecessary(arguments[1].get(),
                       argumentOIs[1]);
           }
           return retVal;
       }

       public String getDisplayString(String[] children) {
           StringBuilder sb = new StringBuilder();
           sb.append("if ");
           sb.append(children[0]);
           sb.append(" is null ");
           sb.append("returns");
           sb.append(children[1]);
           return sb.toString();
       }
   }

   2)添加jar到類路徑
       ...
   3)註冊函數
       $hive>CREATE TEMPORARY FUNCTION nvl AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFNvl';

大數據MR模型以及代碼實現

kubernetes分佈式安裝部署-簡介

redis簡介-安裝部署-命令說明

UWSGI的作用

大數據—zookeeper介紹-配置-安裝-使用命令

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結