Hive SQL案例分析

數據籌備

7369,SMITH,CLERK,7902,1980-12-17 00:00:00,800,\N,20
7499,ALLEN,SALESMAN,7698,1981-02-20 00:00:00,1600,300,30
7521,WARD,SALESMAN,7698,1981-02-22 00:00:00,1250,500,30
7566,JONES,MANAGER,7839,1981-04-02 00:00:00,2975,\N,20
7654,MARTIN,SALESMAN,7698,1981-09-28 00:00:00,1250,1400,30
7698,BLAKE,MANAGER,7839,1981-05-01 00:00:00,2850,\N,30
7782,CLARK,MANAGER,7839,1981-06-09 00:00:00,2450,\N,10
7788,SCOTT,ANALYST,7566,1987-04-19 00:00:00,1500,\N,20
7839,KING,PRESIDENT,\N,1981-11-17 00:00:00,5000,\N,10
7844,TURNER,SALESMAN,7698,1981-09-08 00:00:00,1500,0,30
7876,ADAMS,CLERK,7788,1987-05-23 00:00:00,1100,\N,20
7900,JAMES,CLERK,7698,1981-12-03 00:00:00,950,\N,30
7902,FORD,ANALYST,7566,1981-12-03 00:00:00,3000,\N,20
7934,MILLER,CLERK,7782,1982-01-23 00:00:00,1300,\N,10

CREATE TABLE t_employee(
    empno INT,
    ename STRING,
    job STRING,
    mgr INT,
    hiredate TIMESTAMP,
    sal DECIMAL(7,2),
    comm DECIMAL(7,2),
    deptno INT)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;

10,ACCOUNTING,NEW YORK
20,RESEARCH,DALLAS
30,SALES,CHICAGO
40,OPERATIONS,BOSTON

CREATE TABLE t_dept(
    DEPTNO INT,
    DNAME STRING,
    LOC STRING)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;

0: jdbc:hive2://CentOS:10000> select empno,ename,job,mgr,hiredate,sal,comm,deptno  from t_employee;
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| empno  |  ename  |    job     |  mgr  |        hiredate        |  sal  | comm  | deptno  |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| 7369   | SMITH   | CLERK      | 7902  | 1980-12-17 00:00:00.0  | 800   | NULL  | 20      |
| 7499   | ALLEN   | SALESMAN   | 7698  | 1981-02-20 00:00:00.0  | 1600  | 300   | 30      |
| 7521   | WARD    | SALESMAN   | 7698  | 1981-02-22 00:00:00.0  | 1250  | 500   | 30      |
| 7566   | JONES   | MANAGER    | 7839  | 1981-04-02 00:00:00.0  | 2975  | NULL  | 20      |
| 7654   | MARTIN  | SALESMAN   | 7698  | 1981-09-28 00:00:00.0  | 1250  | 1400  | 30      |
| 7698   | BLAKE   | MANAGER    | 7839  | 1981-05-01 00:00:00.0  | 2850  | NULL  | 30      |
| 7782   | CLARK   | MANAGER    | 7839  | 1981-06-09 00:00:00.0  | 2450  | NULL  | 10      |
| 7788   | SCOTT   | ANALYST    | 7566  | 1987-04-19 00:00:00.0  | 1500  | NULL  | 20      |
| 7839   | KING    | PRESIDENT  | NULL  | 1981-11-17 00:00:00.0  | 5000  | NULL  | 10      |
| 7844   | TURNER  | SALESMAN   | 7698  | 1981-09-08 00:00:00.0  | 1500  | 0     | 30      |
| 7876   | ADAMS   | CLERK      | 7788  | 1987-05-23 00:00:00.0  | 1100  | NULL  | 20      |
| 7900   | JAMES   | CLERK      | 7698  | 1981-12-03 00:00:00.0  | 950   | NULL  | 30      |
| 7902   | FORD    | ANALYST    | 7566  | 1981-12-03 00:00:00.0  | 3000  | NULL  | 20      |
| 7934   | MILLER  | CLERK      | 7782  | 1982-01-23 00:00:00.0  | 1300  | NULL  | 10      |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
14 rows selected (0.047 seconds)
0: jdbc:hive2://CentOS:10000> select deptno,dname,loc from t_dept;
+---------+-------------+-----------+--+
| deptno  |    dname    |    loc    |
+---------+-------------+-----------+--+
| 10      | ACCOUNTING  | NEW YORK  |
| 20      | RESEARCH    | DALLAS    |
| 30      | SALES       | CHICAGO   |
| 40      | OPERATIONS  | BOSTON    |
+---------+-------------+-----------+--+
4 rows selected (0.046 seconds)

CREATE TABLE t_employee_partition(
    empno INT,
    ename STRING,
    job STRING,
    mgr INT,
    hiredate TIMESTAMP,
    sal DECIMAL(7,2),
    comm DECIMAL(7,2))
PARTITIONED BY(deptno INT) 
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;

0: jdbc:hive2://CentOS:10000> set hive.exec.dynamic.partition.mode=nonstrict
0: jdbc:hive2://CentOS:10000> INSERT OVERWRITE TABLE t_employee_partition PARTITION (deptno)  SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee;

SQL查詢

單表查詢

0: jdbc:hive2://CentOS:10000> select empno,ename,job,mgr,hiredate,sal,comm,deptno  from t_employee;
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| empno  |  ename  |    job     |  mgr  |        hiredate        |  sal  | comm  | deptno  |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| 7369   | SMITH   | CLERK      | 7902  | 1980-12-17 00:00:00.0  | 800   | NULL  | 20      |
| 7499   | ALLEN   | SALESMAN   | 7698  | 1981-02-20 00:00:00.0  | 1600  | 300   | 30      |
| 7521   | WARD    | SALESMAN   | 7698  | 1981-02-22 00:00:00.0  | 1250  | 500   | 30      |
| 7566   | JONES   | MANAGER    | 7839  | 1981-04-02 00:00:00.0  | 2975  | NULL  | 20      |
| 7654   | MARTIN  | SALESMAN   | 7698  | 1981-09-28 00:00:00.0  | 1250  | 1400  | 30      |
| 7698   | BLAKE   | MANAGER    | 7839  | 1981-05-01 00:00:00.0  | 2850  | NULL  | 30      |
| 7782   | CLARK   | MANAGER    | 7839  | 1981-06-09 00:00:00.0  | 2450  | NULL  | 10      |
| 7788   | SCOTT   | ANALYST    | 7566  | 1987-04-19 00:00:00.0  | 1500  | NULL  | 20      |
| 7839   | KING    | PRESIDENT  | NULL  | 1981-11-17 00:00:00.0  | 5000  | NULL  | 10      |
| 7844   | TURNER  | SALESMAN   | 7698  | 1981-09-08 00:00:00.0  | 1500  | 0     | 30      |
| 7876   | ADAMS   | CLERK      | 7788  | 1987-05-23 00:00:00.0  | 1100  | NULL  | 20      |
| 7900   | JAMES   | CLERK      | 7698  | 1981-12-03 00:00:00.0  | 950   | NULL  | 30      |
| 7902   | FORD    | ANALYST    | 7566  | 1981-12-03 00:00:00.0  | 3000  | NULL  | 20      |
| 7934   | MILLER  | CLERK      | 7782  | 1982-01-23 00:00:00.0  | 1300  | NULL  | 10      |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
14 rows selected (0.056 seconds)

WHERE查詢

0: jdbc:hive2://CentOS:10000> SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee WHERE empno > 7782 AND deptno = 10;
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| empno  |  ename  |    job     |  mgr  |        hiredate        |  sal  | comm  | deptno  |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| 7839   | KING    | PRESIDENT  | NULL  | 1981-11-17 00:00:00.0  | 5000  | NULL  | 10      |
| 7934   | MILLER  | CLERK      | 7782  | 1982-01-23 00:00:00.0  | 1300  | NULL  | 10      |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
2 rows selected (0.067 seconds)

DISTINCT查詢

0: jdbc:hive2://CentOS:10000> select distinct(job) from t_employee;
+------------+--+
|    job     |
+------------+--+
| ANALYST    |
| CLERK      |
| MANAGER    |
| PRESIDENT  |
| SALESMAN   |
+------------+--+

分區查詢

0: jdbc:hive2://CentOS:10000> SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee_partition e  WHERE e.deptno >= 20 AND e.deptno <= 40;
+--------+---------+-----------+-------+------------------------+-------+-------+---------+--+
| empno  |  ename  |    job    |  mgr  |        hiredate        |  sal  | comm  | deptno  |
+--------+---------+-----------+-------+------------------------+-------+-------+---------+--+
| 7369   | SMITH   | CLERK     | 7902  | 1980-12-17 00:00:00.0  | 800   | NULL  | 20      |
| 7566   | JONES   | MANAGER   | 7839  | 1981-04-02 00:00:00.0  | 2975  | NULL  | 20      |
| 7788   | SCOTT   | ANALYST   | 7566  | 1987-04-19 00:00:00.0  | 1500  | NULL  | 20      |
| 7876   | ADAMS   | CLERK     | 7788  | 1987-05-23 00:00:00.0  | 1100  | NULL  | 20      |
| 7902   | FORD    | ANALYST   | 7566  | 1981-12-03 00:00:00.0  | 3000  | NULL  | 20      |
| 7499   | ALLEN   | SALESMAN  | 7698  | 1981-02-20 00:00:00.0  | 1600  | 300   | 30      |
| 7521   | WARD    | SALESMAN  | 7698  | 1981-02-22 00:00:00.0  | 1250  | 500   | 30      |
| 7654   | MARTIN  | SALESMAN  | 7698  | 1981-09-28 00:00:00.0  | 1250  | 1400  | 30      |
| 7698   | BLAKE   | MANAGER   | 7839  | 1981-05-01 00:00:00.0  | 2850  | NULL  | 30      |
| 7844   | TURNER  | SALESMAN  | 7698  | 1981-09-08 00:00:00.0  | 1500  | 0     | 30      |
| 7900   | JAMES   | CLERK     | 7698  | 1981-12-03 00:00:00.0  | 950   | NULL  | 30      |
+--------+---------+-----------+-------+------------------------+-------+-------+---------+--+
11 rows selected (0.123 seconds)

LIMIT查詢

0: jdbc:hive2://CentOS:10000> SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee  ORDER BY sal DESC LIMIT 5;
+--------+--------+------------+-------+------------------------+-------+-------+---------+--+
| empno  | ename  |    job     |  mgr  |        hiredate        |  sal  | comm  | deptno  |
+--------+--------+------------+-------+------------------------+-------+-------+---------+--+
| 7839   | KING   | PRESIDENT  | NULL  | 1981-11-17 00:00:00.0  | 5000  | NULL  | 10      |
| 7902   | FORD   | ANALYST    | 7566  | 1981-12-03 00:00:00.0  | 3000  | NULL  | 20      |
| 7566   | JONES  | MANAGER    | 7839  | 1981-04-02 00:00:00.0  | 2975  | NULL  | 20      |
| 7698   | BLAKE  | MANAGER    | 7839  | 1981-05-01 00:00:00.0  | 2850  | NULL  | 30      |
| 7782   | CLARK  | MANAGER    | 7839  | 1981-06-09 00:00:00.0  | 2450  | NULL  | 10      |
+--------+--------+------------+-------+------------------------+-------+-------+---------+--+
5 rows selected (14.294 seconds)

GROUP BY查詢

0: jdbc:hive2://CentOS:10000> set hive.map.aggr=true;
0: jdbc:hive2://CentOS:10000> SELECT deptno,SUM(sal) as total FROM t_employee GROUP BY deptno;
+---------+--------+--+
| deptno  | total  |
+---------+--------+--+
| 10      | 8750   |
| 20      | 9375   |
| 30      | 9400   |
+---------+--------+--+
3 rows selected (12.645 seconds)

hive.map.aggr控制程序如何進行聚合。默認值爲false。如果設置爲true，Hive會在map階段就執行一次聚合。這可以提高聚合效率，但需要消耗更多內存。

ORDER AND SORT

可以使用ORDER BY或者Sort BY對查詢結果進行排序，排序字段可以是整型也可以是字符串：如果是整型，則按照大小排序；如果是字符串，則按照字典序排序。ORDER BY 和 SORT BY 的區別如下：使用ORDER BY時會有一個Reducer對全部查詢結果進行排序，可以保證數據的全局有序性；使用SORT BY時只會在每個Reducer中進行排序，這可以保證每個Reducer的輸出數據是有序的，但不能保證全局有序。由於ORDER BY的時間可能很長，如果你設置了嚴格模式(hive.mapred.mode = strict)，則其後面必須再跟一個limit子句。

sort by

0: jdbc:hive2://CentOS:10000> set mapreduce.job.reduces=2
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal from t_employee sort by sal desc;
+--------+---------+-------+--+
| empno  |  ename  |  sal  |
+--------+---------+-------+--+
| 7902   | FORD    | 3000  |
| 7566   | JONES   | 2975  |
| 7844   | TURNER  | 1500  |
| 7788   | SCOTT   | 1500  |
| 7521   | WARD    | 1250  |
| 7654   | MARTIN  | 1250  |
| 7876   | ADAMS   | 1100  |
| 7900   | JAMES   | 950   |
| 7369   | SMITH   | 800   |
| 7839   | KING    | 5000  |
| 7698   | BLAKE   | 2850  |
| 7782   | CLARK   | 2450  |
| 7499   | ALLEN   | 1600  |
| 7934   | MILLER  | 1300  |
+--------+---------+-------+--+
14 rows selected (14.474 seconds)

order by

0: jdbc:hive2://CentOS:10000> set mapreduce.job.reduces=3;
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal from t_employee order by sal desc;
+--------+---------+-------+--+
| empno  |  ename  |  sal  |
+--------+---------+-------+--+
| 7839   | KING    | 5000  |
| 7902   | FORD    | 3000  |
| 7566   | JONES   | 2975  |
| 7698   | BLAKE   | 2850  |
| 7782   | CLARK   | 2450  |
| 7499   | ALLEN   | 1600  |
| 7844   | TURNER  | 1500  |
| 7788   | SCOTT   | 1500  |
| 7934   | MILLER  | 1300  |
| 7654   | MARTIN  | 1250  |
| 7521   | WARD    | 1250  |
| 7876   | ADAMS   | 1100  |
| 7900   | JAMES   | 950   |
| 7369   | SMITH   | 800   |
+--------+---------+-------+--+
14 rows selected (13.049 seconds)
0: jdbc:hive2://CentOS:10000> set hive.mapred.mode = strict;
No rows affected (0.004 seconds)
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal from t_employee order by sal desc;
Error: Error while compiling statement: FAILED: SemanticException 1:48 In strict mode, if ORDER BY is specified, LIMIT must also be specified. Error encountered near token 'sal' (state=42000,code=40000)
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal from t_employee order by sal desc limit 5; 
+--------+--------+-------+--+
| empno  | ename  |  sal  |
+--------+--------+-------+--+
| 7839   | KING   | 5000  |
| 7902   | FORD   | 3000  |
| 7566   | JONES  | 2975  |
| 7698   | BLAKE  | 2850  |
| 7782   | CLARK  | 2450  |
+--------+--------+-------+--+
5 rows selected (12.468 seconds)

8、HAVING過濾

0: jdbc:hive2://CentOS:10000> SELECT deptno,SUM(sal) total FROM t_employee GROUP BY deptno HAVING SUM(sal)>9000;
+---------+--------+--+
| deptno  | total  |
+---------+--------+--+
| 30      | 9400   |
| 20      | 9375   |
+---------+--------+--+
2 rows selected (18.361 seconds)

DISTRIBUTE BY

默認情況下，MapReduce程序會對Map輸出結果的Key值進行散列，並均勻分發到所有Reducer上。如果想要把具有相同Key值的數據分發到同一個Reducer進行處理，這就需要使用DISTRIBUTE BY字句。需要注意的是，DISTRIBUTE BY雖然能保證具有相同Key值的數據分發到同一個Reducer，但是不能保證數據在Reducer上是有序的。

0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal, deptno  FROM t_employee distribute BY deptno;
+--------+---------+-------+---------+--+
| empno  |  ename  |  sal  | deptno  |
+--------+---------+-------+---------+--+
| 7654   | MARTIN  | 1250  | 30      |
| 7900   | JAMES   | 950   | 30      |
| 7698   | BLAKE   | 2850  | 30      |
| 7521   | WARD    | 1250  | 30      |
| 7844   | TURNER  | 1500  | 30      |
| 7499   | ALLEN   | 1600  | 30      |
| 7934   | MILLER  | 1300  | 10      |
| 7839   | KING    | 5000  | 10      |
| 7782   | CLARK   | 2450  | 10      |
| 7788   | SCOTT   | 1500  | 20      |
| 7566   | JONES   | 2975  | 20      |
| 7876   | ADAMS   | 1100  | 20      |
| 7902   | FORD    | 3000  | 20      |
| 7369   | SMITH   | 800   | 20      |
+--------+---------+-------+---------+--+
14 rows selected (15.504 seconds)
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal, deptno  FROM t_employee distribute BY deptno sort by sal desc;
+--------+---------+-------+---------+--+
| empno  |  ename  |  sal  | deptno  |
+--------+---------+-------+---------+--+
| 7698   | BLAKE   | 2850  | 30      |
| 7499   | ALLEN   | 1600  | 30      |
| 7844   | TURNER  | 1500  | 30      |
| 7521   | WARD    | 1250  | 30      |
| 7654   | MARTIN  | 1250  | 30      |
| 7900   | JAMES   | 950   | 30      |
| 7839   | KING    | 5000  | 10      |
| 7782   | CLARK   | 2450  | 10      |
| 7934   | MILLER  | 1300  | 10      |
| 7902   | FORD    | 3000  | 20      |
| 7566   | JONES   | 2975  | 20      |
| 7788   | SCOTT   | 1500  | 20      |
| 7876   | ADAMS   | 1100  | 20      |
| 7369   | SMITH   | 800   | 20      |
+--------+---------+-------+---------+--+
14 rows selected (16.528 seconds)

CLUSTER BY

如果SORT BY和DISTRIBUTE BY指定的是相同字段，且SORT BY排序規則是ASC，此時可以使用CLUSTER BY進行替換。

0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal, deptno  FROM t_employee cluster by deptno;
+--------+---------+-------+---------+--+
| empno  |  ename  |  sal  | deptno  |
+--------+---------+-------+---------+--+
| 7934   | MILLER  | 1300  | 10      |
| 7839   | KING    | 5000  | 10      |
| 7782   | CLARK   | 2450  | 10      |
| 7876   | ADAMS   | 1100  | 20      |
| 7788   | SCOTT   | 1500  | 20      |
| 7369   | SMITH   | 800   | 20      |
| 7566   | JONES   | 2975  | 20      |
| 7902   | FORD    | 3000  | 20      |
| 7844   | TURNER  | 1500  | 30      |
| 7499   | ALLEN   | 1600  | 30      |
| 7698   | BLAKE   | 2850  | 30      |
| 7654   | MARTIN  | 1250  | 30      |
| 7521   | WARD    | 1250  | 30      |
| 7900   | JAMES   | 950   | 30      |
+--------+---------+-------+---------+--+
14 rows selected (25.847 seconds)

表Join查詢

Hive支持內連接，外連接，左外連接，右外連接，笛卡爾連接，這和傳統數據庫中的概念是一致的。需要特別強調：JOIN語句的關聯條件必須用ON指定，不能用WHERE指定，否則就會先做笛卡爾積，再過濾，這會導致你得不到預期的結果。

內連接

0: jdbc:hive2://CentOS:10000>  SELECT e.empno,e.ename,e.sal,d.dname,d.deptno FROM t_employee e JOIN t_dept d ON e.deptno = d.deptno WHERE e.empno=7369;
+----------+----------+--------+-----------+-----------+--+
| e.empno  | e.ename  | e.sal  |  d.dname  | d.deptno  |
+----------+----------+--------+-----------+-----------+--+
| 7369     | SMITH    | 800    | RESEARCH  | 20        |
+----------+----------+--------+-----------+-----------+--+
1 row selected (10.419 seconds)

外連接

0: jdbc:hive2://CentOS:10000>  SELECT e.empno,e.ename,e.sal,d.dname,d.deptno FROM t_employee e LEFT OUTER JOIN t_dept d ON e.deptno = d.deptno;
+----------+----------+--------+-------------+-----------+--+
| e.empno  | e.ename  | e.sal  |   d.dname   | d.deptno  |
+----------+----------+--------+-------------+-----------+--+
| 7369     | SMITH    | 800    | RESEARCH    | 20        |
| 7499     | ALLEN    | 1600   | SALES       | 30        |
| 7521     | WARD     | 1250   | SALES       | 30        |
| 7566     | JONES    | 2975   | RESEARCH    | 20        |
| 7654     | MARTIN   | 1250   | SALES       | 30        |
| 7698     | BLAKE    | 2850   | SALES       | 30        |
| 7782     | CLARK    | 2450   | ACCOUNTING  | 10        |
| 7788     | SCOTT    | 1500   | RESEARCH    | 20        |
| 7839     | KING     | 5000   | ACCOUNTING  | 10        |
| 7844     | TURNER   | 1500   | SALES       | 30        |
| 7876     | ADAMS    | 1100   | RESEARCH    | 20        |
| 7900     | JAMES    | 950    | SALES       | 30        |
| 7902     | FORD     | 3000   | RESEARCH    | 20        |
| 7934     | MILLER   | 1300   | ACCOUNTING  | 10        |
+----------+----------+--------+-------------+-----------+--+
14 rows selected (11.424 seconds)
0: jdbc:hive2://CentOS:10000>  SELECT e.empno,e.ename,e.sal,d.dname,d.deptno FROM t_employee e RIGHT OUTER JOIN t_dept d ON e.deptno = d.deptno;
+----------+----------+--------+-------------+-----------+--+
| e.empno  | e.ename  | e.sal  |   d.dname   | d.deptno  |
+----------+----------+--------+-------------+-----------+--+
| 7782     | CLARK    | 2450   | ACCOUNTING  | 10        |
| 7839     | KING     | 5000   | ACCOUNTING  | 10        |
| 7934     | MILLER   | 1300   | ACCOUNTING  | 10        |
| 7369     | SMITH    | 800    | RESEARCH    | 20        |
| 7566     | JONES    | 2975   | RESEARCH    | 20        |
| 7788     | SCOTT    | 1500   | RESEARCH    | 20        |
| 7876     | ADAMS    | 1100   | RESEARCH    | 20        |
| 7902     | FORD     | 3000   | RESEARCH    | 20        |
| 7499     | ALLEN    | 1600   | SALES       | 30        |
| 7521     | WARD     | 1250   | SALES       | 30        |
| 7654     | MARTIN   | 1250   | SALES       | 30        |
| 7698     | BLAKE    | 2850   | SALES       | 30        |
| 7844     | TURNER   | 1500   | SALES       | 30        |
| 7900     | JAMES    | 950    | SALES       | 30        |
| NULL     | NULL     | NULL   | OPERATIONS  | 40        |
+----------+----------+--------+-------------+-----------+--+
15 rows selected (11.063 seconds)
0: jdbc:hive2://CentOS:10000>  SELECT e.empno,e.ename,e.sal,d.dname,d.deptno FROM t_employee e FULL OUTER JOIN t_dept d ON e.deptno = d.deptno;
+----------+----------+--------+-------------+-----------+--+
| e.empno  | e.ename  | e.sal  |   d.dname   | d.deptno  |
+----------+----------+--------+-------------+-----------+--+
| 7934     | MILLER   | 1300   | ACCOUNTING  | 10        |
| 7839     | KING     | 5000   | ACCOUNTING  | 10        |
| 7782     | CLARK    | 2450   | ACCOUNTING  | 10        |
| 7876     | ADAMS    | 1100   | RESEARCH    | 20        |
| 7788     | SCOTT    | 1500   | RESEARCH    | 20        |
| 7369     | SMITH    | 800    | RESEARCH    | 20        |
| 7566     | JONES    | 2975   | RESEARCH    | 20        |
| 7902     | FORD     | 3000   | RESEARCH    | 20        |
| 7844     | TURNER   | 1500   | SALES       | 30        |
| 7499     | ALLEN    | 1600   | SALES       | 30        |
| 7698     | BLAKE    | 2850   | SALES       | 30        |
| 7654     | MARTIN   | 1250   | SALES       | 30        |
| 7521     | WARD     | 1250   | SALES       | 30        |
| 7900     | JAMES    | 950    | SALES       | 30        |
| NULL     | NULL     | NULL   | OPERATIONS  | 40        |
+----------+----------+--------+-------------+-----------+--+
15 rows selected (24.703 seconds)

12、LEFT SEMI JOIN

LEFT SEMI JOIN （左半連接）是 IN/EXISTS 子查詢的一種更高效的實現。

JOIN 子句中右邊的表只能在 ON 子句中設置過濾條件;
查詢結果只包含左邊表的數據，所以只能SELECT左表中的列。

0: jdbc:hive2://CentOS:10000> SELECT e.empno,e.ename,d.dname FROM t_employee e LEFT SEMI JOIN t_dept d ON e.deptno = d.deptno AND d.loc="NEW YORK";
+----------+----------+-----------+--+
| e.empno  | e.ename  | e.deptno  |
+----------+----------+-----------+--+
| 7782     | CLARK    | 10        |
| 7839     | KING     | 10        |
| 7934     | MILLER   | 10        |
+----------+----------+-----------+--+
3 rows selected (10.119 seconds)

JOIN優化

STREAMTABLE

在多表進行join的時候，如果每個ON子句都使用到共同的列，此時Hive會進行優化，將多表JOIN在同一個map / reduce作業上進行。同時假定查詢的最後一個表是最大的一個表，在對每行記錄進行JOIN操作時，它將嘗試將其他的表緩存起來，然後掃描最後那個表進行計算。因此用戶需要保證查詢的表的大小從左到右是依次增加的。

SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key) JOIN c ON (c.key = b.key)

然而用戶並非需要總是把最大的表放在查詢語句的最後面，Hive提供了/*+ STREAMTABLE() */標誌，使用該標識來指出大表，能避免數據表過大導致佔用內存過多而產生的問題。示例如下：

0: jdbc:hive2://CentOS:10000> SELECT /*+ STREAMTABLE(e) */ e.empno,e.ename,d.dname,d.deptno FROM t_employee e JOIN t_dept d ON e.deptno = d.deptno WHERE job='CLERK';
+----------+----------+-------------+-----------+--+
| e.empno  | e.ename  |   d.dname   | d.deptno  |
+----------+----------+-------------+-----------+--+
| 7369     | SMITH    | RESEARCH    | 20        |
| 7876     | ADAMS    | RESEARCH    | 20        |
| 7900     | JAMES    | SALES       | 30        |
| 7934     | MILLER   | ACCOUNTING  | 10        |
+----------+----------+-------------+-----------+--+
4 rows selected (11.645 seconds)

MAPJOIN

如果在進行join操作時，有一個表很小，則可以將join操作調整到map階段執行。這就是典型的極大表和極小表關聯問題。有兩種解決方式：1.增加**/*+ MAPJOIN(b) */標示；2.設置參數hive.optimize.bucketmapjoin = true**，在

0: jdbc:hive2://CentOS:10000> SELECT /*+ MAPJOIN(d) */ e.empno, e.ename,d.dname FROM t_employee e  JOIN t_dept d ON d.deptno = e.deptno;
+----------+----------+-------------+--+
| e.empno  | e.ename  |   d.dname   |
+----------+----------+-------------+--+
| 7369     | SMITH    | RESEARCH    |
| 7499     | ALLEN    | SALES       |
| 7521     | WARD     | SALES       |
| 7566     | JONES    | RESEARCH    |
| 7654     | MARTIN   | SALES       |
| 7698     | BLAKE    | SALES       |
| 7782     | CLARK    | ACCOUNTING  |
| 7788     | SCOTT    | RESEARCH    |
| 7839     | KING     | ACCOUNTING  |
| 7844     | TURNER   | SALES       |
| 7876     | ADAMS    | RESEARCH    |
| 7900     | JAMES    | SALES       |
| 7902     | FORD     | RESEARCH    |
| 7934     | MILLER   | ACCOUNTING  |
+----------+----------+-------------+--+
14 rows selected (11.416 seconds)

開窗函數

0: jdbc:hive2://CentOS:10000> select e.empno ,e.ename,e.sal,e.deptno,rank() over(partition by e.deptno order by e.sal) as rank from t_employee e; 
+----------+----------+--------+-----------+-------+--+
| e.empno  | e.ename  | e.sal  | e.deptno  | rank  |
+----------+----------+--------+-----------+-------+--+
| 7839     | KING     | 5000   | 10        | 1     |
| 7782     | CLARK    | 2450   | 10        | 2     |
| 7934     | MILLER   | 1300   | 10        | 3     |
| 7902     | FORD     | 3000   | 20        | 1     |
| 7566     | JONES    | 2975   | 20        | 2     |
| 7788     | SCOTT    | 1500   | 20        | 3     |
| 7876     | ADAMS    | 1100   | 20        | 4     |
| 7369     | SMITH    | 800    | 20        | 5     |
| 7698     | BLAKE    | 2850   | 30        | 1     |
| 7499     | ALLEN    | 1600   | 30        | 2     |
| 7844     | TURNER   | 1500   | 30        | 3     |
| 7654     | MARTIN   | 1250   | 30        | 4     |
| 7521     | WARD     | 1250   | 30        | 4     |
| 7900     | JAMES    | 950    | 30        | 6     |
+----------+----------+--------+-----------+-------+--+
0: jdbc:hive2://CentOS:10000> select e.empno ,e.ename,e.sal,e.deptno,dense_rank() over(partition by e.deptno order by e.sal desc) as rank from t_employee e; 
+----------+----------+--------+-----------+-------+--+
| e.empno  | e.ename  | e.sal  | e.deptno  | rank  |
+----------+----------+--------+-----------+-------+--+
| 7839     | KING     | 5000   | 10        | 1     |
| 7782     | CLARK    | 2450   | 10        | 2     |
| 7934     | MILLER   | 1300   | 10        | 3     |
| 7902     | FORD     | 3000   | 20        | 1     |
| 7566     | JONES    | 2975   | 20        | 2     |
| 7788     | SCOTT    | 1500   | 20        | 3     |
| 7876     | ADAMS    | 1100   | 20        | 4     |
| 7369     | SMITH    | 800    | 20        | 5     |
| 7698     | BLAKE    | 2850   | 30        | 1     |
| 7499     | ALLEN    | 1600   | 30        | 2     |
| 7844     | TURNER   | 1500   | 30        | 3     |
| 7654     | MARTIN   | 1250   | 30        | 4     |
| 7521     | WARD     | 1250   | 30        | 4     |
| 7900     | JAMES    | 950    | 30        | 5     |
+----------+----------+--------+-----------+-------+--+
14 rows selected (24.262 seconds)

Cube分析

0: jdbc:hive2://CentOS:10000> select e.deptno,e.job,avg(e.sal) avg,max(e.sal) max,min(e.sal) min from t_employee e group by e.deptno,e.job with cube;
+-----------+------------+--------------+-------+-------+--+
| e.deptno  |   e.job    |     avg      |  max  |  min  |
+-----------+------------+--------------+-------+-------+--+
| NULL      | ANALYST    | 2250         | 3000  | 1500  |
| 10        | CLERK      | 1300         | 1300  | 1300  |
| 20        | CLERK      | 950          | 1100  | 800   |
| 30        | CLERK      | 950          | 950   | 950   |
| 20        | ANALYST    | 2250         | 3000  | 1500  |
| NULL      | PRESIDENT  | 5000         | 5000  | 5000  |
| 10        | PRESIDENT  | 5000         | 5000  | 5000  |
| NULL      | SALESMAN   | 1400         | 1600  | 1250  |
| NULL      | MANAGER    | 2758.333333  | 2975  | 2450  |
| 30        | SALESMAN   | 1400         | 1600  | 1250  |
| 10        | MANAGER    | 2450         | 2450  | 2450  |
| 20        | MANAGER    | 2975         | 2975  | 2975  |
| 30        | MANAGER    | 2850         | 2850  | 2850  |
| NULL      | NULL       | 1966.071429  | 5000  | 800   |
| NULL      | CLERK      | 1037.5       | 1300  | 800   |
| 10        | NULL       | 2916.666667  | 5000  | 1300  |
| 20        | NULL       | 1875         | 3000  | 800   |
| 30        | NULL       | 1566.666667  | 2850  | 950   |
+-----------+------------+--------------+-------+-------+--+
18 rows selected (25.037 seconds)

行轉列

1,語文,100
1,數學,100
1,英語,100
2,數學,79
2,語文,80
2,英語,100

CREATE TABLE t_student(
    id INT,
    course STRING,
    score double)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;

0: jdbc:hive2://CentOS:10000> select * from t_student;
+---------------+-------------------+------------------+--+
| t_student.id  | t_student.course  | t_student.score  |
+---------------+-------------------+------------------+--+
| 1             | 語文                | 100.0            |
| 1             | 數學                | 100.0            |
| 1             | 英語                | 100.0            |
| 2             | 數學                | 79.0             |
| 2             | 語文                | 80.0             |
| 2             | 英語                | 100.0            |
+---------------+-------------------+------------------+--+
6 rows selected (0.05 seconds)

0: jdbc:hive2://CentOS:10000> select id,max(case course when '語文' then score else 0 end) as chinese,max(case course when '數學' then score else 0 end ) as math,max(case course when '英語' then score else 0 end ) as english from t_student group by id ;

+-----+----------+--------+----------+--+
| id  | chinese  |  math  | english  |
+-----+----------+--------+----------+--+
| 1   | 100.0    | 100.0  | 100.0    |
| 2   | 80.0     | 79.0   | 100.0    |
+-----+----------+--------+----------+--+
2 rows selected (25.617 seconds)

SELECT id,concat_ws(’,’, collect_set(concat(course, ‘:’, score))) 成績 FROM t_student GROUP BY id

Hive數據傾斜

數據傾斜是進行大數據計算時最經常遇到的問題之一。當我們在執行HiveQL或者運行MapReduce作業時候，如果遇到一直卡在map100%,reduce99%一般就是遇到了數據傾斜的問題。數據傾斜其實是進行分佈式計算的時候，某些節點的計算能力比較強或者需要計算的數據比較少，早早執行完了，某些節點計算的能力較差或者由於此節點需要計算的數據比較多，導致出現其他節點的reduce階段任務執行完成，但是這種節點的數據處理任務還沒有執行完成。

group by,我使用Hive對數據做一些類型統計的時候遇到過某種類型的數據量特別多，而其他類型數據的數據量特別少。當按照類型進行group by的時候，會將相同的group by字段的reduce任務需要的數據拉取到同一個節點進行聚合，而當其中每一組的數據量過大時，會出現其他組的計算已經完成而這裏還沒計算完成，其他節點的一直等待這個節點的任務執行完成，所以會看到一直map 100% reduce 99%的情況。

解決方法：

set hive.map.aggr=true
set hive.groupby.skewindata=true

原理：
hive.map.aggr=true 這個配置項代表是否在map端進行聚合hive.groupby.skwindata=true 當選項設定爲 true，生成的查詢計劃會有兩個 MR Job。第一個 MR Job 中，Map 的輸出結果集合會隨機分佈到 Reduce 中，每個 Reduce 做部分聚合操作，並輸出結果，這樣處理的結果是相同的 Group By Key 有可能被分發到不同的 Reduce 中，從而達到負載均衡的目的；第二個 MR Job 再根據預處理的數據結果按照 Group By Key 分佈到 Reduce 中（這個過程可以保證相同的 Group By Key 被分佈到同一個 Reduce 中），最後完成最終的聚合操作。

Hive On Hbase

create external table t_employee(
	empno INT,
    ename STRING,
    job STRING,
    mgr INT,
    hiredate TIMESTAMP,
    sal DECIMAL(7,2),
    comm DECIMAL(7,2),
    deptno INT)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES("hbase.columns.mapping" = ":key,cf1:name,cf1:job,cf1:mgr,cf1:hiredate,cf1:sal,cf1:comm,cf1:deptno") 
TBLPROPERTIES("hbase.table.name" = "baizhi:t_employee");

需要替換hive-hbase-handler-1.2.2.jar

Hive SQL案例分析

數據籌備

SQL查詢

單表查詢

WHERE查詢

DISTINCT查詢

分區查詢

LIMIT查詢

GROUP BY查詢

ORDER AND SORT

DISTRIBUTE BY

CLUSTER BY

表Join查詢

JOIN優化

開窗函數

Cube分析

行轉列

Hive數據傾斜

Hive On Hbase

Python實現大麥網搶票的四大關鍵技術點解析

salesforce零基礎學習（一百三十八）零碎知識點小總結（十）

SpringBoot數據庫讀寫分離

#Spark流計算-章節1

Apache Flink 狀態管理教案

Apache Sqoop （最新版本）

Apache Spark-[概述和安裝]

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結