數據籌備
7369,SMITH,CLERK,7902,1980-12-17 00:00:00,800,\N,20
7499,ALLEN,SALESMAN,7698,1981-02-20 00:00:00,1600,300,30
7521,WARD,SALESMAN,7698,1981-02-22 00:00:00,1250,500,30
7566,JONES,MANAGER,7839,1981-04-02 00:00:00,2975,\N,20
7654,MARTIN,SALESMAN,7698,1981-09-28 00:00:00,1250,1400,30
7698,BLAKE,MANAGER,7839,1981-05-01 00:00:00,2850,\N,30
7782,CLARK,MANAGER,7839,1981-06-09 00:00:00,2450,\N,10
7788,SCOTT,ANALYST,7566,1987-04-19 00:00:00,1500,\N,20
7839,KING,PRESIDENT,\N,1981-11-17 00:00:00,5000,\N,10
7844,TURNER,SALESMAN,7698,1981-09-08 00:00:00,1500,0,30
7876,ADAMS,CLERK,7788,1987-05-23 00:00:00,1100,\N,20
7900,JAMES,CLERK,7698,1981-12-03 00:00:00,950,\N,30
7902,FORD,ANALYST,7566,1981-12-03 00:00:00,3000,\N,20
7934,MILLER,CLERK,7782,1982-01-23 00:00:00,1300,\N,10
CREATE TABLE t_employee(
empno INT,
ename STRING,
job STRING,
mgr INT,
hiredate TIMESTAMP,
sal DECIMAL(7,2),
comm DECIMAL(7,2),
deptno INT)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;
10,ACCOUNTING,NEW YORK
20,RESEARCH,DALLAS
30,SALES,CHICAGO
40,OPERATIONS,BOSTON
CREATE TABLE t_dept(
DEPTNO INT,
DNAME STRING,
LOC STRING)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;
0: jdbc:hive2://CentOS:10000> select empno,ename,job,mgr,hiredate,sal,comm,deptno from t_employee;
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| empno | ename | job | mgr | hiredate | sal | comm | deptno |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| 7369 | SMITH | CLERK | 7902 | 1980-12-17 00:00:00.0 | 800 | NULL | 20 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 00:00:00.0 | 1600 | 300 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 00:00:00.0 | 1250 | 500 | 30 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 00:00:00.0 | 2975 | NULL | 20 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 00:00:00.0 | 1250 | 1400 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 00:00:00.0 | 2850 | NULL | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 00:00:00.0 | 2450 | NULL | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-04-19 00:00:00.0 | 1500 | NULL | 20 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-17 00:00:00.0 | 5000 | NULL | 10 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 00:00:00.0 | 1500 | 0 | 30 |
| 7876 | ADAMS | CLERK | 7788 | 1987-05-23 00:00:00.0 | 1100 | NULL | 20 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 00:00:00.0 | 950 | NULL | 30 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 00:00:00.0 | 3000 | NULL | 20 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 00:00:00.0 | 1300 | NULL | 10 |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
14 rows selected (0.047 seconds)
0: jdbc:hive2://CentOS:10000> select deptno,dname,loc from t_dept;
+---------+-------------+-----------+--+
| deptno | dname | loc |
+---------+-------------+-----------+--+
| 10 | ACCOUNTING | NEW YORK |
| 20 | RESEARCH | DALLAS |
| 30 | SALES | CHICAGO |
| 40 | OPERATIONS | BOSTON |
+---------+-------------+-----------+--+
4 rows selected (0.046 seconds)
CREATE TABLE t_employee_partition(
empno INT,
ename STRING,
job STRING,
mgr INT,
hiredate TIMESTAMP,
sal DECIMAL(7,2),
comm DECIMAL(7,2))
PARTITIONED BY(deptno INT)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;
0: jdbc:hive2://CentOS:10000> set hive.exec.dynamic.partition.mode=nonstrict
0: jdbc:hive2://CentOS:10000> INSERT OVERWRITE TABLE t_employee_partition PARTITION (deptno) SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee;
SQL查詢
單表查詢
0: jdbc:hive2://CentOS:10000> select empno,ename,job,mgr,hiredate,sal,comm,deptno from t_employee;
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| empno | ename | job | mgr | hiredate | sal | comm | deptno |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| 7369 | SMITH | CLERK | 7902 | 1980-12-17 00:00:00.0 | 800 | NULL | 20 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 00:00:00.0 | 1600 | 300 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 00:00:00.0 | 1250 | 500 | 30 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 00:00:00.0 | 2975 | NULL | 20 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 00:00:00.0 | 1250 | 1400 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 00:00:00.0 | 2850 | NULL | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 00:00:00.0 | 2450 | NULL | 10 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-04-19 00:00:00.0 | 1500 | NULL | 20 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-17 00:00:00.0 | 5000 | NULL | 10 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 00:00:00.0 | 1500 | 0 | 30 |
| 7876 | ADAMS | CLERK | 7788 | 1987-05-23 00:00:00.0 | 1100 | NULL | 20 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 00:00:00.0 | 950 | NULL | 30 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 00:00:00.0 | 3000 | NULL | 20 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 00:00:00.0 | 1300 | NULL | 10 |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
14 rows selected (0.056 seconds)
WHERE查詢
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee WHERE empno > 7782 AND deptno = 10;
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| empno | ename | job | mgr | hiredate | sal | comm | deptno |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
| 7839 | KING | PRESIDENT | NULL | 1981-11-17 00:00:00.0 | 5000 | NULL | 10 |
| 7934 | MILLER | CLERK | 7782 | 1982-01-23 00:00:00.0 | 1300 | NULL | 10 |
+--------+---------+------------+-------+------------------------+-------+-------+---------+--+
2 rows selected (0.067 seconds)
DISTINCT查詢
0: jdbc:hive2://CentOS:10000> select distinct(job) from t_employee;
+------------+--+
| job |
+------------+--+
| ANALYST |
| CLERK |
| MANAGER |
| PRESIDENT |
| SALESMAN |
+------------+--+
分區查詢
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee_partition e WHERE e.deptno >= 20 AND e.deptno <= 40;
+--------+---------+-----------+-------+------------------------+-------+-------+---------+--+
| empno | ename | job | mgr | hiredate | sal | comm | deptno |
+--------+---------+-----------+-------+------------------------+-------+-------+---------+--+
| 7369 | SMITH | CLERK | 7902 | 1980-12-17 00:00:00.0 | 800 | NULL | 20 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 00:00:00.0 | 2975 | NULL | 20 |
| 7788 | SCOTT | ANALYST | 7566 | 1987-04-19 00:00:00.0 | 1500 | NULL | 20 |
| 7876 | ADAMS | CLERK | 7788 | 1987-05-23 00:00:00.0 | 1100 | NULL | 20 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 00:00:00.0 | 3000 | NULL | 20 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-02-20 00:00:00.0 | 1600 | 300 | 30 |
| 7521 | WARD | SALESMAN | 7698 | 1981-02-22 00:00:00.0 | 1250 | 500 | 30 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-09-28 00:00:00.0 | 1250 | 1400 | 30 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 00:00:00.0 | 2850 | NULL | 30 |
| 7844 | TURNER | SALESMAN | 7698 | 1981-09-08 00:00:00.0 | 1500 | 0 | 30 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-03 00:00:00.0 | 950 | NULL | 30 |
+--------+---------+-----------+-------+------------------------+-------+-------+---------+--+
11 rows selected (0.123 seconds)
LIMIT查詢
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,job,mgr,hiredate,sal,comm,deptno FROM t_employee ORDER BY sal DESC LIMIT 5;
+--------+--------+------------+-------+------------------------+-------+-------+---------+--+
| empno | ename | job | mgr | hiredate | sal | comm | deptno |
+--------+--------+------------+-------+------------------------+-------+-------+---------+--+
| 7839 | KING | PRESIDENT | NULL | 1981-11-17 00:00:00.0 | 5000 | NULL | 10 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-03 00:00:00.0 | 3000 | NULL | 20 |
| 7566 | JONES | MANAGER | 7839 | 1981-04-02 00:00:00.0 | 2975 | NULL | 20 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-05-01 00:00:00.0 | 2850 | NULL | 30 |
| 7782 | CLARK | MANAGER | 7839 | 1981-06-09 00:00:00.0 | 2450 | NULL | 10 |
+--------+--------+------------+-------+------------------------+-------+-------+---------+--+
5 rows selected (14.294 seconds)
GROUP BY查詢
0: jdbc:hive2://CentOS:10000> set hive.map.aggr=true;
0: jdbc:hive2://CentOS:10000> SELECT deptno,SUM(sal) as total FROM t_employee GROUP BY deptno;
+---------+--------+--+
| deptno | total |
+---------+--------+--+
| 10 | 8750 |
| 20 | 9375 |
| 30 | 9400 |
+---------+--------+--+
3 rows selected (12.645 seconds)
hive.map.aggr
控制程序如何進行聚合。默認值爲false。如果設置爲true,Hive會在map階段就執行一次聚合。這可以提高聚合效率,但需要消耗更多內存。
ORDER AND SORT
可以使用ORDER BY或者Sort BY對查詢結果進行排序,排序字段可以是整型也可以是字符串:如果是整型,則按照大小排序;如果是字符串,則按照字典序排序。ORDER BY 和 SORT BY 的區別如下:使用ORDER BY時會有一個Reducer對全部查詢結果進行排序,可以保證數據的全局有序性;使用SORT BY時只會在每個Reducer中進行排序,這可以保證每個Reducer的輸出數據是有序的,但不能保證全局有序。由於ORDER BY的時間可能很長,如果你設置了嚴格模式(hive.mapred.mode = strict),則其後面必須再跟一個limit子句。
- sort by
0: jdbc:hive2://CentOS:10000> set mapreduce.job.reduces=2
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal from t_employee sort by sal desc;
+--------+---------+-------+--+
| empno | ename | sal |
+--------+---------+-------+--+
| 7902 | FORD | 3000 |
| 7566 | JONES | 2975 |
| 7844 | TURNER | 1500 |
| 7788 | SCOTT | 1500 |
| 7521 | WARD | 1250 |
| 7654 | MARTIN | 1250 |
| 7876 | ADAMS | 1100 |
| 7900 | JAMES | 950 |
| 7369 | SMITH | 800 |
| 7839 | KING | 5000 |
| 7698 | BLAKE | 2850 |
| 7782 | CLARK | 2450 |
| 7499 | ALLEN | 1600 |
| 7934 | MILLER | 1300 |
+--------+---------+-------+--+
14 rows selected (14.474 seconds)
- order by
0: jdbc:hive2://CentOS:10000> set mapreduce.job.reduces=3;
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal from t_employee order by sal desc;
+--------+---------+-------+--+
| empno | ename | sal |
+--------+---------+-------+--+
| 7839 | KING | 5000 |
| 7902 | FORD | 3000 |
| 7566 | JONES | 2975 |
| 7698 | BLAKE | 2850 |
| 7782 | CLARK | 2450 |
| 7499 | ALLEN | 1600 |
| 7844 | TURNER | 1500 |
| 7788 | SCOTT | 1500 |
| 7934 | MILLER | 1300 |
| 7654 | MARTIN | 1250 |
| 7521 | WARD | 1250 |
| 7876 | ADAMS | 1100 |
| 7900 | JAMES | 950 |
| 7369 | SMITH | 800 |
+--------+---------+-------+--+
14 rows selected (13.049 seconds)
0: jdbc:hive2://CentOS:10000> set hive.mapred.mode = strict;
No rows affected (0.004 seconds)
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal from t_employee order by sal desc;
Error: Error while compiling statement: FAILED: SemanticException 1:48 In strict mode, if ORDER BY is specified, LIMIT must also be specified. Error encountered near token 'sal' (state=42000,code=40000)
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal from t_employee order by sal desc limit 5;
+--------+--------+-------+--+
| empno | ename | sal |
+--------+--------+-------+--+
| 7839 | KING | 5000 |
| 7902 | FORD | 3000 |
| 7566 | JONES | 2975 |
| 7698 | BLAKE | 2850 |
| 7782 | CLARK | 2450 |
+--------+--------+-------+--+
5 rows selected (12.468 seconds)
8、HAVING過濾
0: jdbc:hive2://CentOS:10000> SELECT deptno,SUM(sal) total FROM t_employee GROUP BY deptno HAVING SUM(sal)>9000;
+---------+--------+--+
| deptno | total |
+---------+--------+--+
| 30 | 9400 |
| 20 | 9375 |
+---------+--------+--+
2 rows selected (18.361 seconds)
DISTRIBUTE BY
默認情況下,MapReduce程序會對Map輸出結果的Key值進行散列,並均勻分發到所有Reducer上。如果想要把具有相同Key值的數據分發到同一個Reducer進行處理,這就需要使用DISTRIBUTE BY字句。需要注意的是,DISTRIBUTE BY雖然能保證具有相同Key值的數據分發到同一個Reducer,但是不能保證數據在Reducer上是有序的。
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal, deptno FROM t_employee distribute BY deptno;
+--------+---------+-------+---------+--+
| empno | ename | sal | deptno |
+--------+---------+-------+---------+--+
| 7654 | MARTIN | 1250 | 30 |
| 7900 | JAMES | 950 | 30 |
| 7698 | BLAKE | 2850 | 30 |
| 7521 | WARD | 1250 | 30 |
| 7844 | TURNER | 1500 | 30 |
| 7499 | ALLEN | 1600 | 30 |
| 7934 | MILLER | 1300 | 10 |
| 7839 | KING | 5000 | 10 |
| 7782 | CLARK | 2450 | 10 |
| 7788 | SCOTT | 1500 | 20 |
| 7566 | JONES | 2975 | 20 |
| 7876 | ADAMS | 1100 | 20 |
| 7902 | FORD | 3000 | 20 |
| 7369 | SMITH | 800 | 20 |
+--------+---------+-------+---------+--+
14 rows selected (15.504 seconds)
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal, deptno FROM t_employee distribute BY deptno sort by sal desc;
+--------+---------+-------+---------+--+
| empno | ename | sal | deptno |
+--------+---------+-------+---------+--+
| 7698 | BLAKE | 2850 | 30 |
| 7499 | ALLEN | 1600 | 30 |
| 7844 | TURNER | 1500 | 30 |
| 7521 | WARD | 1250 | 30 |
| 7654 | MARTIN | 1250 | 30 |
| 7900 | JAMES | 950 | 30 |
| 7839 | KING | 5000 | 10 |
| 7782 | CLARK | 2450 | 10 |
| 7934 | MILLER | 1300 | 10 |
| 7902 | FORD | 3000 | 20 |
| 7566 | JONES | 2975 | 20 |
| 7788 | SCOTT | 1500 | 20 |
| 7876 | ADAMS | 1100 | 20 |
| 7369 | SMITH | 800 | 20 |
+--------+---------+-------+---------+--+
14 rows selected (16.528 seconds)
CLUSTER BY
如果SORT BY
和DISTRIBUTE BY
指定的是相同字段,且SORT BY排序規則是ASC,此時可以使用CLUSTER BY
進行替換。
0: jdbc:hive2://CentOS:10000> SELECT empno,ename,sal, deptno FROM t_employee cluster by deptno;
+--------+---------+-------+---------+--+
| empno | ename | sal | deptno |
+--------+---------+-------+---------+--+
| 7934 | MILLER | 1300 | 10 |
| 7839 | KING | 5000 | 10 |
| 7782 | CLARK | 2450 | 10 |
| 7876 | ADAMS | 1100 | 20 |
| 7788 | SCOTT | 1500 | 20 |
| 7369 | SMITH | 800 | 20 |
| 7566 | JONES | 2975 | 20 |
| 7902 | FORD | 3000 | 20 |
| 7844 | TURNER | 1500 | 30 |
| 7499 | ALLEN | 1600 | 30 |
| 7698 | BLAKE | 2850 | 30 |
| 7654 | MARTIN | 1250 | 30 |
| 7521 | WARD | 1250 | 30 |
| 7900 | JAMES | 950 | 30 |
+--------+---------+-------+---------+--+
14 rows selected (25.847 seconds)
表Join查詢
Hive支持內連接,外連接,左外連接,右外連接,笛卡爾連接,這和傳統數據庫中的概念是一致的。需要特別強調:JOIN語句的關聯條件必須用ON指定,不能用WHERE指定,否則就會先做笛卡爾積,再過濾,這會導致你得不到預期的結果。
- 內連接
0: jdbc:hive2://CentOS:10000> SELECT e.empno,e.ename,e.sal,d.dname,d.deptno FROM t_employee e JOIN t_dept d ON e.deptno = d.deptno WHERE e.empno=7369;
+----------+----------+--------+-----------+-----------+--+
| e.empno | e.ename | e.sal | d.dname | d.deptno |
+----------+----------+--------+-----------+-----------+--+
| 7369 | SMITH | 800 | RESEARCH | 20 |
+----------+----------+--------+-----------+-----------+--+
1 row selected (10.419 seconds)
- 外連接
0: jdbc:hive2://CentOS:10000> SELECT e.empno,e.ename,e.sal,d.dname,d.deptno FROM t_employee e LEFT OUTER JOIN t_dept d ON e.deptno = d.deptno;
+----------+----------+--------+-------------+-----------+--+
| e.empno | e.ename | e.sal | d.dname | d.deptno |
+----------+----------+--------+-------------+-----------+--+
| 7369 | SMITH | 800 | RESEARCH | 20 |
| 7499 | ALLEN | 1600 | SALES | 30 |
| 7521 | WARD | 1250 | SALES | 30 |
| 7566 | JONES | 2975 | RESEARCH | 20 |
| 7654 | MARTIN | 1250 | SALES | 30 |
| 7698 | BLAKE | 2850 | SALES | 30 |
| 7782 | CLARK | 2450 | ACCOUNTING | 10 |
| 7788 | SCOTT | 1500 | RESEARCH | 20 |
| 7839 | KING | 5000 | ACCOUNTING | 10 |
| 7844 | TURNER | 1500 | SALES | 30 |
| 7876 | ADAMS | 1100 | RESEARCH | 20 |
| 7900 | JAMES | 950 | SALES | 30 |
| 7902 | FORD | 3000 | RESEARCH | 20 |
| 7934 | MILLER | 1300 | ACCOUNTING | 10 |
+----------+----------+--------+-------------+-----------+--+
14 rows selected (11.424 seconds)
0: jdbc:hive2://CentOS:10000> SELECT e.empno,e.ename,e.sal,d.dname,d.deptno FROM t_employee e RIGHT OUTER JOIN t_dept d ON e.deptno = d.deptno;
+----------+----------+--------+-------------+-----------+--+
| e.empno | e.ename | e.sal | d.dname | d.deptno |
+----------+----------+--------+-------------+-----------+--+
| 7782 | CLARK | 2450 | ACCOUNTING | 10 |
| 7839 | KING | 5000 | ACCOUNTING | 10 |
| 7934 | MILLER | 1300 | ACCOUNTING | 10 |
| 7369 | SMITH | 800 | RESEARCH | 20 |
| 7566 | JONES | 2975 | RESEARCH | 20 |
| 7788 | SCOTT | 1500 | RESEARCH | 20 |
| 7876 | ADAMS | 1100 | RESEARCH | 20 |
| 7902 | FORD | 3000 | RESEARCH | 20 |
| 7499 | ALLEN | 1600 | SALES | 30 |
| 7521 | WARD | 1250 | SALES | 30 |
| 7654 | MARTIN | 1250 | SALES | 30 |
| 7698 | BLAKE | 2850 | SALES | 30 |
| 7844 | TURNER | 1500 | SALES | 30 |
| 7900 | JAMES | 950 | SALES | 30 |
| NULL | NULL | NULL | OPERATIONS | 40 |
+----------+----------+--------+-------------+-----------+--+
15 rows selected (11.063 seconds)
0: jdbc:hive2://CentOS:10000> SELECT e.empno,e.ename,e.sal,d.dname,d.deptno FROM t_employee e FULL OUTER JOIN t_dept d ON e.deptno = d.deptno;
+----------+----------+--------+-------------+-----------+--+
| e.empno | e.ename | e.sal | d.dname | d.deptno |
+----------+----------+--------+-------------+-----------+--+
| 7934 | MILLER | 1300 | ACCOUNTING | 10 |
| 7839 | KING | 5000 | ACCOUNTING | 10 |
| 7782 | CLARK | 2450 | ACCOUNTING | 10 |
| 7876 | ADAMS | 1100 | RESEARCH | 20 |
| 7788 | SCOTT | 1500 | RESEARCH | 20 |
| 7369 | SMITH | 800 | RESEARCH | 20 |
| 7566 | JONES | 2975 | RESEARCH | 20 |
| 7902 | FORD | 3000 | RESEARCH | 20 |
| 7844 | TURNER | 1500 | SALES | 30 |
| 7499 | ALLEN | 1600 | SALES | 30 |
| 7698 | BLAKE | 2850 | SALES | 30 |
| 7654 | MARTIN | 1250 | SALES | 30 |
| 7521 | WARD | 1250 | SALES | 30 |
| 7900 | JAMES | 950 | SALES | 30 |
| NULL | NULL | NULL | OPERATIONS | 40 |
+----------+----------+--------+-------------+-----------+--+
15 rows selected (24.703 seconds)
12、LEFT SEMI JOIN
LEFT SEMI JOIN (左半連接)是 IN/EXISTS 子查詢的一種更高效的實現。
- JOIN 子句中右邊的表只能在 ON 子句中設置過濾條件;
- 查詢結果只包含左邊表的數據,所以只能SELECT左表中的列。
0: jdbc:hive2://CentOS:10000> SELECT e.empno,e.ename,d.dname FROM t_employee e LEFT SEMI JOIN t_dept d ON e.deptno = d.deptno AND d.loc="NEW YORK";
+----------+----------+-----------+--+
| e.empno | e.ename | e.deptno |
+----------+----------+-----------+--+
| 7782 | CLARK | 10 |
| 7839 | KING | 10 |
| 7934 | MILLER | 10 |
+----------+----------+-----------+--+
3 rows selected (10.119 seconds)
JOIN優化
- STREAMTABLE
在多表進行join的時候,如果每個ON子句都使用到共同的列,此時Hive會進行優化,將多表JOIN在同一個map / reduce作業上進行。同時假定查詢的最後一個表是最大的一個表,在對每行記錄進行JOIN操作時,它將嘗試將其他的表緩存起來,然後掃描最後那個表進行計算。因此用戶需要保證查詢的表的大小從左到右是依次增加的。
SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key) JOIN c ON (c.key = b.key)
然而用戶並非需要總是把最大的表放在查詢語句的最後面,Hive提供了/*+ STREAMTABLE() */
標誌,使用該標識來指出大表,能避免數據表過大導致佔用內存過多而產生的問題。示例如下:
0: jdbc:hive2://CentOS:10000> SELECT /*+ STREAMTABLE(e) */ e.empno,e.ename,d.dname,d.deptno FROM t_employee e JOIN t_dept d ON e.deptno = d.deptno WHERE job='CLERK';
+----------+----------+-------------+-----------+--+
| e.empno | e.ename | d.dname | d.deptno |
+----------+----------+-------------+-----------+--+
| 7369 | SMITH | RESEARCH | 20 |
| 7876 | ADAMS | RESEARCH | 20 |
| 7900 | JAMES | SALES | 30 |
| 7934 | MILLER | ACCOUNTING | 10 |
+----------+----------+-------------+-----------+--+
4 rows selected (11.645 seconds)
- MAPJOIN
如果在進行join操作時,有一個表很小,則可以將join操作調整到map階段執行。這就是典型的極大表和極小表關聯問題。有兩種解決方式:1.增加**/*+ MAPJOIN(b) */標示;2.設置參數hive.optimize.bucketmapjoin = true**,在
0: jdbc:hive2://CentOS:10000> SELECT /*+ MAPJOIN(d) */ e.empno, e.ename,d.dname FROM t_employee e JOIN t_dept d ON d.deptno = e.deptno;
+----------+----------+-------------+--+
| e.empno | e.ename | d.dname |
+----------+----------+-------------+--+
| 7369 | SMITH | RESEARCH |
| 7499 | ALLEN | SALES |
| 7521 | WARD | SALES |
| 7566 | JONES | RESEARCH |
| 7654 | MARTIN | SALES |
| 7698 | BLAKE | SALES |
| 7782 | CLARK | ACCOUNTING |
| 7788 | SCOTT | RESEARCH |
| 7839 | KING | ACCOUNTING |
| 7844 | TURNER | SALES |
| 7876 | ADAMS | RESEARCH |
| 7900 | JAMES | SALES |
| 7902 | FORD | RESEARCH |
| 7934 | MILLER | ACCOUNTING |
+----------+----------+-------------+--+
14 rows selected (11.416 seconds)
開窗函數
0: jdbc:hive2://CentOS:10000> select e.empno ,e.ename,e.sal,e.deptno,rank() over(partition by e.deptno order by e.sal) as rank from t_employee e;
+----------+----------+--------+-----------+-------+--+
| e.empno | e.ename | e.sal | e.deptno | rank |
+----------+----------+--------+-----------+-------+--+
| 7839 | KING | 5000 | 10 | 1 |
| 7782 | CLARK | 2450 | 10 | 2 |
| 7934 | MILLER | 1300 | 10 | 3 |
| 7902 | FORD | 3000 | 20 | 1 |
| 7566 | JONES | 2975 | 20 | 2 |
| 7788 | SCOTT | 1500 | 20 | 3 |
| 7876 | ADAMS | 1100 | 20 | 4 |
| 7369 | SMITH | 800 | 20 | 5 |
| 7698 | BLAKE | 2850 | 30 | 1 |
| 7499 | ALLEN | 1600 | 30 | 2 |
| 7844 | TURNER | 1500 | 30 | 3 |
| 7654 | MARTIN | 1250 | 30 | 4 |
| 7521 | WARD | 1250 | 30 | 4 |
| 7900 | JAMES | 950 | 30 | 6 |
+----------+----------+--------+-----------+-------+--+
0: jdbc:hive2://CentOS:10000> select e.empno ,e.ename,e.sal,e.deptno,dense_rank() over(partition by e.deptno order by e.sal desc) as rank from t_employee e;
+----------+----------+--------+-----------+-------+--+
| e.empno | e.ename | e.sal | e.deptno | rank |
+----------+----------+--------+-----------+-------+--+
| 7839 | KING | 5000 | 10 | 1 |
| 7782 | CLARK | 2450 | 10 | 2 |
| 7934 | MILLER | 1300 | 10 | 3 |
| 7902 | FORD | 3000 | 20 | 1 |
| 7566 | JONES | 2975 | 20 | 2 |
| 7788 | SCOTT | 1500 | 20 | 3 |
| 7876 | ADAMS | 1100 | 20 | 4 |
| 7369 | SMITH | 800 | 20 | 5 |
| 7698 | BLAKE | 2850 | 30 | 1 |
| 7499 | ALLEN | 1600 | 30 | 2 |
| 7844 | TURNER | 1500 | 30 | 3 |
| 7654 | MARTIN | 1250 | 30 | 4 |
| 7521 | WARD | 1250 | 30 | 4 |
| 7900 | JAMES | 950 | 30 | 5 |
+----------+----------+--------+-----------+-------+--+
14 rows selected (24.262 seconds)
Cube分析
0: jdbc:hive2://CentOS:10000> select e.deptno,e.job,avg(e.sal) avg,max(e.sal) max,min(e.sal) min from t_employee e group by e.deptno,e.job with cube;
+-----------+------------+--------------+-------+-------+--+
| e.deptno | e.job | avg | max | min |
+-----------+------------+--------------+-------+-------+--+
| NULL | ANALYST | 2250 | 3000 | 1500 |
| 10 | CLERK | 1300 | 1300 | 1300 |
| 20 | CLERK | 950 | 1100 | 800 |
| 30 | CLERK | 950 | 950 | 950 |
| 20 | ANALYST | 2250 | 3000 | 1500 |
| NULL | PRESIDENT | 5000 | 5000 | 5000 |
| 10 | PRESIDENT | 5000 | 5000 | 5000 |
| NULL | SALESMAN | 1400 | 1600 | 1250 |
| NULL | MANAGER | 2758.333333 | 2975 | 2450 |
| 30 | SALESMAN | 1400 | 1600 | 1250 |
| 10 | MANAGER | 2450 | 2450 | 2450 |
| 20 | MANAGER | 2975 | 2975 | 2975 |
| 30 | MANAGER | 2850 | 2850 | 2850 |
| NULL | NULL | 1966.071429 | 5000 | 800 |
| NULL | CLERK | 1037.5 | 1300 | 800 |
| 10 | NULL | 2916.666667 | 5000 | 1300 |
| 20 | NULL | 1875 | 3000 | 800 |
| 30 | NULL | 1566.666667 | 2850 | 950 |
+-----------+------------+--------------+-------+-------+--+
18 rows selected (25.037 seconds)
行轉列
1,語文,100
1,數學,100
1,英語,100
2,數學,79
2,語文,80
2,英語,100
CREATE TABLE t_student(
id INT,
course STRING,
score double)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '>'
lines terminated by '\n'
stored as textfile;
0: jdbc:hive2://CentOS:10000> select * from t_student;
+---------------+-------------------+------------------+--+
| t_student.id | t_student.course | t_student.score |
+---------------+-------------------+------------------+--+
| 1 | 語文 | 100.0 |
| 1 | 數學 | 100.0 |
| 1 | 英語 | 100.0 |
| 2 | 數學 | 79.0 |
| 2 | 語文 | 80.0 |
| 2 | 英語 | 100.0 |
+---------------+-------------------+------------------+--+
6 rows selected (0.05 seconds)
0: jdbc:hive2://CentOS:10000> select id,max(case course when '語文' then score else 0 end) as chinese,max(case course when '數學' then score else 0 end ) as math,max(case course when '英語' then score else 0 end ) as english from t_student group by id ;
+-----+----------+--------+----------+--+
| id | chinese | math | english |
+-----+----------+--------+----------+--+
| 1 | 100.0 | 100.0 | 100.0 |
| 2 | 80.0 | 79.0 | 100.0 |
+-----+----------+--------+----------+--+
2 rows selected (25.617 seconds)
SELECT id,concat_ws(’,’, collect_set(concat(course, ‘:’, score))) 成績 FROM t_student GROUP BY id
Hive數據傾斜
數據傾斜是進行大數據計算時最經常遇到的問題之一。當我們在執行HiveQL或者運行MapReduce作業時候,如果遇到一直卡在map100%,reduce99%一般就是遇到了數據傾斜的問題。數據傾斜其實是進行分佈式計算的時候,某些節點的計算能力比較強或者需要計算的數據比較少,早早執行完了,某些節點計算的能力較差或者由於此節點需要計算的數據比較多,導致出現其他節點的reduce階段任務執行完成,但是這種節點的數據處理任務還沒有執行完成。
group by,我使用Hive對數據做一些類型統計的時候遇到過某種類型的數據量特別多,而其他類型數據的數據量特別少。當按照類型進行group by的時候,會將相同的group by字段的reduce任務需要的數據拉取到同一個節點進行聚合,而當其中每一組的數據量過大時,會出現其他組的計算已經完成而這裏還沒計算完成,其他節點的一直等待這個節點的任務執行完成,所以會看到一直map 100% reduce 99%的情況。
解決方法:
set hive.map.aggr=true
set hive.groupby.skewindata=true
原理:
hive.map.aggr=true 這個配置項代表是否在map端進行聚合hive.groupby.skwindata=true 當選項設定爲 true,生成的查詢計劃會有兩個 MR Job。第一個 MR Job 中,Map 的輸出結果集合會隨機分佈到 Reduce 中,每個 Reduce 做部分聚合操作,並輸出結果,這樣處理的結果是相同的 Group By Key 有可能被分發到不同的 Reduce 中,從而達到負載均衡的目的;第二個 MR Job 再根據預處理的數據結果按照 Group By Key 分佈到 Reduce 中(這個過程可以保證相同的 Group By Key 被分佈到同一個 Reduce 中),最後完成最終的聚合操作。
Hive On Hbase
create external table t_employee(
empno INT,
ename STRING,
job STRING,
mgr INT,
hiredate TIMESTAMP,
sal DECIMAL(7,2),
comm DECIMAL(7,2),
deptno INT)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES("hbase.columns.mapping" = ":key,cf1:name,cf1:job,cf1:mgr,cf1:hiredate,cf1:sal,cf1:comm,cf1:deptno")
TBLPROPERTIES("hbase.table.name" = "baizhi:t_employee");
需要替換hive-hbase-handler-1.2.2.jar