數據倉庫
OLAP //online analyze process.
//數量量大,併發低,延遲高。
hive //hadoop mr,效率高。
sql //類似sql語句。
數據庫
mysql,
OLTP //在線事務處理。acid
//事務並發現象: dirty read | unrepeatable read | phantom read
//隔離級別:read uncommitted | read committed | repeatable read | serializable
//延遲低.
hive
蜜蜂。
數據倉庫。
使用sql方式讀、寫、管理駐留在分佈式存儲系統上的大型數據集的數據倉庫軟件。
命令行方式和driver方式。
最初由facebook開發,
處理結構化數據。
[不是]
關係型數據庫
OLTP
實時查詢和行級更新。
[特點]
存儲schema文件在數據庫中,處理的是hdfs數據。
OLAP
提供SQL語句,成爲HiveQL / HQL
可伸縮、可擴展
hive概念
1.user interface
shell
web ui
2.metastore
元數據庫
存儲庫、表、列、類型等信息。
存放在關係數據庫中。
3.HiveQL prcoess engine
處理引擎,替代傳統MR方式。
4.執行引擎
處理查詢,生成MR的結果。
5.hdfs | hbase
存儲存放地。
hive與hadoop的交互流程
client同hive driver(select id,name,age from customers ;)
-->通過編譯器進行編譯
-->查詢metastore
-->執行引擎
-->hadoop mr
hive安裝
1.下載
2.tar
$>tar -xzvf apache-hive-2.1.0-bin.tar.gz -C /soft/
$>ln -s apache-hive-2.1.0-bin hive
3.環境變量
$>sudo nano /etc/profile
...
HIVE_HOME=/soft/hive
PATH=$PATH:$HIVE_HOME/bin
4.source /etc/profile
hive配置元數據到mysql中,不使用默認的derby庫。
1.複製mysql驅動到hive的lib下.
cp mysql-connector-java-5.1.17.jar /soft/hive/lib
2.創建hive配置文件
[/soft/hive/conf/hive-site.xml]
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://192.168.231.1:3306/hive6</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>hive.server2.authentication</name>
<value>NONE</value>
</property>
<property>
<name>hive.server2.enable.doAs</name>
<value>false</value>
</property>
初始化hive元數據到mysql
$>bin/schemaTool -dbType mysql -initSchema
hive的組件
database //hdfs目錄,/user/hive/warehouse/${dbname}.db
table //hdfs目錄,/user/hive/warehouse/${dbname}.db/${table}
進入shell
//進入hive命令行
$>hive
//hive操作
$hive>create database mydb ; //建庫
$hive>use mydb ;
$hive>create table customers(id int,name string , age int) ; //建表
$hive>show database ;
$hive>show tables ;
$hive>desc customers ;
使用hive的hiveserver2服務實現hive cs訪問
1.啓動hiveserver2
$>hive/bin/hiveserver2 &
2.查看hiveserver2是否啓動完成
netstat -anop | grep 10000
3.啓動beeline命令行
$>hive/bin/beeline
$beeline>!help
$beeline>!connect jdbc:hive2://localhost:10000/mydb ;
$beeline>show databases ;
遠程通過jdbc連接訪問hiveserver2,進行hive數據倉庫操作
1.創建模塊添加依賴
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.it18zhang</groupId>
<artifactId>my-hive-day01</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>2.1.0</version>
</dependency>
</dependencies>
</project>
2.編寫測試代碼
package com.hive.test;
import org.junit.Test;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
/**
* 測試hive
*/
public class TestHive {
@Test
public void getConn() throws Exception {
String driver= "org.apache.hive.jdbc.HiveDriver" ;
Class.forName(driver) ;
Connection conn = DriverManager.getConnection("jdbc:hive2://s100:10000/mydb") ;
Statement st = conn.createStatement();
ResultSet rs = st.executeQuery("select id,name,age from customers order by id desc");
while(rs.next()){
int id = rs.getInt(1) ;
String name = rs.getString(2) ;
int age = rs.getInt(3) ;
System.out.println(id + " ," + name + "," + age);
}
rs.close();
conn.close();
}
}
3.執行並查看web ui
mr.
hive數據類型
1.基本數據類型
TINYINT //byte 1
SMALLINT //short 2
INT //int 4
BIGINT //long 8
FLOAT //float 4
DOUBLE //double 8
DECIMAL //decimal 精度和刻度decimal(10,3)
BINARY //二進制
BOOLEAN //TRUE | FALSE
STRING //字符串
CHAR //定長 <= 255
VARCHAR //變長 <=65355.
DATE //日期 '2013-01-01'
TIMESTAMP //時間戳 ‘2013-01-01 12:00:01.345’
2.複雜類型
ARRAY //數組 ['apple','orange','mango']
MAP //map {1:"apple",2: "orange"}
STRUCT //結構體 {1, "apple"}
NAMED STRUCT //命名結構體{"apple":"gala","weightkg":1}
UNION //組合 {2:["apple","orange"]}
3.準備數據
Michael|Montreal,Toronto|Male,30|DB:80|Product:Developer Lead
Will|Montreal|Male,35|Perl:85|Product:Lead,Test:Lead
Shelley|New York|Female,27|Python:80|Test:Lead,COE:Architect
Lucy|Vancouver|Female,57|Sales:89,HR:94|Sales:Lead
4.創建複雜表
CREATE TABLE employee
(
name string,
arr ARRAY<string>,
struc STRUCT<sex:string,age:int>,
map1 MAP<string,int>,
map2 MAP<string,ARRAY<string>>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
5.加載本地數據到hive
$hive>load data local inpath '/home/centos/employee.txt' into table employee ;
//
$hive>select * from employee ;
//重命名
$hive>alter table employee rename to emp;
$hive>select name,arr from emp ;
//查詢數組指定元素
$hive>select name,arr[0] from emp ;
//查詢結構體
$hive>select name,struc.sex from emp ;
//查詢map指定的key
$hive>select name,map1["DB"] from emp ;
//查詢map2指定的k
$hive>select map2["Product"][0] from emp ;
6.使用hive實現word count
6.1)創建表
CREATE TABLE docs
(
line string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',' ;
6.2)加載數據到docs
load data local inpath '/home/centos/1.txt' into table docs ;
6.3)單詞統計
//單詞統計
select t.word,count(*) from (select explode(split(line," ")) word from docs) t group by t.word ;
//降序topN查詢
select t.word,count(*) cnt from (select explode(split(line," ")) word from docs) t group by t.word order by cnt desc limit 3 ;
創建表完整語法
CREATE TABLE employee
(
name string,
arr ARRAY<string>,
struc STRUCT<sex:string,age:int>,
map1 MAP<string,int>,
map2 MAP<string,ARRAY<string>>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|' //默認\001
COLLECTION ITEMS TERMINATED BY ',' //默認\002
MAP KEYS TERMINATED BY ':' //默認\003
LINES TERMINATED BY '\n' //行結束符
STORED AS TEXTFILE; //
explode
UDTF,表生成函數。
可以應用於array或者map.
類型轉換函數
cast('124' as int); //轉換成整數
concat('12','12','23','35') //字符串連接函數
DDL
create database if not exists xx ;
//創建數據庫定義位置和屬性,以及註釋。
CREATE DATABASE IF NOT EXISTS myhivebook
COMMENT 'hive database demo'
LOCATION '/hdfs/directory'
WITH DBPROPERTIES ('creator'='dayongd','date'='2015-01-01')
//查看庫信息
desc database myhivebook ;
//刪除庫
drop database myhivebook ;
load data
//1.從本地加載,複製過程
load data local inpath '/x/x/x/x/1.xt' into table docs ;
//2.從hdfs加載,移動過程
load data inpath '/x/x/x/x/1.xt' into table docs ;
內部表和外部表
1.內部表
託管表,刪除表時,數據和表結構都刪除,默認內部表。
2.外部表
刪除時,只刪除表結構。數據還在。
CTAS:Create the table as select
//攜帶數據
create table emp2 as select name from emp ;
like語句建表,只有數據結構,沒有數據
create table emp3 like emp ;
truncate
快速請空表。
truncate table emp2 ;
分區表:分區是目錄。
//創建分區表
CREATE TABLE custs
(
id int,
name string ,
age int
)
PARTITIONED BY (prov string, city string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
//添加分區
alter table custs add PARTITION (prov='hebei', city='baoding') PARTITION (prov='hebei', city='shijiazhuang');
//查看分區
SHOW PARTITIONS custs;
//刪除分區
alter table custs drop partition partition(prov='hebei',city='shijizhuang') ;
//加載數據到分區
load data local inpath '/home/centos/cust.txt' into table custs partition(prov='hebei',city='baoding') ;
//按照分區查詢
select * from custs where city = 'baoding' ;
桶表:桶表是文件。
//創建桶表
CREATE TABLE buck
(
id int,
name string ,
age int
)
CLUSTERED BY (id) INTO 3 BUCKETS
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
//桶的數量確定標準
避免桶內的數據量過大或者過小,一般以數據塊的2倍爲宜。
//設置map個數和強行分桶
set map.reduce.tasks = 2;
set hive.enforce.bucketing = true;
連接
//創建orders表。
CREATE TABLE orders
(
id int,
orderno string ,
price float,
cid int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
//笛卡爾積
select a.*,b.* from custs a , orders b ;
//內連接
select a.*,b.* from custs a inner join orders b on a.id = b.cid ;
//左外連接
select a.*,b.* from custs a left outer join orders b on a.id = b.cid ;
//半連接,只查詢左邊的表
select a.* from custs a left outer join orders b on a.id = b.cid ;
//右外
select a.*,b.* from custs a right outer join orders b on a.id = b.cid ;
//全外
select a.*,b.* from custs a full outer join orders b on a.id = b.cid ;
Map端連接
//連接暗示/*+ MAPJOIN(employee) */
SELECT /*+ MAPJOIN(employee) */ c.* FROM custs c CROSS JOIN orders o WHERE c.id <> o.cid;
//通過設置自動map端連接轉換,實現map連接
set hive.auto.convert.join=true
SELECT c.* FROM custs c CROSS JOIN orders o WHERE c.id <> o.cid;
union查詢
//union all
select id,name from custs union select id,orderno from orders ;
//去重
select distinct cid from orders ;