Hive學習筆記
筆記內容主要來自Hive編程指南
HiveQL:視圖
Hive視圖是一個邏輯結構,因爲它不像一個表會存儲數據。Hive目前暫不支持物化視圖。
使用視圖來降低查詢複雜度
-- Hive查詢語句中多層嵌套
FROM(
SELECT * FROM people JOIN cart
ON (cart.people_id=people.id) WHERE firstname='john'
) a SELECT a.lastname WHERE a.id=3;
-- 嵌套子查詢變成了一個視圖
CREATE VIEW shorter_join AS
SELECT * FROM people JOIN cart
ON (cart.people_id=people.id) WHERE firstname='john';
-- 現在就可以像操作表一樣來操作這個視圖
SELECT lastname FROM shorter_join WHERE id=3;
使用視圖來限制基於條件過濾的數據
-- Hive通過創建視圖來限制數據訪問可以用來保護信息不被隨意查詢
CREATE TABLE userinfo(firstname string, lastname string, ssn string, password string);
-- 通過視圖隱藏了ssn和password
CREATE VIEW safer_user_info AS
SELECT firstname, lastname FROM userinfo;
動態分區中的視圖和 map 類型
示例數據
CREATE EXTERNAL TABLE dynamictable(cols map<string,string>)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\004'
COLLECTION ITEMS TERMINATED BY '\001'
MAP KEYS TERMINATED BY '\002'
STORED AS TEXTFILE;
--可以創建一個視圖取出 type值 等於 request 的 city,state和part 3個字段。
CREATE VIEW orders(state, city, part) AS
SELECT cols["state"], cols["city"], cols["part"]
FROM dynamictable
WHERE cols["type"] = "request";
-- 可以加where限制條件
CREATE VIEW shipments(time, part) AS
SELECT cols["time"], cols["part"]
FROM dynamictable
WHERE cols["type"] = "response";
視圖零碎
-- 創建視圖
CREATE VIEW IF NOT EXISTS shipments(time, part)
COMMENT 'Time and parts for shipments.'
TBLPROPERTIES('creator'='me')
AS SELECT ...;
--放棄視圖
DROP VIEW IF EXISTS shipments;
--更新
ALTER VIEW shipments SET TBLPROPERTIES('created_at'='some_timestamp');
HiveQL:索引
CREATE TABLE employees(
name STRING,
salary FLOAT,
subordinates ARRAY<STRING>,
deductions MAP<STRING,FLOAT>,
address STRUCT<street:STRING, city:STRING, state:STRING, zip:INT>
)
PARTITIONED BY(country STRING, state STRING);
-- 下面僅對分區字段country 建立索引
CREATE INDEX employees_index
ON TABLE employees (country)
AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
WITH DEFERRED REBUILD
IDXPROPERTIES('creator'='me', 'created_at'='some_time')
IN TABLE employees_index_table
PARTITIONED BY (country, name)
COMMENT 'Employees indexed by country and name.';
--Bitmap索引普遍應用於排重後值較少的列
CREATE INDEX employees_index
ON TABLE employees (country)
AS 'BITMAP'
WITH DEFERRED REBUILD
IDXPROPERTIES('creator'='me','created_at'='some_time')
IN TABLE employees_index_table
PARTITIONED BY (country, name)
COMMENT 'Employees indexed by country and name.';
重建索引
-- 如果用戶指定了 DEFERRED REBUILD,那麼新索引將呈現空白狀態。在任何時候,都可以進行第一次索引創建或者使用 ALTER INDEX 對索引進行重建
ALTER INDEX employees_index
ON TABLE employees
PARTITION (country='US')
REBUILD;
-- 如果省略掉PARTITION,那麼將會對所有分區進行重建索引
--在工作流可以對對應的索引執行重建索引語句ALTER INDEX...REBUILD
顯示索引
-- 顯示對於這個索引表對所有列所建立的索引
SHOW FORMATTED INDEX ON employees;
-- INDEXS 列舉出多個索引信息
SHOW FORMATTED INDEXS ON employees;
刪除索引
-- 刪除一個索引將會刪除這個索引表
DROP INDEX IF EXISTS employees_index ON TABLE employees;
實現一個定製化的索引處理器
Hive Wiki 頁面具有實現一個定製化的索引處理器的完整的例子,鏈接是 https://cwiki.apache.org/confluence/display/Hive/IndexDev#CREATE_INDEX 其中還包括了索引的初步設計文檔。