HBase 底層原理詳解(深度好文,建議收藏)

{"type":"doc","content":[{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"HBase簡介","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HBase 是一個分佈式的、面向列的開源數據庫。建立在 HDFS 之上。Hbase的名字的來源是 Hadoop database,即 Hadoop 數據庫。HBase 的計算和存儲能力取決於 Hadoop 集羣。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"它介於 NoSql 和 RDBMS 之間,僅能通過主鍵(row key)和主鍵的 range 來檢索數據,僅支持單行事務(可通過 Hive 支持來實現多表 join 等複雜操作)。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HBase中表的特點:","attrs":{}}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":"大:一個表可以有上十億行,上百萬列","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":2,"align":null,"origin":null},"content":[{"type":"text","text":"面向列:面向列(族)的存儲和權限控制,列(族)獨立檢索。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":3,"align":null,"origin":null},"content":[{"type":"text","text":"稀疏:","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"對於爲空(null)的列,並不佔用存儲空間,因此,表可以設計的非常稀疏","attrs":{}},{"type":"text","text":"。","attrs":{}}]}],"attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"HBase底層原理","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"系統架構","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/64/64b66b06b6790d89a8e0eb9a8e933f68.png","alt":"HBase系統架構","title":"HBase系統架構","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HBase系統架構","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"根據這幅圖,解釋下HBase中各個組件","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"Client","attrs":{}}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":"包含訪問hbase的接口,","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Client維護着一些cache來加快對hbase的訪問","attrs":{}},{"type":"text","text":",比如regione的位置信息.","attrs":{}}]}],"attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"Zookeeper","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HBase可以使用內置的Zookeeper,也可以使用外置的,在實際生產環境,爲了保持統一性,一般使用外置Zookeeper。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Zookeeper在HBase中的作用:","attrs":{}}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":"保證任何時候,集羣中只有一個master","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":2,"align":null,"origin":null},"content":[{"type":"text","text":"存貯所有Region的尋址入口","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":3,"align":null,"origin":null},"content":[{"type":"text","text":"實時監控Region Server的狀態,將Region server的上線和下線信息實時通知給Master","attrs":{}}]}],"attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"HMaster","attrs":{}}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":"爲Region server分配region","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":2,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"負責region server的負載均衡","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":3,"align":null,"origin":null},"content":[{"type":"text","text":"發現失效的region server並重新分配其上的region","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":4,"align":null,"origin":null},"content":[{"type":"text","text":"HDFS上的垃圾文件回收","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":5,"align":null,"origin":null},"content":[{"type":"text","text":"處理schema更新請求","attrs":{}}]}],"attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"HRegion Server","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HRegion server","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"維護HMaster分配給它的region","attrs":{}},{"type":"text","text":",處理對這些region的IO請求","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HRegion server負責切分在運行過程中變得過大的region","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"從圖中可以看到,","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Client訪問HBase上數據的過程並不需要HMaster參與","attrs":{}},{"type":"text","text":"(尋址訪問Zookeeper和HRegion server,數據讀寫訪問HRegione server)","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"HMaster僅僅維護者table和HRegion的元數據信息,負載很低。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"HBase的表數據模型","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/a5/a5ea27dd8097742bcdcff5f8bf0cc860.png","alt":"HBase的表結構","title":"HBase的表結構","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HBase的表結構","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"行鍵 Row Key","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"與nosql數據庫一樣,row key是用來檢索記錄的主鍵。訪問hbase table中的行,只有三種方式:","attrs":{}}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":"通過單個row key訪問","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":2,"align":null,"origin":null},"content":[{"type":"text","text":"通過row key的range","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":3,"align":null,"origin":null},"content":[{"type":"text","text":"全表掃描","attrs":{}}]}],"attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Row Key 行鍵可以是任意字符串(","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"最大長度是 64KB","attrs":{}},{"type":"text","text":",實際應用中長度一般爲 10-100bytes),在hbase內部,row key保存爲字節數組。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Hbase會對錶中的數據按照rowkey排序(字典順序)","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"存儲時,數據按照Row key的字典序(byte order)排序存儲。設計key時,要充分排序存儲這個特性,將經常一起讀取的行存儲放到一起。(位置相關性)。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"注意:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"字典序對int排序的結果是","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"1,10,100,11,12,13,14,15,16,17,18,19,2,20,21 … 。","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"要保持整形的自然序,行鍵必須用0作左填充。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"行的一次讀寫是原子操作 (不論一次讀寫多少列)","attrs":{}},{"type":"text","text":"。這個設計決策能夠使用戶很容易的理解程序在對同一個行進行併發更新操作時的行爲。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"列族 Column Family","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"HBase表中的每個列,都歸屬於某個列族","attrs":{}},{"type":"text","text":"。列族是表的schema的一部分(而列不是),","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"必須在使用表之前定義","attrs":{}},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"列名都以列族作爲前綴。例如 courses:history , courses:math 都屬於 courses 這個列族。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"訪問控制、磁盤和內存的使用統計都是在列族層面進行的。列族越多,在取一行數據時所要參與IO、搜尋的文件就越多,所以,如果沒有必要,不要設置太多的列族。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"列 Column","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"列族下面的具體列,屬於某一個ColumnFamily,類似於在mysql當中創建的具體的列。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"時間戳 Timestamp","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HBase中通過row和columns確定的爲一個存貯單元稱爲cell。每個 cell都保存着同一份數據的多個版本。版本通過時間戳來索引。時間戳的類型是 64位整型。","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"時間戳可以由hbase(在數據寫入時自動 )賦值","attrs":{}},{"type":"text","text":",此時時間戳是精確到毫秒的當前系統時間。時間戳也可以由客戶顯式賦值。如果應用程序要避免數據版本衝突,就必須自己生成具有唯一性的時間戳。","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"每個 cell中,不同版本的數據按照時間倒序排序","attrs":{}},{"type":"text","text":",即最新的數據排在最前面。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"爲了避免數據存在過多版本造成的的管理 (包括存貯和索引)負擔,hbase提供了兩種數據版本回收方式:","attrs":{}}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":"保存數據的最後n個版本","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":2,"align":null,"origin":null},"content":[{"type":"text","text":"保存最近一段時間內的版本(設置數據的生命週期TTL)。","attrs":{}}]}],"attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"用戶可以針對每個列族進行設置。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"單元 Cell","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"由{row key, column( = +
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章