Hadoop之HDFS 內部機制知多少?

{"type":"doc","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在前一篇\"","attrs":{}},{"type":"link","attrs":{"href":"https://xie.infoq.cn/article/d1f9dca888d6b119fd484c894","title":""},"content":[{"type":"text","text":"Hadoop的MapReduce到底有什麼問題","attrs":{}}]},{"type":"text","text":"\"裏,我們一起回顧了MapReduce內部機制和存在的問題。在本文中,主要討論Hadoop裏另外一個重要組件HDFS的架構和高可用相關機制。感興趣的同學也可進一步閱讀","attrs":{}},{"type":"link","attrs":{"href":"https://hadoop.apache.org/docs/current/","title":""},"content":[{"type":"text","text":"官方HDFS設計文檔","attrs":{}}]},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HDFS設計的目的就是分佈式環境下海量數據的存儲。其中最重要的目標就是:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"系統的高可用","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"數據一致性","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"高併發 ","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"HDFS的架構與工作機制","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HDFS的架構圖如下:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HDFS主要由Namenode和DataNodes組成:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"NameNode職責","attrs":{}},{"type":"text","text":":","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 扮演的是整個分佈式存儲的大腦角色。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 存儲HDFS所有的metadata信息,比如Namespace的名字,文件的replicas的個數等。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 執行所有文件操作系統等的動作並向DataNode發相應的Block指令,比如打開、關閉、重命名、複製等操作。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 負責Block和DataNode之間的mapping關係。","attrs":{}}]},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"NameNode的角色類似文件系統裏的","attrs":{}},{"type":"codeinline","content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"文件控制塊","attrs":{}}],"marks":[{"type":"italic"}],"attrs":{}},{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"角色,在linux裏文件控制塊會記錄着文件權限、所有者、修改時間和文件大小等文件屬性信息,以及文件數據塊硬盤地址索引。 HDFS的Block Size從2.7.3版本開始默認值從64M更改爲128M。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"DataNodes職責","attrs":{}},{"type":"text","text":":","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 響應Client的讀寫請求。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 執行來自NameNode的block操作請求,比如複製,刪除,新建等命令。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 負責向NameNode彙報自己的Heartbeat和BlockReport。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"HDFS的HA","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"元數據方面","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * NameNode在HDFS裏的重要性不言而喻,如果NameNode掛了或者元數據丟失了,那麼整個HDFS也就癱了,因此非常需要有HA機制。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * HDFS採取的方案是: ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"主備雙活NameNode + Zookeeper集羣(Master選舉) + Journal(共享存儲)","attrs":{}}],"attrs":{}},{"type":"text","text":"。","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"文件數據方面","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 數據通過","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"replicas","attrs":{}}],"attrs":{}},{"type":"text","text":"冗餘來保證HA。","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/36/36ffaae14d107166578fca5303cd07f4.png","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"更詳細的信息可以參考文章","attrs":{}},{"type":"link","attrs":{"href":"https://developer.ibm.com/zh/articles/os-cn-hadoop-name-node","title":""},"content":[{"type":"text","text":"HDFS的HA機制","attrs":{}}]},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"主備NameNode + 自動主備切換","attrs":{}}]},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"HDFS也可以通過手動切換主備,本文主要關注通過","attrs":{}},{"type":"codeinline","content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"ZK進行輔助Master選舉","attrs":{}}],"marks":[{"type":"italic"}],"attrs":{}},{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"的方式進行主備切換。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"建鎖結點","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"當NameNode節點需要申請成爲主結點時,需要通過ZK進行Master選舉時,通過搶佔在ZK裏建立對應的鎖結點。建立鎖結點成功,那麼說明選主成功。其中鎖結點信息包括兩部分:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"臨時結點","attrs":{}},{"type":"text","text":": ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"/hadoop-ha/${dfs.nameservices}/ActiveStandbyElectorLock","attrs":{}}],"attrs":{}},{"type":"text","text":" ","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果ZK在一定的時間內收到不到對應的NameNode的心跳,會將這個臨時結點刪掉。","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"持久結點","attrs":{}},{"type":"text","text":": ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"/hadoop-ha/${dfs.nameservices}/ActiveBreadCrumb","attrs":{}}],"attrs":{}},{"type":"text","text":" ","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 持久結點會在成爲主結點是同時創建。建立持久結點的目的是爲了NameNode和ZK之間通信假死帶來腦裂問題。持久結點裏會記錄NameNode的地址。當發生腦裂時,下一個被選爲主結點的NameNode會去查看是不是存在持久結點,如果存在,就會採取Fencing的措施,來防止腦裂問題。具體的Fencing方法有:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 通過調用舊的Active NameNode的HAServiceProtocolRPC來去transition舊的NameNode爲StandBy狀態。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 通過SSH方式登錄到對應的NameNode機器上Kill掉對應的進程。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 執行用戶自定義的Shell腳本去隔離進程。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"註冊Watch監聽","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"當NameNode申請成爲主結點失敗時,會向ZK註冊一個監聽事件,來監聽對應的鎖節點的目錄變化,當然主要監聽的是NodeDelete事件,會用來觸發自動主備切換事件。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"自動主備切換","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"NameNode的自動主備切換主要由","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"ZKFailoverController","attrs":{}}],"attrs":{}},{"type":"text","text":", ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"HealthMontior","attrs":{}}],"attrs":{}},{"type":"text","text":"和","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"ActiveStandbyElector","attrs":{}}],"attrs":{}},{"type":"text","text":"這3個組件來協同實現。","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"ZKFailoverController","attrs":{}}],"attrs":{}},{"type":"text","text":"啓動時會創建HealthMonitor和ActiveStandbyElector兩個組件,並向這兩個組件註冊對應的回調方法。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"HealthMonitor","attrs":{}}],"attrs":{}},{"type":"text","text":"主要是用來監控NameNode的健康狀態,如果檢測到有狀態變化,會調用回調函數來通知ZKFailoverController進行自動的主備選舉。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"ActiveStandbyElector","attrs":{}}],"attrs":{}},{"type":"text","text":"主要是負責和ZK交互, 裏面封裝了ZK相關的處理邏輯,當ZK master選舉完成,會回調ZKFailoverController的相應方法來進行NameNode的主備狀態切換。","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"具體的主備切換流程如下(可參考上面的HA圖):","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Active NameNode","attrs":{}},{"type":"text","text":" ","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"① Active NameNode上的HealthMonitor發現NameNode上狀態發生變化,比如沒有響應。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"② 通過回調ZKFailoverController函數通知。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"③ ZKFailoverController收到通知後會調用ActiveStandbyElector去刪除掉在ZK集羣上創建的鎖結點。對於正常情況下關閉的Active NameNode,也會將持久鎖結點一併刪除。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"④ ActiveStandbyElector call ZK集羣刪除對應的鎖結點。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"⑤ 當刪除結點成功後,AcitveStandbyElector會回調ZKFailoverController方法進行通知。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"⑥ ZKFailoverController會去將Active NameNode的狀態切換爲Standby。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Standby NameNode","attrs":{}},{"type":"text","text":" ","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"① Standby NameNode在第一次主備競選時在ZK建立鎖結點失敗時會註冊Watch監聽。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"② 當Active NameNode進行主備切換刪除鎖結點,NodeDelete的事件觸發Standby NameNode的ActiveStandByElector的自動創建鎖結點,申請成爲主結點的動作。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"③ 當申請主結點被ZK通過後,會回調ZKFailoverController進行NameNode的狀態切換。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"④ ZKFailoverController調NameNode方法將狀態從Standby更新爲Active。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"⑤ NameNode從Journal集羣裏Sync最新元數據EditLog信息。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"⑥ 當所有的元數據信息整體對齊後,此時的NameNode纔會真正對外提供服務。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"以上是正常情況下的主備切換流程。當Active NameNode整個機器宕機,或者和ZK失去通信後,根據ZK臨時節點的特性,鎖節點也會自動刪除,自動觸發主備切換。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"腦裂和Fencing","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Zookeeper的工程實踐裏會經常出現“假死”的情況,即客戶端到服務端的心跳不能正常發出,通訊出現問題。這樣當超時超過設置的Session Timeout參數時,Zookeeper就會認爲客戶端已經掛掉了,會自動關閉session,刪除鎖節點,從而引發分佈式系統裏的雙主或者腦裂的情況。比如HDFS裏,會觸發自動的主備切換,而實際上原來的Active NameNode還是好的,這樣就存在兩個Active NameNode在工作。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HDFS HA裏解決腦裂問題就是在ZK裏建立持久結點通過Fencing機制,可以閱讀","attrs":{}},{"type":"link","attrs":{"href":"#建鎖結點","title":""},"content":[{"type":"text","text":"持久結點","attrs":{}}]},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"具體到主備切換機制裏,當Standby結點在② 時,會發現ZK上存在永久鎖結點,那就會採取Fencing機制。當成功將原來的Active NameNode隔離(Kill或者進程隔離等),纔會真正去call ZKFaioverController進行狀態切換。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"Journal共享存儲元數據","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Active NameNode向Journal集羣結點同步寫EditLog元數據,具體可參考","attrs":{}},{"type":"link","attrs":{"href":"#元數據的高併發修改","title":""},"content":[{"type":"text","text":"元數據的高併發修改","attrs":{}}]},{"type":"text","text":"部分。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"而Standby NameNode則是定時從Journal集羣同步EditLog元數據到本地。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在發生NameNode主備切換的時候,需要將Standby的NameNode的元數據同Journal集羣結點的信息完全對齊後纔可對外提供數據。","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"Journal本身也是分佈集羣來通過Paxos算法來提供分佈式數據一致性的保障。只有多數據結點通過投票以後才認爲真正的數據寫成功。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"元數據保護","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"可以通過維護多份FSImage(落盤) + EditLog 副本來防止元數據損壞。","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"HDFS的數據一致性","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"元數據一致性","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"主備雙活NameNode之間的元數據","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 通過Journal共享存儲EditLog,每次切換主備時只有對齊EditLog以後才能對外提供服務。","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"內存與磁盤裏元數據","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"內存裏的數據 = 最新的FSImage + EditLog","attrs":{}}],"attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 當有元數據修改時,往內存寫時,需要先往EditLog裏記錄元數據的操作記錄。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 當EditLog數據滿了以後,會將EditLog應用FSImage裏並和內存裏的數據做同步,生成新的FSImage,清空EditLog。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"數據一致性","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HDFS會對寫入的所有數據計算校驗和(checksum),並在讀取數據時驗證。","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"寫入的時候會往DataNode發Checksum值,最後一個寫的DataNode會負責檢查所有負責寫的DataNode的數據正確性。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"讀數據的時候,客戶端也會去和存儲在DataNode中的校驗和進行比較。","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"HDFS高併發","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"元數據的高併發修改","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"主要的流程圖如下:","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/7c/7ca40c5e5b067fe5e43adf3c140af5a9.png","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"參考","attrs":{}},{"type":"link","attrs":{"href":"https://juejin.cn/post/6844903713966915598","title":""},"content":[{"type":"text","text":"博文","attrs":{}}]}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"主要的過程","attrs":{}},{"type":"text","text":": ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"當有多個線程排除申請修改元數據時,會需要經過兩階段的對元數據資源申請加鎖的過程。","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"第一次申請鎖成功的線程,會首先生成","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"全局唯一且遞增","attrs":{}}],"attrs":{}},{"type":"text","text":"的txid來作爲這次元數據的標識,將元數據修改的信息(EditLog的transaction信息)寫入到當下其中一個Buffer裏(沒有擔任刷數據到磁盤的角色的Buffer裏)。然後第一次快速釋放鎖。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"此時前一步中的線程接着發起第二次加鎖請求:","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果請求失敗(比如現在正在有其他的線程正在寫Buffer)會將自己休眠1s然後再發起新的加鎖請求。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果第二次請求加鎖成功,會先check是否有線程正在進行刷磁盤的操作:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果是,那麼就快速釋放第二次加鎖然後再把自己休眠等待下次加鎖請求(因爲已經有人在刷磁盤了,爲了不阻塞其他線程寫Buffer,先釋放鎖信息)。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果不是,那麼會接着check是否自己的EditLog信息已經由在後面的其他線程刷進磁盤裏:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果是,那麼就直接釋放第二次加鎖請求直接線程退出,因爲不再需要它做任何事情;","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果還沒刷進去,那麼就由該線程擔任起切換Buffer並刷數據到磁盤和Journal集羣結點的重任。在切換Buffer以後,該線程會進行第二次釋放鎖的動作,這樣其他線程可以繼續往切換後的Buffer寫數據了。在慢慢刷數據到本地磁盤或者通過網絡刷數據到Journal結點的過程中,不會阻塞其他線程同時的寫請求,提高併發量。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"主要的方法","attrs":{}},{"type":"text","text":":","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"分段加鎖機制 + 內存雙緩衝機制","attrs":{}}],"attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 分段加鎖是指:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 第一階段是在寫內存緩衝區的申請對修改加鎖。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 第二段是在申請刷緩衝區的數據到磁盤、Journal集羣資格的時候申請加鎖。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 整個過程中只有一個鎖,保護的元數據的資源。當開始刷數據時,會立刻釋放鎖,不會阻塞後續其他往內存緩衝區寫數據的線程。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 內存雙緩存:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 緩衝1用來當下的寫入Log。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 緩衝2用來讀取已經寫入的刷到磁盤和Journal結點。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 兩個緩存會交換角色(需要時機判斷)","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"緩衝數據批量刷","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"磁盤+網絡","attrs":{}}],"attrs":{}},{"type":"text","text":"優化","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"多線程併發吞吐量支持","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"Reference","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https://hadoop.apache.org/docs/current/","title":""},"content":[{"type":"text","text":"Hadoop文檔","attrs":{}}]}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html","title":""},"content":[{"type":"text","text":"HDFS Design文檔","attrs":{}}]}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https://developer.ibm.com/zh/articles/os-cn-hadoop-name-node","title":""},"content":[{"type":"text","text":"HDFS的HA機制","attrs":{}}]}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"更多大數據相關分享,可在微信公衆號搜索“","attrs":{}},{"type":"text","marks":[{"type":"color","attrs":{"color":"#FF7021","name":"orange"}},{"type":"strong","attrs":{}}],"text":"數據元素","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"”或掃描下方二維碼。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/d5/d5863c0edfb19d09b5999079fd703d7c.gif","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章