Hadoop之HDFS 内部机制知多少?

{"type":"doc","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在前一篇\"","attrs":{}},{"type":"link","attrs":{"href":"https://xie.infoq.cn/article/d1f9dca888d6b119fd484c894","title":""},"content":[{"type":"text","text":"Hadoop的MapReduce到底有什么问题","attrs":{}}]},{"type":"text","text":"\"里,我们一起回顾了MapReduce内部机制和存在的问题。在本文中,主要讨论Hadoop里另外一个重要组件HDFS的架构和高可用相关机制。感兴趣的同学也可进一步阅读","attrs":{}},{"type":"link","attrs":{"href":"https://hadoop.apache.org/docs/current/","title":""},"content":[{"type":"text","text":"官方HDFS设计文档","attrs":{}}]},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HDFS设计的目的就是分布式环境下海量数据的存储。其中最重要的目标就是:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"系统的高可用","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"数据一致性","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"高并发 ","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"HDFS的架构与工作机制","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HDFS的架构图如下:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HDFS主要由Namenode和DataNodes组成:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"NameNode职责","attrs":{}},{"type":"text","text":":","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 扮演的是整个分布式存储的大脑角色。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 存储HDFS所有的metadata信息,比如Namespace的名字,文件的replicas的个数等。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 执行所有文件操作系统等的动作并向DataNode发相应的Block指令,比如打开、关闭、重命名、复制等操作。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 负责Block和DataNode之间的mapping关系。","attrs":{}}]},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"NameNode的角色类似文件系统里的","attrs":{}},{"type":"codeinline","content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"文件控制块","attrs":{}}],"marks":[{"type":"italic"}],"attrs":{}},{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"角色,在linux里文件控制块会记录着文件权限、所有者、修改时间和文件大小等文件属性信息,以及文件数据块硬盘地址索引。 HDFS的Block Size从2.7.3版本开始默认值从64M更改为128M。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"DataNodes职责","attrs":{}},{"type":"text","text":":","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 响应Client的读写请求。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 执行来自NameNode的block操作请求,比如复制,删除,新建等命令。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 负责向NameNode汇报自己的Heartbeat和BlockReport。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"HDFS的HA","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"元数据方面","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * NameNode在HDFS里的重要性不言而喻,如果NameNode挂了或者元数据丢失了,那么整个HDFS也就瘫了,因此非常需要有HA机制。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * HDFS采取的方案是: ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"主备双活NameNode + Zookeeper集群(Master选举) + Journal(共享存储)","attrs":{}}],"attrs":{}},{"type":"text","text":"。","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"文件数据方面","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 数据通过","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"replicas","attrs":{}}],"attrs":{}},{"type":"text","text":"冗余来保证HA。","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/36/36ffaae14d107166578fca5303cd07f4.png","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"更详细的信息可以参考文章","attrs":{}},{"type":"link","attrs":{"href":"https://developer.ibm.com/zh/articles/os-cn-hadoop-name-node","title":""},"content":[{"type":"text","text":"HDFS的HA机制","attrs":{}}]},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"主备NameNode + 自动主备切换","attrs":{}}]},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"HDFS也可以通过手动切换主备,本文主要关注通过","attrs":{}},{"type":"codeinline","content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"ZK进行辅助Master选举","attrs":{}}],"marks":[{"type":"italic"}],"attrs":{}},{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"的方式进行主备切换。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"建锁结点","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"当NameNode节点需要申请成为主结点时,需要通过ZK进行Master选举时,通过抢占在ZK里建立对应的锁结点。建立锁结点成功,那么说明选主成功。其中锁结点信息包括两部分:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"临时结点","attrs":{}},{"type":"text","text":": ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"/hadoop-ha/${dfs.nameservices}/ActiveStandbyElectorLock","attrs":{}}],"attrs":{}},{"type":"text","text":" ","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果ZK在一定的时间内收到不到对应的NameNode的心跳,会将这个临时结点删掉。","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"持久结点","attrs":{}},{"type":"text","text":": ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"/hadoop-ha/${dfs.nameservices}/ActiveBreadCrumb","attrs":{}}],"attrs":{}},{"type":"text","text":" ","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 持久结点会在成为主结点是同时创建。建立持久结点的目的是为了NameNode和ZK之间通信假死带来脑裂问题。持久结点里会记录NameNode的地址。当发生脑裂时,下一个被选为主结点的NameNode会去查看是不是存在持久结点,如果存在,就会采取Fencing的措施,来防止脑裂问题。具体的Fencing方法有:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 通过调用旧的Active NameNode的HAServiceProtocolRPC来去transition旧的NameNode为StandBy状态。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 通过SSH方式登录到对应的NameNode机器上Kill掉对应的进程。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 执行用户自定义的Shell脚本去隔离进程。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"注册Watch监听","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"当NameNode申请成为主结点失败时,会向ZK注册一个监听事件,来监听对应的锁节点的目录变化,当然主要监听的是NodeDelete事件,会用来触发自动主备切换事件。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"自动主备切换","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"NameNode的自动主备切换主要由","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"ZKFailoverController","attrs":{}}],"attrs":{}},{"type":"text","text":", ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"HealthMontior","attrs":{}}],"attrs":{}},{"type":"text","text":"和","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"ActiveStandbyElector","attrs":{}}],"attrs":{}},{"type":"text","text":"这3个组件来协同实现。","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"ZKFailoverController","attrs":{}}],"attrs":{}},{"type":"text","text":"启动时会创建HealthMonitor和ActiveStandbyElector两个组件,并向这两个组件注册对应的回调方法。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"HealthMonitor","attrs":{}}],"attrs":{}},{"type":"text","text":"主要是用来监控NameNode的健康状态,如果检测到有状态变化,会调用回调函数来通知ZKFailoverController进行自动的主备选举。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"ActiveStandbyElector","attrs":{}}],"attrs":{}},{"type":"text","text":"主要是负责和ZK交互, 里面封装了ZK相关的处理逻辑,当ZK master选举完成,会回调ZKFailoverController的相应方法来进行NameNode的主备状态切换。","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"具体的主备切换流程如下(可参考上面的HA图):","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Active NameNode","attrs":{}},{"type":"text","text":" ","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"① Active NameNode上的HealthMonitor发现NameNode上状态发生变化,比如没有响应。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"② 通过回调ZKFailoverController函数通知。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"③ ZKFailoverController收到通知后会调用ActiveStandbyElector去删除掉在ZK集群上创建的锁结点。对于正常情况下关闭的Active NameNode,也会将持久锁结点一并删除。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"④ ActiveStandbyElector call ZK集群删除对应的锁结点。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"⑤ 当删除结点成功后,AcitveStandbyElector会回调ZKFailoverController方法进行通知。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"⑥ ZKFailoverController会去将Active NameNode的状态切换为Standby。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Standby NameNode","attrs":{}},{"type":"text","text":" ","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"① Standby NameNode在第一次主备竞选时在ZK建立锁结点失败时会注册Watch监听。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"② 当Active NameNode进行主备切换删除锁结点,NodeDelete的事件触发Standby NameNode的ActiveStandByElector的自动创建锁结点,申请成为主结点的动作。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"③ 当申请主结点被ZK通过后,会回调ZKFailoverController进行NameNode的状态切换。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"④ ZKFailoverController调NameNode方法将状态从Standby更新为Active。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"⑤ NameNode从Journal集群里Sync最新元数据EditLog信息。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"⑥ 当所有的元数据信息整体对齐后,此时的NameNode才会真正对外提供服务。 ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"以上是正常情况下的主备切换流程。当Active NameNode整个机器宕机,或者和ZK失去通信后,根据ZK临时节点的特性,锁节点也会自动删除,自动触发主备切换。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":4},"content":[{"type":"text","text":"脑裂和Fencing","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Zookeeper的工程实践里会经常出现“假死”的情况,即客户端到服务端的心跳不能正常发出,通讯出现问题。这样当超时超过设置的Session Timeout参数时,Zookeeper就会认为客户端已经挂掉了,会自动关闭session,删除锁节点,从而引发分布式系统里的双主或者脑裂的情况。比如HDFS里,会触发自动的主备切换,而实际上原来的Active NameNode还是好的,这样就存在两个Active NameNode在工作。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HDFS HA里解决脑裂问题就是在ZK里建立持久结点通过Fencing机制,可以阅读","attrs":{}},{"type":"link","attrs":{"href":"#建锁结点","title":""},"content":[{"type":"text","text":"持久结点","attrs":{}}]},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"具体到主备切换机制里,当Standby结点在② 时,会发现ZK上存在永久锁结点,那就会采取Fencing机制。当成功将原来的Active NameNode隔离(Kill或者进程隔离等),才会真正去call ZKFaioverController进行状态切换。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"Journal共享存储元数据","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Active NameNode向Journal集群结点同步写EditLog元数据,具体可参考","attrs":{}},{"type":"link","attrs":{"href":"#元数据的高并发修改","title":""},"content":[{"type":"text","text":"元数据的高并发修改","attrs":{}}]},{"type":"text","text":"部分。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"而Standby NameNode则是定时从Journal集群同步EditLog元数据到本地。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在发生NameNode主备切换的时候,需要将Standby的NameNode的元数据同Journal集群结点的信息完全对齐后才可对外提供数据。","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"Journal本身也是分布集群来通过Paxos算法来提供分布式数据一致性的保障。只有多数据结点通过投票以后才认为真正的数据写成功。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"元数据保护","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"可以通过维护多份FSImage(落盘) + EditLog 副本来防止元数据损坏。","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"HDFS的数据一致性","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"元数据一致性","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"主备双活NameNode之间的元数据","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 通过Journal共享存储EditLog,每次切换主备时只有对齐EditLog以后才能对外提供服务。","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"内存与磁盘里元数据","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"内存里的数据 = 最新的FSImage + EditLog","attrs":{}}],"attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 当有元数据修改时,往内存写时,需要先往EditLog里记录元数据的操作记录。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 当EditLog数据满了以后,会将EditLog应用FSImage里并和内存里的数据做同步,生成新的FSImage,清空EditLog。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"数据一致性","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"HDFS会对写入的所有数据计算校验和(checksum),并在读取数据时验证。","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"写入的时候会往DataNode发Checksum值,最后一个写的DataNode会负责检查所有负责写的DataNode的数据正确性。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"读数据的时候,客户端也会去和存储在DataNode中的校验和进行比较。","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"HDFS高并发","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"元数据的高并发修改","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"主要的流程图如下:","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/7c/7ca40c5e5b067fe5e43adf3c140af5a9.png","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"参考","attrs":{}},{"type":"link","attrs":{"href":"https://juejin.cn/post/6844903713966915598","title":""},"content":[{"type":"text","text":"博文","attrs":{}}]}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"主要的过程","attrs":{}},{"type":"text","text":": ","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"当有多个线程排除申请修改元数据时,会需要经过两阶段的对元数据资源申请加锁的过程。","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"第一次申请锁成功的线程,会首先生成","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"全局唯一且递增","attrs":{}}],"attrs":{}},{"type":"text","text":"的txid来作为这次元数据的标识,将元数据修改的信息(EditLog的transaction信息)写入到当下其中一个Buffer里(没有担任刷数据到磁盘的角色的Buffer里)。然后第一次快速释放锁。","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"此时前一步中的线程接着发起第二次加锁请求:","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果请求失败(比如现在正在有其他的线程正在写Buffer)会将自己休眠1s然后再发起新的加锁请求。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果第二次请求加锁成功,会先check是否有线程正在进行刷磁盘的操作:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果是,那么就快速释放第二次加锁然后再把自己休眠等待下次加锁请求(因为已经有人在刷磁盘了,为了不阻塞其他线程写Buffer,先释放锁信息)。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果不是,那么会接着check是否自己的EditLog信息已经由在后面的其他线程刷进磁盘里:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果是,那么就直接释放第二次加锁请求直接线程退出,因为不再需要它做任何事情;","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 如果还没刷进去,那么就由该线程担任起切换Buffer并刷数据到磁盘和Journal集群结点的重任。在切换Buffer以后,该线程会进行第二次释放锁的动作,这样其他线程可以继续往切换后的Buffer写数据了。在慢慢刷数据到本地磁盘或者通过网络刷数据到Journal结点的过程中,不会阻塞其他线程同时的写请求,提高并发量。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"主要的方法","attrs":{}},{"type":"text","text":":","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"分段加锁机制 + 内存双缓冲机制","attrs":{}}],"attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 分段加锁是指:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 第一阶段是在写内存缓冲区的申请对修改加锁。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 第二段是在申请刷缓冲区的数据到磁盘、Journal集群资格的时候申请加锁。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 整个过程中只有一个锁,保护的元数据的资源。当开始刷数据时,会立刻释放锁,不会阻塞后续其他往内存缓冲区写数据的线程。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 内存双缓存:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 缓冲1用来当下的写入Log。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 缓冲2用来读取已经写入的刷到磁盘和Journal结点。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" * 两个缓存会交换角色(需要时机判断)","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"缓冲数据批量刷","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"磁盘+网络","attrs":{}}],"attrs":{}},{"type":"text","text":"优化","attrs":{}}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"多线程并发吞吐量支持","attrs":{}}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"Reference","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https://hadoop.apache.org/docs/current/","title":""},"content":[{"type":"text","text":"Hadoop文档","attrs":{}}]}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html","title":""},"content":[{"type":"text","text":"HDFS Design文档","attrs":{}}]}]}],"attrs":{}},{"type":"listitem","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https://developer.ibm.com/zh/articles/os-cn-hadoop-name-node","title":""},"content":[{"type":"text","text":"HDFS的HA机制","attrs":{}}]}]}],"attrs":{}}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"更多大数据相关分享,可在微信公众号搜索“","attrs":{}},{"type":"text","marks":[{"type":"color","attrs":{"color":"#FF7021","name":"orange"}},{"type":"strong","attrs":{}}],"text":"数据元素","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"”或扫描下方二维码。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/d5/d5863c0edfb19d09b5999079fd703d7c.gif","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章