数据架构:概念与冷热分离

{"type":"doc","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"封面图片来自51CTO.com的文章:","attrs":{}},{"type":"link","attrs":{"href":"http://www.100ec.cn/detail--6069294.html","title":"","type":null},"content":[{"type":"text","text":"淘宝大数据分析应对双十一来袭","attrs":{}}]},{"type":"text","text":"。引用请注明来源。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":1},"content":[{"type":"text","text":"一 什么是数据架构","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 关于架构,大家都有了解和理解。通常一个业务或项目,在做架构设计时,可能会包含业务架构和技术架构。其中技术架构是我们作为开发角色,在做设计时重点的工作内容。但还有架构类型的划分方式,会包括业务架构、技术架构、数据架构和应用架构四种。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 数据架构管理的内容包括管理对象、管理流程、管理组织,管理对象又包括数据标准、数据模型、数据库、数据质量。总之,数据架构就是由一定的管理组织,通过一系列管理流程,来实现对数据对象的管理。数据架构构成如下图所示:","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/f5/f50e16e0f52838740077310c15ac4f95.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":1},"content":[{"type":"text","text":"二 为什么需要数据架构","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" “经验来源于实践”。经历过一个或多个中大型项目/产品生命周期的朋友,大多会有这样的经验。在项目早期时,为了快速验证,会以尽快上线运行为最主要的目标,架构设计会有数据结构部分,但不会过多设计。在项目快速发展之后,频繁的表结构变更、数据类型变化会带来一系列的问题,尤其是当可能发生拆库、分表等动作之后,带来几个典型的数据问题:","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"2.1 数据标准不一致","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"列名相同,数据类型不同;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"列名相同,数据类型相同,长度不同;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"列名定义没有统一标准,识别困难;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"列名定义不统一,类型不统一,长度不同;","attrs":{}}]}]}],"attrs":{}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"2.2 数据模型混乱","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"表、字段缺乏注释;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"表无主键、允许为NULL列;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"表关系不清晰;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"不合理的冗余设计;","attrs":{}}]}]}],"attrs":{}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"2.3 性能问题","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"对表结构、索引理解、使用不当;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"SQL的编写与开发者的技术水平有关,当sql编写不当且缺乏审核导致带入线上,就会导致性能问题","attrs":{}}]}]}],"attrs":{}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"2.4 数据缺乏安全管理","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"表结构规范;","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"索引合理性设计、创建检查;","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"SQL质量;","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"数据安全管理(插入、删除、更新,以及批量查询动作)","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":1},"content":[{"type":"text","text":"三 数据架构生命周期","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 数据架构的重点是数据的标准化处理,这会贯穿于系统/项目的整个生命周期。包括数据架构设计阶段、开发阶段、迁移阶段、测试阶段等等。","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/19/195f606a5440f8940cece2de9370d0f7.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"heading","attrs":{"align":null,"level":1},"content":[{"type":"text","text":"四 数据冷热分离","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"4.1 大数据存储方案","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 关于大数据量的存储方案,常用的有分库分表方案,可以选用多种分库分表技术和中间件来实现。但有一个问题,当单表数据量到达多少的时候执行分库分表?","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"4.2 单表上限2000w的起源","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 一直有这样一种说法,MySQL 单表数据量大于 2000 万行,性能会明显下降。","attrs":{}},{"type":"link","attrs":{"href":"https://zhuanlan.zhihu.com/p/370031862","title":"","type":null},"content":[{"type":"text","text":"冷热分离之 OTS 表格存储实战","attrs":{}}]},{"type":"text","text":"这篇文章中给出了来源:这个传闻据说最早起源于百度。“具体情况大概是这样的,当年的 DBA 测试 MySQL性能时发现,当单表的量在 2000 万行量级的时候,SQL 操作的性能急剧下降,因此,结论由此而来。然后又据说百度的工程师流动到业界的其它公司,随之也带去了这个信息,所以,就在业界流传开这么一个说法。再后来,阿里巴巴《Java 开发手册》提出单表行数超过 500 万行或者单表容量超过2GB,才推荐进行分库分表。对此,有阿里的黄金铁律支撑,所以,很多人设计大数据存储时,多会以此为标准,进行分表操作。”","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 背后的原理,大家也可以仔细阅读这篇文章,简单来说,这个说法源于Mysql的InnoDB引擎的存储结构和索引结构。记录数过多时导致B+树高度过高从而需要多次IO,导致性能明显下降。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"4.3 冷热分离","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"4.3.1 数据的冷热划分","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 首先,绝大部分场景,数据都可以分为“冷数据”和“热数据”。数据划分的原则,可以根据时间远近、热点/非热点用户等等。例如在以往项目中的实例,用户通常只访问一段时间之内的数据,例如近一周或一个月。如果数据不做划分,必然会导致一定程度上的性能、成本损耗。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"4.3.2 冷热分离好处","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 通过合理的冷热分离设计,可以达到的好处:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"降低单表数据量,","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"提升单表性能","attrs":{}},{"type":"text","text":";","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"大量业务冷数据转冷存,","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"存储成本可以降低很多,至少50%+","attrs":{}},{"type":"text","text":"。","attrs":{}}]}]}],"attrs":{}},{"type":"heading","attrs":{"align":null,"level":1},"content":[{"type":"text","text":"五 冷热分离方案","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 需要考虑的包括存储方案、数据迁移方案,另外需要做历史查询时也需要支持聚合查询和自动的冷热查询路由。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"5.1 存储方案","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 存储方案,包括本地方案和云方案。本地的存储介质,通常是硬盘,但通常机械硬盘会受限于磁盘空间和IO瓶颈,这也是单表限制的主要原因。所以一般处于性能提升的考虑,会使用固态硬盘(SSD)。但SSD成本较高(远高于机械硬盘),所以不适合海量数据存储,这时候就需要考虑磁盘阵列等等。另外,磁带也是一种方案,但仅适合历史数据的持久化保存,和必要时做数据恢复,本身并不适合查询。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 如果能够接受云方案,那么可选的就有云硬盘作为DB的存储介质、或者是云服务上提供的冷热存储(blob、表格存储)。阿里云的OTS就是一种表格存储实现,其技术架构如下图所示:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/d4/d42ae92995a1cc93ae5886f0932f9e15.jpeg","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"5.2 迁移方案","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 数据有冷热划分,那么就会有界限、生命周期。新的数据写入时,其属性是“热”的;当到达某个时间节点或预设阈值时,就需要把数据迁移到“冷”数据存储。这里又涉及到几个问题:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"冷热数据标记","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"迁移方法。时效性保障","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"数据一致性","attrs":{}}]}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"另外,为了保证冷热数据迁移过程中业务系统的稳定性,在数据迁移的过程中还一定要做到:","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"可灰度","attrs":{}},{"type":"text","text":"[降低影响,提前发现],","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"可一键回滚","attrs":{}},{"type":"text","text":"[快速止血]。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":1},"content":[{"type":"text","text":"总结","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":" 本文介绍了数据架构的概念、意义,以及数据的冷热分离,并阐述了冷热分离方案和注意事项。本篇作为综述,在后续系列文章中会通过实际案例来进一步探究数据架构的内容。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":1},"content":[{"type":"text","text":"参考文章","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"http://blog.sina.com.cn/s/blog_4d22b9720102xhrr.html","title":"","type":null},"content":[{"type":"text","text":"数据架构-什么是数据架构","attrs":{}}]}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章