Google大规模监控系统--Monarch

{"type":"doc","content":[{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Monarch是Google的大规模监控系统,服务于Google全球大规模实时业务监控,其实现为超大规模时序数据库集群,被公认为当今全球最大规模的时序数据库。Monarch本身没有开源,主要信息来源于Google在2020年8月份发表在PVLDB上的一篇论文:","attrs":{}},{"type":"link","attrs":{"href":"https://www.vldb.org/pvldb/vol13/p3181-adams.pdf","title":"","type":null},"content":[{"type":"text","text":"Monarch: Google’s Planet-Scale In-Memory Time Series Database","attrs":{}}]},{"type":"text","text":"。这篇文章是Medium上一篇介绍Google Monarch的文章的中文翻译,原文链接:","attrs":{}},{"type":"link","attrs":{"href":"https://medium.com/geekculture/understanding-monarch-googles-planet-scale-monitoring-system-60e59b63ac0c","title":"","type":null},"content":[{"type":"text","text":"Understanding Monarch, Google’s Planet-Scale Monitoring System","attrs":{}}]},{"type":"text","text":"。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"以下为正文,斜体字是我加的备注:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Monarch是Google开发的大规模内存时序数据库,用于Google的大部分内部系统(如Spanner, BigTable, Colossus, BlobStore)的可靠监控系统。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"与任何其他的Google服务一样,它必须被设计为大规模、高可用并且支持区域局域性。但与其他服务不同的是,Monarch需要尽可能少的依赖其他Google服务,这一点非常重要,因为其他服务都在使用Monarch对自己进行监控,因此如果有相互依赖关系,任何一方的服务中断都会影响到另一方。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Monarch是一种必须支持高可用性和分区性的服务,因此在最终一致性有延迟的情况下,它通过向客户服务提供一定的提示来解决低一致性的问题。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"这意味着Monarch是AP系统,支持最终一致性而不是强一致性,这也是大规模分布式系统设计的普遍做法。","attrs":{}}]}],"attrs":{}},{"type":"heading","attrs":{"align":null,"level":5},"content":[{"type":"text","text":"Monarch的数据存储","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"数据以两种格式存储:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Leaves","attrs":{}},{"type":"text","text":"(叶子)组件在内存中存储实际的监控数据","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Logs","attrs":{}},{"type":"text","text":"(日志)是持久化存储,可用于在组件失败时重放事件","attrs":{}}]}]}],"attrs":{}},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"Leaf是内存中的缓存存储模块,Log用来做持久化和高可用,在服务失效的时候可用于快速恢复数据。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":5},"content":[{"type":"text","text":"Monarch的数据获取","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Data Ingestion Pipeline(数据接收管道)遵循如下原则:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"将客户端服务的数据存储在距离服务运行的zone越近越好,从而减少网络延迟","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"将客户服务的数据存储在同一个Leaf中,因为数据查询很有可能被集中在该Leaf上,以获得更快的查询响应","attrs":{}}]}]}],"attrs":{}},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"就近原则和内聚原则,从而尽可能提高访问性能。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/36/369eab95f35f9ac3b588355be7c64186.webp","alt":"图片","title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","marks":[{"type":"size","attrs":{"size":9}}],"text":"Monarch模块架构","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"以下情况将触发数据传输:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Ingestion Router(数据接收路由器)","attrs":{}},{"type":"text","text":"将数据发送给叶子路由器","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Leaf Router(叶子路由器)","attrs":{}},{"type":"text","text":"将数据发送给叶子","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Range Assigner(分配器)","attrs":{}},{"type":"text","text":"决定存储数据的叶子","attrs":{}}]}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"数据接收路由器根据位置字段将时间序列数据分区,叶子路由器根据分配器的决定将数据分布在叶子上。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/c3/c30910e451a0d64d36208d64e43f376f.webp","alt":"图片","title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","marks":[{"type":"size","attrs":{"size":9}}],"text":"数据格式模型","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"接收到的数据包含以下几类:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Targes(目标)","attrs":{}},{"type":"text","text":"用于识别数据生成的节点/服务/组件。以上图为例,目标字符串","attrs":{}},{"type":"text","marks":[{"type":"italic","attrs":{}},{"type":"strong","attrs":{}}],"text":"ComputeTask::sql-dba::db.server::aa::0876","attrs":{}},{"type":"text","text":"表示数据库服务器的Borg任务。目标字符串的格式对于决定数据在叶子之间的存储位置很重要,因为目标范围用于决定叶子之间的字典分片和负载均衡。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Metrics(指标)","attrs":{}},{"type":"text","text":"以key-value(键值对)的格式包含指标信息,其中键是目标的指标类型,值是基于时间序列的数据点。支持的度量类型有boolean、int64、double、string、distribution或tuple。度量值可以是cumulative(累计值),也可以是gauge(测量值)。使用累计值的优点是偶尔的数据丢失对统计分布的影响不大。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}},{"type":"strong","attrs":{}}],"text":"distribution","attrs":{}},{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"实际上是一个double list,由许多个 bucket组成,每个bucket里有一个double值,用于表示数据分布范围的统计。举个栗子:有一个访问延迟的distribution定义了4个bucket: 0-10ms, 10-20ms, 20-30ms, 30ms以上,每个bucket里的double数值定义了在这个延迟范围内的访问次数。","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/0b/0bb702264f995e2b9055438819787e8c.webp","alt":"图片","title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"size","attrs":{"size":9}}],"text":" distribution类型示例","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"数据可以以","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Delta Time Series(增量时间序列)","attrs":{}},{"type":"text","text":"的格式发送,这样只需要发送时间序列数据中的增量,而不是整个指标数据。这减少了数据的连续输入,并且只需要在数据有变化的时候进行处理。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Bucketing","attrs":{}},{"type":"text","text":"在将数据发送到数据接收管道之前的一段时间内负责聚合数据,从而减少网络处理,并且可以实现批量插入。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Admission windows","attrs":{}},{"type":"text","text":"用于丢弃在一定时间后接收到的数据,从而避免处理过期数据的压力。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"因为Bucketing会在一定时间内聚合数据,然后存到叶子并且持久化,这样如果有数据来晚了,对应的Bucket已经完成了处理,就需要将迟到的数据丢弃。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":5},"content":[{"type":"text","text":"数据查询","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Monarch提供了一个全局联合查询引擎,所有查询都可以在全局级别启动,Monarch负责将查询路由到存储相关数据的叶子,并整合来自叶子的响应。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"上面的模块图显示了查询相关的模块:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Mixers","attrs":{}},{"type":"text","text":"将查询分解为子查询,并合并来自子查询的响应。根Mixer接收查询并将它们发送到zone mixer,zone mixer进一步将其发送到叶子,从而形成查询树。Mixers还查询索引服务器,将查询限制在数据所在的zone或叶子。","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Index servers","attrs":{}},{"type":"text","text":"(索引服务器)索引每个zone和叶子的数据,这些数据可用于确定查询针对哪些叶子。","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Evaluators","attrs":{}},{"type":"text","text":"从standing query生成响应,并将数据写回叶子。","attrs":{}}]}]}],"attrs":{}},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic","attrs":{}}],"text":"Standing query类似SQL中的View(视图)的概念,用户可以自定义standing query,evaluator会定期查询standing query的结果,并写回对应的叶子,从而可以加快查询速度。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Monarch支持如下查询语义:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Fetch","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Filter","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Join","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Align","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"GroupBy","attrs":{}}]}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Ad-hoc queries","attrs":{}},{"type":"text","text":"是来自系统外部用户的查询。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Standing queries","attrs":{}},{"type":"text","text":"是类似于其他数据库系统中的视图的查询,定期计算并存储到Monarch中,以获得更快的查询响应。根据查询的广度,standing query可以在zone或root级别进行评估,将查询空间最小化到特定zone的叶子,从而获得更好的性能。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Level analysis","attrs":{}},{"type":"text","text":"用于对查询进行级别分析,基于不同的级别将查询打破重组,用以进行认证和获得更好的查询局部性。查询级别可以根据上面提到的查询树进行定义。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"Replica Resolution","attrs":{}},{"type":"text","text":"用于找出响应查询的最佳副本。因为查询负载、系统配置、数据完整性等方面可能存在差异,因此某个副本可能更适合响应查询请求。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"User Isolation","attrs":{}},{"type":"text","text":"用来限制任一用户在系统中可以使用的内存量,以便其他遵守规则的用户不受影响。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":5},"content":[{"type":"text","text":"性能","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Monarch分布在横跨五大洲的38个zone,执行大约400,000个任务。","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"截至2019年7月,Monarch存储了近9500亿时间序列,其高度优化的数据结构消耗了约750TB内存。","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"2019年7月,Monarch的内部部署每秒接收约4.4TB的数据。","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Monarch持续指数级增长,截至2019年7月,每秒服务超过600万次查询。","attrs":{}}]}]}],"attrs":{}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/84/846307d96094e00dab302466266af345.webp","alt":"图片","title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","marks":[{"type":"size","attrs":{"size":9}}],"text":"性能","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"size","attrs":{"size":9}}],"text":"封面图片来自:https://unsplash.com/@seanstratton","attrs":{}}]}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章