prometheus監控kafka指標

重要指標

該配置爲prometheus的rules報警規則。
監控數據通過jmx-exporter方式暴露kafka監控指標

- name: kafka_server
  rules:
  - alert: UnderReplicatedPartitions複製不足的分區數
    expr: avg_over_time(kafka_server_ReplicaManager_Value{name="UnderReplicatedPartitions",}[1m]) >= 1
    for: 1m
    labels:
      job: kafka
    annotations:
      summary: "{{ $labels.app }} app UnderReplicatedPartitions"
      description: "app: {{ $labels.app }} ,Instance: {{ $labels.instance }} 複製不足的分區數: {{ $value }}"
  - alert: ActiveControllerCount集羣中控制器的數量
    expr: sum(avg_over_time(kafka_controller_KafkaController_Value{name="ActiveControllerCount",}[1m])) by(app) != 1
    for: 1m
    labels:
      job: kafka
    annotations:
      summary: "{{ $labels.app }} app ActiveControllerCount"
      description: "app: {{ $labels.app }}, 集羣中控制器的數量異常:{{ $value }}"
  - alert: OfflinePartitionsCount沒有活動領導者的分區數
    expr: avg_over_time(kafka_controller_KafkaController_Value{name="OfflinePartitionsCount",}[1m]) > 0
    for: 1m
    labels:
      job: kafka
    annotations:
      summary: "{{ $labels.app }} app OfflinePartitionsCount"
      description: "app: {{ $labels.app }}, OfflinePartitionsCount沒有活動領導者的分區數, count:{{ $value }}"
  - alert: BytesOutPerSec出網絡流量M/s
    expr: avg_over_time(kafka_server_BrokerTopicMetrics_OneMinuteRate{name="BytesOutPerSec",topic=""}[1m]) / 1024 /1024 >= 450
    for: 1m
    labels:
      job: kafka
    annotations:
      summary: "{{ $labels.app }} 網絡流量出異常"
      description: "app: {{ $labels.app }}, instance:{{ $labels.instance }}, 出網絡流量M/s大於 450, 當前值爲:{{ $value }}"
  - alert: BytesInPerSec入網絡流量M/s
    expr: avg_over_time(kafka_server_BrokerTopicMetrics_OneMinuteRate{name="BytesInPerSec",topic=""}[1m]) / 1024 /1024 >= 150
    for: 1m
    labels:
      job: kafka
    annotations:
      summary: "{{ $labels.app }} 網絡流量出異常"
      description: "app: {{ $labels.app }}, instance:{{ $labels.instance }}, 出網絡流量M/s大於 150, 當前值爲:{{ $value }}"
  - alert: RequestHandlerAvgIdlePercent請求處理程序線程空閒的平均時間百分比
    for: 1m
    labels:
      job: kafka
    expr: avg_over_time(kafka_server_KafkaRequestHandlerPool_OneMinuteRate{name="RequestHandlerAvgIdlePercent",}[1m]) <= 0.3
    annotations:
      summary: "{{ $labels.app }} 請求處理程序線程空閒的平均時間百分比"
      description: "app: {{ $labels.app }}, instance:{{ $labels.instance }},請求處理程序線程空閒百分比低於30%, 當前值爲:{{ $value }}"
  - alert: NetworkProcessorAvgIdlePercent網絡處理器線程空閒的平均時間百分比
    expr: avg_over_time(kafka_network_SocketServer_Value{name="NetworkProcessorAvgIdlePercent",}[1m]) <= 0.3
    annotations:
      summary: "{{ $labels.app }} 網絡處理器線程空閒的平均時間百分比"
      description: "app: {{ $labels.app }}, instance:{{ $labels.instance }},網絡處理器線程空閒的平均時間百分比30%, 當前值爲:{{ $value }}"
  - alert: connection_count 已建立的連接數
    for: 1m
    labels:
      job: kafka
    expr: sum(avg_over_time(kafka_server_socket_server_metrics_connection_count{listener="PLAINTEXT",}[1m])) by (instance,app) > 3000
    annotations:
      summary: "{{ $labels.app }} 已建立的連接數"
      description: "app: {{ $labels.app }}, instance:{{ $labels.instance }},已建立的連接數大於3000, 當前值爲:{{ $value }}"
  - alert: connection_creation 每秒新建連接數
    expr: sum(avg_over_time(kafka_server_socket_server_metrics_connection_creation_rate[1m])) by (instance)  > 100
    for: 1m
    labels:
      job: kafka
    annotations:
      summary: "{{ $labels.app }} 每秒新建連接數"
      description: "app: {{ $labels.app }}, instance:{{ $labels.instance }},每秒新建連接數大於100, 當前值爲:{{ $value }}"
  - alert: RequestQueueTimeMs 請求在請求隊列中等待的時間
    expr: avg_over_time(kafka_network_RequestMetrics_999thPercentile{name="RequestQueueTimeMs",request="Produce",}[1m]) > 5000
    for: 1m
    labels:
      job: kafka
    annotations:
      summary: "{{ $labels.app }} 請求在請求隊列中等待的時間"
      description: "app: {{ $labels.app }}, instance:{{ $labels.instance }},請求在請求隊列中等待的時間大於5000ms, 當前值爲:{{ $value }}"
  - alert: LocalTimeMs leader處理請求的時間
    expr: avg_over_time(kafka_network_RequestMetrics_999thPercentile{name="LocalTimeMs",request="Produce",}[1m])  > 5000
    for: 1m
    labels:
      job: kafka
    annotations:
      summary: "{{ $labels.app }} leader處理請求的時間"
      description: "app: {{ $labels.app }}, instance:{{ $labels.instance }}, leader處理請求的時間大於5000ms, 當前值爲:{{ $value }}"
  - alert: RemoteTimeMs 請求等待follower的時間
    expr: avg_over_time(kafka_network_RequestMetrics_999thPercentile{name="RemoteTimeMs",request="Produce",}[1m]) > 1000
    for: 1m
    labels:
      job: kafka
    annotations:
      summary: "{{ $labels.app }} 請求等待follower的時間"
      description: "app: {{ $labels.app }}, instance:{{ $labels.instance }}, 請求等待follower的時間大於1000ms, 當前值爲:{{ $value }}"
  - alert: ResponseQueueTimeMs 請求在響應隊列中等待的時間
    expr: avg_over_time(kafka_network_RequestMetrics_999thPercentile{name="ResponseQueueTimeMs",request="Produce",}[1m]) > 1000
    for: 1m
    labels:
      job: kafka
    annotations:
      summary: "{{ $labels.app }} 請求在響應隊列中等待的時間"
      description: "app: {{ $labels.app }}, instance:{{ $labels.instance }}, 請求在響應隊列中等待的時間大於1000ms, 當前值爲:{{ $value }}"
  - alert: ResponseSendTimeMs 發送響應的時間
    expr: avg_over_time(kafka_network_RequestMetrics_999thPercentile{name="ResponseSendTimeMs",request="Produce",}[1m]) > 1000
    for: 1m
    labels:
      job: kafka
    annotations:
      summary: "{{ $labels.app }} 發送響應的時間"
      description: "app: {{ $labels.app }}, instance:{{ $labels.instance }}, 發送響應的時間大於1000ms, 當前值爲:{{ $value }}"
  - alert: MessagesInPerSec 彙總傳入消息速率
    expr:  avg_over_time(kafka_server_BrokerTopicMetrics_OneMinuteRate{name="MessagesInPerSec",topic=""}[1m]) > 200000
    for: 1m
    labels:
      job: kafka
    annotations:
      summary: "{{ $labels.app }} 彙總傳入消息速率"
      description: "app: {{ $labels.app }}, instance:{{ $labels.instance }}, 彙總傳入消息速率大於200000 m/s, 當前值爲:{{ $value }}"

其他

# Log flush rate and time.	
kafka_log_LogFlushStats_OneMinuteRate{name="LogFlushRateAndTimeMs",}	
# 副本的滯後消息數	
sum(kafka_server_FetcherLagMetrics_Value) by(topic, app)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章