基於prometheus打造分佈式監控系統

規劃使用版本

產品名稱 版本 url地址
prometheus 2.22.1 https://github.com/prometheus/prometheus/releases/tag/v2.22.1
alertmanager v0.21.0 https://github.com/prometheus/alertmanager/releases/tag/v0.21.0
consul 1.8.5 docker.io/consul
consulR latest https://github.com/qist/registy-consul-service/releases
victoriametrics v1.50.2 https://github.com/VictoriaMetrics/VictoriaMetrics/releases
kube-prometheus v0.43.2 https://github.com/prometheus-operator/kube-prometheus
kube-prometheus 修改版本 https://github.com/qist/k8s/tree/master/k8s-yaml/kube-prometheus
grafana v7.3.2 docker.io/grafana/grafana

部署環境

部署環境 部署IP 部署方式
阿里雲賬號1 10.8.23.80 二進制部署
阿里雲賬號2 172.16.4.141 二進制部署
華爲雲 10.9.12.133 二進制部署
阿里雲ack kube-prometheus K8S部署
辦公idc kube-prometheus K8S部署
監控彙總 192.168.2.220 二進制部署
grafana K8S 方式部署

網絡互通

1、阿里雲1,2使用阿里云云企網互通
2、阿里雲1與華爲雲,辦公IDC 使用ipsec*** 互通 openswan 安裝
3、阿里雲1自定義路由然後發佈 這樣雲企網就能訪問,華爲雲請關閉網卡安全檢查安裝openswan 服務器關閉 然後配置路由

部署阿里雲1,2,華爲雲 監控系統(二進制模式)

##### 二進制部署目錄 /apps/ 業務名目錄
#####下載
cd /apps
 wget https://github.com/prometheus/prometheus/releases/download/v2.22.1/prometheus-2.22.1.linux-amd64.tar.gz
 wget https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz
 wget https://github.com/qist/registy-consul-service/releases/download/release/consulR
#### 安裝docker consul 使用
yum-config-manager --add-repo http://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo

yum install docker-ce

# 啓動consul
docker run -d --restart=always -p 8500:8500 -e CONSUL_BIND_INTERFACE='eth0' --name=consulone consul agent -server -bootstrap -ui -client='0.0.0.0'

# 部署prometheus
cd /apps
mkdir -p prometheus/{bin,conf,data}
tar -xvf prometheus-2.22.1.linux-amd64.tar.gz
mv prometheus-2.22.1.linux-amd64/* prometheus/bin
配置prometheus
cd prometheus/conf
vim   prometheus.yml
# my global config
global:
  scrape_interval: 1m
  scrape_timeout: 1m
  evaluation_interval: 10s
  external_labels:
    environment: aliyun1 # 環境名字 多環境建議配置

alerting:
  alertmanagers:
    - static_configs:
      - targets: ['127.0.0.1:9093'] # 報警地址

rule_files:
  - "/apps/prometheus/conf/rules/*.yaml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  #- job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    #static_configs:
    #- targets: ['localhost:9090']

  - job_name: 'consul-prometheus'   # consul 自動發現名字
    scrape_interval: 30s
    scrape_timeout: 30s
    consul_sd_configs:
    - server: '127.0.0.1:8500'
      services: []
    relabel_configs:
    - source_labels: [__meta_consul_service]
      regex: "consul|aliyun-exporter" # 需要過濾的自動發現service 名
      action: drop
    - source_labels: [__meta_consul_service]
      separator: ;
      regex: (.*)
      target_label: service 
      replacement: $1
      action: replace
    - source_labels: [__meta_consul_service]
      separator: ;
      regex: (.*)
      target_label: job # 重寫job 名爲consul 配置的service 名字
      replacement: $1
      action: replace
    - source_labels: [__meta_consul_service_id]
      separator: ;
      regex: (.*)
      target_label: service_name # 添加標籤 報警使用
      replacement: $1
      action: replace

  - job_name: 'aliyun-exporter'# 阿里雲api 監控 拉取阿里雲監控指標很慢所以改成1分鐘拉取一次所以獨立出來
    scrape_interval: 60s
    scrape_timeout: 60s
    consul_sd_configs:
    - server: '127.0.0.1:8500'
      services: []
    relabel_configs:
    - source_labels: [__meta_consul_service]
      regex: "aliyun-exporter"  # 需要監控 consul service
      action: keep
    - source_labels: [__meta_consul_service]
      separator: ;
      regex: (.*)
      target_label: service
      replacement: $1
      action: replace
    - source_labels: [__meta_consul_service]
      separator: ;
      regex: (.*)
      target_label: job
      replacement: $1
      action: replace
    - source_labels: [__meta_consul_service_id]
      separator: ;
      regex:  (.*)
      replacement: $1
      target_label: service_name # 添加標籤 報警使用
      action: replace
# 配置prometheus 啓動參數
vim prometheus
PROMETHEUS_OPTS="--web.console.templates=/apps/prometheus/bin/consoles \
--web.console.libraries=/apps/prometheus/bin/console_libraries \
--config.file=/apps/prometheus/conf/prometheus.yml \
--storage.tsdb.path=/apps/prometheus/data/prometheus \
--storage.tsdb.retention.time=1d \
--storage.tsdb.min-block-duration=2h \
--storage.tsdb.max-block-duration=2h \
--web.enable-lifecycle \
--storage.tsdb.no-lockfile \
--web.route-prefix=/"
# 創建報警規則
mkdir -p /apps/prometheus/conf/rules
cp /apps/prometheus/conf/rules
vim node-rules.yaml
groups:
  - name: linux Disk Alerts
    rules:
      - alert: NodeDiskUseage
        expr: 100 - (node_filesystem_avail_bytes{fstype=~"ext4|xfs",mountpoint!="/apps/docker/overlay",mountpoint!="/var/lib/docker/devicemapper",mountpoint!="/var/lib/docker/containers"} / node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint!="/apps/docker/overlay",mountpoint!="/var/lib/docker/devicemapper",mountpoint!="/var/lib/docker/containers"} * 100) > 90
        for: 1m
        labels:
          severity: high
        annotations:
          summary: "{{ $labels.instance }} Partition utilization too high"
          description: "{{ $labels.instance }} Partition usage greater than 90%(Currently used:{{$value}}%)"
      - alert: DiskIoPerformance
        expr: 100 - (avg by(instance,device,job,service,service_name) (irate(node_disk_io_time_seconds_total[1m])) * 100) < 60
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "{{ $labels.instance }} The IO utilization rate of incoming disk is too high"
          description: "{{ $labels.instance }} The incoming disk IO is greater than 60%(Currently used:{{$value}})"

  - name: linux Cpu
    rules:
      - alert: UserCpuUsage
        expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode='user'}[5m]))*100) by (instance,job,service,service_name) >50
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "{{ $labels.instance }}"
          description: "{{ $labels.instance }} User CPU Use greater than 50%(Currently used:{{$value}}%)"
      - alert: CpuUsage
        expr: 100 - (avg by(instance,job,service,service_name) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "{{ $labels.instance }} CPU The utilization rate is too high"
          description: "{{ $labels.instance }} CPU Use greater than 60%(Currently used:{{$value}}%)"
      - alert: NodeCPUUsage95%
        expr: 100 - (avg by(instance,job,service,service_name) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "{{ $labels.instance }} CPU The utilization rate is too high"
          description: "{{ $labels.instance }} CPU Use greater than 95%(Currently used:{{$value}}%)"
  - name: linux Memory
    rules:
      #- alert:  MemoryLow
      #  expr: (1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes)))* 100>80
      #  for: 1m
      #  labels:
      #    severity: high
      #  annotations:
      #    summary: "{{ $labels.instance }} High memory usage"
      #    description: "{{ $labels.instance }} Memory greater than 90%(Currently used:{{$value}}%)"
      - alert: NodeMemoryUsageTooHigh95%
        expr: (1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes)))* 100>95
        for: 1m
        labels:
          severity: high
        annotations:
          summary: "{{ $labels.instance }} High memory usage"
          description: "{{ $labels.instance }} Memory greater than 95%(Currently used:{{$value}}%)"
  - name: linux Clock
    rules:
      - alert: NodeClockSkewDetected
        annotations:
          message: Clock on {{ $labels.instance }} is out of sync by more than 300s.
            Ensure NTP is configured correctly on this host.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected
          summary: Clock skew detected.
        expr: |
          (
            node_timex_offset_seconds > 0.05
          and
            deriv(node_timex_offset_seconds[5m]) >= 0
          )
          or
          (
            node_timex_offset_seconds < -0.05
          and
            deriv(node_timex_offset_seconds[5m]) <= 0
          )
        for: 10m
        labels:
          severity: warning
      - alert: NodeClockNotSynchronising
        annotations:
          message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP
            is configured on this host.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising
          summary: Clock not synchronising.
        expr: |
          min_over_time(node_timex_sync_status[5m]) == 0
        for: 10m
        labels:
          severity: warning
  - name: Instance Down
    rules:
      - alert: InstanceDown
        expr: (up{job!="node-exporter",job!="windows-exporter"}) == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.instance }} down"
          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minute."
  - name: Node Down
    rules:
      - alert: NodeDown
        expr: (up{job=~"node-exporter|windows-exporter"}) == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Node {{ $labels.instance }} down"
          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minute."
  - name: linux Network err
    rules:
      - alert: NodeNetworkReceiveErrs
        annotations:
          description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
            {{ printf "%.0f" $value }} receive errors in the last two minutes.'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
          summary: Network interface is reporting many receive errors.
        expr: |
          increase(node_network_receive_errs_total[2m]) > 10
        for: 1h
        labels:
          severity: warning
      - alert: NodeNetworkTransmitErrs
        annotations:
          description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
            {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
          summary: Network interface is reporting many transmit errors.
        expr: |
          increase(node_network_transmit_errs_total[2m]) > 10
        for: 1h
        labels:
          severity: warning
      - alert: NodeHighNumberConntrackEntriesUsed
        annotations:
          description: '{{ $value | humanizePercentage }} of conntrack entries are used'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused
          summary: Number of conntrack are getting close to the limit
        expr: |
          (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
        labels:
          severity: warning
vim  windows-rules.yaml
groups:
- name: Windows Disk Alerts
  rules:

  # Sends an alert when disk space usage is above 95%
  #- alert: DiskSpaceUsage
  #  expr: 100.0 - 100 * (windows_logical_disk_free_bytes / windows_logical_disk_size_bytes) > 80
  #  for: 10m
  #  labels:
  #    severity: high
  #  annotations:
  #    summary: "Disk Space Usage (instance {{ $labels.instance }})"
  #    description: "Disk Space on Drive is used more than 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: NodeDiskUseage
    expr: 100.0 - 100 * (windows_logical_disk_free_bytes / windows_logical_disk_size_bytes) > 90
    for: 10m
    labels:
      severity: critical
    annotations:
      summary: "Disk Space Usage (instance {{ $labels.instance }})"
      description: "Disk Space on Drive is used more than 95%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: DiskFilling
    expr: 100 * (windows_logical_disk_free_bytes / windows_logical_disk_size_bytes) < 15 and predict_linear(windows_logical_disk_free_bytes[6h], 4 * 24 * 3600) < 0
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: "Disk full in four days (instance {{ $labels.instance }})"
      description: "{{ $labels.volume }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

- name: Windows Cpu
  rules:
  - alert: CpuUsage
    expr: 100 - (avg by (instance) (irate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: "CPU Usage (instance {{ $labels.instance }})"
      description: "CPU Usage is more than 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: NodeCPUUsage95%
    expr: 100 - (avg by (instance) (irate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 95
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "CPU Usage (instance {{ $labels.instance }})"
      description: "CPU Usage is more than 95%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- name: Windows Memory
  rules:
  # Alert on hosts that have exhausted all available physical memory
  - alert: MemoryExhausted
    expr: windows_os_physical_memory_free_bytes == 0
    for: 10m
    labels:
      severity: high
    annotations:
      summary: "Host {{ $labels.instance }} is out of memory"
      description: "{{ $labels.instance }} has exhausted all available physical memory"

  #- alert: MemoryLow
  #  expr: 100 - 100 * windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes > 80
  #  for: 10m
  #  labels:
  #    severity: warning
  #  annotations:
  #    summary: "Memory usage for host {{ $labels.instance }} is greater than 80%"
  - alert: NodeMemoryUsageTooHigh95%
    expr: 100 - 100 * windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes > 95
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Memory usage for host {{ $labels.instance }} is greater than 95%"
  #- name: Microsoft SQL Server Alerts
  #rules:
  #- alert: SQL Server Agent DOWN
  #  expr: windows_service_state{instance="SQL",exported_name="sqlserveragent",state="running"} == 0
  #  for: 3m
  #  labels:
  #    severity: high
  #  annotations:
  #    summary: "Service {{ $labels.exported_name }} down"
  #    description: "Service {{ $labels.exported_name }} on instance {{ $labels.instance }} has been down for more than 3 minutes."
vim docker-exporter.yaml
groups:
- name: DockerContainer
  rules:
  - alert: DockerContainerDown
    expr: rate(container_last_seen{id=~"/docker/.+"}[5m]) < 0.5
    for: 1m
    labels:
      severity: critical
     # Prometheus templates apply here in the annotation and label fields of the alert.
    annotations:
      description: '服務器: {{ $labels.service_name }}中鏡像名爲: {{ $labels.image }},容器名: {{ $labels.name }} 掛了'
      summary: 'Container {{ $labels.instance }} dow'
vim  mysql-rules.yaml
groups:
- name: GaleraAlerts
  rules:
  - alert: MySQLGaleraNotReady
    expr: mysql_global_status_wsrep_ready != 1
    for: 5m
    labels:
      severity: warning
    annotations:
      description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
      summary: Galera cluster node not ready
  - alert: MySQLGaleraOutOfSync
    expr: (mysql_global_status_wsrep_local_state != 4 and mysql_global_variables_wsrep_desync
      == 0)
    for: 5m
    labels:
      severity: warning
    annotations:
      description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}}
        != 4).'
      summary: Galera cluster node out of sync
  - alert: MySQLGaleraDonorFallingBehind
    expr: (mysql_global_status_wsrep_local_state == 2 and mysql_global_status_wsrep_local_recv_queue
      > 100)
    for: 5m
    labels:
      severity: warning
    annotations:
      description: '{{$labels.job}} on {{$labels.instance}} is a donor (hotbackup)
        and is falling behind (queue size {{$value}}).'
      summary: xtradb cluster donor node falling behind
  - alert: MySQLReplicationNotRunning
    expr: mysql_slave_status_slave_io_running == 0 or mysql_slave_status_slave_sql_running
      == 0
    for: 2m
    labels:
      severity: critical
    annotations:
      description: Slave replication (IO or SQL) has been down for more than 2 minutes.
      summary: Slave replication is not running
  - alert: MySQLReplicationLag
    expr: (mysql_slave_lag_seconds > 30) and on(instance) (predict_linear(mysql_slave_lag_seconds[5m],
      60 * 2) > 0)
    for: 1m
    labels:
      severity: critical
    annotations:
      description: The mysql slave replication has fallen behind and is not recovering
      summary: MySQL slave replication is lagging
  - alert: MySQLReplicationLag
    expr: (mysql_heartbeat_lag_seconds > 30) and on(instance) (predict_linear(mysql_heartbeat_lag_seconds[5m],
      60 * 2) > 0)
    for: 1m
    labels:
      severity: critical
    annotations:
      description: The mysql slave replication has fallen behind and is not recovering
      summary: MySQL slave replication is lagging
  - alert: MySQLInnoDBLogWaits
    expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
    labels:
      severity: warning
    annotations:
      description: The innodb logs are waiting for disk at a rate of {{$value}} /
        second
      summary: MySQL innodb log writes stalling
### 等等監控規則
創建prometheus
useradd prometheus -s /sbin/nologin -M
chown -R prometheus:prometheus /apps/prometheus
配置啓動文件
vim /usr/lib/systemd/system/prometheus.service
[Unit]
Description=prometheus
[Service]
LimitNOFILE=1024000
LimitNPROC=1024000
LimitCORE=infinity
LimitMEMLOCK=infinity
EnvironmentFile=-/apps/prometheus/conf/prometheus
ExecStart=/apps/prometheus/bin/prometheus $PROMETHEUS_OPTS
Restart=on-failure
KillMode=process
User=prometheus
[Install]
WantedBy=multi-user.target
# 啓動prometheus
systemctl start prometheus 
# 開機啓動
systemctl enable prometheus
# 部署consulR 註冊監控服務 每個被監控的節點都需部署
cd /apps
chmod +x consulR
mv consulR /tmp
mkdir -p consulR/{bin,conf,logs}
mv /tmp/consulR consulR/bin
# 配置 consulR 註冊prometheus 到consul
# 配置說明https://github.com/qist/registy-consul-service/blob/master/conf/consul.yaml
cd consulR/conf
vim prometheus-exporter.yaml
System:
  ServiceName: registy-consul-service
  ListenAddress: 0.0.0.0
#  Port: 9984 # 多exporter 註釋掉
  FindAddress: 10.8.23.80:80
Logs:
  LogFilePath: /apps/consulR/logs/info.log
  LogLevel: trace
Consul:
  Address: 10.8.23.80:8500,10.8.23.80:8500,10.8.23.80:8500
  Token:
  CheckHealth: /
  CheckType: tcp
  CheckTimeout: 5s
  CheckInterval: 5s
  CheckDeregisterCriticalServiceAfter: false
  CheckDeregisterCriticalServiceAfterTime: 30s
Service:
  Tag: prometheus-exporter
  Address:
  Port: 9090
# 註冊node-exporter
vim node-exporter.yaml
System:
  ServiceName: registy-consul-service
  ListenAddress: 0.0.0.0
#  Port: 9984
  FindAddress: 10.8.23.80:80
Logs:
  LogFilePath: /apps/consulR/logs/info.log
  LogLevel: trace
Consul:
  Address: 10.8.23.80:8500,10.8.23.80:8500,10.8.23.80:8500
  Token:
  CheckHealth: /
  CheckType: tcp
  CheckTimeout: 5s
  CheckInterval: 5s
  CheckDeregisterCriticalServiceAfter: false
  CheckDeregisterCriticalServiceAfterTime: 30s
Service:
  Tag: node-exporter
  Address:
  Port: 9100
# 創建啓動文件
vim /usr/lib/systemd/system/[email protected]
[Unit]
Description=ConsulR process %i
[Service]
#LimitNOFILE=1024000
#LimitNPROC=1024000
LimitCORE=infinity
LimitMEMLOCK=infinity
ExecStart=/apps/consulR/bin/consulR -confpath=/apps/consulR/conf/%i.yaml
ProtectHome=true
ProtectSystem=full
PrivateTmp=true
TasksMax=infinity
Restart=on-failure
StartLimitInterval=30min
StartLimitBurst=30
RestartSec=20s
[Install]
WantedBy=multi-user.target
# 啓動consulR 並z註冊到consul 業務進程必須先啓動
systemctl start consulR@node-exporter
systemctl start consulR@prometheus-exporter
# 配置開機啓動
systemctl enable consulR@node-exporter
systemctl enable  consulR@prometheus-exporter
# 部署alertmanager
cd /apps
tar -xvf alertmanager-0.21.0.linux-amd64.tar.gz
mkdir alertmanager/{bin,conf,data}
mv alertmanager-0.21.0.linux-amd64/* alertmanager/bin
# 創建配置文件
cd alertmanager/conf
vim alertmanager.yml
"global":
  "resolve_timeout": "1m"

"route":
  "group_by": ["alertname","container_name","namespace","severity","pod_name","instance","service_name","environment"]
  "group_wait": "30s"
  "group_interval": "30m"
  "repeat_interval": "3h"
  "receiver": "web.hook"
  routes:
  - "receiver": "web.hook"
    "group_by": ["alertname","container_name","namespace","severity","pod_name","instance","service_name","environment"]
    "group_wait": "10s"
    "group_interval": "5m"
    "repeat_interval": "30m"
    match_re:
      "severity": "critical"
  - "receiver": "web.hook"
    "group_by": ["alertname","container_name","namespace","severity","pod_name","instance","service_name","environment"]
    "group_wait": "10s"
    "group_interval": "30m"
    "repeat_interval": "1h"
    match_re:
      "severity": "high"
"receivers":
- "name": "web.hook"
  "webhook_configs":
  - "url": "http://xxxxxxx.xxxxx.com/k8smntoauth/api/alert_api/alert/prometheus/"  # 專用報警平臺,可以參考其它配置 alertmanager 
    "http_config":
      "bearer_token": ""
"inhibit_rules":
  - "source_match":
      "severity": "critical"
    "target_match":
      "severity": "warning"
    "equal": ["alertname", "dev", "instance"]
# alertmanager 啓動配置文件
vim alertmanager
ALERTMANAGER_OPT="--config.file=/apps/alertmanager/conf/alertmanager.yml \
                  --storage.path=/apps/alertmanager/data \
                  --data.retention=120h \
                  --web.listen-address=:9093 \
                  --web.route-prefix=/"
# 配置alertmanager 啓動腳本
vim /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
[Service]
LimitNOFILE=1024000
LimitNPROC=1024000
LimitCORE=infinity
LimitMEMLOCK=infinity
EnvironmentFile=-/apps/alertmanager/conf/alertmanager
ExecStart=/apps/alertmanager/bin/alertmanager $ALERTMANAGER_OPT
Restart=on-failure
KillMode=process
User=prometheus
[Install]
WantedBy=multi-user.target
# 設置運行用戶
chown -R prometheus:prometheus /apps/alertmanager
# 啓動alertmanager
systemctl start alertmanager
# 開機啓動
systemctl enable alertmanager
# 啓動的環境參數次二進制方式部署

K8S 集羣部署使用(kube-prometheus)

# 下載 代碼
git clone https://github.com/prometheus-operator/kube-prometheus.git
# 或者 git clone https://github.com/qist/k8s.git
# 主要修改文件 prometheus-prometheus.yaml
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
  labels:
    prometheus: k8s
  name: k8s
  namespace: monitoring
spec:
  alerting:
    alertmanagers:
    - name: alertmanager-main
      namespace: monitoring
      port: web
  image: quay.io/prometheus/prometheus:v2.22.1
  nodeSelector:
    kubernetes.io/os: linux
  podMonitorNamespaceSelector: {}
  podMonitorSelector: {}
  replicas: 2
  # 取消系統生成 external_labels
  replicaExternalLabelName: ""
  prometheusExternalLabelName: ""
  # 配置外部標籤多環境報警使用
  externalLabels:
    environment: k8s # 環境env 必須修改
  #secrets:  # 添加etcd 跟istio https 監控請取消註釋
  #- etcd-certs
  #- istio-certs
  #configMaps: # 添加blackbox-exporter 站點批量監控
  #- prometheus-files-discover
  resources:
    requests:
      memory: 4096Mi
  retention: 2d
  #storage:  #使用外部存儲 請取消註釋
  #  volumeClaimTemplate:
  #    spec:
  #      accessModes:
  #      - ReadWriteOnce
  #      resources:
  #        requests:
  #          storage: 50Gi
  #      storageClassName: alicloud-disk-ssd
  #      volumeMode: Filesystem
  ruleSelector:
    matchLabels:
      prometheus: k8s
      role: alert-rules
  additionalScrapeConfigs:
    name: additional-configs
    key: prometheus-additional.yaml   # 參數 qist 倉庫 k8s/k8s-yaml/kube-prometheus/prometheus
  securityContext:
    fsGroup: 2000
    runAsNonRoot: true
    runAsUser: 1000
  serviceAccountName: prometheus-k8s
  serviceMonitorNamespaceSelector: {}
  serviceMonitorSelector: {}
  version: v2.22.1
# 其它直接 應用
kubectl apply -f setup/.
kubectl apply -f .

部署victoriametrics

# 爲啥使用victoriametrics victoriametrics 存儲空間壓縮的很小 prometheus 佔用空間很大
# prometheus 聯邦集羣一天產生幾百GB 的監控數據
# victoriametrics 每天產生幾G的監控數據壓縮的很小  victoriametrics 配置可以自己刷新
# 可以使用prometheus remote_write 方案跟prometheus 聯邦集羣方案 這裏選擇 prometheus 聯邦集羣方案 remote_write 方案在victoriametrics 重啓會佔用大量的帶寬,所以這裏我選擇prometheus 聯邦集羣方案 抓取federate
# prometheus remote_write 寫法
remote_write:
- url: http://192.168.2.220:8428/api/v1/write
  remote_timeout: 30s
  queue_config:
    capacity: 20000
    max_shards: 50
    min_shards: 1
    max_samples_per_send: 10000
    batch_send_deadline: 60s
    min_backoff: 30ms
    max_backoff: 100ms
# K8S 集羣   prometheus-prometheus.yaml 添加
  remoteWrite:
  - url: http://192.168.2.220:8428/api/v1/write 
    batchSendDeadline: 60s
    capacity: 20000
    maxBackoff: 100ms
    maxSamplesPerSend: 10000
    maxShards: 50
    minBackoff: 30ms
    minShards: 1
# victoriametrics 兼容prometheus 配置
# 節點192.168.2.220
# 下載 victoriametrics 可選集羣版本跟單機版本這裏我選擇單機版本
cd /apps
wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/v1.50.2/victoria-metrics-v1.50.2.tar.gz
tar -xvf victoria-metrics-v1.50.2.tar.gz
# 創建運行目錄
mkdir -p victoriametrics/{bin,conf,data}
mv victoria-metrics-prod victoriametrics/bin/
# 創建配置文件
cd victoriametrics/conf
vim prometheus.yml
# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'victoriametrics'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['localhost:8428'] # 靜態監控

  - job_name: 'federate'
    scrape_interval: 30s
    scrape_timeout: 30s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
         #- '{job=~"prometheus.*"}'
         - '{environment=~".*"}'   # 拉取所有environment 
    static_configs:
      - targets:
        - '192.168.201.62:9090' # ack service IP
        - '172.100.41.118:9090' # 辦公idc K8S  service IP

  - job_name: 'aliyun-federate'
    scrape_interval: 30s
    scrape_timeout: 30s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
         - '{job=~"prometheus.*"}'  # 基於job 模式拉取
         - '{job=~"aliyun.*"}'
         - '{job=~"windows.*"}'
    static_configs:
      - targets:
        - '10.8.23.80:9090'
        - '172.16.4.141:9090'

  - job_name: 'huaweiyun-federate'
    scrape_interval: 30s
    scrape_timeout: 30s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
         - '{job=~"huaweiyun.*"}' # 基於job 模式拉取
         - '{job=~"prometheus.*"}'
    static_configs:
      - targets:
        - '10.9.12.133:9090'
  - job_name: 'redis-federate'
    scrape_interval: 30s
    scrape_timeout: 30s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
#         - '{job=~"mysql.*"}'
         - '{job=~"redis.*"}'   # 基於job 模式拉取
    static_configs:
      - targets:
        - '10.8.23.80:9090'
        - '172.16.4.141:9090'

  - job_name: 'mysql-ddd-federate'
    scrape_interval: 60s
    scrape_timeout: 60s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
         - '{instance=~"10.8.27.*",job=~"mysql.*"}' # 基於ip 地址+job 拉取監控數據量很大這樣分區
    static_configs:
      - targets:
        - '10.8.23.80:9090'

  - job_name: 'mysql-usd-federate'
    scrape_interval: 30s
    scrape_timeout: 30s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
         - '{instance=~"10.8.12.*",job=~"mysql.*"}' # 基於ip 地址+job 拉取監控數據量很大這樣分區
    static_configs:
      - targets:
        - '10.8.23.80:9090'

  - job_name: 'mysql-web-federate'
    scrape_interval: 60s
    scrape_timeout: 60s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
         - '{instance=~"10.8.28.*|10.8.26.*|10.8.38.*|10.8.40.*",job=~"mysql.*"}' # 基於ip 地址+job 拉取監控數據量很大這樣分區
    static_configs:
      - targets:
        - '10.8.23.80:9090'

  - job_name: 'mysql-bs-federate'
    scrape_interval: 60s
    scrape_timeout: 60s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
         - '{job=~"mysql.*"}'
    static_configs:
      - targets:
        - '172.16.4.141:9090'

  - job_name: 'mysql-huaweiyun'
    scrape_interval: 30s
    scrape_timeout: 30s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
         - '{job=~"mysql.*"}'  # 基於job 模式拉取
    static_configs:
      - targets:
        - '10.9.12.133:9090'

  - job_name: 'docker-federate'
    scrape_interval: 30s
    scrape_timeout: 30s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
         - '{job=~"alertmanager.*"}' # 基於job 模式拉取
         - '{job=~"consul.*"}'
         - '{job=~"docker.*"}'
         - '{job=~"elasticsearch.*"}'
         - '{job=~"haproxy.*"}'
         - '{job=~"nginx-vts.*"}'
         - '{job=~"rabbitmq.*"}'
    static_configs:
      - targets:
        - '10.8.23.80:9090'
        - '172.16.4.141:9090'
        - '10.9.12.133:9090'

  - job_name: 'node-federate'
    scrape_interval: 30s
    scrape_timeout: 30s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
         - '{job=~"node.*"}' # 基於job 模式拉取
    static_configs:
      - targets:
        - '10.8.23.80:9090'
        - '172.16.4.141:9090'
        - '10.9.12.133:9090'

  - job_name: 'dns-federate'
    scrape_interval: 30s
    scrape_timeout: 30s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
         - '{job=~"coredns.*"}'  # 基於job 模式拉取
         - '{job=~"online.*"}'
    static_configs:
      - targets:
        - '10.8.23.80:9090'
        - '172.16.4.141:9090'
        - '10.9.12.133:9090'

  - job_name: 'blackbox-federate'
    scrape_interval: 30s
    scrape_timeout: 30s
    honor_labels: true
    metrics_path: '/federate'

    params:
      'match[]':
         - '{job=~"blackbox-.*"}' # 基於job 模式拉取
    static_configs:
      - targets:
        - '10.8.23.80:9090'
        - '172.16.4.141:9090'
        - '10.9.12.133:9090'
 # 配置 victoriametrics 啓動參數
 vim victoriametrics
 VICTORIAMETRICS_OPT="-http.connTimeout=5m \
-influx.maxLineSize=100MB \
-import.maxLineLen=100MB \
-maxConcurrentInserts=20000 \
-maxInsertRequestSize=100MB \
-maxLabelsPerTimeseries=200 \
-insert.maxQueueDuration=5m \
-dedup.minScrapeInterval=60s \
-bigMergeConcurrency=20 \
-retentionPeriod=180d \
-search.maxQueryDuration=10m \
-search.maxQueryLen=30MB \
-search.maxQueueDuration=60s \
-search.maxConcurrentRequests=32 \
-storageDataPath=/apps/victoriametrics/data \
-promscrape.streamParse=true \
-promscrape.config=/apps/victoriametrics/conf/prometheus.yml \
-promscrape.configCheckInterval=30s \
-promscrape.consulSDCheckInterval=30s \
-promscrape.discovery.concurrency=2000 \
-promscrape.fileSDCheckInterval=30s \
-promscrape.maxScrapeSize=100MB \
"
# 配置啓動文件
vim /usr/lib/systemd/system/victoriametrics.service
[Unit]
Description=victoriametrics
[Service]
LimitNOFILE=1024000
LimitNPROC=1024000
LimitCORE=infinity
LimitMEMLOCK=infinity
EnvironmentFile=-/apps/victoriametrics/conf/victoriametrics
ExecStart=/apps/victoriametrics/bin/victoria-metrics-prod $VICTORIAMETRICS_OPT
Restart=on-failure
KillMode=process
[Install]
WantedBy=multi-user.target
# 啓動
service victoriametrics start
# 開機啓動
chkconfig victoriametrics on

部署grafana

# 使用二進制或者K8S 模式部署 我這裏以前使用K8S 不是了grafana 不想再去從新部署所以 就修改了數據源
# 參考下面url 記得添加外部存儲
https://github.com/qist/k8s/tree/master/k8s-yaml/kube-prometheus/grafana
# environment-dashboards 目錄爲多環境dashboards 
# 使用 import-dashboards.sh 導入即可
# grafana 數據源 選擇prometheus

基於prometheus打造分佈式監控系統

基於prometheus打造分佈式監控系統
基於prometheus打造分佈式監控系統
prometheus-alertmanager 多環境配置
基於prometheus打造分佈式監控系統
基於prometheus打造分佈式監控系統



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章