[系统运维] prometheus+grafana+AlertManager部署

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 系统运维 -> prometheus+grafana+AlertManager部署 -> 正文阅读

[系统运维]prometheus+grafana+AlertManager部署

文章目录

一、二进制部署

1、部署prometheus

1、下载源码包

wget https://github.do/https://github.com/prometheus/prometheus/releases/download/v2.33.4/prometheus-2.33.4.linux-amd64.tar.gz
tar zxf prometheus-2.33.4.linux-amd64.tar.gz 
mv prometheus-2.33.4.linux-amd64 /usr/local/prometheus

2、创建prometheus用户及数据存放目录

useradd -M -s /sbin/nologin prometheus
mkdir -p /data/prometheus
chown -R prometheus:prometheus /usr/local/prometheus /data/prometheus

3、使用systemd来管理prometheus服务

# vim /usr/lib/systemd/system/prometheus.service
[Unit]
Description=Prometheus
After=network.target

[Service]
Type=simple
Environment="GOMAXPROCS=4"		# prometheus最多可以在4个线程上执行
User=prometheus
Group=prometheus
ExecReload=/bin/kill -HUP $MAINPID
ExecStart=/usr/local/prometheus/prometheus \		# 配置存放目录
  --config.file=/usr/local/prometheus/prometheus.yml \			# 制定配置文件
  --storage.tsdb.path=/data/prometheus \		# 指定数据存储位置
  --storage.tsdb.retention=30d \			# 设置数据留存时间
  --web.console.libraries=/usr/local/prometheus/console_libraries \
  --web.console.templates=/usr/local/prometheus/consoles \
  --web.external-url=http://192.168.1.125:8848/prometheus \				# 指定访问url
  --web.route-prefix="/" \
  --web.listen-address=0.0.0.0:9090 \				# 设置服务监听端口
  --web.read-timeout=5m \
  --web.max-connections=10 \
  --query.max-concurrency=20 \
  --query.timeout=2m \
  --web.enable-lifecycle				# 启用配置文件热更新
PrivateTmp=true
PrivateDevices=true
ProtectHome=true
NoNewPrivileges=true
LimitNOFILE=infinity
ReadWriteDirectories=/data/prometheus
ProtectSystem=full

SyslogIdentifier=prometheus
Restart=always

[Install]
WantedBy=multi-user.target

4、启动prometheus

systemctl daemon-reload
systemctl enable prometheus && systemctl start prometheus
netstat -lntp | grep prometheus
tcp6       0      0 :::9090                 :::*                    LISTEN      40632/prometheus

5、修改配置文件

# vim /usr/local/prometheus/prometheus.yml
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
  alertmanagers:
    - static_configs:
        - targets: [ 'localhost:9093' ]
rule_files:
   - "/usr/local/prometheus/rules/*.yml"
scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]
  - job_name: cadvisor
    static_configs:
      - targets:
        - 192.168.0.28:8080
  - job_name: 'nacos_exproter'
    metrics_path: '/nacos/actuator/prometheus'
    static_configs:
      - targets:
        - 192.168.0.86:8888
  - job_name: 'rabbitmq'
    static_configs:
    - targets:
        - 192.168.0.20:15692
        - 192.168.0.20:15693
        - 192.168.0.20:15694
  - job_name: 'fastdfs'
    static_configs:
    - targets: ['192.168.0.176:9018']
  - job_name: 'nginx_VTS'
    static_configs:
    - targets: ['192.168.0.19:9913']
  - job_name: 'redis_exporter'
    static_configs:
      - targets:
        - redis://192.168.0.155:6379
        - redis://192.168.0.155:6380
    metrics_path: /scrape
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 192.168.0.155:9121
  - job_name: 'blackbox_tcp'
    metrics_path: /probe
    params:
      module: [tcp_connect]
    static_configs:
      - targets:
        - 192.168.0.256:8888
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 127.0.0.1:9115
  - job_name: 'blackbox_http'
    metrics_path: /probe
    params:
      module: [http_2xx]
    static_configs:
      - targets:
        - http://192.168.1.10:110
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 127.0.0.1:9115
  - job_name: 'mysql_exproter'
    static_configs:
    - targets:
      - 192.168.0.118:9104

6、配置告警规则

cd /usr/local/prometheus/rules/

1、域名检测

vim blackbox_rules.yml
groups:
  - name: blackbox-exporter
    rules:
    - alert: DomainAccessDelayExceeds1s
      annotations:
        description:  域名：{{ $labels.instance }} 探测延迟大于 10 秒，当前延迟为：{{ $value }}
        summary: 域名探测，访问延迟超过 1 秒
      expr: sum(probe_http_duration_seconds{job=~"blackbox"}) by (instance) > 1
      for: 1m
      labels:
        severity: warning
        type: blackbox

2、端口检测

vim black.yml
groups:
- name: blackbox_network_stats
  rules:
  - alert: blackbox_network_stats
    expr: probe_success == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "接口/主机/端口 {{ $labels.instance }}  无法联通"
      description: "请尽快检测"

3、容器状态检测

vim container_sys.yml
groups:
- name: Container_rules
  rules:
  - alert: ContainerKilled
    expr: time() - container_last_seen > 60
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: 容器挂啦 (instance {{ $labels.instance }})
      description: "一个容器挂啦\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: ContainerAbsent
    expr: absent(container_last_seen)
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: 容器不存在 (instance {{ $labels.instance }})
      description: "容器丢失5分钟\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: ContainerCpuUsage
    expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: 容器CPU使用率 (instance {{ $labels.instance }})
      description: "容器CPU使用率超过80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: ContainerMemoryUsage
    expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: 容器内存使用 (instance {{ $labels.instance }})
      description: "容器内存使用率超过80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: ContainerVolumeUsage
    expr: (1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: 容器卷利用率 (instance {{ $labels.instance }})
      description: "容器卷使用率超过80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: ContainerVolumeIoUsage
    expr: (sum(container_fs_io_current{name!=""}) BY (instance, name) * 100) > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: 容器卷IO使用情况 (instance {{ $labels.instance }})
      description: "容器卷IO使用率超过80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: ContainerHighThrottleRate
    expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: 容器繁忙 (instance {{ $labels.instance }})
      description: "容器正在被杀死\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

4、主机状态检测

vim host_sys.yml
groups:
  - name: Host
    rules:
    - alert: HostMemory Usage
      expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 >  90
      for: 1m
      labels:
        name: Memory
        severity: Warning
      annotations:
        summary: " {{ $labels.appname }} "
        description: "宿主机内存使用率超过90%."
        value: "{{ $value }}"
    - alert: HostCPU Usage
      expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.8
      for: 1m
      labels:
        name: CPU
        severity: Warning
      annotations:
        summary: " {{ $labels.appname }} "
        description: "宿主机CPU使用率超过80%."
        value: "{{ $value }}"
    - alert: HostLoad
      expr: node_load5 > 20
      for: 1m
      labels:
        name: Load
        severity: Warning
      annotations:
        summary: "{{ $labels.appname }} "
        description: " 主机负载5分钟超过20."
        value: "{{ $value }}"
    - alert: HostFilesystem Usage
      expr: (node_filesystem_size_bytes-node_filesystem_free_bytes)/node_filesystem_size_bytes*100>80
      for: 1m
      labels:
        name: Disk
        severity: Warning
      annotations:
        summary: " {{ $labels.appname }} "
        description: " 宿主机 [ {{ $labels.mountpoint }} ]分区使用超过80%."
        value: "{{ $value }}%"
    - alert: HostDiskio writes
      expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10
      for: 1m
      labels:
        name: Diskio
        severity: Warning
      annotations:
        summary: " {{ $labels.appname }} "
        description: " 宿主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高."
        value: "{{ $value }}iops"
    - alert: HostDiskio reads
      expr: irate(node_disk_reads_completed_total{job=~"Host"}[1m]) > 10
      for: 1m
      labels:
        name: Diskio
        severity: Warning
      annotations:
        summary: " {{ $labels.appname }} "
        description: " 宿机 [{{ $labels.device }}]磁盘1分钟平均读取IO负载较高."
        value: "{{ $value }}iops"
    - alert: HostNetwork_receive
      expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576  > 10
      for: 1m
      labels:
        name: Network_receive
        severity: Warning
      annotations:
        summary: " {{ $labels.appname }} "
        description: " 宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过10Mbps."
        value: "{{ $value }}3Mbps"
    - alert: hostNetwork_transmit
      expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576  > 10
      for: 1m
      labels:
        name: Network_transmit
        severity: Warning
      annotations:
        summary: " {{ $labels.appname }} "
        description: " 宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过10Mbps."
        value: "{{ $value }}3Mbps"

5、jenkins状态检测

vim jenkins.yml
groups:
- name: jenkins-监控告警
  rules:
  - alert: JenkinsOffline
    expr: jenkins_node_offline_value > 1
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Jenkins 离线 (instance {{ $labels.instance }})
      description: "Jenkins 离线: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: JenkinsHealthcheck
    expr: jenkins_health_check_score < 1
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Jenkins 健康检查 (instance {{ $labels.instance }})
      description: "Jenkins 健康检查 score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: JenkinsBuildsHealthScore
    expr: default_jenkins_builds_health_score < 1
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Jenkins 构建健康评分 (instance {{ $labels.instance }})
      description: "健康检查失败 `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: JenkinsRunFailureTotal
    expr: delta(jenkins_runs_failure_total[1h]) > 100
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Jenkins运行失败总数 (instance {{ $labels.instance }})
      description: "作业运行失败: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: Jenkinsbuildtestsfailing
    expr: default_jenkins_builds_last_build_tests_failing > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Jenkins构建测试失败 (instance {{ $labels.instance }})
      description: "上次生成测试失败: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: JenkinsLastBuildFailed
    expr: default_jenkins_builds_last_build_result_ordinal == 2
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Jenkins 上次构建失败了 (instance {{ $labels.instance }})
      description: "上次构建失败了: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

6、MySQL状态检测

vim mysql_rules.yml
groups:
- name: MySQL-rules
  rules:
  - alert: MysqlDown
    expr: mysql_up == 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: MySQL关闭 (instance {{ $labels.instance }})
      description: "MySQL实例已关闭 {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: MysqlTooManyConnections(>80%)
    expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: MySQL连接太多 (> 80%) (instance {{ $labels.instance }})
      description: "超过80%的MySQL连接在上使用 {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: MysqlHighThreadsRunning
    expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: MySQL高线程运行 (instance {{ $labels.instance }})
      description: "超过60%的MySQL连接在上处于运行状态 {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: MysqlSlaveIoThreadNotRunning
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: MySQL从IO线程未运行 (instance {{ $labels.instance }})
      description: "MySQL从IO线程未在运行 {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: MysqlSlaveSqlThreadNotRunning
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: MySQL从属SQL线程未运行 (instance {{ $labels.instance }})
      description: "MySQL从属SQL线程未在运行 {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: MysqlSlaveReplicationLag
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: MySQL从机复制延迟 (instance {{ $labels.instance }})
      description: "MySQL复制延迟 {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: MysqlSlowQueries
    expr: increase(mysql_global_status_slow_queries[5m]) > 3
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: MySQL慢速查询 (instance {{ $labels.instance }})
      description: "MySQL有一些新的慢速查询.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: MysqlInnodbLogWaits
    expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: MySQL InnoDB日志等待 (instance {{ $labels.instance }})
      description: "MySQL innodb日志写入暂停\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: MysqlRestarted
    expr: mysql_global_status_uptime < 60
    for: 0m
    labels:
      severity: info
    annotations:
      summary: MySQL重启 (instance {{ $labels.instance }})
      description: "MySQL刚刚重新启动，不到一分钟前 {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

7、nacos检测

vim nacos.yml
groups:
- name: alters
  rules:
  - alert: Nacos Down
    expr: up{job="nacos"} == 0
    for: 1m
    labels:
      severity: emergency
      target: nacos
    annotations:
      description: "description"
      summary: "{{ $labels.instance }} 已停止运行超过 1 分钟！"

8、nginx状态检测

vim nginx_rules.yml
groups:
- name: NginxAlert # 规则组名称
  rules:
  - alert: ResponseTimeAlert      # 规则的名称
      # 告警阈值计算规则为响应时间大于1000ms并持续10s的发送告警
    expr: (nginx_upstream_responseMsec > 1000)
    for: 10s                      # 持续时间为10s
    labels:                       # 定义告警路由标签
      severity: critical
      service: nginx
    annotations:                # 告警信息
      summary: "Nginx响应大于1000ms"
      description: "Nginx {{ $labels.instance }}后端集群{{ $labels.upstream }} 中{{ $labels.backend }}的响应时间大于1000ms"

  - alert: NginxHighHttp4xxErrorRate
    expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: Nginx的HTTP 4xx错误率高 (instance {{ $labels.instance }})
      description: "HTTP状态请求太多 4xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

  - alert: NginxHighHttp5xxErrorRate
    expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
      description: "HTTP状态请求太多 5xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

  - alert: NginxLatencyHigh
    expr: histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node)) > 3
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Nginx latency high (instance {{ $labels.instance }})
      description: "Nginx p99延迟高于3秒\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

9、rabbitmq状态检测

vim rabbitmq_rules.yml
groups:
- name: RabbitmqNodeDown
  rules:
  - alert: RabbitmqNodeDown
    expr: sum(rabbitmq_build_info) < 3
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Rabbitmq节点关闭 (instance {{ $labels.instance }})
      description: "RabbitMQ群集中运行的节点少于3个\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: RabbitmqNodeNotDistributed
    expr: erlang_vm_dist_node_state < 3
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Rabbitmq节点未分发 (instance {{ $labels.instance }})
      description: "分配链路状态未“启动”'\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: RabbitmqInstancesDifferentVersions
    expr: count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1
    for: 1h
    labels:
      severity: warning
    annotations:
      summary: 不同版本的Rabbitmq实例 (instance {{ $labels.instance }})
      description: "在同一集群中运行不同版本的Rabbitmq可能会导致失败.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: RabbitmqMemoryHigh
    expr: rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Rabbitmq内存高 (instance {{ $labels.instance }})
      description: "一个节点使用超过90%的分配RAM\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: RabbitmqFileDescriptorsUsage
    expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Rabbitmq文件描述符的用法 (instance {{ $labels.instance }})
      description: "一个节点使用超过90%的文件描述符\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: RabbitmqTooManyUnackMessages
    expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: Rabbitmq未确认消息太多 (instance {{ $labels.instance }})
      description: "太多未确认的消息\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: RabbitmqTooManyConnections
    expr: rabbitmq_connections > 1000
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Rabbitmq连接太多 (instance {{ $labels.instance }})
      description: "节点的总连接数太高\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: RabbitmqNoQueueConsumer
    expr: rabbitmq_queue_consumers < 1
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: Rabbitmq无队列使用者 (instance {{ $labels.instance }})
      description: "一个队列的使用者少于1个\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: RabbitmqUnroutableMessages
    expr: increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Rabbitmq不可终止消息 (instance {{ $labels.instance }})
      description: "队列中有无法发送的消息\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  - alert: Rabbitmq-down
    expr: rabbitmq_up{job='RabbitMQ'} != 1
    labels:
      status: High
      team: Rabbitmq_monitor
    annotations:
      description: "Instance: {{ $labels.instance }} is Down ! ! !"
      value: '{{ $value }}'
      summary:  "主机节点已关闭"
  - alert: Rabbitmq disk free limit   status
    expr: rabbitmq_node_disk_free{job='RabbitMQ'} / 1024 / 1024  <= rabbitmq_node_disk_free_limit{job='RabbitMQ'} / 1024 / 1024 + 200
    labels:
      status: High
      team: Rabbitmq_monitor
    annotations:
      description: "Instance: {{ $labels.instance }} Rabbitmq可用磁盘的容量过低 ! ! !"
      value: '{{ $value }} MB'
      summary:  "Rabbitmq空闲磁盘太低"

10、redis状态检测

vim redis_down.yml
groups:
- name:  redisdown
  rules:
    - alert: RedisDown
      expr: redis_up == 0
      for: 5m
      labels:
        name: instance
        severity: Critical
      annotations:
        summary: " {{ $labels.alias }}"
        description: " 服务停止运行 "
        value: "{{ $value }}"
    - alert: Redis linked too many clients
      expr: redis_connected_clients / redis_config_maxclients * 100 > 80
      for: 1m
      labels:
        name: instance
        severity: Warning
      annotations:
        summary: " {{ $labels.alias }}"
        description: " Redis连接数超过最大连接数的80%. "
        value: "{{ $value }}"
    - alert: MissingBackup
      expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
      for: 5m
      labels:
        severity: error
      annotations:
        summary: "Missing backup (instance {{ $labels.instance }})"
        description: "Redis已经24小时没有备份了\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: OutOfMemory
      expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Out of memory (instance {{ $labels.instance }})"
        description: "Redis内存不足 (> 90%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: ReplicationBroken
      expr: delta(redis_connected_slaves[1m]) < 0
      for: 5m
      labels:
        severity: error
      annotations:
        summary: "Replication broken (instance {{ $labels.instance }})"
        description: "Redis实例丢失了一个从机\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: TooManyConnections
      expr: redis_connected_clients > 1000
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Too many connections (instance {{ $labels.instance }})"
        description: "Redis实例的连接太多\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: NotEnoughConnections
      expr: redis_connected_clients < 5
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Not enough connections (instance {{ $labels.instance }})"
        description: "Redis实例应该有更多的连接 (> 5)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: RejectedConnections
      expr: increase(redis_rejected_connections_total[1m]) > 0
      for: 5m
      labels:
        severity: error
      annotations:
        summary: "Rejected connections (instance {{ $labels.instance }})"
        description: "与Redis的某些连接已被拒绝\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

11、服务状态检测

vim service_down.yml
groups:
  - name: Process-exporter
    rules:
    - alert: ProcessDown
      expr: namedprocess_namegroup_num_procs  == 0
      for: 1m
      labels:
        name: instance
        severity: Critical
      annotations:
        summary: " {{ $labels.appname }}"
        description: " 进程停止运行 "
        value: "{{ $value }}"
    - alert: 实例存活告警
      expr: up == 0
      for: 1m
      labels:
        user: prometheus
        severity: warning
      annotations:
        summary: "主机宕机 !!!"
        description: "该实例主机已经宕机超过一分钟了。"

7、检查配置文件是否正确

./promtool check config prometheus.yml

8、不重启重新加载配置文件

curl -X POST http://127.0.0.1:9090/-/reload

2、部署grafana

1、下载grafana二进制包

wget https://github.do/https://github.com/grafana/grafana/archive/refs/tags/v8.4.4.tar.gz
tar -zxf v8.4.4.tar.gz
mv grafana-8.4.4  /usr/local/
ln -s /usr/local/grafana-8.4.4/ /usr/local/grafana

2、创建grafana用户及数据存放目录

useradd -s /sbin/nologin -M grafana
mkdir /data/grafana
chown -R grafana:grafana /usr/local/grafana/ 
chown -R grafana:grafana  /data/grafana/

3、修改配置文件

# vim /usr/local/grafana/conf/defaults.ini
data = /data/grafana/data
logs = /data/grafana/log
plugins = /data/grafana/plugins
provisioning = /data/grafana/conf/provisioning

4、使用systemd来管理grafana服务

# vim /etc/systemd/system/grafana-server.service
[Unit]
Description=Grafana
After=network.target

[Service]
User=grafana
Group=grafana
Type=notify
ExecStart=/usr/local/grafana/bin/grafana-server -homepath /usr/local/grafana
Restart=on-failure

[Install]
WantedBy=multi-user.target

5、启动并设置开机自启

systemctl daemon-reload && systemctl start  grafana-server
systemctl status  grafana-server
systemctl enable  grafana-server

3、部署AlertManager

1、下载AlertManager二进制包

wget https://github.do/https://github.com/prometheus/alertmanager/releases/download/v0.23.0/alertmanager-0.23.0.linux-amd64.tar.gz
tar zxf alertmanager-0.23.0.linux-amd64.tar.gz -C /usr/local/
cd /usr/local/
mv alertmanager-0.23.0.linux-amd64/ alertmanager

2、使用systemd来管理AlertManager服务

# vim /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
After=network.target

[Service]
Type=simple
ExecStart=/usr/local/alertmanager/alertmanager --config.file /usr/local/alertmanager/alertmanager.yml --storage.path=/var/lib/alertmanager
Restart=on-failure

[Install]
WantedBy=multi-user.target

3、修改配置文件

# vim alertmanager.yml
global:
  resolve_timeout: 5m   #解析的超时时间
  http_config: {}
route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'dingding.webhook1'
receivers:
- name: 'dingding.webhook1'
  webhook_configs:
  - send_resolved: true
    url: 'http://127.0.0.1:8060/dingtalk/webhook1/send'
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']
templates:
  - '/alertmanager/template/*.tmpl'

4、启动AlertManager

systemctl daemon-reload && systemctl enable alertmanager
systemctl start alertmanager && systemctl status alertmanager

4、部署钉钉告警

1、下载二进制包

wget https://ghproxy.com/https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.0.0/prometheus-webhook-dingtalk-2.0.0.linux-amd64.tar.gz
tar xf prometheus-webhook-dingtalk-2.0.0.linux-amd64.tar.gz -C /usr/local/
ln -sv /usr/local/prometheus-webhook-dingtalk-2.0.0.linux-amd64/ /usr/local/prometheus-webhook-dingtalk

2、修改配置文件

# vim /usr/local/prometheus-webhook-dingtalk/config.yml
timeout: 5s
templates:
  - contrib/templates/legacy/template.tmpl

default_message:
  title: '{{ template "legacy.title" . }}'
  text: '{{ template "legacy.content" . }}'

targets:
  webhook1:
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    # secret for signature
    secret: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx			# 加签
  webhook_mention_all:
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    secret: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    mention:
      all: true
  webhook_mention_users:
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    mention:
      mobiles: ['139xxxxxxxx']

3、修改报警模板

# cd /usr/local/prometheus-webhook-dingtalk
# cd contrib/templates/legacy/
# vim template.tmpl 
{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
{{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}

{{ define "__text_alert_list" }}{{ range . }}
**Labels**
{{ range .Labels.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**Annotations**
{{ range .Annotations.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**Source:** [{{ .GeneratorURL }}]({{ .GeneratorURL }})
{{ end }}{{ end }}

{{ define "default.__text_alert_list" }}{{ range . }}
---
**告警级别:** {{ .Labels.severity | upper }}

**运营团队:** {{ .Labels.team | upper }}

**触发时间:** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}

**事件信息:** 
{{ range .Annotations.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}

{{ end }}

**事件标签:**
{{ range .Labels.SortedPairs }}{{ if and (ne (.Name) "severity") (ne (.Name) "summary") (ne (.Name) "team") }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}{{ end }}
{{ end }}
{{ end }}
{{ define "default.__text_alertresovle_list" }}{{ range . }}
---
**告警级别:** {{ .Labels.severity | upper }}

**运营团队:** {{ .Labels.team | upper }}

**触发时间:** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}

**结束时间:** {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }}

**事件信息:**
{{ range .Annotations.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}

{{ end }}

**事件标签:**
{{ range .Labels.SortedPairs }}{{ if and (ne (.Name) "severity") (ne (.Name) "summary") (ne (.Name) "team") }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}{{ end }}
{{ end }}
{{ end }}

{{/* Default */}}
{{ define "default.title" }}{{ template "__subject" . }}{{ end }}
{{ define "default.content" }}#### \[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}\] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})**
{{ if gt (len .Alerts.Firing) 0 -}}

{{ template "default.__text_alert_list" .Alerts.Firing }}

{{- end }}

{{ if gt (len .Alerts.Resolved) 0 -}}
{{ template "default.__text_alertresovle_list" .Alerts.Resolved }}

{{- end }}
{{- end }}

{{/* Legacy */}}
{{ define "legacy.title" }}{{ template "__subject" . }}{{ end }}
{{ define "legacy.content" }}#### \[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}\] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})**
{{ template "__text_alert_list" .Alerts.Firing }}
{{- end }}

{{/* Following names for compatibility */}}
{{ define "ding.link.title" }}{{ template "default.title" . }}{{ end }}
{{ define "ding.link.content" }}{{ template "default.content" . }}{{ end }}

4、使用systemd来管理服务

# vim /lib/systemd/system/dingtalk.service
[Unit]
Descripton=dingtalk
Documentation=https://github.com/timonwong/prometheus-webhook-dingtalk/
After=network.target

[Service]
Restart=on-failure
WorkingDirectory=/usr/local/prometheus-webhook-dingtalk
ExecStart=/usr/local/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --config.file=/usr/local/prometheus-webhook-dingtalk/config.yml

[Install]
WantedBy=multi-user.target

5、启动钉钉告警

systemctl daemon-reload && systemctl enable dingtalk
systemctl start dingtalk && systemctl status dingtalk

二、docker-compose部署

version: "3.8"
networks:
    monitor:
        driver: bridge

services:
  node-exporter:
    image: prom/node-exporter
    container_name: node-exporter
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.rootfs=/rootfs'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
    restart: unless-stopped
    expose:
      - 9100
    networks:
      - monitor

  blackbox-exporter:
    image: prom/blackbox-exporter
    expose:
      - 9115
    container_name: blackbox
    restart: unless-stopped
    volumes:
      - "./blackbox/:/config"
    command:
      - "--config.file=/config/blackbox.yml"
    networks:
      - monitor

  cadvisor:
    image: google/cadvisor
    container_name: cadvisor
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker:/var/lib/docker:ro
    restart: unless-stopped
    expose:
      - 8080
    networks:
      - monitor
    depends_on:
      - prometheus

  prometheus:
    image: prom/prometheus
    container_name: prom
    restart: always
    ports:
      - "9090:9090"
    volumes:
      - /etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - /data/prometheus/data:/prometheus
    command: 
      - "--storage.tsdb.retention.time=60d"
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--web.enable-lifecycle"    
    networks:
      - monitor
    depends_on:
      - blackbox-exporter
      - node-exporter

  alertmanager:
    image: prom/alertmanager
    restart: always
    container_name: alert
    volumes:
      - /etc/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml     
    ports:
      - "9093:9093"
    networks:
      - monitor
    depends_on:
      - prometheus

  grafana:
    image: grafana/grafana
    restart: always
    container_name: grafana
    ports:
      - "3000:3000"
    environment:
      - "GF_SECURITY_ADMIN_PASSWORD=admin"
      - "GF_RENDERING_SERVER_URL=http://renderer:8081/render"
      - "GF_RENDERING_CALLBACK_URL=http://grafana:3000/"
      - "GF_LOG_FILTERS=rendering:debug"
    volumes:  
      - /data/grafana/:/grafana
      - /data/grafana/data/:/var/lib/grafana
    networks:
      - monitor
    depends_on:
      - prometheus
      - renderer

  renderer:
    image: grafana/grafana-image-renderer:latest
    container_name: renderer
    ports:
      - "8081:8081"
    environment:
      - "ENABLE_METRICS=true"
      - "RENDERING_MODE=clustered"
      - "RENDERING_CLUSTERING_MODE=context"
      - "RENDERING_CLUSTERING_MAX_CONCURRENCY=5"
    networks:
      - monitor