一、二进制部署
1、部署prometheus
1、下载源码包
wget https://github.do/https://github.com/prometheus/prometheus/releases/download/v2.33.4/prometheus-2.33.4.linux-amd64.tar.gz
tar zxf prometheus-2.33.4.linux-amd64.tar.gz
mv prometheus-2.33.4.linux-amd64 /usr/local/prometheus
2、创建prometheus用户及数据存放目录
useradd -M -s /sbin/nologin prometheus
mkdir -p /data/prometheus
chown -R prometheus:prometheus /usr/local/prometheus /data/prometheus
3、使用systemd来管理prometheus服务
[Unit]
Description=Prometheus
After=network.target
[Service]
Type=simple
Environment="GOMAXPROCS=4"
User=prometheus
Group=prometheus
ExecReload=/bin/kill -HUP $MAINPID
ExecStart=/usr/local/prometheus/prometheus \
--config.file=/usr/local/prometheus/prometheus.yml \
--storage.tsdb.path=/data/prometheus \
--storage.tsdb.retention=30d \
--web.console.libraries=/usr/local/prometheus/console_libraries \
--web.console.templates=/usr/local/prometheus/consoles \
--web.external-url=http://192.168.1.125:8848/prometheus \
--web.route-prefix="/" \
--web.listen-address=0.0.0.0:9090 \
--web.read-timeout=5m \
--web.max-connections=10 \
--query.max-concurrency=20 \
--query.timeout=2m \
--web.enable-lifecycle
PrivateTmp=true
PrivateDevices=true
ProtectHome=true
NoNewPrivileges=true
LimitNOFILE=infinity
ReadWriteDirectories=/data/prometheus
ProtectSystem=full
SyslogIdentifier=prometheus
Restart=always
[Install]
WantedBy=multi-user.target
4、启动prometheus
systemctl daemon-reload
systemctl enable prometheus && systemctl start prometheus
netstat -lntp | grep prometheus
tcp6 0 0 :::9090 :::* LISTEN 40632/prometheus
5、修改配置文件
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: [ 'localhost:9093' ]
rule_files:
- "/usr/local/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: cadvisor
static_configs:
- targets:
- 192.168.0.28:8080
- job_name: 'nacos_exproter'
metrics_path: '/nacos/actuator/prometheus'
static_configs:
- targets:
- 192.168.0.86:8888
- job_name: 'rabbitmq'
static_configs:
- targets:
- 192.168.0.20:15692
- 192.168.0.20:15693
- 192.168.0.20:15694
- job_name: 'fastdfs'
static_configs:
- targets: ['192.168.0.176:9018']
- job_name: 'nginx_VTS'
static_configs:
- targets: ['192.168.0.19:9913']
- job_name: 'redis_exporter'
static_configs:
- targets:
- redis://192.168.0.155:6379
- redis://192.168.0.155:6380
metrics_path: /scrape
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.0.155:9121
- job_name: 'blackbox_tcp'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
- 192.168.0.256:8888
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
- job_name: 'blackbox_http'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://192.168.1.10:110
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
- job_name: 'mysql_exproter'
static_configs:
- targets:
- 192.168.0.118:9104
6、配置告警规则
cd /usr/local/prometheus/rules/
1、域名检测
vim blackbox_rules.yml
groups:
- name: blackbox-exporter
rules:
- alert: DomainAccessDelayExceeds1s
annotations:
description: 域名:{{ $labels.instance }} 探测延迟大于 10 秒,当前延迟为:{{ $value }}
summary: 域名探测,访问延迟超过 1 秒
expr: sum(probe_http_duration_seconds{job=~"blackbox"}) by (instance) > 1
for: 1m
labels:
severity: warning
type: blackbox
2、端口检测
vim black.yml
groups:
- name: blackbox_network_stats
rules:
- alert: blackbox_network_stats
expr: probe_success == 0
for: 1m
labels:
severity: critical
annotations:
summary: "接口/主机/端口 {{ $labels.instance }} 无法联通"
description: "请尽快检测"
3、容器状态检测
vim container_sys.yml
groups:
- name: Container_rules
rules:
- alert: ContainerKilled
expr: time() - container_last_seen > 60
for: 0m
labels:
severity: warning
annotations:
summary: 容器挂啦 (instance {{ $labels.instance }})
description: "一个容器挂啦\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerAbsent
expr: absent(container_last_seen)
for: 5m
labels:
severity: warning
annotations:
summary: 容器不存在 (instance {{ $labels.instance }})
description: "容器丢失5分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerCpuUsage
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: 容器CPU使用率 (instance {{ $labels.instance }})
description: "容器CPU使用率超过80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerMemoryUsage
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: 容器内存使用 (instance {{ $labels.instance }})
description: "容器内存使用率超过80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerVolumeUsage
expr: (1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: 容器卷利用率 (instance {{ $labels.instance }})
description: "容器卷使用率超过80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerVolumeIoUsage
expr: (sum(container_fs_io_current{name!=""}) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: 容器卷IO使用情况 (instance {{ $labels.instance }})
description: "容器卷IO使用率超过80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerHighThrottleRate
expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
for: 2m
labels:
severity: warning
annotations:
summary: 容器繁忙 (instance {{ $labels.instance }})
description: "容器正在被杀死\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
4、主机状态检测
vim host_sys.yml
groups:
- name: Host
rules:
- alert: HostMemory Usage
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 90
for: 1m
labels:
name: Memory
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: "宿主机内存使用率超过90%."
value: "{{ $value }}"
- alert: HostCPU Usage
expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.8
for: 1m
labels:
name: CPU
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: "宿主机CPU使用率超过80%."
value: "{{ $value }}"
- alert: HostLoad
expr: node_load5 > 20
for: 1m
labels:
name: Load
severity: Warning
annotations:
summary: "{{ $labels.appname }} "
description: " 主机负载5分钟超过20."
value: "{{ $value }}"
- alert: HostFilesystem Usage
expr: (node_filesystem_size_bytes-node_filesystem_free_bytes)/node_filesystem_size_bytes*100>80
for: 1m
labels:
name: Disk
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿主机 [ {{ $labels.mountpoint }} ]分区使用超过80%."
value: "{{ $value }}%"
- alert: HostDiskio writes
expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10
for: 1m
labels:
name: Diskio
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高."
value: "{{ $value }}iops"
- alert: HostDiskio reads
expr: irate(node_disk_reads_completed_total{job=~"Host"}[1m]) > 10
for: 1m
labels:
name: Diskio
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿机 [{{ $labels.device }}]磁盘1分钟平均读取IO负载较高."
value: "{{ $value }}iops"
- alert: HostNetwork_receive
expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 10
for: 1m
labels:
name: Network_receive
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过10Mbps."
value: "{{ $value }}3Mbps"
- alert: hostNetwork_transmit
expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 10
for: 1m
labels:
name: Network_transmit
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过10Mbps."
value: "{{ $value }}3Mbps"
5、jenkins状态检测
vim jenkins.yml
groups:
- name: jenkins-监控告警
rules:
- alert: JenkinsOffline
expr: jenkins_node_offline_value > 1
for: 0m
labels:
severity: critical
annotations:
summary: Jenkins 离线 (instance {{ $labels.instance }})
description: "Jenkins 离线: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsHealthcheck
expr: jenkins_health_check_score < 1
for: 0m
labels:
severity: critical
annotations:
summary: Jenkins 健康检查 (instance {{ $labels.instance }})
description: "Jenkins 健康检查 score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsBuildsHealthScore
expr: default_jenkins_builds_health_score < 1
for: 0m
labels:
severity: critical
annotations:
summary: Jenkins 构建健康评分 (instance {{ $labels.instance }})
description: "健康检查失败 `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsRunFailureTotal
expr: delta(jenkins_runs_failure_total[1h]) > 100
for: 0m
labels:
severity: warning
annotations:
summary: Jenkins运行失败总数 (instance {{ $labels.instance }})
description: "作业运行失败: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Jenkinsbuildtestsfailing
expr: default_jenkins_builds_last_build_tests_failing > 0
for: 0m
labels:
severity: warning
annotations:
summary: Jenkins构建测试失败 (instance {{ $labels.instance }})
description: "上次生成测试失败: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsLastBuildFailed
expr: default_jenkins_builds_last_build_result_ordinal == 2
for: 0m
labels:
severity: warning
annotations:
summary: Jenkins 上次构建失败了 (instance {{ $labels.instance }})
description: "上次构建失败了: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
6、MySQL状态检测
vim mysql_rules.yml
groups:
- name: MySQL-rules
rules:
- alert: MysqlDown
expr: mysql_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: MySQL关闭 (instance {{ $labels.instance }})
description: "MySQL实例已关闭 {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlTooManyConnections(>80%)
expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: MySQL连接太多 (> 80%) (instance {{ $labels.instance }})
description: "超过80%的MySQL连接在上使用 {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlHighThreadsRunning
expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60
for: 2m
labels:
severity: warning
annotations:
summary: MySQL高线程运行 (instance {{ $labels.instance }})
description: "超过60%的MySQL连接在上处于运行状态 {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlSlaveIoThreadNotRunning
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
for: 0m
labels:
severity: critical
annotations:
summary: MySQL从IO线程未运行 (instance {{ $labels.instance }})
description: "MySQL从IO线程未在运行 {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlSlaveSqlThreadNotRunning
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
for: 0m
labels:
severity: critical
annotations:
summary: MySQL从属SQL线程未运行 (instance {{ $labels.instance }})
description: "MySQL从属SQL线程未在运行 {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlSlaveReplicationLag
expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30
for: 1m
labels:
severity: critical
annotations:
summary: MySQL从机复制延迟 (instance {{ $labels.instance }})
description: "MySQL复制延迟 {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlSlowQueries
expr: increase(mysql_global_status_slow_queries[5m]) > 3
for: 2m
labels:
severity: warning
annotations:
summary: MySQL慢速查询 (instance {{ $labels.instance }})
description: "MySQL有一些新的慢速查询.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlInnodbLogWaits
expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
for: 0m
labels:
severity: warning
annotations:
summary: MySQL InnoDB日志等待 (instance {{ $labels.instance }})
description: "MySQL innodb日志写入暂停\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlRestarted
expr: mysql_global_status_uptime < 60
for: 0m
labels:
severity: info
annotations:
summary: MySQL重启 (instance {{ $labels.instance }})
description: "MySQL刚刚重新启动,不到一分钟前 {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
7、nacos检测
vim nacos.yml
groups:
- name: alters
rules:
- alert: Nacos Down
expr: up{job="nacos"} == 0
for: 1m
labels:
severity: emergency
target: nacos
annotations:
description: "description"
summary: "{{ $labels.instance }} 已停止运行超过 1 分钟!"
8、nginx状态检测
vim nginx_rules.yml
groups:
- name: NginxAlert
rules:
- alert: ResponseTimeAlert
expr: (nginx_upstream_responseMsec > 1000)
for: 10s
labels:
severity: critical
service: nginx
annotations:
summary: "Nginx响应大于1000ms"
description: "Nginx {{ $labels.instance }}后端集群{{ $labels.upstream }} 中{{ $labels.backend }}的响应时间大于1000ms"
- alert: NginxHighHttp4xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
severity: critical
annotations:
summary: Nginx的HTTP 4xx错误率高 (instance {{ $labels.instance }})
description: "HTTP状态请求太多 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NginxHighHttp5xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
severity: critical
annotations:
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
description: "HTTP状态请求太多 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NginxLatencyHigh
expr: histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node)) > 3
for: 2m
labels:
severity: warning
annotations:
summary: Nginx latency high (instance {{ $labels.instance }})
description: "Nginx p99延迟高于3秒\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
9、rabbitmq状态检测
vim rabbitmq_rules.yml
groups:
- name: RabbitmqNodeDown
rules:
- alert: RabbitmqNodeDown
expr: sum(rabbitmq_build_info) < 3
for: 0m
labels:
severity: critical
annotations:
summary: Rabbitmq节点关闭 (instance {{ $labels.instance }})
description: "RabbitMQ群集中运行的节点少于3个\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqNodeNotDistributed
expr: erlang_vm_dist_node_state < 3
for: 0m
labels:
severity: critical
annotations:
summary: Rabbitmq节点未分发 (instance {{ $labels.instance }})
description: "分配链路状态未“启动”'\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqInstancesDifferentVersions
expr: count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1
for: 1h
labels:
severity: warning
annotations:
summary: 不同版本的Rabbitmq实例 (instance {{ $labels.instance }})
description: "在同一集群中运行不同版本的Rabbitmq可能会导致失败.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqMemoryHigh
expr: rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90
for: 2m
labels:
severity: warning
annotations:
summary: Rabbitmq内存高 (instance {{ $labels.instance }})
description: "一个节点使用超过90%的分配RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqFileDescriptorsUsage
expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90
for: 2m
labels:
severity: warning
annotations:
summary: Rabbitmq文件描述符的用法 (instance {{ $labels.instance }})
description: "一个节点使用超过90%的文件描述符\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyUnackMessages
expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
for: 1m
labels:
severity: warning
annotations:
summary: Rabbitmq未确认消息太多 (instance {{ $labels.instance }})
description: "太多未确认的消息\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyConnections
expr: rabbitmq_connections > 1000
for: 2m
labels:
severity: warning
annotations:
summary: Rabbitmq连接太多 (instance {{ $labels.instance }})
description: "节点的总连接数太高\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqNoQueueConsumer
expr: rabbitmq_queue_consumers < 1
for: 1m
labels:
severity: warning
annotations:
summary: Rabbitmq无队列使用者 (instance {{ $labels.instance }})
description: "一个队列的使用者少于1个\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqUnroutableMessages
expr: increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: Rabbitmq不可终止消息 (instance {{ $labels.instance }})
description: "队列中有无法发送的消息\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Rabbitmq-down
expr: rabbitmq_up{job='RabbitMQ'} != 1
labels:
status: High
team: Rabbitmq_monitor
annotations:
description: "Instance: {{ $labels.instance }} is Down ! ! !"
value: '{{ $value }}'
summary: "主机节点已关闭"
- alert: Rabbitmq disk free limit status
expr: rabbitmq_node_disk_free{job='RabbitMQ'} / 1024 / 1024 <= rabbitmq_node_disk_free_limit{job='RabbitMQ'} / 1024 / 1024 + 200
labels:
status: High
team: Rabbitmq_monitor
annotations:
description: "Instance: {{ $labels.instance }} Rabbitmq可用磁盘的容量过低 ! ! !"
value: '{{ $value }} MB'
summary: "Rabbitmq空闲磁盘太低"
10、redis状态检测
vim redis_down.yml
groups:
- name: redisdown
rules:
- alert: RedisDown
expr: redis_up == 0
for: 5m
labels:
name: instance
severity: Critical
annotations:
summary: " {{ $labels.alias }}"
description: " 服务停止运行 "
value: "{{ $value }}"
- alert: Redis linked too many clients
expr: redis_connected_clients / redis_config_maxclients * 100 > 80
for: 1m
labels:
name: instance
severity: Warning
annotations:
summary: " {{ $labels.alias }}"
description: " Redis连接数超过最大连接数的80%. "
value: "{{ $value }}"
- alert: MissingBackup
expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
for: 5m
labels:
severity: error
annotations:
summary: "Missing backup (instance {{ $labels.instance }})"
description: "Redis已经24小时没有备份了\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: OutOfMemory
expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Out of memory (instance {{ $labels.instance }})"
description: "Redis内存不足 (> 90%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ReplicationBroken
expr: delta(redis_connected_slaves[1m]) < 0
for: 5m
labels:
severity: error
annotations:
summary: "Replication broken (instance {{ $labels.instance }})"
description: "Redis实例丢失了一个从机\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: TooManyConnections
expr: redis_connected_clients > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Too many connections (instance {{ $labels.instance }})"
description: "Redis实例的连接太多\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: NotEnoughConnections
expr: redis_connected_clients < 5
for: 5m
labels:
severity: warning
annotations:
summary: "Not enough connections (instance {{ $labels.instance }})"
description: "Redis实例应该有更多的连接 (> 5)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: RejectedConnections
expr: increase(redis_rejected_connections_total[1m]) > 0
for: 5m
labels:
severity: error
annotations:
summary: "Rejected connections (instance {{ $labels.instance }})"
description: "与Redis的某些连接已被拒绝\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
11、服务状态检测
vim service_down.yml
groups:
- name: Process-exporter
rules:
- alert: ProcessDown
expr: namedprocess_namegroup_num_procs == 0
for: 1m
labels:
name: instance
severity: Critical
annotations:
summary: " {{ $labels.appname }}"
description: " 进程停止运行 "
value: "{{ $value }}"
- alert: 实例存活告警
expr: up == 0
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "主机宕机 !!!"
description: "该实例主机已经宕机超过一分钟了。"
7、检查配置文件是否正确
./promtool check config prometheus.yml
8、不重启重新加载配置文件
curl -X POST http://127.0.0.1:9090/-/reload
2、部署grafana
1、下载grafana二进制包
wget https://github.do/https://github.com/grafana/grafana/archive/refs/tags/v8.4.4.tar.gz
tar -zxf v8.4.4.tar.gz
mv grafana-8.4.4 /usr/local/
ln -s /usr/local/grafana-8.4.4/ /usr/local/grafana
2、创建grafana用户及数据存放目录
useradd -s /sbin/nologin -M grafana
mkdir /data/grafana
chown -R grafana:grafana /usr/local/grafana/
chown -R grafana:grafana /data/grafana/
3、修改配置文件
data = /data/grafana/data
logs = /data/grafana/log
plugins = /data/grafana/plugins
provisioning = /data/grafana/conf/provisioning
4、使用systemd来管理grafana服务
[Unit]
Description=Grafana
After=network.target
[Service]
User=grafana
Group=grafana
Type=notify
ExecStart=/usr/local/grafana/bin/grafana-server -homepath /usr/local/grafana
Restart=on-failure
[Install]
WantedBy=multi-user.target
5、启动并设置开机自启
systemctl daemon-reload && systemctl start grafana-server
systemctl status grafana-server
systemctl enable grafana-server
3、部署AlertManager
1、下载AlertManager二进制包
wget https://github.do/https://github.com/prometheus/alertmanager/releases/download/v0.23.0/alertmanager-0.23.0.linux-amd64.tar.gz
tar zxf alertmanager-0.23.0.linux-amd64.tar.gz -C /usr/local/
cd /usr/local/
mv alertmanager-0.23.0.linux-amd64/ alertmanager
2、使用systemd来管理AlertManager服务
[Unit]
Description=alertmanager
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/alertmanager/alertmanager --config.file /usr/local/alertmanager/alertmanager.yml --storage.path=/var/lib/alertmanager
Restart=on-failure
[Install]
WantedBy=multi-user.target
3、修改配置文件
global:
resolve_timeout: 5m
http_config: {}
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'dingding.webhook1'
receivers:
- name: 'dingding.webhook1'
webhook_configs:
- send_resolved: true
url: 'http://127.0.0.1:8060/dingtalk/webhook1/send'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
templates:
- '/alertmanager/template/*.tmpl'
4、启动AlertManager
systemctl daemon-reload && systemctl enable alertmanager
systemctl start alertmanager && systemctl status alertmanager
4、部署钉钉告警
1、下载二进制包
wget https://ghproxy.com/https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.0.0/prometheus-webhook-dingtalk-2.0.0.linux-amd64.tar.gz
tar xf prometheus-webhook-dingtalk-2.0.0.linux-amd64.tar.gz -C /usr/local/
ln -sv /usr/local/prometheus-webhook-dingtalk-2.0.0.linux-amd64/ /usr/local/prometheus-webhook-dingtalk
2、修改配置文件
timeout: 5s
templates:
- contrib/templates/legacy/template.tmpl
default_message:
title: '{{ template "legacy.title" . }}'
text: '{{ template "legacy.content" . }}'
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
secret: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
webhook_mention_all:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
secret: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
mention:
all: true
webhook_mention_users:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
mention:
mobiles: ['139xxxxxxxx']
3、修改报警模板
{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
{{ define "__alertmanagerURL" }}{{ .ExternalURL }}/
{{ define "__text_alert_list" }}{{ range . }}
**Labels**
{{ range .Labels.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**Annotations**
{{ range .Annotations.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**Source:** [{{ .GeneratorURL }}]({{ .GeneratorURL }})
{{ end }}{{ end }}
{{ define "default.__text_alert_list" }}{{ range . }}
---
**告警级别:** {{ .Labels.severity | upper }}
**运营团队:** {{ .Labels.team | upper }}
**触发时间:** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
**事件信息:**
{{ range .Annotations.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**事件标签:**
{{ range .Labels.SortedPairs }}{{ if and (ne (.Name) "severity") (ne (.Name) "summary") (ne (.Name) "team") }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}{{ end }}
{{ end }}
{{ end }}
{{ define "default.__text_alertresovle_list" }}{{ range . }}
---
**告警级别:** {{ .Labels.severity | upper }}
**运营团队:** {{ .Labels.team | upper }}
**触发时间:** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
**结束时间:** {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }}
**事件信息:**
{{ range .Annotations.SortedPairs }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**事件标签:**
{{ range .Labels.SortedPairs }}{{ if and (ne (.Name) "severity") (ne (.Name) "summary") (ne (.Name) "team") }} - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}{{ end }}
{{ end }}
{{ end }}
{{/* Default */}}
{{ define "default.title" }}{{ template "__subject" . }}{{ end }}
{{ define "default.content" }}
{{ if gt (len .Alerts.Firing) 0 -}}
{{ template "default.__text_alert_list" .Alerts.Firing }}
{{- end }}
{{ if gt (len .Alerts.Resolved) 0 -}}
{{ template "default.__text_alertresovle_list" .Alerts.Resolved }}
{{- end }}
{{- end }}
{{/* Legacy */}}
{{ define "legacy.title" }}{{ template "__subject" . }}{{ end }}
{{ define "legacy.content" }}
{{ template "__text_alert_list" .Alerts.Firing }}
{{- end }}
{{/* Following names for compatibility */}}
{{ define "ding.link.title" }}{{ template "default.title" . }}{{ end }}
{{ define "ding.link.content" }}{{ template "default.content" . }}{{ end }}
4、使用systemd来管理服务
[Unit]
Descripton=dingtalk
Documentation=https://github.com/timonwong/prometheus-webhook-dingtalk/
After=network.target
[Service]
Restart=on-failure
WorkingDirectory=/usr/local/prometheus-webhook-dingtalk
ExecStart=/usr/local/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --config.file=/usr/local/prometheus-webhook-dingtalk/config.yml
[Install]
WantedBy=multi-user.target
5、启动钉钉告警
systemctl daemon-reload && systemctl enable dingtalk
systemctl start dingtalk && systemctl status dingtalk
二、docker-compose部署
version: "3.8"
networks:
monitor:
driver: bridge
services:
node-exporter:
image: prom/node-exporter
container_name: node-exporter
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
restart: unless-stopped
expose:
- 9100
networks:
- monitor
blackbox-exporter:
image: prom/blackbox-exporter
expose:
- 9115
container_name: blackbox
restart: unless-stopped
volumes:
- "./blackbox/:/config"
command:
- "--config.file=/config/blackbox.yml"
networks:
- monitor
cadvisor:
image: google/cadvisor
container_name: cadvisor
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
restart: unless-stopped
expose:
- 8080
networks:
- monitor
depends_on:
- prometheus
prometheus:
image: prom/prometheus
container_name: prom
restart: always
ports:
- "9090:9090"
volumes:
- /etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- /data/prometheus/data:/prometheus
command:
- "--storage.tsdb.retention.time=60d"
- "--config.file=/etc/prometheus/prometheus.yml"
- "--web.enable-lifecycle"
networks:
- monitor
depends_on:
- blackbox-exporter
- node-exporter
alertmanager:
image: prom/alertmanager
restart: always
container_name: alert
volumes:
- /etc/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
- "9093:9093"
networks:
- monitor
depends_on:
- prometheus
grafana:
image: grafana/grafana
restart: always
container_name: grafana
ports:
- "3000:3000"
environment:
- "GF_SECURITY_ADMIN_PASSWORD=admin"
- "GF_RENDERING_SERVER_URL=http://renderer:8081/render"
- "GF_RENDERING_CALLBACK_URL=http://grafana:3000/"
- "GF_LOG_FILTERS=rendering:debug"
volumes:
- /data/grafana/:/grafana
- /data/grafana/data/:/var/lib/grafana
networks:
- monitor
depends_on:
- prometheus
- renderer
renderer:
image: grafana/grafana-image-renderer:latest
container_name: renderer
ports:
- "8081:8081"
environment:
- "ENABLE_METRICS=true"
- "RENDERING_MODE=clustered"
- "RENDERING_CLUSTERING_MODE=context"
- "RENDERING_CLUSTERING_MAX_CONCURRENCY=5"
networks:
- monitor
|