groups: - name: targets rules: - alert: monitor_service_down expr: up == 0 for: 30s labels: severity: critical annotations: summary: "Monitor service non-operational" description: "Service {{ $labels.instance }} is down." - name: host rules: - alert: high_cpu_load expr: node_load1 > 1.5 for: 30s labels: severity: warning annotations: summary: "Server under high load" description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." - alert: high_memory_load expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 for: 30s labels: severity: warning annotations: summary: "Server memory is almost full" description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." - alert: high_storage_load expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 for: 30s labels: severity: warning annotations: summary: "Server storage is almost full" description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." - name: containers rules: - alert: jenkins_down expr: absent(container_memory_usage_bytes{name="jenkins"}) for: 30s labels: severity: critical annotations: summary: "Jenkins down" description: "Jenkins container is down for more than 30 seconds." - alert: jenkins_high_cpu expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10 for: 30s labels: severity: warning annotations: summary: "Jenkins high CPU usage" description: "Jenkins CPU usage is {{ humanize $value}}%." - alert: jenkins_high_memory expr: sum(container_memory_usage_bytes{name="jenkins"}) > 1200000000 for: 30s labels: severity: warning annotations: summary: "Jenkins high memory usage" description: "Jenkins memory consumption is at {{ humanize $value}}."