code/etcd-inmemory-monitoring/alertingrules.yaml

65 lines
2.0 KiB
YAML

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: wamblee-cluster-monitoring
namespace: monitoring
spec:
groups:
- name: wamblee-org
partial_response_strategy: ""
rules:
- alert: etcdBackupAbsentOrTooOld
annotations:
description: 'etcd backup is too old or not present'
summary: etcd cluster backup is too old or not present
expr: |-
time() -
max(max_over_time(file_time_seconds{job="pantherbackupexporter",type="backup"}[60m]))
> 3600
for: 5m
labels:
severity: critical
- alert: etcdBackupFileTooSmall
annotations:
description: 'etcd backup is too small or not present'
summary: etcd cluster backup is too small or not present
expr: |-
max(max_over_time(file_size{job="pantherbackupexporter",type="backup"}[60m]))
<
10000000
for: 5m
labels:
severity: critical
- alert: etcdImageFileTooOld
annotations:
description: 'etcd image file is too old or not present'
summary: etcd image file is too old or not present
expr: |-
time() -
max(max_over_time(file_time_seconds{job="pantherbackupexporter",type="image"}[60m]))
> 3600
for: 5m
labels:
severity: critical
- alert: etcdImageFileTooSmall
annotations:
description: 'etcd image file is too small or not present'
summary: etcd cluster image file is too small or not present
expr: |-
max(max_over_time(file_size{job="pantherbackupexporter",type="image"}[60m]))
<
10
for: 5m
labels:
severity: critical
- alert: etcdDiskAlmostFull
annotations:
description: 'etcd high disk usage'
summary: etcd high disk usage
expr: |-
node_filesystem_free_bytes{mountpoint="/var/lib/etcd"}/
node_filesystem_size_bytes{mountpoint="/var/lib/etcd"} < 0.30
for: 5m
labels:
severity: critical