apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: wamblee-cluster-monitoring
  namespace: monitoring
spec:
  groups:
  - name: wamblee-org
    partial_response_strategy: ""
    rules:
    - alert: etcdBackupAbsentOrTooOld
      annotations:
        description: 'etcd backup is too old or not present'
        summary: etcd cluster backup is too old or not present
      expr: |-
        time() - 
        max(max_over_time(file_time_seconds{job="pantherbackupexporter",type="backup"}[60m]))
        > 3600
      for: 5m
      labels:
        severity: critical
    - alert: etcdBackupFileTooSmall
      annotations:
        description: 'etcd backup is too small or not present'
        summary: etcd cluster backup is too small or not present
      expr: |-
         max(max_over_time(file_size{job="pantherbackupexporter",type="backup"}[60m])) 
         < 
         10000000
      for: 5m
      labels:
        severity: critical
    - alert: etcdImageFileTooOld
      annotations:
        description: 'etcd image file is too old or not present'
        summary: etcd image file is too old or not present
      expr: |-
        time() - 
        max(max_over_time(file_time_seconds{job="pantherbackupexporter",type="image"}[60m]))
        > 3600
      for: 5m
      labels:
        severity: critical
    - alert: etcdImageFileTooSmall
      annotations:
        description: 'etcd image file is too small or not present'
        summary: etcd cluster image file is too small or not present
      expr: |-
         max(max_over_time(file_size{job="pantherbackupexporter",type="image"}[60m])) 
         < 
         10
      for: 5m
      labels:
        severity: critical
    - alert: etcdDiskAlmostFull
      annotations:
        description: 'etcd high disk usage'
        summary: etcd high disk usage
      expr: |-
        node_filesystem_free_bytes{mountpoint="/var/lib/etcd"}/
        node_filesystem_size_bytes{mountpoint="/var/lib/etcd"} < 0.30
      for: 5m
      labels:
        severity: critical