etcd monitoring code examples.

This commit is contained in:
Erik Brakkee 2023-04-10 20:52:24 +02:00
parent 092c830f52
commit f86bb5f55a
9 changed files with 267 additions and 0 deletions

View File

@ -0,0 +1,6 @@
FROM python:3.8
RUN pip install prometheus_client
COPY exporter.py /

View File

@ -0,0 +1,3 @@
https://github.com/prometheus/client_python
pip install prometheus-client

View File

@ -0,0 +1,64 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: wamblee-cluster-monitoring
namespace: monitoring
spec:
groups:
- name: wamblee-org
partial_response_strategy: ""
rules:
- alert: etcdBackupAbsentOrTooOld
annotations:
description: 'etcd backup is too old or not present'
summary: etcd cluster backup is too old or not present
expr: |-
time() -
max(max_over_time(file_time_seconds{job="pantherbackupexporter",type="backup"}[60m]))
> 3600
for: 5m
labels:
severity: critical
- alert: etcdBackupFileTooSmall
annotations:
description: 'etcd backup is too small or not present'
summary: etcd cluster backup is too small or not present
expr: |-
max(max_over_time(file_size{job="pantherbackupexporter",type="backup"}[60m]))
<
10000000
for: 5m
labels:
severity: critical
- alert: etcdImageFileTooOld
annotations:
description: 'etcd image file is too old or not present'
summary: etcd image file is too old or not present
expr: |-
time() -
max(max_over_time(file_time_seconds{job="pantherbackupexporter",type="image"}[60m]))
> 3600
for: 5m
labels:
severity: critical
- alert: etcdImageFileTooSmall
annotations:
description: 'etcd image file is too small or not present'
summary: etcd cluster image file is too small or not present
expr: |-
max(max_over_time(file_size{job="pantherbackupexporter",type="image"}[60m]))
<
10
for: 5m
labels:
severity: critical
- alert: etcdDiskAlmostFull
annotations:
description: 'etcd high disk usage'
summary: etcd high disk usage
expr: |-
node_filesystem_free_bytes{mountpoint="/var/lib/etcd"}/
node_filesystem_size_bytes{mountpoint="/var/lib/etcd"} < 0.30
for: 5m
labels:
severity: critical

View File

@ -0,0 +1,47 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: pantherbackupmonitoring
namespace: monitoring
spec:
selector:
matchLabels:
app: pantherbackupmonitoring
template:
metadata:
labels:
app: pantherbackupmonitoring
prometheus-scrapable: "true"
spec:
terminationGracePeriodSeconds: 0
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Exists
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Exists
nodeSelector:
kubernetes.io/hostname: panther
containers:
- name: exporter
image: docker.example.com/filemonitor:1.0
args:
- python3
- -u
- /exporter.py
- backup:/backup/etcd-snapshot-latest.db
- image:/backup/etcdimage
ports:
- containerPort: 8080
protocol: TCP
name: http
volumeMounts:
- name: pantherbackup
mountPath: /backup
readOnly: true
volumes:
- name: pantherbackup
hostPath:
path: /var/lib/wamblee/etcd

View File

@ -0,0 +1,19 @@
services:
filemonitor:
image: docker.example.com/filemonitor:1.0
build:
context: .
command:
- python3
- /exporter.py
# some paths for testing
- /data/x.txt
- /data/y.txt
ports:
- "8080:8080"
volumes:
# for testing.
- /home/user/downloads:/data

View File

@ -0,0 +1,81 @@
import time
import prometheus_client
from prometheus_client import start_http_server, Gauge, Counter
import sys
import argparse
import os
import stat
from http.server import HTTPServer
prometheus_client.REGISTRY.unregister(prometheus_client.GC_COLLECTOR)
prometheus_client.REGISTRY.unregister(prometheus_client.PLATFORM_COLLECTOR)
prometheus_client.REGISTRY.unregister(prometheus_client.PROCESS_COLLECTOR)
# Create a metric to track time spent and requests made.
FILE_TIME = Gauge("file_time_seconds", "File last modification time", labelnames=['path', 'type'])
FILE_SIZE = Gauge("file_size", "File size in bytes", labelnames=['path', 'type'])
class Handler(prometheus_client.MetricsHandler):
def do_GET(self) -> None:
for file in FILES.keys():
type = FILES[file]
try:
FILE_TIME.labels(file, type).set(0)
FILE_SIZE.labels(file, type).set(0)
# follow symlinks
stats = os.stat(path=file)
if stat.S_ISREG(stats.st_mode):
FILE_TIME.labels(file, type).set(stats.st_mtime)
FILE_SIZE.labels(file, type).set(stats.st_size)
except:
pass
return super().do_GET()
# map of filename to type
FILES = {}
if __name__ == '__main__':
DEFAULT_PORT = 8080
parser = argparse.ArgumentParser(prog=sys.argv[0],
description=f"""Statistics on a (backup) file,
Usage: ${sys.argv[0]} [-p|--port <port>] <label1>:<filepath1> .... <labeln>:<filepathn>
Listens on port {DEFAULT_PORT} by default. It exposes statistics
on the monitored files to prometheus. Current metrics are
{FILE_TIME._name}{{path="/path/to/file"}}: file modification time in seconds since 1970
{FILE_SIZE._name}{{path="/path/to/file"}}: file size in bytes
If a path does not exist or is not a regular file then the value 0 is returned.
The exporter follow symlinks.
The synax of each file is fo theform <label>:<file> where <label> is the value of
the type label in the prometheus export.
""",
epilog="Have a lot of fun!",
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("files", nargs="*", help="Files to monitor")
parser.add_argument("-p", "--port", type=int, default=DEFAULT_PORT, help="Port to listen on")
args = parser.parse_args()
filespecs = args.files
for filespec in filespecs:
ind = filespec.index(":")
fname = filespec[ind+1:]
label= filespec[:ind]
FILES[fname] = label
PORT = args.port
print(f"Monitoring files {FILES}")
# Start up the server to expose the metrics.
print(f"Listening on port {PORT}")
HTTPServer(('0.0.0.0', PORT), Handler).serve_forever()

View File

@ -0,0 +1,13 @@
namespace: monitoring
generatorOptions:
disableNameSuffixHash: true
resources:
- deployment.yaml
- service.yaml
- servicemonitor.yaml
- alertingrules.yaml

View File

@ -0,0 +1,17 @@
---
apiVersion: v1
kind: Service
metadata:
name: pantherbackupexporter
namespace: monitoring
labels:
app: pantherbackupmonitoring
spec:
selector:
app: pantherbackupmonitoring
ports:
- port: 8080
targetPort: 8080
name: http

View File

@ -0,0 +1,17 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: pantherbackupmonitoring
namespace: monitoring
spec:
endpoints:
- honorLabels: true
path: /metrics
port: http
scheme: http
scrapeTimeout: 30s
selector:
matchLabels:
app: pantherbackupmonitoring
targetLabels:
- app