fix:
This commit is contained in:
@@ -0,0 +1,46 @@
|
||||
# Prometheus configuration for Meldestelle project
|
||||
# Basic configuration to enable service monitoring
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- "alertmanager:9093"
|
||||
|
||||
rule_files:
|
||||
- "/etc/prometheus/rules/alerts.yaml"
|
||||
|
||||
scrape_configs:
|
||||
# Job 1: Prometheus überwacht sich selbst
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: [ 'localhost:9090' ]
|
||||
|
||||
# Job 2: API Gateway (Spring Boot Actuator)
|
||||
- job_name: 'api-gateway'
|
||||
metrics_path: '/actuator/prometheus'
|
||||
scrape_interval: "30s"
|
||||
static_configs:
|
||||
- targets: [ 'api-gateway:8081' ]
|
||||
|
||||
# Job 3: Postgres (ACHTUNG)
|
||||
# Postgres direkt auf 5432 zu scrapen geht nicht.
|
||||
# Entweder auskommentieren oder 'postgres-exporter' Container hinzufügen.
|
||||
# - job_name: 'postgres-exporter'
|
||||
# static_configs:
|
||||
# - targets: ['postgres-exporter:9187']
|
||||
|
||||
# Add consul for service discovery monitoring
|
||||
- job_name: 'consul'
|
||||
metrics_path: '/v1/agent/metrics'
|
||||
params:
|
||||
format: [ 'prometheus' ]
|
||||
static_configs:
|
||||
- targets: [ 'consul:8500' ]
|
||||
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
groups:
|
||||
- name: meldestelle_alerts
|
||||
rules:
|
||||
# 1. Memory: Passt soweit, ist okay.
|
||||
- alert: HighMemoryUsage
|
||||
expr: (jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage ({{ $value | humanize }}%)"
|
||||
description: "JVM Heap usage is above 85%.\n Instance: {{ $labels.instance }}"
|
||||
|
||||
# 2. CPU: Passt auch.
|
||||
- alert: HighCpuUsage
|
||||
expr: process_cpu_usage * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage ({{ $value | humanize }}%)"
|
||||
description: "CPU usage is above 85%.\n Instance: {{ $labels.instance }}"
|
||||
|
||||
# 3. Error Rate: FIX - Division durch null abfangen & Rate nutzen
|
||||
- alert: HighErrorRate
|
||||
# Wir prüfen nur, wenn überhaupt Requests > 0 da sind, um DivByZero zu vermeiden
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_server_requests_seconds_count[5m]))
|
||||
) * 100 > 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High error rate ({{ $value | humanize }}%)"
|
||||
description: "More than 5% of requests resulted in 5xx errors.\n Instance: {{ $labels.instance }}"
|
||||
|
||||
# 4. Service Down: FIX - Job Name Regex
|
||||
- alert: ServiceDown
|
||||
# Prüft alle Jobs, die du in prometheus.yml definiert hast (api-gateway, consul etc.),
|
||||
# 'up == 0' bedeutet: Target ist konfiguriert, aber nicht erreichbar.
|
||||
expr: up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down"
|
||||
description: "Service instance {{ $labels.instance }} of job {{ $labels.job }} is not reachable."
|
||||
|
||||
# 5. Slow Response: FIX - 'rate' benutzen!
|
||||
- alert: SlowResponseTime
|
||||
# Berechnet die durchschnittliche Dauer pro request im 5-Minuten-Fenster
|
||||
expr: rate(http_server_requests_seconds_sum[5m]) / rate(http_server_requests_seconds_count[5m]) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Slow response time ({{ $value | humanizeDuration }})"
|
||||
description: "Average response time is > 1s for the last 5 minutes.\n Instance: {{ $labels.instance }}\n Path: {{ $labels.uri }}"
|
||||
|
||||
# 6. GC Pause: FIX - 'rate' benutzen!
|
||||
- alert: HighGcPauseTime
|
||||
# Zeigt an, wie viel Zeit PRO SEKUNDE für GC draufgeht (nicht pro GC Event, das ist oft aussagekräftiger)
|
||||
# Oder "Durchschnittliche Dauer pro GC Event im Zeitfenster":
|
||||
expr: rate(jvm_gc_pause_seconds_sum[5m]) / rate(jvm_gc_pause_seconds_count[5m]) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High GC pause time ({{ $value | humanizeDuration }})"
|
||||
description: "Average GC pause is > 0.5s.\n Instance: {{ $labels.instance }}"
|
||||
Reference in New Issue
Block a user