refactoring(infra-monitoring)

This commit is contained in:
2025-08-15 23:00:21 +02:00
parent 69ca3faf91
commit 689e3bf8e7
8 changed files with 1028 additions and 29 deletions
@@ -1,10 +1,10 @@
global:
resolve_timeout: 5m
# SMTP configuration for email alerts
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alertmanager@meldestelle.at'
smtp_auth_username: 'alertmanager@meldestelle.at'
smtp_auth_password: 'password' # Use environment variable in production
# SMTP configuration for email alerts - use environment variables
smtp_smarthost: '${SMTP_SMARTHOST:-smtp.example.com:587}'
smtp_from: '${SMTP_FROM:-alertmanager@meldestelle.at}'
smtp_auth_username: '${SMTP_AUTH_USERNAME:-alertmanager@meldestelle.at}'
smtp_auth_password: '${SMTP_AUTH_PASSWORD}'
smtp_require_tls: true
# The root route on which each incoming alert enters.
@@ -55,8 +55,8 @@ receivers:
- name: 'slack-critical'
slack_configs:
- api_url: 'https://hooks.slack.com/services/REPLACE_WITH_YOUR_WEBHOOK_URL'
channel: '#alerts-critical'
- api_url: '${SLACK_WEBHOOK_URL_CRITICAL}'
channel: '${SLACK_CHANNEL_CRITICAL:-#alerts-critical}'
send_resolved: true
title: '{{ .CommonAnnotations.summary }}'
text: >-
@@ -69,8 +69,8 @@ receivers:
- name: 'slack-warnings'
slack_configs:
- api_url: 'https://hooks.slack.com/services/REPLACE_WITH_YOUR_WEBHOOK_URL'
channel: '#alerts-warnings'
- api_url: '${SLACK_WEBHOOK_URL_WARNINGS}'
channel: '${SLACK_CHANNEL_WARNINGS:-#alerts-warnings}'
send_resolved: true
title: '{{ .CommonAnnotations.summary }}'
text: >-
@@ -0,0 +1,389 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Meldestelle Application Overview Dashboard - Key metrics and health indicators",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "reqps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "rate(http_server_requests_seconds_count{application=\"meldestelle\"}[5m])",
"interval": "",
"legendFormat": "{{method}} {{uri}}",
"refId": "A"
}
],
"title": "HTTP Request Rate",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"values": false,
"calcs": [
"lastNotNull"
],
"fields": ""
},
"textMode": "auto"
},
"pluginVersion": "8.5.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "up{application=\"meldestelle\"}",
"interval": "",
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"title": "Application Status",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 3,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.95, rate(http_server_requests_seconds_bucket{application=\"meldestelle\"}[5m])) * 1000",
"interval": "",
"legendFormat": "95th percentile",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.50, rate(http_server_requests_seconds_bucket{application=\"meldestelle\"}[5m])) * 1000",
"interval": "",
"legendFormat": "50th percentile",
"refId": "B"
}
],
"title": "HTTP Response Times",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 12,
"y": 8
},
"id": 4,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "rate(http_server_requests_seconds_count{application=\"meldestelle\",status=~\"[45].*\"}[5m]) / rate(http_server_requests_seconds_count{application=\"meldestelle\"}[5m]) * 100",
"interval": "",
"legendFormat": "Error Rate",
"refId": "A"
}
],
"title": "Error Rate",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 36,
"style": "dark",
"tags": [
"meldestelle",
"application",
"overview"
],
"templating": {
"list": []
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Meldestelle - Application Overview",
"uid": "meldestelle-app-overview",
"version": 1,
"weekStart": ""
}
@@ -0,0 +1,599 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Infrastructure Components Dashboard - Monitoring of PostgreSQL, Redis, Kafka, and other supporting services",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"values": false,
"calcs": [
"lastNotNull"
],
"fields": ""
},
"textMode": "auto"
},
"pluginVersion": "8.5.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "up{job=\"postgres\"}",
"interval": "",
"legendFormat": "PostgreSQL",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "up{job=\"redis\"}",
"interval": "",
"legendFormat": "Redis",
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "up{job=\"kafka\"}",
"interval": "",
"legendFormat": "Kafka",
"refId": "C"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "up{job=\"keycloak\"}",
"interval": "",
"legendFormat": "Keycloak",
"refId": "D"
}
],
"title": "Infrastructure Services Status",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 4
},
"id": 2,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"interval": "",
"legendFormat": "CPU Usage",
"refId": "A"
}
],
"title": "System CPU Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 4
},
"id": 3,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes",
"interval": "",
"legendFormat": "Memory Used",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "node_memory_MemTotal_bytes",
"interval": "",
"legendFormat": "Memory Total",
"refId": "B"
}
],
"title": "System Memory Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 12
},
"id": 4,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "pg_stat_database_numbackends{job=\"postgres\"}",
"interval": "",
"legendFormat": "{{datname}}",
"refId": "A"
}
],
"title": "PostgreSQL Active Connections",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 12
},
"id": 5,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "redis_connected_clients{job=\"redis\"}",
"interval": "",
"legendFormat": "Connected Clients",
"refId": "A"
}
],
"title": "Redis Connected Clients",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 12
},
"id": 6,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "kafka_server_brokertopicmetrics_messagesin_total{job=\"kafka\"}",
"interval": "",
"legendFormat": "{{topic}}",
"refId": "A"
}
],
"title": "Kafka Messages In",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 36,
"style": "dark",
"tags": [
"meldestelle",
"infrastructure",
"postgres",
"redis",
"kafka"
],
"templating": {
"list": []
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Meldestelle - Infrastructure Components",
"uid": "meldestelle-infrastructure",
"version": 1,
"weekStart": ""
}
+3 -16
View File
@@ -23,31 +23,18 @@ scrape_configs:
- targets: ["localhost:9090"]
# Scrape configuration for the Meldestelle application
- job_name: "meldestelle-server"
metrics_path: /metrics
- job_name: "meldestelle-api"
metrics_path: /actuator/prometheus
scrape_interval: 10s
basic_auth:
username: ${METRICS_USER:-metrics}
password: ${METRICS_PASSWORD:-metrics-password-change-in-production}
password: ${METRICS_PASSWORD:-metrics-password-dev}
static_configs:
- targets: ["server:8081"]
labels:
application: "meldestelle"
service: "api-gateway"
# JVM metrics for the Meldestelle application
- job_name: "meldestelle-jvm"
metrics_path: /metrics
scrape_interval: 10s
basic_auth:
username: ${METRICS_USER:-metrics}
password: ${METRICS_PASSWORD:-metrics-password-change-in-production}
static_configs:
- targets: ["server:8081"]
labels:
application: "meldestelle"
service: "jvm"
# Node exporter for host metrics (if added later)
# - job_name: "node-exporter"
# static_configs:
@@ -45,5 +45,27 @@ Das vollständige Monitoring-Setup besteht aus mehreren Teilen:
Diese Kombination aus Micrometer, Prometheus, Zipkin und Grafana bildet einen leistungsstarken, branchenüblichen "Observability Stack".
## Neue Funktionen und Optimierungen
### Sicherheitsverbesserungen
* **Umgebungsvariablen für Credentials**: Alle hardcodierten Passwörter und API-Schlüssel wurden durch Umgebungsvariablen ersetzt
* **Alertmanager-Konfiguration**: SMTP- und Slack-Einstellungen nutzen jetzt sichere Umgebungsvariablen
* **Prometheus-Authentifizierung**: Metriken-Endpunkte sind durch Benutzername/Passwort geschützt
### Performance-Optimierungen
* **Konfigurierbare Tracing-Sampling-Rate**: Standard 100% für Entwicklung, über `TRACING_SAMPLING_PROBABILITY` anpassbar für Produktion
* **Optimierte Prometheus-Konfiguration**: Korrigierte Metriken-Pfade und eliminierte doppelte Jobs
* **Verbesserte Speicher-Retention**: Produktion nutzt 30 Tage Retention und WAL-Komprimierung
### Erweiterte Dashboards
* **Application Overview Dashboard**: Zentrale Anwendungsmetriken (Request Rate, Response Times, Error Rate, Status)
* **Infrastructure Components Dashboard**: Überwachung von PostgreSQL, Redis, Kafka, System-Metriken
* **JVM Dashboard**: Bestehende JVM-Metriken für Java-Anwendungen
### Konfigurationsverbesserungen
* **Einheitliche Endpunkt-Pfade**: Verwendung von `/actuator/prometheus` für alle Services
* **Umgebungsspezifische Konfiguration**: Getrennte Einstellungen für Entwicklung und Produktion
* **Erweiterte ELK-Integration**: Vollständige Logging-Pipeline mit Elasticsearch und Logstash
---
**Letzte Aktualisierung**: 31. Juli 2025
**Letzte Aktualisierung**: 15. August 2025
@@ -12,8 +12,9 @@ management.endpoints.web.exposure.include=health,info,prometheus
# --- Micrometer Tracing ---
# Aktiviert das Tracing
management.tracing.enabled=true
# Definiert, dass Traces immer gesammelt werden sollen (1.0 = 100%)
management.tracing.sampling.probability=1.0
# Definiert Sampling-Rate für Traces (kann per Umgebungsvariable überschrieben werden)
# Entwicklung: 1.0 (100%), Produktion sollte niedriger sein (z.B. 0.1 = 10%)
management.tracing.sampling.probability=${TRACING_SAMPLING_PROBABILITY:1.0}
# --- Micrometer Observation (für Metriken UND Tracing) ---
# aktiviert die "Beobachtung" von HTTP Server Requests.
@@ -24,6 +24,7 @@ dependencies {
// Abhängigkeiten für den Zipkin-Server und seine UI.
implementation(libs.zipkin.server)
implementation(libs.zipkin.autoconfigure.ui)
// Stellt alle Test-Abhängigkeiten gebündelt bereit.
testImplementation(projects.platform.platformTesting)
@@ -19,6 +19,6 @@ class MonitoringServerApplicationTest {
@Test
fun `context loads successfully`() {
// Test ist bestanden, wenn der Kontext ohne Exception startet.
// Der Test ist bestanden, wenn der Kontext ohne Exception startet.
}
}