refactor(infra): Restrukturierung Config-Ordner & Einführung von Docker-Profilen
Umfangreiches Refactoring der Projektkonfiguration zur klaren Trennung von Build-, Runtime- und Applikations-Logik. Änderungen im Detail: - Struktur: Neuorganisation des `config/` Verzeichnisses in logische Bereiche: - `config/docker`: Reine Infrastruktur-Configs (Postgres, Redis, Nginx, Monitoring). - `config/quality`: Statische Code-Analyse (Detekt, Lint). - `config/app`: Gemeinsame Spring-Boot-Konfigurationen. - Docker Compose: - Einführung von Profilen (`infra`, `backend`, `ops`, `gui`, `tools`) für gezieltes Starten von Teilbereichen. - Anpassung aller Volume-Pfade auf die neue Struktur. - Spring Boot Config: - Zentralisierung gemeinsamer Einstellungen (Datasource, Redis, JPA) in `config/app/base-application.yml`. - Parametrisierung der Hosts für nahtlosen Wechsel zwischen Docker und Localhost. - Bereinigung der service-spezifischen `application.yaml` Dateien (z.B. Ping-Service). - Cleanup: Entfernen redundanter "Ghost-Files" (`versions.toml`, `central.toml`, `config/.env`), um eine echte Single Source of Truth (SSoT) zu gewährleisten.
This commit is contained in:
@@ -0,0 +1,68 @@
|
||||
# ===================================================================
|
||||
# Prometheus Configuration for Meldestelle
|
||||
# Features: Consul Service Discovery, Spring Boot Actuator support
|
||||
# ===================================================================
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
# Da wir Alertmanager noch nicht im Docker Compose haben (kommt noch!),
|
||||
# lassen wir das vorerst auskommentiert oder fügen den Container hinzu.
|
||||
- "alertmanager:9093"
|
||||
|
||||
rule_files:
|
||||
- "/etc/prometheus/rules/alerts.yaml"
|
||||
|
||||
scrape_configs:
|
||||
# 1. Prometheus Self-Monitoring
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: [ 'localhost:9090' ]
|
||||
|
||||
# 2. Consul Self-Monitoring
|
||||
- job_name: 'consul'
|
||||
metrics_path: '/v1/agent/metrics'
|
||||
params:
|
||||
format: [ 'prometheus' ]
|
||||
static_configs:
|
||||
- targets: [ 'consul:8500' ]
|
||||
|
||||
# 3. Spring Boot Services via Consul Discovery.
|
||||
# Das ist die Magie: Prometheus fragt Consul nach allen Services.
|
||||
# Wenn ein Service das Tag 'metrics' oder 'spring-boot' hat (oder einfach alle), wird er scraped.
|
||||
- job_name: 'consul-services'
|
||||
consul_sd_configs:
|
||||
- server: 'consul:8500'
|
||||
services: [ ] # Leere Liste = Alle Services
|
||||
|
||||
relabel_configs:
|
||||
# Nur Services scrapen, die NICHT consul selbst sind (das haben wir oben schon)
|
||||
- source_labels: [ __meta_consul_service ]
|
||||
regex: consul
|
||||
action: drop
|
||||
|
||||
# Setze den Pfad auf /actuator/prometheus für Spring Boot Apps
|
||||
# Optional: Man kann das auch über Consul Tags steuern
|
||||
- source_labels: [ __meta_consul_service ]
|
||||
target_label: __metrics_path__
|
||||
replacement: /actuator/prometheus
|
||||
|
||||
# Übernehme den Service-Namen als 'application' Label
|
||||
- source_labels: [ __meta_consul_service ]
|
||||
target_label: application
|
||||
|
||||
# Behalte die Instanz (IP: Port)
|
||||
- source_labels: [ __meta_consul_address, __meta_consul_service_port ]
|
||||
separator: ':'
|
||||
target_label: instance
|
||||
|
||||
# Job 4: Postgres Exporter (Statisch, da kein Consul-Client im Image)
|
||||
- job_name: 'postgres-exporter'
|
||||
static_configs:
|
||||
- targets: [ 'postgres-exporter:9187' ]
|
||||
@@ -0,0 +1,73 @@
|
||||
groups:
|
||||
- name: meldestelle_alerts
|
||||
rules:
|
||||
# 1. Memory: Passt soweit, ist okay.
|
||||
- alert: HighMemoryUsage
|
||||
expr: (jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage ({{ $value | humanize }}%)"
|
||||
description: "JVM Heap usage is above 85%.\n Instance: {{ $labels.instance }}"
|
||||
|
||||
# 2. CPU: Passt auch.
|
||||
- alert: HighCpuUsage
|
||||
expr: process_cpu_usage * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage ({{ $value | humanize }}%)"
|
||||
description: "CPU usage is above 85%.\n Instance: {{ $labels.instance }}"
|
||||
|
||||
# 3. Error Rate: FIX - Division durch null abfangen & Rate nutzen
|
||||
- alert: HighErrorRate
|
||||
# Wir prüfen nur, wenn überhaupt Requests > 0 da sind, um DivByZero zu vermeiden
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_server_requests_seconds_count[5m]))
|
||||
) * 100 > 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High error rate ({{ $value | humanize }}%)"
|
||||
description: "More than 5% of requests resulted in 5xx errors.\n Instance: {{ $labels.instance }}"
|
||||
|
||||
# 4. Service Down: FIX - Job Name Regex
|
||||
- alert: ServiceDown
|
||||
# Prüft alle Jobs, die du in prometheus.yml definiert hast (api-gateway, consul etc.),
|
||||
# 'up == 0' bedeutet: Target ist konfiguriert, aber nicht erreichbar.
|
||||
expr: up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down"
|
||||
description: "Service instance {{ $labels.instance }} of job {{ $labels.job }} is not reachable."
|
||||
|
||||
# 5. Slow Response: FIX - 'rate' benutzen!
|
||||
- alert: SlowResponseTime
|
||||
# Berechnet die durchschnittliche Dauer pro request im 5-Minuten-Fenster
|
||||
expr: rate(http_server_requests_seconds_sum[5m]) / rate(http_server_requests_seconds_count[5m]) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Slow response time ({{ $value | humanizeDuration }})"
|
||||
description: "Average response time is > 1s for the last 5 minutes.\n Instance: {{ $labels.instance }}\n Path: {{ $labels.uri }}"
|
||||
|
||||
# 6. GC Pause: FIX - 'rate' benutzen!
|
||||
- alert: HighGcPauseTime
|
||||
# Zeigt an, wie viel Zeit PRO SEKUNDE für GC draufgeht (nicht pro GC Event, das ist oft aussagekräftiger)
|
||||
# Oder "Durchschnittliche Dauer pro GC Event im Zeitfenster":
|
||||
expr: rate(jvm_gc_pause_seconds_sum[5m]) / rate(jvm_gc_pause_seconds_count[5m]) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High GC pause time ({{ $value | humanizeDuration }})"
|
||||
description: "Average GC pause is > 0.5s.\n Instance: {{ $labels.instance }}"
|
||||
Reference in New Issue
Block a user