diff --git a/MONITORING_SETUP.md b/MONITORING_SETUP.md new file mode 100644 index 00000000..530dc833 --- /dev/null +++ b/MONITORING_SETUP.md @@ -0,0 +1,199 @@ +# Meldestelle Monitoring System + +This document describes the monitoring system set up for the Meldestelle application. The monitoring system includes metrics collection, visualization, centralized logging, and alerting. + +## Components + +The monitoring system consists of the following components: + +1. **Prometheus** - For metrics collection and storage +2. **Grafana** - For metrics visualization and dashboards +3. **ELK Stack** - For centralized logging (Elasticsearch, Logstash, Kibana) +4. **Alertmanager** - For alert management and notifications + +## Architecture + +The monitoring system is deployed as Docker containers alongside the Meldestelle application. The components interact as follows: + +- The Meldestelle application exposes metrics at the `/metrics` endpoint +- Prometheus scrapes metrics from the application and stores them +- Grafana visualizes the metrics from Prometheus +- The application sends logs to Logstash +- Logstash processes the logs and sends them to Elasticsearch +- Kibana visualizes the logs from Elasticsearch +- Prometheus evaluates alerting rules and sends alerts to Alertmanager +- Alertmanager manages alerts and sends notifications via configured channels (email, Slack, etc.) + +## Setup + +The monitoring system is configured in the `docker-compose.yml` file and the configuration files in the `config/monitoring` directory. + +### Prerequisites + +- Docker and Docker Compose +- The Meldestelle application running with metrics enabled + +### Starting the Monitoring System + +To start the monitoring system, run: + +```bash +docker-compose up -d prometheus grafana alertmanager +``` + +To start the ELK Stack, run: + +```bash +docker-compose up -d elasticsearch logstash kibana +``` + +### Testing the Monitoring System + +A test script is provided to verify that the monitoring system is working correctly: + +```bash +./test-monitoring.sh +``` + +## Accessing the Monitoring Tools + +- **Prometheus**: http://localhost:9090 +- **Grafana**: http://localhost:3000 (default credentials: admin/admin) +- **Alertmanager**: http://localhost:9093 +- **Kibana**: http://localhost:5601 + +## Metrics + +The following metrics are collected by Prometheus: + +### JVM Metrics + +- Memory usage (heap and non-heap) +- Garbage collection statistics +- Thread counts +- Class loading statistics +- CPU usage + +### Application Metrics + +- HTTP request counts +- HTTP request durations +- Error rates +- Custom business metrics + +## Dashboards + +Grafana dashboards are provided for visualizing the metrics: + +- **JVM Dashboard**: Shows JVM metrics such as memory usage, garbage collection, and thread counts +- **Application Dashboard**: Shows application metrics such as request rates, error rates, and response times + +## Alerting + +Alerting is configured in Prometheus and Alertmanager. The following alerts are defined: + +- **High Memory Usage**: Triggered when JVM heap memory usage exceeds 85% for 5 minutes +- **High CPU Usage**: Triggered when CPU usage exceeds 85% for 5 minutes +- **High Error Rate**: Triggered when the error rate exceeds 5% for 2 minutes +- **Service Unavailable**: Triggered when the service is down for 1 minute +- **Slow Response Time**: Triggered when the average response time exceeds 1 second for 5 minutes +- **High GC Pause Time**: Triggered when the average GC pause time exceeds 0.5 seconds for 5 minutes + +Alerts are sent to the configured notification channels (email and Slack). + +## Logging + +Logs are collected by Logstash, stored in Elasticsearch, and visualized in Kibana. The following log sources are configured: + +- Application logs via TCP (JSON format) +- File logs from the `/var/log/meldestelle` directory + +## Configuration Files + +- **Prometheus**: `config/monitoring/prometheus.yml` +- **Alertmanager**: `config/monitoring/alertmanager/alertmanager.yml` +- **Alerting Rules**: `config/monitoring/prometheus/rules/alerts.yml` +- **Grafana Dashboards**: `config/monitoring/grafana/dashboards/` +- **Grafana Datasources**: `config/monitoring/grafana/provisioning/datasources/` +- **Logstash**: `config/monitoring/elk/logstash.conf` +- **Elasticsearch**: `config/monitoring/elk/elasticsearch.yml` + +## Troubleshooting + +### Prometheus + +- Check if Prometheus is running: `docker-compose ps prometheus` +- Check Prometheus logs: `docker-compose logs prometheus` +- Verify that Prometheus can scrape metrics: http://localhost:9090/targets +- Check if alerting rules are loaded: http://localhost:9090/rules + +### Grafana + +- Check if Grafana is running: `docker-compose ps grafana` +- Check Grafana logs: `docker-compose logs grafana` +- Verify that Grafana can connect to Prometheus: http://localhost:3000/datasources + +### Alertmanager + +- Check if Alertmanager is running: `docker-compose ps alertmanager` +- Check Alertmanager logs: `docker-compose logs alertmanager` +- Verify that Alertmanager is receiving alerts: http://localhost:9093/#/alerts + +### ELK Stack + +- Check if Elasticsearch is running: `docker-compose ps elasticsearch` +- Check Elasticsearch logs: `docker-compose logs elasticsearch` +- Check if Logstash is running: `docker-compose ps logstash` +- Check Logstash logs: `docker-compose logs logstash` +- Check if Kibana is running: `docker-compose ps kibana` +- Check Kibana logs: `docker-compose logs kibana` +- Verify that Elasticsearch is receiving logs: http://localhost:9200/_cat/indices +- Verify that Kibana can connect to Elasticsearch: http://localhost:5601/app/management/kibana/indexPatterns + +## Maintenance + +### Backup and Restore + +- Prometheus data is stored in the `prometheus_data` volume +- Grafana data is stored in the `grafana_data` volume +- Alertmanager data is stored in the `alertmanager_data` volume +- Elasticsearch data is stored in the `elasticsearch_data` volume + +To backup these volumes, use Docker's volume backup functionality: + +```bash +docker run --rm -v prometheus_data:/source -v $(pwd)/backup:/backup alpine tar -czf /backup/prometheus_data.tar.gz -C /source . +``` + +To restore from a backup: + +```bash +docker run --rm -v prometheus_data:/target -v $(pwd)/backup:/backup alpine sh -c "rm -rf /target/* && tar -xzf /backup/prometheus_data.tar.gz -C /target" +``` + +### Updating + +To update the monitoring components, update the image tags in the `docker-compose.yml` file and run: + +```bash +docker-compose pull prometheus grafana alertmanager +docker-compose up -d prometheus grafana alertmanager +``` + +## Security Considerations + +- The monitoring system is configured for development and testing purposes +- For production use, consider the following security measures: + - Enable authentication for Prometheus + - Use strong passwords for Grafana + - Configure TLS for all components + - Restrict access to the monitoring endpoints + - Use environment variables for sensitive configuration values + - Implement network segmentation to isolate the monitoring system + +## Further Reading + +- [Prometheus Documentation](https://prometheus.io/docs/introduction/overview/) +- [Grafana Documentation](https://grafana.com/docs/grafana/latest/) +- [Alertmanager Documentation](https://prometheus.io/docs/alerting/latest/alertmanager/) +- [ELK Stack Documentation](https://www.elastic.co/guide/index.html) diff --git a/SERVICE_DISCOVERY_IMPLEMENTATION.md b/SERVICE_DISCOVERY_IMPLEMENTATION.md index a414b282..e041306b 100644 --- a/SERVICE_DISCOVERY_IMPLEMENTATION.md +++ b/SERVICE_DISCOVERY_IMPLEMENTATION.md @@ -62,20 +62,6 @@ implementation("io.ktor:ktor-client-cio:${libs.versions.ktor.get()}") Create a service registration component in the shared-kernel module: ```kotlin -package at.mocode.shared.discovery - -import at.mocode.shared.config.AppConfig -import com.orbitz.consul.Consul -import com.orbitz.consul.model.agent.ImmutableRegistration -import com.orbitz.consul.model.agent.Registration -import kotlinx.coroutines.CoroutineScope -import kotlinx.coroutines.Dispatchers -import kotlinx.coroutines.delay -import kotlinx.coroutines.launch -import java.net.InetAddress -import java.util.* -import kotlin.time.Duration.Companion.seconds - class ServiceRegistration( private val serviceName: String, private val servicePort: Int, @@ -222,17 +208,6 @@ implementation("io.ktor:ktor-serialization-kotlinx-json:${libs.versions.ktor.get Create a service discovery component in the API Gateway: ```kotlin -package at.mocode.gateway.discovery - -import com.orbitz.consul.Consul -import com.orbitz.consul.model.health.ServiceHealth -import io.ktor.client.* -import io.ktor.client.engine.cio.* -import io.ktor.client.request.* -import io.ktor.http.* -import java.net.URI -import java.util.concurrent.ConcurrentHashMap - class ServiceDiscovery( private val consulHost: String = "consul", private val consulPort: Int = 8500 diff --git a/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/AuthorizationConfig.kt b/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/AuthorizationConfig.kt index 225c0821..d06449fc 100644 --- a/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/AuthorizationConfig.kt +++ b/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/AuthorizationConfig.kt @@ -157,7 +157,7 @@ private fun getRolePermissions(roles: List): List { roles.forEach { role -> when (role) { UserRole.ADMIN -> { - permissions.addAll(Permission.values()) + permissions.addAll(Permission.entries.toTypedArray()) } UserRole.VEREINS_ADMIN -> { permissions.addAll(listOf( @@ -354,7 +354,7 @@ val PipelineContext.userAuthContext: UserAuthContext? get() = call.principal()?.getUserAuthContext() /** - * Application call extension to check if user has specific role. + * Application call extension to check if the user has a specific role. */ fun ApplicationCall.hasRole(role: UserRole): Boolean { val authContext = principal()?.getUserAuthContext() @@ -362,7 +362,7 @@ fun ApplicationCall.hasRole(role: UserRole): Boolean { } /** - * Application call extension to check if user has specific permission. + * Application call extension to check if the user has specific permission. */ fun ApplicationCall.hasPermission(permission: Permission): Boolean { val authContext = principal()?.getUserAuthContext() diff --git a/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/CachingConfig.kt b/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/CachingConfig.kt index b2bfb5da..513af3fc 100644 --- a/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/CachingConfig.kt +++ b/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/CachingConfig.kt @@ -95,23 +95,23 @@ class CachingConfig( } /** - * Put a value in cache with TTL in minutes + * Put a value in a cache with TTL in minutes */ fun put(cacheName: String, key: String, value: T, ttlMinutes: Long = defaultTtlMinutes) { val stats = cacheStats.computeIfAbsent(cacheName) { CacheStats() } stats.puts++ - // Store in local cache + // Store in a local cache val expiresAt = System.currentTimeMillis() + TimeUnit.MINUTES.toMillis(ttlMinutes) val entry = CacheEntry(value as Any, expiresAt) getCacheMap(cacheName)[key] = entry } /** - * Remove a value from cache + * Remove a value from the cache */ fun remove(cacheName: String, key: String) { - // Remove from local cache + // Remove from the local cache getCacheMap(cacheName).remove(key) } @@ -136,7 +136,7 @@ class CachingConfig( } /** - * Get the appropriate cache map based on cache name + * Get the appropriate cache map based on the cache name */ private fun getCacheMap(cacheName: String): ConcurrentHashMap> { return when (cacheName) { diff --git a/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/CustomMetricsConfig.kt b/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/CustomMetricsConfig.kt index d12f21ff..79bb2f40 100644 --- a/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/CustomMetricsConfig.kt +++ b/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/CustomMetricsConfig.kt @@ -1,18 +1,13 @@ package at.mocode.gateway.config import io.ktor.server.application.* -import io.ktor.server.plugins.* import io.ktor.server.request.* import io.ktor.server.routing.* import io.ktor.util.* import io.micrometer.core.instrument.Counter -import io.micrometer.core.instrument.MeterRegistry import io.micrometer.core.instrument.Timer -import io.micrometer.core.instrument.binder.MeterBinder import io.micrometer.prometheus.PrometheusMeterRegistry -import java.time.Duration import java.util.concurrent.ConcurrentHashMap -import java.util.concurrent.TimeUnit /** * Custom application metrics configuration. diff --git a/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/MonitoringConfig.kt b/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/MonitoringConfig.kt index abacbecf..e87d8993 100644 --- a/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/MonitoringConfig.kt +++ b/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/config/MonitoringConfig.kt @@ -4,19 +4,10 @@ import at.mocode.dto.base.ApiResponse import at.mocode.shared.config.AppConfig import io.ktor.http.* import io.ktor.server.application.* -import io.ktor.server.metrics.micrometer.* import io.ktor.server.plugins.calllogging.* import io.ktor.server.plugins.statuspages.* import io.ktor.server.request.* import io.ktor.server.response.* -import io.ktor.server.routing.* -import io.micrometer.core.instrument.binder.jvm.ClassLoaderMetrics -import io.micrometer.core.instrument.binder.jvm.JvmGcMetrics -import io.micrometer.core.instrument.binder.jvm.JvmMemoryMetrics -import io.micrometer.core.instrument.binder.jvm.JvmThreadMetrics -import io.micrometer.core.instrument.binder.system.ProcessorMetrics -import io.micrometer.prometheus.PrometheusConfig -import io.micrometer.prometheus.PrometheusMeterRegistry import org.slf4j.event.Level import java.time.LocalDateTime import java.time.format.DateTimeFormatter diff --git a/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/discovery/ServiceDiscovery.kt b/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/discovery/ServiceDiscovery.kt index 62e8b503..38dda222 100644 --- a/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/discovery/ServiceDiscovery.kt +++ b/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/discovery/ServiceDiscovery.kt @@ -131,7 +131,7 @@ class ServiceDiscovery( * @return The complete URL */ fun buildServiceUrl(instance: ServiceInstance, path: String): String { - val baseUrl = "http://${instance.host}:${instance.port}" + val baseUrl = "https://${instance.host}:${instance.port}" return URI(baseUrl).resolve(path).toString() } @@ -143,7 +143,7 @@ class ServiceDiscovery( */ suspend fun isServiceHealthy(serviceName: String): Boolean { try { - val response = httpClient.get("http://$consulHost:$consulPort/v1/health/service/$serviceName?passing=true") + val response = httpClient.get("https://$consulHost:$consulPort/v1/health/service/$serviceName?passing=true") val responseBody = response.bodyAsText() val healthyServices = Json.decodeFromString>(responseBody) return healthyServices.isNotEmpty() diff --git a/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/plugins/HttpCaching.kt b/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/plugins/HttpCaching.kt index 984b0473..7f435107 100644 --- a/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/plugins/HttpCaching.kt +++ b/api-gateway/src/jvmMain/kotlin/at/mocode/gateway/plugins/HttpCaching.kt @@ -1,6 +1,5 @@ package at.mocode.gateway.plugins -import at.mocode.gateway.config.CachingConfig import at.mocode.gateway.config.getCachingConfig import io.ktor.http.* import io.ktor.server.application.* @@ -10,7 +9,6 @@ import io.ktor.util.pipeline.* import java.security.MessageDigest import java.text.SimpleDateFormat import java.util.* -import kotlin.text.Charsets /** * Configures enhanced HTTP caching headers for the application. @@ -190,7 +188,7 @@ suspend fun PipelineContext.checkLastModifiedAndRespond(t call.respond(HttpStatusCode.NotModified) return true } - } catch (e: Exception) { + } catch (_: Exception) { // If we can't parse the date, ignore it } } @@ -217,7 +215,7 @@ suspend fun PipelineContext.checkCacheAndRespond( val application = call.application val cachingConfig = try { application.getCachingConfig() - } catch (e: Exception) { + } catch (_: Exception) { return false } diff --git a/settings.gradle.kts b/settings.gradle.kts index 21b4fbaf..9c85d0db 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -25,7 +25,7 @@ dependencyResolutionManagement { includeGroupAndSubgroups("com.google") } } - // Add JCenter repository (archive) + // Add a JCenter repository (archive) maven { url = uri("https://jcenter.bintray.com") }