meldestelle/scripts/test/test-monitoring.sh
2025-07-25 23:16:16 +02:00

506 lines
17 KiB
Bash
Executable File

#!/bin/bash
# =============================================================================
# Enhanced Monitoring Setup Test Script
# =============================================================================
# This script provides comprehensive testing of the monitoring setup including
# Prometheus, Grafana, and Alertmanager with improved error handling, retry
# logic, cleanup options, and configuration validation.
# =============================================================================
# Load common utilities
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=../utils/common.sh
source "$SCRIPT_DIR/../utils/common.sh" || {
echo "Error: Could not load common utilities from $SCRIPT_DIR/../utils/common.sh"
exit 1
}
# =============================================================================
# Configuration
# =============================================================================
readonly COMPOSE_FILE="${COMPOSE_FILE:-docker-compose.yml}"
readonly MONITORING_SERVICES=("prometheus" "grafana" "alertmanager")
readonly STARTUP_TIMEOUT=120
readonly HEALTH_CHECK_TIMEOUT=30
readonly RETRY_COUNT=3
readonly RETRY_DELAY=10
# Service endpoints
readonly PROMETHEUS_URL="http://localhost:9090"
readonly GRAFANA_URL="http://localhost:3000"
readonly ALERTMANAGER_URL="http://localhost:9093"
# Configuration files
readonly CONFIG_FILES=(
"config/monitoring/prometheus.yml"
"config/monitoring/grafana/provisioning/dashboards/dashboard.yml"
"config/monitoring/grafana/provisioning/datasources/prometheus.yml"
)
# =============================================================================
# Cleanup Function
# =============================================================================
cleanup() {
if [[ "${CLEANUP_SERVICES:-true}" == "true" ]]; then
log_info "Cleaning up monitoring services..."
# Stop monitoring services
if docker-compose -f "$COMPOSE_FILE" ps | grep -q "prometheus\|grafana\|alertmanager"; then
log_info "Stopping monitoring services..."
docker-compose -f "$COMPOSE_FILE" stop "${MONITORING_SERVICES[@]}" >/dev/null 2>&1 || true
fi
# Remove containers if requested
if [[ "${REMOVE_CONTAINERS:-false}" == "true" ]]; then
log_info "Removing monitoring containers..."
docker-compose -f "$COMPOSE_FILE" rm -f "${MONITORING_SERVICES[@]}" >/dev/null 2>&1 || true
fi
log_info "Cleanup completed"
else
log_info "Cleanup skipped (CLEANUP_SERVICES=false)"
fi
}
# =============================================================================
# Configuration Validation Functions
# =============================================================================
validate_configuration() {
log_section "Configuration Validation"
# Check docker-compose file
check_file "$COMPOSE_FILE" "Docker Compose file" || return 1
# Validate docker-compose syntax
log_info "Validating docker-compose syntax..."
if docker-compose -f "$COMPOSE_FILE" config >/dev/null 2>&1; then
print_status "OK" "Docker Compose file syntax is valid"
else
print_status "ERROR" "Docker Compose file has syntax errors"
return 1
fi
# Check required services are defined
for service in "${MONITORING_SERVICES[@]}"; do
if docker-compose -f "$COMPOSE_FILE" config | grep -q "^ ${service}:"; then
print_status "OK" "Service '$service' is defined in docker-compose"
else
print_status "ERROR" "Service '$service' is not defined in docker-compose"
fi
done
# Check configuration files
for config_file in "${CONFIG_FILES[@]}"; do
if [[ -f "$config_file" ]]; then
print_status "OK" "Configuration file exists: $config_file"
# Validate specific configuration files
case "$config_file" in
*prometheus.yml)
validate_prometheus_config "$config_file"
;;
*grafana*)
validate_grafana_config "$config_file"
;;
esac
else
print_status "WARNING" "Configuration file missing: $config_file"
fi
done
return 0
}
validate_prometheus_config() {
local config_file=$1
# Check for required sections
if grep -q "global:" "$config_file" && grep -q "scrape_configs:" "$config_file"; then
print_status "OK" "Prometheus configuration has required sections"
else
print_status "WARNING" "Prometheus configuration may be incomplete"
fi
# Check for application scrape targets
if grep -q "meldestelle" "$config_file"; then
print_status "OK" "Prometheus configured to scrape Meldestelle application"
else
print_status "WARNING" "Prometheus may not be configured to scrape application metrics"
fi
}
validate_grafana_config() {
local config_file=$1
# Basic validation for Grafana config files
if [[ "$config_file" == *"datasources"* ]]; then
if grep -q "prometheus" "$config_file"; then
print_status "OK" "Grafana datasource configuration includes Prometheus"
else
print_status "WARNING" "Grafana datasource configuration may not include Prometheus"
fi
fi
}
# =============================================================================
# Service Management Functions
# =============================================================================
start_monitoring_services() {
log_section "Starting Monitoring Services"
# Check Docker availability
check_docker || return 1
check_docker_compose || return 1
# Start services with timeout
log_info "Starting monitoring stack: ${MONITORING_SERVICES[*]}"
if run_with_timeout "$STARTUP_TIMEOUT" "Start monitoring services" \
docker-compose -f "$COMPOSE_FILE" up -d "${MONITORING_SERVICES[@]}"; then
print_status "OK" "Monitoring services started successfully"
else
print_status "ERROR" "Failed to start monitoring services"
return 1
fi
# Wait for services to be ready
log_info "Waiting for services to be ready..."
sleep 15 # Initial wait for containers to initialize
return 0
}
# =============================================================================
# Health Check Functions
# =============================================================================
check_prometheus() {
log_section "Prometheus Health Check"
local health_url="${PROMETHEUS_URL}/-/healthy"
local ready_url="${PROMETHEUS_URL}/-/ready"
# Check if Prometheus is healthy
if check_http_endpoint "$health_url" "Prometheus health" "$HEALTH_CHECK_TIMEOUT" "$RETRY_COUNT"; then
print_status "OK" "Prometheus is healthy"
else
print_status "ERROR" "Prometheus health check failed"
return 1
fi
# Check if Prometheus is ready
if check_http_endpoint "$ready_url" "Prometheus readiness" "$HEALTH_CHECK_TIMEOUT" 1; then
print_status "OK" "Prometheus is ready"
else
print_status "WARNING" "Prometheus readiness check failed"
fi
# Check Prometheus configuration
local config_url="${PROMETHEUS_URL}/api/v1/status/config"
if check_http_endpoint "$config_url" "Prometheus configuration" 10 1; then
print_status "OK" "Prometheus configuration is accessible"
else
print_status "WARNING" "Prometheus configuration endpoint not accessible"
fi
# Check targets
check_prometheus_targets
return 0
}
check_prometheus_targets() {
log_info "Checking Prometheus targets..."
local targets_url="${PROMETHEUS_URL}/api/v1/targets"
local targets_response
targets_response=$(curl -sf "$targets_url" 2>/dev/null || echo "")
if [[ -n "$targets_response" ]]; then
# Check for application targets
if echo "$targets_response" | grep -q "meldestelle"; then
print_status "OK" "Prometheus can discover application targets"
else
print_status "WARNING" "No application targets found in Prometheus"
log_info "Make sure the application is running and exposing metrics"
fi
# Check for healthy targets
local healthy_targets
healthy_targets=$(echo "$targets_response" | grep -o '"health":"up"' | wc -l)
if [[ "$healthy_targets" -gt 0 ]]; then
print_status "OK" "Found $healthy_targets healthy targets"
else
print_status "WARNING" "No healthy targets found"
fi
else
print_status "WARNING" "Could not retrieve Prometheus targets"
fi
}
check_grafana() {
log_section "Grafana Health Check"
local health_url="${GRAFANA_URL}/api/health"
local datasources_url="${GRAFANA_URL}/api/datasources"
# Check if Grafana is healthy
if check_http_endpoint "$health_url" "Grafana health" "$HEALTH_CHECK_TIMEOUT" "$RETRY_COUNT"; then
print_status "OK" "Grafana is healthy"
# Parse health response
local health_response
health_response=$(curl -sf "$health_url" 2>/dev/null || echo "")
if [[ "$health_response" == *"ok"* ]]; then
print_status "OK" "Grafana health status is OK"
else
print_status "WARNING" "Grafana health status unclear: $health_response"
fi
else
print_status "ERROR" "Grafana health check failed"
return 1
fi
# Check datasources (requires authentication, so this might fail)
log_info "Checking Grafana datasources..."
local datasources_response
datasources_response=$(curl -sf -u "admin:admin" "$datasources_url" 2>/dev/null || echo "")
if [[ -n "$datasources_response" ]] && [[ "$datasources_response" != "Unauthorized" ]]; then
if echo "$datasources_response" | grep -q "prometheus"; then
print_status "OK" "Grafana has Prometheus datasource configured"
else
print_status "WARNING" "Prometheus datasource not found in Grafana"
fi
else
print_status "INFO" "Could not check Grafana datasources (authentication required)"
fi
return 0
}
check_alertmanager() {
log_section "Alertmanager Health Check"
local health_url="${ALERTMANAGER_URL}/-/healthy"
local ready_url="${ALERTMANAGER_URL}/-/ready"
local status_url="${ALERTMANAGER_URL}/api/v1/status"
# Check if Alertmanager is healthy
if check_http_endpoint "$health_url" "Alertmanager health" "$HEALTH_CHECK_TIMEOUT" "$RETRY_COUNT"; then
print_status "OK" "Alertmanager is healthy"
else
print_status "ERROR" "Alertmanager health check failed"
return 1
fi
# Check if Alertmanager is ready
if check_http_endpoint "$ready_url" "Alertmanager readiness" "$HEALTH_CHECK_TIMEOUT" 1; then
print_status "OK" "Alertmanager is ready"
else
print_status "WARNING" "Alertmanager readiness check failed"
fi
# Check Alertmanager status
if check_http_endpoint "$status_url" "Alertmanager status" 10 1; then
print_status "OK" "Alertmanager status endpoint is accessible"
else
print_status "WARNING" "Alertmanager status endpoint not accessible"
fi
return 0
}
# =============================================================================
# Integration Tests
# =============================================================================
test_monitoring_integration() {
log_section "Monitoring Integration Tests"
# Test Prometheus-Grafana integration
log_info "Testing Prometheus-Grafana integration..."
# Check if Prometheus metrics are accessible from Grafana's perspective
local prometheus_query_url="${PROMETHEUS_URL}/api/v1/query?query=up"
if check_http_endpoint "$prometheus_query_url" "Prometheus query API" 10 1; then
print_status "OK" "Prometheus query API is accessible for Grafana"
else
print_status "WARNING" "Prometheus query API may not be accessible for Grafana"
fi
# Test alerting rules
log_info "Checking alerting rules..."
local rules_url="${PROMETHEUS_URL}/api/v1/rules"
local rules_response
rules_response=$(curl -sf "$rules_url" 2>/dev/null || echo "")
if [[ -n "$rules_response" ]]; then
if echo "$rules_response" | grep -q "meldestelle"; then
print_status "OK" "Meldestelle alerting rules are loaded"
else
print_status "WARNING" "No Meldestelle-specific alerting rules found"
fi
else
print_status "WARNING" "Could not retrieve alerting rules"
fi
return 0
}
# =============================================================================
# Performance and Load Tests
# =============================================================================
test_monitoring_performance() {
log_section "Monitoring Performance Tests"
# Test Prometheus query performance
log_info "Testing Prometheus query performance..."
local start_time
local end_time
local duration
start_time=$(date +%s%N)
curl -sf "${PROMETHEUS_URL}/api/v1/query?query=up" >/dev/null 2>&1
local query_result=$?
end_time=$(date +%s%N)
duration=$(( (end_time - start_time) / 1000000 )) # Convert to milliseconds
if [[ $query_result -eq 0 ]]; then
if [[ $duration -lt 1000 ]]; then
print_status "OK" "Prometheus query performance is good (${duration}ms)"
else
print_status "WARNING" "Prometheus query performance is slow (${duration}ms)"
fi
else
print_status "WARNING" "Prometheus query performance test failed"
fi
# Test Grafana response time
log_info "Testing Grafana response time..."
start_time=$(date +%s%N)
curl -sf "${GRAFANA_URL}/api/health" >/dev/null 2>&1
local grafana_result=$?
end_time=$(date +%s%N)
duration=$(( (end_time - start_time) / 1000000 ))
if [[ $grafana_result -eq 0 ]]; then
if [[ $duration -lt 2000 ]]; then
print_status "OK" "Grafana response time is good (${duration}ms)"
else
print_status "WARNING" "Grafana response time is slow (${duration}ms)"
fi
else
print_status "WARNING" "Grafana response time test failed"
fi
return 0
}
# =============================================================================
# Main Execution
# =============================================================================
show_usage() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --no-cleanup Don't stop services after testing"
echo " --remove-containers Remove containers after testing"
echo " --config-only Only validate configuration, don't start services"
echo " --help Show this help message"
echo ""
echo "Environment Variables:"
echo " COMPOSE_FILE Docker compose file to use (default: docker-compose.yml)"
echo " CLEANUP_SERVICES Whether to cleanup services (default: true)"
echo " REMOVE_CONTAINERS Whether to remove containers (default: false)"
}
main() {
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--no-cleanup)
export CLEANUP_SERVICES=false
shift
;;
--remove-containers)
export REMOVE_CONTAINERS=true
shift
;;
--config-only)
local CONFIG_ONLY=true
shift
;;
--help)
show_usage
exit 0
;;
*)
log_error "Unknown option: $1"
show_usage
exit 1
;;
esac
done
log_section "Enhanced Monitoring Setup Test"
log_info "Starting comprehensive monitoring tests..."
log_info "Compose file: $COMPOSE_FILE"
log_info "Test timestamp: $(date)"
# Always validate configuration
validate_configuration || exit 1
# If config-only mode, exit after validation
if [[ "${CONFIG_ONLY:-false}" == "true" ]]; then
log_info "Configuration validation completed (config-only mode)"
print_summary "Monitoring Configuration Validation"
exit 0
fi
# Run all tests
local test_results=()
start_monitoring_services && test_results+=("Startup: PASS") || test_results+=("Startup: FAIL")
check_prometheus && test_results+=("Prometheus: PASS") || test_results+=("Prometheus: FAIL")
check_grafana && test_results+=("Grafana: PASS") || test_results+=("Grafana: FAIL")
check_alertmanager && test_results+=("Alertmanager: PASS") || test_results+=("Alertmanager: FAIL")
test_monitoring_integration && test_results+=("Integration: PASS") || test_results+=("Integration: FAIL")
test_monitoring_performance && test_results+=("Performance: PASS") || test_results+=("Performance: FAIL")
# Print test results summary
log_section "Test Results Summary"
for result in "${test_results[@]}"; do
if [[ "$result" == *"PASS" ]]; then
log_success "$result"
else
log_error "$result"
fi
done
# Print access information
log_section "Monitoring Access Information"
log_info "Prometheus: ${PROMETHEUS_URL}"
log_info "Grafana: ${GRAFANA_URL} (default credentials: admin/admin)"
log_info "Alertmanager: ${ALERTMANAGER_URL}"
# Print final summary
print_summary "Enhanced Monitoring Test"
}
# Run main function
main "$@"