einige Ergänzungen
This commit is contained in:
Executable
+505
@@ -0,0 +1,505 @@
|
||||
#!/bin/bash
|
||||
|
||||
# =============================================================================
|
||||
# Enhanced Monitoring Setup Test Script
|
||||
# =============================================================================
|
||||
# This script provides comprehensive testing of the monitoring setup including
|
||||
# Prometheus, Grafana, and Alertmanager with improved error handling, retry
|
||||
# logic, cleanup options, and configuration validation.
|
||||
# =============================================================================
|
||||
|
||||
# Load common utilities
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=../utils/common.sh
|
||||
source "$SCRIPT_DIR/../utils/common.sh" || {
|
||||
echo "Error: Could not load common utilities from $SCRIPT_DIR/../utils/common.sh"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
readonly COMPOSE_FILE="${COMPOSE_FILE:-docker-compose.yml}"
|
||||
readonly MONITORING_SERVICES=("prometheus" "grafana" "alertmanager")
|
||||
readonly STARTUP_TIMEOUT=120
|
||||
readonly HEALTH_CHECK_TIMEOUT=30
|
||||
readonly RETRY_COUNT=3
|
||||
readonly RETRY_DELAY=10
|
||||
|
||||
# Service endpoints
|
||||
readonly PROMETHEUS_URL="http://localhost:9090"
|
||||
readonly GRAFANA_URL="http://localhost:3000"
|
||||
readonly ALERTMANAGER_URL="http://localhost:9093"
|
||||
|
||||
# Configuration files
|
||||
readonly CONFIG_FILES=(
|
||||
"config/monitoring/prometheus.yml"
|
||||
"config/monitoring/grafana/provisioning/dashboards/dashboard.yml"
|
||||
"config/monitoring/grafana/provisioning/datasources/prometheus.yml"
|
||||
)
|
||||
|
||||
# =============================================================================
|
||||
# Cleanup Function
|
||||
# =============================================================================
|
||||
|
||||
cleanup() {
|
||||
if [[ "${CLEANUP_SERVICES:-true}" == "true" ]]; then
|
||||
log_info "Cleaning up monitoring services..."
|
||||
|
||||
# Stop monitoring services
|
||||
if docker-compose -f "$COMPOSE_FILE" ps | grep -q "prometheus\|grafana\|alertmanager"; then
|
||||
log_info "Stopping monitoring services..."
|
||||
docker-compose -f "$COMPOSE_FILE" stop "${MONITORING_SERVICES[@]}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
# Remove containers if requested
|
||||
if [[ "${REMOVE_CONTAINERS:-false}" == "true" ]]; then
|
||||
log_info "Removing monitoring containers..."
|
||||
docker-compose -f "$COMPOSE_FILE" rm -f "${MONITORING_SERVICES[@]}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
log_info "Cleanup completed"
|
||||
else
|
||||
log_info "Cleanup skipped (CLEANUP_SERVICES=false)"
|
||||
fi
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Configuration Validation Functions
|
||||
# =============================================================================
|
||||
|
||||
validate_configuration() {
|
||||
log_section "Configuration Validation"
|
||||
|
||||
# Check docker-compose file
|
||||
check_file "$COMPOSE_FILE" "Docker Compose file" || return 1
|
||||
|
||||
# Validate docker-compose syntax
|
||||
log_info "Validating docker-compose syntax..."
|
||||
if docker-compose -f "$COMPOSE_FILE" config >/dev/null 2>&1; then
|
||||
print_status "OK" "Docker Compose file syntax is valid"
|
||||
else
|
||||
print_status "ERROR" "Docker Compose file has syntax errors"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Check required services are defined
|
||||
for service in "${MONITORING_SERVICES[@]}"; do
|
||||
if docker-compose -f "$COMPOSE_FILE" config | grep -q "^ ${service}:"; then
|
||||
print_status "OK" "Service '$service' is defined in docker-compose"
|
||||
else
|
||||
print_status "ERROR" "Service '$service' is not defined in docker-compose"
|
||||
fi
|
||||
done
|
||||
|
||||
# Check configuration files
|
||||
for config_file in "${CONFIG_FILES[@]}"; do
|
||||
if [[ -f "$config_file" ]]; then
|
||||
print_status "OK" "Configuration file exists: $config_file"
|
||||
|
||||
# Validate specific configuration files
|
||||
case "$config_file" in
|
||||
*prometheus.yml)
|
||||
validate_prometheus_config "$config_file"
|
||||
;;
|
||||
*grafana*)
|
||||
validate_grafana_config "$config_file"
|
||||
;;
|
||||
esac
|
||||
else
|
||||
print_status "WARNING" "Configuration file missing: $config_file"
|
||||
fi
|
||||
done
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
validate_prometheus_config() {
|
||||
local config_file=$1
|
||||
|
||||
# Check for required sections
|
||||
if grep -q "global:" "$config_file" && grep -q "scrape_configs:" "$config_file"; then
|
||||
print_status "OK" "Prometheus configuration has required sections"
|
||||
else
|
||||
print_status "WARNING" "Prometheus configuration may be incomplete"
|
||||
fi
|
||||
|
||||
# Check for application scrape targets
|
||||
if grep -q "meldestelle" "$config_file"; then
|
||||
print_status "OK" "Prometheus configured to scrape Meldestelle application"
|
||||
else
|
||||
print_status "WARNING" "Prometheus may not be configured to scrape application metrics"
|
||||
fi
|
||||
}
|
||||
|
||||
validate_grafana_config() {
|
||||
local config_file=$1
|
||||
|
||||
# Basic validation for Grafana config files
|
||||
if [[ "$config_file" == *"datasources"* ]]; then
|
||||
if grep -q "prometheus" "$config_file"; then
|
||||
print_status "OK" "Grafana datasource configuration includes Prometheus"
|
||||
else
|
||||
print_status "WARNING" "Grafana datasource configuration may not include Prometheus"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Service Management Functions
|
||||
# =============================================================================
|
||||
|
||||
start_monitoring_services() {
|
||||
log_section "Starting Monitoring Services"
|
||||
|
||||
# Check Docker availability
|
||||
check_docker || return 1
|
||||
check_docker_compose || return 1
|
||||
|
||||
# Start services with timeout
|
||||
log_info "Starting monitoring stack: ${MONITORING_SERVICES[*]}"
|
||||
|
||||
if run_with_timeout "$STARTUP_TIMEOUT" "Start monitoring services" \
|
||||
docker-compose -f "$COMPOSE_FILE" up -d "${MONITORING_SERVICES[@]}"; then
|
||||
print_status "OK" "Monitoring services started successfully"
|
||||
else
|
||||
print_status "ERROR" "Failed to start monitoring services"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Wait for services to be ready
|
||||
log_info "Waiting for services to be ready..."
|
||||
sleep 15 # Initial wait for containers to initialize
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Health Check Functions
|
||||
# =============================================================================
|
||||
|
||||
check_prometheus() {
|
||||
log_section "Prometheus Health Check"
|
||||
|
||||
local health_url="${PROMETHEUS_URL}/-/healthy"
|
||||
local ready_url="${PROMETHEUS_URL}/-/ready"
|
||||
|
||||
# Check if Prometheus is healthy
|
||||
if check_http_endpoint "$health_url" "Prometheus health" "$HEALTH_CHECK_TIMEOUT" "$RETRY_COUNT"; then
|
||||
print_status "OK" "Prometheus is healthy"
|
||||
else
|
||||
print_status "ERROR" "Prometheus health check failed"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Check if Prometheus is ready
|
||||
if check_http_endpoint "$ready_url" "Prometheus readiness" "$HEALTH_CHECK_TIMEOUT" 1; then
|
||||
print_status "OK" "Prometheus is ready"
|
||||
else
|
||||
print_status "WARNING" "Prometheus readiness check failed"
|
||||
fi
|
||||
|
||||
# Check Prometheus configuration
|
||||
local config_url="${PROMETHEUS_URL}/api/v1/status/config"
|
||||
if check_http_endpoint "$config_url" "Prometheus configuration" 10 1; then
|
||||
print_status "OK" "Prometheus configuration is accessible"
|
||||
else
|
||||
print_status "WARNING" "Prometheus configuration endpoint not accessible"
|
||||
fi
|
||||
|
||||
# Check targets
|
||||
check_prometheus_targets
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
check_prometheus_targets() {
|
||||
log_info "Checking Prometheus targets..."
|
||||
|
||||
local targets_url="${PROMETHEUS_URL}/api/v1/targets"
|
||||
local targets_response
|
||||
|
||||
targets_response=$(curl -sf "$targets_url" 2>/dev/null || echo "")
|
||||
|
||||
if [[ -n "$targets_response" ]]; then
|
||||
# Check for application targets
|
||||
if echo "$targets_response" | grep -q "meldestelle"; then
|
||||
print_status "OK" "Prometheus can discover application targets"
|
||||
else
|
||||
print_status "WARNING" "No application targets found in Prometheus"
|
||||
log_info "Make sure the application is running and exposing metrics"
|
||||
fi
|
||||
|
||||
# Check for healthy targets
|
||||
local healthy_targets
|
||||
healthy_targets=$(echo "$targets_response" | grep -o '"health":"up"' | wc -l)
|
||||
if [[ "$healthy_targets" -gt 0 ]]; then
|
||||
print_status "OK" "Found $healthy_targets healthy targets"
|
||||
else
|
||||
print_status "WARNING" "No healthy targets found"
|
||||
fi
|
||||
else
|
||||
print_status "WARNING" "Could not retrieve Prometheus targets"
|
||||
fi
|
||||
}
|
||||
|
||||
check_grafana() {
|
||||
log_section "Grafana Health Check"
|
||||
|
||||
local health_url="${GRAFANA_URL}/api/health"
|
||||
local datasources_url="${GRAFANA_URL}/api/datasources"
|
||||
|
||||
# Check if Grafana is healthy
|
||||
if check_http_endpoint "$health_url" "Grafana health" "$HEALTH_CHECK_TIMEOUT" "$RETRY_COUNT"; then
|
||||
print_status "OK" "Grafana is healthy"
|
||||
|
||||
# Parse health response
|
||||
local health_response
|
||||
health_response=$(curl -sf "$health_url" 2>/dev/null || echo "")
|
||||
if [[ "$health_response" == *"ok"* ]]; then
|
||||
print_status "OK" "Grafana health status is OK"
|
||||
else
|
||||
print_status "WARNING" "Grafana health status unclear: $health_response"
|
||||
fi
|
||||
else
|
||||
print_status "ERROR" "Grafana health check failed"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Check datasources (requires authentication, so this might fail)
|
||||
log_info "Checking Grafana datasources..."
|
||||
local datasources_response
|
||||
datasources_response=$(curl -sf -u "admin:admin" "$datasources_url" 2>/dev/null || echo "")
|
||||
|
||||
if [[ -n "$datasources_response" ]] && [[ "$datasources_response" != "Unauthorized" ]]; then
|
||||
if echo "$datasources_response" | grep -q "prometheus"; then
|
||||
print_status "OK" "Grafana has Prometheus datasource configured"
|
||||
else
|
||||
print_status "WARNING" "Prometheus datasource not found in Grafana"
|
||||
fi
|
||||
else
|
||||
print_status "INFO" "Could not check Grafana datasources (authentication required)"
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
check_alertmanager() {
|
||||
log_section "Alertmanager Health Check"
|
||||
|
||||
local health_url="${ALERTMANAGER_URL}/-/healthy"
|
||||
local ready_url="${ALERTMANAGER_URL}/-/ready"
|
||||
local status_url="${ALERTMANAGER_URL}/api/v1/status"
|
||||
|
||||
# Check if Alertmanager is healthy
|
||||
if check_http_endpoint "$health_url" "Alertmanager health" "$HEALTH_CHECK_TIMEOUT" "$RETRY_COUNT"; then
|
||||
print_status "OK" "Alertmanager is healthy"
|
||||
else
|
||||
print_status "ERROR" "Alertmanager health check failed"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Check if Alertmanager is ready
|
||||
if check_http_endpoint "$ready_url" "Alertmanager readiness" "$HEALTH_CHECK_TIMEOUT" 1; then
|
||||
print_status "OK" "Alertmanager is ready"
|
||||
else
|
||||
print_status "WARNING" "Alertmanager readiness check failed"
|
||||
fi
|
||||
|
||||
# Check Alertmanager status
|
||||
if check_http_endpoint "$status_url" "Alertmanager status" 10 1; then
|
||||
print_status "OK" "Alertmanager status endpoint is accessible"
|
||||
else
|
||||
print_status "WARNING" "Alertmanager status endpoint not accessible"
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Integration Tests
|
||||
# =============================================================================
|
||||
|
||||
test_monitoring_integration() {
|
||||
log_section "Monitoring Integration Tests"
|
||||
|
||||
# Test Prometheus-Grafana integration
|
||||
log_info "Testing Prometheus-Grafana integration..."
|
||||
|
||||
# Check if Prometheus metrics are accessible from Grafana's perspective
|
||||
local prometheus_query_url="${PROMETHEUS_URL}/api/v1/query?query=up"
|
||||
if check_http_endpoint "$prometheus_query_url" "Prometheus query API" 10 1; then
|
||||
print_status "OK" "Prometheus query API is accessible for Grafana"
|
||||
else
|
||||
print_status "WARNING" "Prometheus query API may not be accessible for Grafana"
|
||||
fi
|
||||
|
||||
# Test alerting rules
|
||||
log_info "Checking alerting rules..."
|
||||
local rules_url="${PROMETHEUS_URL}/api/v1/rules"
|
||||
local rules_response
|
||||
rules_response=$(curl -sf "$rules_url" 2>/dev/null || echo "")
|
||||
|
||||
if [[ -n "$rules_response" ]]; then
|
||||
if echo "$rules_response" | grep -q "meldestelle"; then
|
||||
print_status "OK" "Meldestelle alerting rules are loaded"
|
||||
else
|
||||
print_status "WARNING" "No Meldestelle-specific alerting rules found"
|
||||
fi
|
||||
else
|
||||
print_status "WARNING" "Could not retrieve alerting rules"
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Performance and Load Tests
|
||||
# =============================================================================
|
||||
|
||||
test_monitoring_performance() {
|
||||
log_section "Monitoring Performance Tests"
|
||||
|
||||
# Test Prometheus query performance
|
||||
log_info "Testing Prometheus query performance..."
|
||||
|
||||
local start_time
|
||||
local end_time
|
||||
local duration
|
||||
|
||||
start_time=$(date +%s%N)
|
||||
curl -sf "${PROMETHEUS_URL}/api/v1/query?query=up" >/dev/null 2>&1
|
||||
local query_result=$?
|
||||
end_time=$(date +%s%N)
|
||||
|
||||
duration=$(( (end_time - start_time) / 1000000 )) # Convert to milliseconds
|
||||
|
||||
if [[ $query_result -eq 0 ]]; then
|
||||
if [[ $duration -lt 1000 ]]; then
|
||||
print_status "OK" "Prometheus query performance is good (${duration}ms)"
|
||||
else
|
||||
print_status "WARNING" "Prometheus query performance is slow (${duration}ms)"
|
||||
fi
|
||||
else
|
||||
print_status "WARNING" "Prometheus query performance test failed"
|
||||
fi
|
||||
|
||||
# Test Grafana response time
|
||||
log_info "Testing Grafana response time..."
|
||||
|
||||
start_time=$(date +%s%N)
|
||||
curl -sf "${GRAFANA_URL}/api/health" >/dev/null 2>&1
|
||||
local grafana_result=$?
|
||||
end_time=$(date +%s%N)
|
||||
|
||||
duration=$(( (end_time - start_time) / 1000000 ))
|
||||
|
||||
if [[ $grafana_result -eq 0 ]]; then
|
||||
if [[ $duration -lt 2000 ]]; then
|
||||
print_status "OK" "Grafana response time is good (${duration}ms)"
|
||||
else
|
||||
print_status "WARNING" "Grafana response time is slow (${duration}ms)"
|
||||
fi
|
||||
else
|
||||
print_status "WARNING" "Grafana response time test failed"
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Main Execution
|
||||
# =============================================================================
|
||||
|
||||
show_usage() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --no-cleanup Don't stop services after testing"
|
||||
echo " --remove-containers Remove containers after testing"
|
||||
echo " --config-only Only validate configuration, don't start services"
|
||||
echo " --help Show this help message"
|
||||
echo ""
|
||||
echo "Environment Variables:"
|
||||
echo " COMPOSE_FILE Docker compose file to use (default: docker-compose.yml)"
|
||||
echo " CLEANUP_SERVICES Whether to cleanup services (default: true)"
|
||||
echo " REMOVE_CONTAINERS Whether to remove containers (default: false)"
|
||||
}
|
||||
|
||||
main() {
|
||||
# Parse command line arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--no-cleanup)
|
||||
export CLEANUP_SERVICES=false
|
||||
shift
|
||||
;;
|
||||
--remove-containers)
|
||||
export REMOVE_CONTAINERS=true
|
||||
shift
|
||||
;;
|
||||
--config-only)
|
||||
local CONFIG_ONLY=true
|
||||
shift
|
||||
;;
|
||||
--help)
|
||||
show_usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
log_error "Unknown option: $1"
|
||||
show_usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
log_section "Enhanced Monitoring Setup Test"
|
||||
|
||||
log_info "Starting comprehensive monitoring tests..."
|
||||
log_info "Compose file: $COMPOSE_FILE"
|
||||
log_info "Test timestamp: $(date)"
|
||||
|
||||
# Always validate configuration
|
||||
validate_configuration || exit 1
|
||||
|
||||
# If config-only mode, exit after validation
|
||||
if [[ "${CONFIG_ONLY:-false}" == "true" ]]; then
|
||||
log_info "Configuration validation completed (config-only mode)"
|
||||
print_summary "Monitoring Configuration Validation"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Run all tests
|
||||
local test_results=()
|
||||
|
||||
start_monitoring_services && test_results+=("Startup: PASS") || test_results+=("Startup: FAIL")
|
||||
check_prometheus && test_results+=("Prometheus: PASS") || test_results+=("Prometheus: FAIL")
|
||||
check_grafana && test_results+=("Grafana: PASS") || test_results+=("Grafana: FAIL")
|
||||
check_alertmanager && test_results+=("Alertmanager: PASS") || test_results+=("Alertmanager: FAIL")
|
||||
test_monitoring_integration && test_results+=("Integration: PASS") || test_results+=("Integration: FAIL")
|
||||
test_monitoring_performance && test_results+=("Performance: PASS") || test_results+=("Performance: FAIL")
|
||||
|
||||
# Print test results summary
|
||||
log_section "Test Results Summary"
|
||||
for result in "${test_results[@]}"; do
|
||||
if [[ "$result" == *"PASS" ]]; then
|
||||
log_success "$result"
|
||||
else
|
||||
log_error "$result"
|
||||
fi
|
||||
done
|
||||
|
||||
# Print access information
|
||||
log_section "Monitoring Access Information"
|
||||
log_info "Prometheus: ${PROMETHEUS_URL}"
|
||||
log_info "Grafana: ${GRAFANA_URL} (default credentials: admin/admin)"
|
||||
log_info "Alertmanager: ${ALERTMANAGER_URL}"
|
||||
|
||||
# Print final summary
|
||||
print_summary "Enhanced Monitoring Test"
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user