#!/bin/bash # Enterprise Incident Simulation Script # Simulates various failure scenarios for testing observability stack set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(dirname "$(dirname "$SCRIPT_DIR")")" LOG_FILE="$PROJECT_ROOT/observability-stack/logs/incident_simulation.log" # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Logging function log() { local level=$1 local message=$2 local timestamp=$(date '+%Y-%m-%d %H:%M:%S') echo "[$timestamp] $level $message" >> "$LOG_FILE" echo -e "${BLUE}[$timestamp]${NC} $level $message" } # Function to simulate CPU spike simulate_cpu_spike() { local duration=${1:-60} log "INFO" "Starting CPU spike simulation for ${duration} seconds" # Launch CPU-intensive processes for i in {1..4}; do ( end_time=$((SECONDS + duration)) while [ $SECONDS -lt $end_time ]; do # CPU-intensive calculation result=0 for j in {1..100000}; do result=$((result + j)) done done ) & PIDS[$i]=$! done # Wait for simulation to complete for pid in "${PIDS[@]}"; do wait $pid 2>/dev/null || true done log "INFO" "CPU spike simulation completed" } # Function to simulate memory leak simulate_memory_leak() { local duration=${1:-30} log "INFO" "Starting memory leak simulation for ${duration} seconds" # Create a process that gradually consumes memory ( data="" end_time=$((SECONDS + duration)) while [ $SECONDS -lt $end_time ]; do # Gradually consume memory data="${data}X" sleep 0.1 done ) & MEM_PID=$! wait $MEM_PID 2>/dev/null || true log "INFO" "Memory leak simulation completed" } # Function to simulate disk space exhaustion simulate_disk_full() { local target_dir=${1:-"/tmp"} local duration=${2:-30} log "INFO" "Starting disk space exhaustion simulation in ${target_dir} for ${duration} seconds" # Create large files to fill disk space ( end_time=$((SECONDS + duration)) while [ $SECONDS -lt $end_time ]; do # Create 100MB file dd if=/dev/zero of="${target_dir}/incident_test_file_$(date +%s).tmp" bs=1M count=100 2>/dev/null || true sleep 2 done ) & DISK_PID=$! wait $DISK_PID 2>/dev/null || true # Cleanup test files rm -f "${target_dir}"/incident_test_file_*.tmp 2>/dev/null || true log "INFO" "Disk space exhaustion simulation completed and cleaned up" } # Function to simulate network issues simulate_network_issues() { local interface=${1:-"lo"} local duration=${2:-20} log "INFO" "Starting network issues simulation on ${interface} for ${duration} seconds" # Add network delay and packet loss sudo tc qdisc add dev $interface root netem delay 100ms 50ms loss 10% 2>/dev/null || true sleep $duration # Remove network simulation sudo tc qdisc del dev $interface root 2>/dev/null || true log "INFO" "Network issues simulation completed" } # Function to simulate service crashes simulate_service_crash() { local service_name=${1:-"test-service"} log "INFO" "Starting service crash simulation for ${service_name}" # Simulate service going down log "ERROR" "Service ${service_name} crashed unexpectedly" sleep 5 log "INFO" "Service ${service_name} restarted automatically" # Simulate multiple crashes for i in {1..3}; do sleep 2 log "ERROR" "Service ${service_name} crashed again (attempt $i)" sleep 1 log "INFO" "Service ${service_name} recovered after crash $i" done } # Function to simulate database issues simulate_database_issues() { local duration=${1:-25} log "INFO" "Starting database issues simulation for ${duration} seconds" # Simulate connection pool exhaustion log "WARN" "Database connection pool nearing capacity" sleep 5 log "ERROR" "Database connection pool exhausted" sleep 5 log "ERROR" "Database query timeout occurred" sleep 5 log "WARN" "Database connections recovering" sleep 5 log "INFO" "Database connections restored" log "INFO" "Database issues simulation completed" } # Function to simulate application errors simulate_application_errors() { local error_count=${1:-10} log "INFO" "Starting application error simulation (${error_count} errors)" for i in {1..error_count}; do case $((RANDOM % 4)) in 0) log "ERROR" "NullPointerException in UserService.getUser($i)" ;; 1) log "ERROR" "TimeoutException: Database query timed out for user ID: $i" ;; 2) log "ERROR" "ValidationException: Invalid input data for request $i" ;; 3) log "ERROR" "IOException: Failed to write to log file" ;; esac sleep $((RANDOM % 3 + 1)) done log "INFO" "Application error simulation completed" } # Function to run comprehensive incident scenario run_comprehensive_scenario() { log "INFO" "Starting comprehensive incident scenario simulation" # Phase 1: Initial system stress log "INFO" "Phase 1: System stress simulation" simulate_cpu_spike 30 & CPU_PID=$! simulate_memory_leak 20 & MEM_PID=$! sleep 10 # Phase 2: Service degradation log "INFO" "Phase 2: Service degradation simulation" simulate_service_crash "web-service" & SERVICE_PID=$! sleep 5 # Phase 3: Database issues log "INFO" "Phase 3: Database issues simulation" simulate_database_issues 15 & DB_PID=$! # Phase 4: Application errors log "INFO" "Phase 4: Application error burst" simulate_application_errors 15 & APP_PID=$! # Phase 5: Infrastructure issues log "INFO" "Phase 5: Infrastructure issues simulation" simulate_disk_full "/tmp" 10 & DISK_PID=$! # Wait for all simulations to complete wait $CPU_PID 2>/dev/null || true wait $MEM_PID 2>/dev/null || true wait $SERVICE_PID 2>/dev/null || true wait $DB_PID 2>/dev/null || true wait $APP_PID 2>/dev/null || true wait $DISK_PID 2>/dev/null || true log "INFO" "Comprehensive incident scenario completed" } # Function to show usage show_usage() { echo "Enterprise Incident Simulation Script" echo "Usage: $0 [SCENARIO] [OPTIONS]" echo "" echo "SCENARIOS:" echo " cpu [DURATION] - Simulate CPU spike (default: 60s)" echo " memory [DURATION] - Simulate memory leak (default: 30s)" echo " disk [DIR] [DURATION] - Simulate disk space exhaustion (default: /tmp, 30s)" echo " network [INTERFACE] [DURATION] - Simulate network issues (default: lo, 20s)" echo " service [NAME] - Simulate service crashes (default: test-service)" echo " database [DURATION] - Simulate database issues (default: 25s)" echo " app-errors [COUNT] - Simulate application errors (default: 10)" echo " comprehensive - Run full incident scenario" echo " all - Run all individual scenarios sequentially" echo "" echo "EXAMPLES:" echo " $0 cpu 120 - CPU spike for 2 minutes" echo " $0 disk /var/log 45 - Disk full simulation in /var/log for 45 seconds" echo " $0 comprehensive - Full incident simulation" echo "" } # Main execution main() { local scenario=${1:-"comprehensive"} # Create log directory if it doesn't exist mkdir -p "$(dirname "$LOG_FILE")" log "INFO" "Incident simulation script started" log "INFO" "Scenario: $scenario" case $scenario in "cpu") simulate_cpu_spike "${2:-60}" ;; "memory") simulate_memory_leak "${2:-30}" ;; "disk") simulate_disk_full "${2:-/tmp}" "${3:-30}" ;; "network") simulate_network_issues "${2:-lo}" "${3:-20}" ;; "service") simulate_service_crash "${2:-test-service}" ;; "database") simulate_database_issues "${2:-25}" ;; "app-errors") simulate_application_errors "${2:-10}" ;; "comprehensive") run_comprehensive_scenario ;; "all") log "INFO" "Running all scenarios sequentially" simulate_cpu_spike 30 sleep 5 simulate_memory_leak 20 sleep 5 simulate_disk_full "/tmp" 15 sleep 5 simulate_service_crash "test-service" sleep 5 simulate_database_issues 15 sleep 5 simulate_application_errors 8 sleep 5 simulate_network_issues "lo" 10 ;; "help"|"-h"|"--help") show_usage exit 0 ;; *) echo -e "${RED}Error: Unknown scenario '$scenario'${NC}" echo "" show_usage exit 1 ;; esac log "INFO" "Incident simulation script completed successfully" echo -e "${GREEN}Simulation completed. Check logs at: $LOG_FILE${NC}" } # Run main function with all arguments main "$@"