318 lines
9.3 KiB
Bash
318 lines
9.3 KiB
Bash
|
|
#!/bin/bash
|
||
|
|
|
||
|
|
# Enterprise Incident Simulation Script
|
||
|
|
# Simulates various failure scenarios for testing observability stack
|
||
|
|
|
||
|
|
set -e
|
||
|
|
|
||
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
|
|
PROJECT_ROOT="$(dirname "$(dirname "$SCRIPT_DIR")")"
|
||
|
|
LOG_FILE="$PROJECT_ROOT/observability-stack/logs/incident_simulation.log"
|
||
|
|
|
||
|
|
# Colors for output
|
||
|
|
RED='\033[0;31m'
|
||
|
|
GREEN='\033[0;32m'
|
||
|
|
YELLOW='\033[1;33m'
|
||
|
|
BLUE='\033[0;34m'
|
||
|
|
NC='\033[0m' # No Color
|
||
|
|
|
||
|
|
# Logging function
|
||
|
|
log() {
|
||
|
|
local level=$1
|
||
|
|
local message=$2
|
||
|
|
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||
|
|
echo "[$timestamp] $level $message" >> "$LOG_FILE"
|
||
|
|
echo -e "${BLUE}[$timestamp]${NC} $level $message"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Function to simulate CPU spike
|
||
|
|
simulate_cpu_spike() {
|
||
|
|
local duration=${1:-60}
|
||
|
|
log "INFO" "Starting CPU spike simulation for ${duration} seconds"
|
||
|
|
|
||
|
|
# Launch CPU-intensive processes
|
||
|
|
for i in {1..4}; do
|
||
|
|
(
|
||
|
|
end_time=$((SECONDS + duration))
|
||
|
|
while [ $SECONDS -lt $end_time ]; do
|
||
|
|
# CPU-intensive calculation
|
||
|
|
result=0
|
||
|
|
for j in {1..100000}; do
|
||
|
|
result=$((result + j))
|
||
|
|
done
|
||
|
|
done
|
||
|
|
) &
|
||
|
|
PIDS[$i]=$!
|
||
|
|
done
|
||
|
|
|
||
|
|
# Wait for simulation to complete
|
||
|
|
for pid in "${PIDS[@]}"; do
|
||
|
|
wait $pid 2>/dev/null || true
|
||
|
|
done
|
||
|
|
|
||
|
|
log "INFO" "CPU spike simulation completed"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Function to simulate memory leak
|
||
|
|
simulate_memory_leak() {
|
||
|
|
local duration=${1:-30}
|
||
|
|
log "INFO" "Starting memory leak simulation for ${duration} seconds"
|
||
|
|
|
||
|
|
# Create a process that gradually consumes memory
|
||
|
|
(
|
||
|
|
data=""
|
||
|
|
end_time=$((SECONDS + duration))
|
||
|
|
while [ $SECONDS -lt $end_time ]; do
|
||
|
|
# Gradually consume memory
|
||
|
|
data="${data}X"
|
||
|
|
sleep 0.1
|
||
|
|
done
|
||
|
|
) &
|
||
|
|
MEM_PID=$!
|
||
|
|
|
||
|
|
wait $MEM_PID 2>/dev/null || true
|
||
|
|
log "INFO" "Memory leak simulation completed"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Function to simulate disk space exhaustion
|
||
|
|
simulate_disk_full() {
|
||
|
|
local target_dir=${1:-"/tmp"}
|
||
|
|
local duration=${2:-30}
|
||
|
|
log "INFO" "Starting disk space exhaustion simulation in ${target_dir} for ${duration} seconds"
|
||
|
|
|
||
|
|
# Create large files to fill disk space
|
||
|
|
(
|
||
|
|
end_time=$((SECONDS + duration))
|
||
|
|
while [ $SECONDS -lt $end_time ]; do
|
||
|
|
# Create 100MB file
|
||
|
|
dd if=/dev/zero of="${target_dir}/incident_test_file_$(date +%s).tmp" bs=1M count=100 2>/dev/null || true
|
||
|
|
sleep 2
|
||
|
|
done
|
||
|
|
) &
|
||
|
|
DISK_PID=$!
|
||
|
|
|
||
|
|
wait $DISK_PID 2>/dev/null || true
|
||
|
|
|
||
|
|
# Cleanup test files
|
||
|
|
rm -f "${target_dir}"/incident_test_file_*.tmp 2>/dev/null || true
|
||
|
|
log "INFO" "Disk space exhaustion simulation completed and cleaned up"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Function to simulate network issues
|
||
|
|
simulate_network_issues() {
|
||
|
|
local interface=${1:-"lo"}
|
||
|
|
local duration=${2:-20}
|
||
|
|
log "INFO" "Starting network issues simulation on ${interface} for ${duration} seconds"
|
||
|
|
|
||
|
|
# Add network delay and packet loss
|
||
|
|
sudo tc qdisc add dev $interface root netem delay 100ms 50ms loss 10% 2>/dev/null || true
|
||
|
|
|
||
|
|
sleep $duration
|
||
|
|
|
||
|
|
# Remove network simulation
|
||
|
|
sudo tc qdisc del dev $interface root 2>/dev/null || true
|
||
|
|
log "INFO" "Network issues simulation completed"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Function to simulate service crashes
|
||
|
|
simulate_service_crash() {
|
||
|
|
local service_name=${1:-"test-service"}
|
||
|
|
log "INFO" "Starting service crash simulation for ${service_name}"
|
||
|
|
|
||
|
|
# Simulate service going down
|
||
|
|
log "ERROR" "Service ${service_name} crashed unexpectedly"
|
||
|
|
sleep 5
|
||
|
|
log "INFO" "Service ${service_name} restarted automatically"
|
||
|
|
|
||
|
|
# Simulate multiple crashes
|
||
|
|
for i in {1..3}; do
|
||
|
|
sleep 2
|
||
|
|
log "ERROR" "Service ${service_name} crashed again (attempt $i)"
|
||
|
|
sleep 1
|
||
|
|
log "INFO" "Service ${service_name} recovered after crash $i"
|
||
|
|
done
|
||
|
|
}
|
||
|
|
|
||
|
|
# Function to simulate database issues
|
||
|
|
simulate_database_issues() {
|
||
|
|
local duration=${1:-25}
|
||
|
|
log "INFO" "Starting database issues simulation for ${duration} seconds"
|
||
|
|
|
||
|
|
# Simulate connection pool exhaustion
|
||
|
|
log "WARN" "Database connection pool nearing capacity"
|
||
|
|
sleep 5
|
||
|
|
log "ERROR" "Database connection pool exhausted"
|
||
|
|
sleep 5
|
||
|
|
log "ERROR" "Database query timeout occurred"
|
||
|
|
sleep 5
|
||
|
|
log "WARN" "Database connections recovering"
|
||
|
|
sleep 5
|
||
|
|
log "INFO" "Database connections restored"
|
||
|
|
|
||
|
|
log "INFO" "Database issues simulation completed"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Function to simulate application errors
|
||
|
|
simulate_application_errors() {
|
||
|
|
local error_count=${1:-10}
|
||
|
|
log "INFO" "Starting application error simulation (${error_count} errors)"
|
||
|
|
|
||
|
|
for i in {1..error_count}; do
|
||
|
|
case $((RANDOM % 4)) in
|
||
|
|
0)
|
||
|
|
log "ERROR" "NullPointerException in UserService.getUser($i)"
|
||
|
|
;;
|
||
|
|
1)
|
||
|
|
log "ERROR" "TimeoutException: Database query timed out for user ID: $i"
|
||
|
|
;;
|
||
|
|
2)
|
||
|
|
log "ERROR" "ValidationException: Invalid input data for request $i"
|
||
|
|
;;
|
||
|
|
3)
|
||
|
|
log "ERROR" "IOException: Failed to write to log file"
|
||
|
|
;;
|
||
|
|
esac
|
||
|
|
sleep $((RANDOM % 3 + 1))
|
||
|
|
done
|
||
|
|
|
||
|
|
log "INFO" "Application error simulation completed"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Function to run comprehensive incident scenario
|
||
|
|
run_comprehensive_scenario() {
|
||
|
|
log "INFO" "Starting comprehensive incident scenario simulation"
|
||
|
|
|
||
|
|
# Phase 1: Initial system stress
|
||
|
|
log "INFO" "Phase 1: System stress simulation"
|
||
|
|
simulate_cpu_spike 30 &
|
||
|
|
CPU_PID=$!
|
||
|
|
simulate_memory_leak 20 &
|
||
|
|
MEM_PID=$!
|
||
|
|
|
||
|
|
sleep 10
|
||
|
|
|
||
|
|
# Phase 2: Service degradation
|
||
|
|
log "INFO" "Phase 2: Service degradation simulation"
|
||
|
|
simulate_service_crash "web-service" &
|
||
|
|
SERVICE_PID=$!
|
||
|
|
|
||
|
|
sleep 5
|
||
|
|
|
||
|
|
# Phase 3: Database issues
|
||
|
|
log "INFO" "Phase 3: Database issues simulation"
|
||
|
|
simulate_database_issues 15 &
|
||
|
|
DB_PID=$!
|
||
|
|
|
||
|
|
# Phase 4: Application errors
|
||
|
|
log "INFO" "Phase 4: Application error burst"
|
||
|
|
simulate_application_errors 15 &
|
||
|
|
APP_PID=$!
|
||
|
|
|
||
|
|
# Phase 5: Infrastructure issues
|
||
|
|
log "INFO" "Phase 5: Infrastructure issues simulation"
|
||
|
|
simulate_disk_full "/tmp" 10 &
|
||
|
|
DISK_PID=$!
|
||
|
|
|
||
|
|
# Wait for all simulations to complete
|
||
|
|
wait $CPU_PID 2>/dev/null || true
|
||
|
|
wait $MEM_PID 2>/dev/null || true
|
||
|
|
wait $SERVICE_PID 2>/dev/null || true
|
||
|
|
wait $DB_PID 2>/dev/null || true
|
||
|
|
wait $APP_PID 2>/dev/null || true
|
||
|
|
wait $DISK_PID 2>/dev/null || true
|
||
|
|
|
||
|
|
log "INFO" "Comprehensive incident scenario completed"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Function to show usage
|
||
|
|
show_usage() {
|
||
|
|
echo "Enterprise Incident Simulation Script"
|
||
|
|
echo "Usage: $0 [SCENARIO] [OPTIONS]"
|
||
|
|
echo ""
|
||
|
|
echo "SCENARIOS:"
|
||
|
|
echo " cpu [DURATION] - Simulate CPU spike (default: 60s)"
|
||
|
|
echo " memory [DURATION] - Simulate memory leak (default: 30s)"
|
||
|
|
echo " disk [DIR] [DURATION] - Simulate disk space exhaustion (default: /tmp, 30s)"
|
||
|
|
echo " network [INTERFACE] [DURATION] - Simulate network issues (default: lo, 20s)"
|
||
|
|
echo " service [NAME] - Simulate service crashes (default: test-service)"
|
||
|
|
echo " database [DURATION] - Simulate database issues (default: 25s)"
|
||
|
|
echo " app-errors [COUNT] - Simulate application errors (default: 10)"
|
||
|
|
echo " comprehensive - Run full incident scenario"
|
||
|
|
echo " all - Run all individual scenarios sequentially"
|
||
|
|
echo ""
|
||
|
|
echo "EXAMPLES:"
|
||
|
|
echo " $0 cpu 120 - CPU spike for 2 minutes"
|
||
|
|
echo " $0 disk /var/log 45 - Disk full simulation in /var/log for 45 seconds"
|
||
|
|
echo " $0 comprehensive - Full incident simulation"
|
||
|
|
echo ""
|
||
|
|
}
|
||
|
|
|
||
|
|
# Main execution
|
||
|
|
main() {
|
||
|
|
local scenario=${1:-"comprehensive"}
|
||
|
|
|
||
|
|
# Create log directory if it doesn't exist
|
||
|
|
mkdir -p "$(dirname "$LOG_FILE")"
|
||
|
|
|
||
|
|
log "INFO" "Incident simulation script started"
|
||
|
|
log "INFO" "Scenario: $scenario"
|
||
|
|
|
||
|
|
case $scenario in
|
||
|
|
"cpu")
|
||
|
|
simulate_cpu_spike "${2:-60}"
|
||
|
|
;;
|
||
|
|
"memory")
|
||
|
|
simulate_memory_leak "${2:-30}"
|
||
|
|
;;
|
||
|
|
"disk")
|
||
|
|
simulate_disk_full "${2:-/tmp}" "${3:-30}"
|
||
|
|
;;
|
||
|
|
"network")
|
||
|
|
simulate_network_issues "${2:-lo}" "${3:-20}"
|
||
|
|
;;
|
||
|
|
"service")
|
||
|
|
simulate_service_crash "${2:-test-service}"
|
||
|
|
;;
|
||
|
|
"database")
|
||
|
|
simulate_database_issues "${2:-25}"
|
||
|
|
;;
|
||
|
|
"app-errors")
|
||
|
|
simulate_application_errors "${2:-10}"
|
||
|
|
;;
|
||
|
|
"comprehensive")
|
||
|
|
run_comprehensive_scenario
|
||
|
|
;;
|
||
|
|
"all")
|
||
|
|
log "INFO" "Running all scenarios sequentially"
|
||
|
|
simulate_cpu_spike 30
|
||
|
|
sleep 5
|
||
|
|
simulate_memory_leak 20
|
||
|
|
sleep 5
|
||
|
|
simulate_disk_full "/tmp" 15
|
||
|
|
sleep 5
|
||
|
|
simulate_service_crash "test-service"
|
||
|
|
sleep 5
|
||
|
|
simulate_database_issues 15
|
||
|
|
sleep 5
|
||
|
|
simulate_application_errors 8
|
||
|
|
sleep 5
|
||
|
|
simulate_network_issues "lo" 10
|
||
|
|
;;
|
||
|
|
"help"|"-h"|"--help")
|
||
|
|
show_usage
|
||
|
|
exit 0
|
||
|
|
;;
|
||
|
|
*)
|
||
|
|
echo -e "${RED}Error: Unknown scenario '$scenario'${NC}"
|
||
|
|
echo ""
|
||
|
|
show_usage
|
||
|
|
exit 1
|
||
|
|
;;
|
||
|
|
esac
|
||
|
|
|
||
|
|
log "INFO" "Incident simulation script completed successfully"
|
||
|
|
echo -e "${GREEN}Simulation completed. Check logs at: $LOG_FILE${NC}"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Run main function with all arguments
|
||
|
|
main "$@"
|