#!/bin/bash # Enterprise Infrastructure Failure Simulation Script # Simulates various types of infrastructure failures for testing set -euo pipefail # Configuration DOCKER_COMPOSE_FILE="docker-compose.yml" INVENTORY_FILE="inventory/hosts.ini" LOG_FILE="logs/failure_simulation.log" # Default values FAILURE_TYPE="${1:-network}" DURATION="${2:-60}" TARGET_NODES="${3:-all}" INTENSITY="${INTENSITY:-medium}" # Logging function log() { echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE" } # Error handling error_exit() { log "ERROR: $1" # Cleanup any active failures cleanup_failure exit 1 } # Validate inputs validate_inputs() { case "$FAILURE_TYPE" in network|disk|service|node|cpu|memory) ;; *) error_exit "Invalid failure type: $FAILURE_TYPE. Must be network, disk, service, node, cpu, or memory" ;; esac if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -lt 1 ]; then error_exit "Invalid duration: $DURATION. Must be a positive integer (seconds)" fi case "$INTENSITY" in low|medium|high|critical) ;; *) error_exit "Invalid intensity: $INTENSITY. Must be low, medium, high, or critical" ;; esac } # Get target containers get_target_containers() { if [ "${SIMULATION_MODE:-false}" = true ]; then case "$TARGET_NODES" in all) echo "web db lb" ;; *) echo "$TARGET_NODES" ;; esac return fi case "$TARGET_NODES" in all) docker compose ps --services | grep -v "^NAME$" || true ;; web) echo "web" ;; db) echo "db" ;; lb) echo "lb" ;; monitor) echo "monitor" ;; *) echo "$TARGET_NODES" ;; esac } # Network failure simulation simulate_network_failure() { local containers=$(get_target_containers) log "Simulating network failure on containers: $containers" if [ "${SIMULATION_MODE:-false}" = true ]; then log "SIMULATION_MODE=true: skipping Docker network changes" return fi for container in $containers; do local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true) for cid in $container_ids; do if [ -n "$cid" ]; then log "Disconnecting network for container $cid" # Disconnect from network docker network disconnect "$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" "$cid" 2>/dev/null || true # Store original network for restoration echo "$cid:$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" >> /tmp/network_failure_state fi done done } # Disk failure simulation simulate_disk_failure() { local containers=$(get_target_containers) log "Simulating disk space exhaustion on containers: $containers" if [ "${SIMULATION_MODE:-false}" = true ]; then log "SIMULATION_MODE=true: skipping container disk writes" return fi for container in $containers; do local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true) for cid in $container_ids; do if [ -n "$cid" ]; then log "Filling disk space in container $cid" # Create a large file to consume disk space local fill_size_mb=100 case "$INTENSITY" in low) fill_size_mb=50 ;; medium) fill_size_mb=100 ;; high) fill_size_mb=500 ;; critical) fill_size_mb=1024 ;; esac docker exec "$cid" bash -c "dd if=/dev/zero of=/tmp/disk_fill bs=1M count=${fill_size_mb}" 2>/dev/null || true echo "$cid:disk_fill" >> /tmp/disk_failure_state fi done done } # Service failure simulation simulate_service_failure() { local containers=$(get_target_containers) log "Simulating service failures on containers: $containers" if [ "${SIMULATION_MODE:-false}" = true ]; then for container in $containers; do log "SIMULATION_MODE=true: would stop services in $container" done return fi for container in $containers; do local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true) for cid in $container_ids; do if [ -n "$cid" ]; then log "Stopping services in container $cid" # Stop common services docker exec "$cid" systemctl stop nginx 2>/dev/null || true docker exec "$cid" systemctl stop postgresql 2>/dev/null || true docker exec "$cid" systemctl stop haproxy 2>/dev/null || true echo "$cid:services" >> /tmp/service_failure_state fi done done } # Node failure simulation simulate_node_failure() { local containers=$(get_target_containers) log "Simulating complete node failures on containers: $containers" if [ "${SIMULATION_MODE:-false}" = true ]; then log "SIMULATION_MODE=true: skipping container pause" return fi for container in $containers; do local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true) for cid in $container_ids; do if [ -n "$cid" ]; then log "Stopping container $cid (node failure)" docker pause "$cid" echo "$cid:paused" >> /tmp/node_failure_state fi done done } # CPU stress simulation simulate_cpu_failure() { local containers=$(get_target_containers) log "Simulating CPU stress on containers: $containers" if [ "${SIMULATION_MODE:-false}" = true ]; then log "SIMULATION_MODE=true: skipping CPU stress" return fi for container in $containers; do local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true) for cid in $container_ids; do if [ -n "$cid" ]; then log "Starting CPU stress in container $cid" # Start CPU stress process docker exec -d "$cid" bash -c "while true; do :; done" 2>/dev/null || true echo "$cid:cpu_stress:$(docker exec "$cid" ps aux | grep "while true" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/cpu_failure_state fi done done } # Memory stress simulation simulate_memory_failure() { local containers=$(get_target_containers) log "Simulating memory exhaustion on containers: $containers" if [ "${SIMULATION_MODE:-false}" = true ]; then log "SIMULATION_MODE=true: skipping memory stress" return fi for container in $containers; do local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true) for cid in $container_ids; do if [ -n "$cid" ]; then log "Starting memory stress in container $cid" # Start memory stress process docker exec -d "$cid" bash -c "tail /dev/zero" 2>/dev/null || true echo "$cid:memory_stress:$(docker exec "$cid" ps aux | grep "tail /dev/zero" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/memory_failure_state fi done done } # Inject failure inject_failure() { case "$FAILURE_TYPE" in network) simulate_network_failure ;; disk) simulate_disk_failure ;; service) simulate_service_failure ;; node) simulate_node_failure ;; cpu) simulate_cpu_failure ;; memory) simulate_memory_failure ;; esac } # Cleanup failure cleanup_failure() { log "Cleaning up failure simulation" # Restore network connections if [ -f /tmp/network_failure_state ]; then while IFS=: read -r cid network; do docker network connect "$network" "$cid" 2>/dev/null || true done < /tmp/network_failure_state rm -f /tmp/network_failure_state fi # Clean up disk fill files if [ -f /tmp/disk_failure_state ]; then while IFS=: read -r cid _; do docker exec "$cid" rm -f /tmp/disk_fill 2>/dev/null || true done < /tmp/disk_failure_state rm -f /tmp/disk_failure_state fi # Restart services if [ -f /tmp/service_failure_state ]; then while IFS=: read -r cid _; do docker exec "$cid" systemctl start nginx 2>/dev/null || true docker exec "$cid" systemctl start postgresql 2>/dev/null || true docker exec "$cid" systemctl start haproxy 2>/dev/null || true done < /tmp/service_failure_state rm -f /tmp/service_failure_state fi # Unpause containers if [ -f /tmp/node_failure_state ]; then while IFS=: read -r cid _; do docker unpause "$cid" 2>/dev/null || true done < /tmp/node_failure_state rm -f /tmp/node_failure_state fi # Kill stress processes if [ -f /tmp/cpu_failure_state ]; then while IFS=: read -r cid _ pid; do docker exec "$cid" kill -9 "$pid" 2>/dev/null || true done < /tmp/cpu_failure_state rm -f /tmp/cpu_failure_state fi if [ -f /tmp/memory_failure_state ]; then while IFS=: read -r cid _ pid; do docker exec "$cid" kill -9 "$pid" 2>/dev/null || true done < /tmp/memory_failure_state rm -f /tmp/memory_failure_state fi } # Monitor failure monitor_failure() { local end_time=$(( $(date +%s) + DURATION )) log "Monitoring failure for $DURATION seconds" while [ $(date +%s) -lt $end_time ]; do # Check container status if [ "${SIMULATION_MODE:-false}" = true ]; then log "SIMULATION_MODE=true: validation simulated" return fi if ! docker compose ps | grep -q "Up\|Paused"; then log "WARNING: All containers are down" fi # Log system metrics log "System status: $(docker stats --no-stream --format 'table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}' | tail -n +2)" sleep 10 done } # Generate failure report generate_report() { local report_file="reports/failure_simulation_$(date +%Y%m%d_%H%M%S).txt" cat > "$report_file" << EOF Failure Simulation Report ======================== Timestamp: $(date) Failure Type: $FAILURE_TYPE Duration: $DURATION seconds Target Nodes: $TARGET_NODES Intensity: $INTENSITY Pre-failure Status: $(docker compose ps 2>/dev/null || echo "Docker Compose not running") Post-failure Status: $(docker compose ps 2>/dev/null || echo "Docker Compose not running") Log File: $LOG_FILE EOF log "Failure simulation report generated: $report_file" } # Main execution main() { log "Starting failure simulation: $FAILURE_TYPE for $DURATION seconds" validate_inputs # Inject failure inject_failure # Monitor during failure monitor_failure # Cleanup cleanup_failure # Generate report generate_report log "Failure simulation completed successfully" } # Trap for cleanup on script exit trap cleanup_failure EXIT # Initialize logging mkdir -p logs reports # Run main function main "$@"