#!/bin/bash # Enterprise Infrastructure Failure Simulation Script # Simulates various types of infrastructure failures for testing set -euo pipefail # Configuration DOCKER_COMPOSE_FILE="docker-compose.yml" INVENTORY_FILE="inventory/hosts.ini" LOG_FILE="logs/failure_simulation.log" # Default values FAILURE_TYPE="${1:-network}" DURATION="${2:-60}" TARGET_NODES="${3:-all}" INTENSITY="${INTENSITY:-medium}" # Logging function log() { echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE" } # Error handling error_exit() { log "ERROR: $1" # Cleanup any active failures cleanup_failure exit 1 } # Validate inputs validate_inputs() { case "$FAILURE_TYPE" in network|disk|service|node|cpu|memory) ;; *) error_exit "Invalid failure type: $FAILURE_TYPE. Must be network, disk, service, node, cpu, or memory" ;; esac if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -lt 1 ]; then error_exit "Invalid duration: $DURATION. Must be a positive integer (seconds)" fi case "$INTENSITY" in low|medium|high|critical) ;; *) error_exit "Invalid intensity: $INTENSITY. Must be low, medium, high, or critical" ;; esac } # Get target containers get_target_containers() { case "$TARGET_NODES" in all) docker-compose ps --services | grep -v "^NAME$" || true ;; web) echo "web" ;; db) echo "db" ;; lb) echo "lb" ;; monitor) echo "monitor" ;; *) echo "$TARGET_NODES" ;; esac } # Network failure simulation simulate_network_failure() { local containers=$(get_target_containers) log "Simulating network failure on containers: $containers" for container in $containers; do local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true) for cid in $container_ids; do if [ -n "$cid" ]; then log "Disconnecting network for container $cid" # Disconnect from network docker network disconnect "$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" "$cid" 2>/dev/null || true # Store original network for restoration echo "$cid:$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" >> /tmp/network_failure_state fi done done } # Disk failure simulation simulate_disk_failure() { local containers=$(get_target_containers) log "Simulating disk space exhaustion on containers: $containers" for container in $containers; do local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true) for cid in $container_ids; do if [ -n "$cid" ]; then log "Filling disk space in container $cid" # Create a large file to consume disk space local fill_size="100M" case "$INTENSITY" in low) fill_size="50M" ;; medium) fill_size="100M" ;; high) fill_size="500M" ;; critical) fill_size="1G" ;; esac docker exec "$cid" bash -c "dd if=/dev/zero of=/tmp/disk_fill bs=1M count=$(( ${fill_size%M} * 1024 ))" 2>/dev/null || true echo "$cid:disk_fill" >> /tmp/disk_failure_state fi done done } # Service failure simulation simulate_service_failure() { local containers=$(get_target_containers) log "Simulating service failures on containers: $containers" for container in $containers; do local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true) for cid in $container_ids; do if [ -n "$cid" ]; then log "Stopping services in container $cid" # Stop common services docker exec "$cid" systemctl stop nginx 2>/dev/null || true docker exec "$cid" systemctl stop postgresql 2>/dev/null || true docker exec "$cid" systemctl stop haproxy 2>/dev/null || true echo "$cid:services" >> /tmp/service_failure_state fi done done } # Node failure simulation simulate_node_failure() { local containers=$(get_target_containers) log "Simulating complete node failures on containers: $containers" for container in $containers; do local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true) for cid in $container_ids; do if [ -n "$cid" ]; then log "Stopping container $cid (node failure)" docker pause "$cid" echo "$cid:paused" >> /tmp/node_failure_state fi done done } # CPU stress simulation simulate_cpu_failure() { local containers=$(get_target_containers) log "Simulating CPU stress on containers: $containers" for container in $containers; do local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true) for cid in $container_ids; do if [ -n "$cid" ]; then log "Starting CPU stress in container $cid" # Start CPU stress process docker exec -d "$cid" bash -c "while true; do :; done" 2>/dev/null || true echo "$cid:cpu_stress:$(docker exec "$cid" ps aux | grep "while true" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/cpu_failure_state fi done done } # Memory stress simulation simulate_memory_failure() { local containers=$(get_target_containers) log "Simulating memory exhaustion on containers: $containers" for container in $containers; do local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true) for cid in $container_ids; do if [ -n "$cid" ]; then log "Starting memory stress in container $cid" # Start memory stress process docker exec -d "$cid" bash -c "tail /dev/zero" 2>/dev/null || true echo "$cid:memory_stress:$(docker exec "$cid" ps aux | grep "tail /dev/zero" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/memory_failure_state fi done done } # Inject failure inject_failure() { case "$FAILURE_TYPE" in network) simulate_network_failure ;; disk) simulate_disk_failure ;; service) simulate_service_failure ;; node) simulate_node_failure ;; cpu) simulate_cpu_failure ;; memory) simulate_memory_failure ;; esac } # Cleanup failure cleanup_failure() { log "Cleaning up failure simulation" # Restore network connections if [ -f /tmp/network_failure_state ]; then while IFS=: read -r cid network; do docker network connect "$network" "$cid" 2>/dev/null || true done < /tmp/network_failure_state rm -f /tmp/network_failure_state fi # Clean up disk fill files if [ -f /tmp/disk_failure_state ]; then while IFS=: read -r cid _; do docker exec "$cid" rm -f /tmp/disk_fill 2>/dev/null || true done < /tmp/disk_failure_state rm -f /tmp/disk_failure_state fi # Restart services if [ -f /tmp/service_failure_state ]; then while IFS=: read -r cid _; do docker exec "$cid" systemctl start nginx 2>/dev/null || true docker exec "$cid" systemctl start postgresql 2>/dev/null || true docker exec "$cid" systemctl start haproxy 2>/dev/null || true done < /tmp/service_failure_state rm -f /tmp/service_failure_state fi # Unpause containers if [ -f /tmp/node_failure_state ]; then while IFS=: read -r cid _; do docker unpause "$cid" 2>/dev/null || true done < /tmp/node_failure_state rm -f /tmp/node_failure_state fi # Kill stress processes if [ -f /tmp/cpu_failure_state ]; then while IFS=: read -r cid _ pid; do docker exec "$cid" kill -9 "$pid" 2>/dev/null || true done < /tmp/cpu_failure_state rm -f /tmp/cpu_failure_state fi if [ -f /tmp/memory_failure_state ]; then while IFS=: read -r cid _ pid; do docker exec "$cid" kill -9 "$pid" 2>/dev/null || true done < /tmp/memory_failure_state rm -f /tmp/memory_failure_state fi } # Monitor failure monitor_failure() { local end_time=$(( $(date +%s) + DURATION )) log "Monitoring failure for $DURATION seconds" while [ $(date +%s) -lt $end_time ]; do # Check container status if ! docker-compose ps | grep -q "Up\|Paused"; then log "WARNING: All containers are down" fi # Log system metrics log "System status: $(docker stats --no-stream --format 'table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}' | tail -n +2)" sleep 10 done } # Generate failure report generate_report() { local report_file="reports/failure_simulation_$(date +%Y%m%d_%H%M%S).txt" cat > "$report_file" << EOF Failure Simulation Report ======================== Timestamp: $(date) Failure Type: $FAILURE_TYPE Duration: $DURATION seconds Target Nodes: $TARGET_NODES Intensity: $INTENSITY Pre-failure Status: $(docker-compose ps) Post-failure Status: $(docker-compose ps) Log File: $LOG_FILE EOF log "Failure simulation report generated: $report_file" } # Main execution main() { log "Starting failure simulation: $FAILURE_TYPE for $DURATION seconds" validate_inputs # Inject failure inject_failure # Monitor during failure monitor_failure # Cleanup cleanup_failure # Generate report generate_report log "Failure simulation completed successfully" } # Trap for cleanup on script exit trap cleanup_failure EXIT # Initialize logging mkdir -p logs reports # Run main function main "$@"