7757020014
CI Pipeline / lint-ansible (push) Waiting to run
CI Pipeline / test-python (push) Waiting to run
CI Pipeline / validate-docker (push) Waiting to run
CI Pipeline / security-scan (push) Waiting to run
CI Pipeline / documentation (push) Waiting to run
CI Pipeline / integration-test (push) Blocked by required conditions
343 lines
9.8 KiB
Bash
343 lines
9.8 KiB
Bash
#!/bin/bash
|
|
|
|
# Enterprise Infrastructure Failure Simulation Script
|
|
# Simulates various types of infrastructure failures for testing
|
|
|
|
set -euo pipefail
|
|
|
|
# Configuration
|
|
DOCKER_COMPOSE_FILE="docker-compose.yml"
|
|
INVENTORY_FILE="inventory/hosts.ini"
|
|
LOG_FILE="logs/failure_simulation.log"
|
|
|
|
# Default values
|
|
FAILURE_TYPE="${1:-network}"
|
|
DURATION="${2:-60}"
|
|
TARGET_NODES="${3:-all}"
|
|
INTENSITY="${INTENSITY:-medium}"
|
|
|
|
# Logging function
|
|
log() {
|
|
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
# Error handling
|
|
error_exit() {
|
|
log "ERROR: $1"
|
|
# Cleanup any active failures
|
|
cleanup_failure
|
|
exit 1
|
|
}
|
|
|
|
# Validate inputs
|
|
validate_inputs() {
|
|
case "$FAILURE_TYPE" in
|
|
network|disk|service|node|cpu|memory) ;;
|
|
*) error_exit "Invalid failure type: $FAILURE_TYPE. Must be network, disk, service, node, cpu, or memory" ;;
|
|
esac
|
|
|
|
if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -lt 1 ]; then
|
|
error_exit "Invalid duration: $DURATION. Must be a positive integer (seconds)"
|
|
fi
|
|
|
|
case "$INTENSITY" in
|
|
low|medium|high|critical) ;;
|
|
*) error_exit "Invalid intensity: $INTENSITY. Must be low, medium, high, or critical" ;;
|
|
esac
|
|
}
|
|
|
|
# Get target containers
|
|
get_target_containers() {
|
|
case "$TARGET_NODES" in
|
|
all)
|
|
docker-compose ps --services | grep -v "^NAME$" || true
|
|
;;
|
|
web)
|
|
echo "web"
|
|
;;
|
|
db)
|
|
echo "db"
|
|
;;
|
|
lb)
|
|
echo "lb"
|
|
;;
|
|
monitor)
|
|
echo "monitor"
|
|
;;
|
|
*)
|
|
echo "$TARGET_NODES"
|
|
;;
|
|
esac
|
|
}
|
|
|
|
# Network failure simulation
|
|
simulate_network_failure() {
|
|
local containers=$(get_target_containers)
|
|
log "Simulating network failure on containers: $containers"
|
|
|
|
for container in $containers; do
|
|
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
|
|
|
for cid in $container_ids; do
|
|
if [ -n "$cid" ]; then
|
|
log "Disconnecting network for container $cid"
|
|
|
|
# Disconnect from network
|
|
docker network disconnect "$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" "$cid" 2>/dev/null || true
|
|
|
|
# Store original network for restoration
|
|
echo "$cid:$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" >> /tmp/network_failure_state
|
|
fi
|
|
done
|
|
done
|
|
}
|
|
|
|
# Disk failure simulation
|
|
simulate_disk_failure() {
|
|
local containers=$(get_target_containers)
|
|
log "Simulating disk space exhaustion on containers: $containers"
|
|
|
|
for container in $containers; do
|
|
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
|
|
|
for cid in $container_ids; do
|
|
if [ -n "$cid" ]; then
|
|
log "Filling disk space in container $cid"
|
|
|
|
# Create a large file to consume disk space
|
|
local fill_size="100M"
|
|
case "$INTENSITY" in
|
|
low) fill_size="50M" ;;
|
|
medium) fill_size="100M" ;;
|
|
high) fill_size="500M" ;;
|
|
critical) fill_size="1G" ;;
|
|
esac
|
|
|
|
docker exec "$cid" bash -c "dd if=/dev/zero of=/tmp/disk_fill bs=1M count=$(( ${fill_size%M} * 1024 ))" 2>/dev/null || true
|
|
echo "$cid:disk_fill" >> /tmp/disk_failure_state
|
|
fi
|
|
done
|
|
done
|
|
}
|
|
|
|
# Service failure simulation
|
|
simulate_service_failure() {
|
|
local containers=$(get_target_containers)
|
|
log "Simulating service failures on containers: $containers"
|
|
|
|
for container in $containers; do
|
|
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
|
|
|
for cid in $container_ids; do
|
|
if [ -n "$cid" ]; then
|
|
log "Stopping services in container $cid"
|
|
|
|
# Stop common services
|
|
docker exec "$cid" systemctl stop nginx 2>/dev/null || true
|
|
docker exec "$cid" systemctl stop postgresql 2>/dev/null || true
|
|
docker exec "$cid" systemctl stop haproxy 2>/dev/null || true
|
|
|
|
echo "$cid:services" >> /tmp/service_failure_state
|
|
fi
|
|
done
|
|
done
|
|
}
|
|
|
|
# Node failure simulation
|
|
simulate_node_failure() {
|
|
local containers=$(get_target_containers)
|
|
log "Simulating complete node failures on containers: $containers"
|
|
|
|
for container in $containers; do
|
|
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
|
|
|
for cid in $container_ids; do
|
|
if [ -n "$cid" ]; then
|
|
log "Stopping container $cid (node failure)"
|
|
docker pause "$cid"
|
|
echo "$cid:paused" >> /tmp/node_failure_state
|
|
fi
|
|
done
|
|
done
|
|
}
|
|
|
|
# CPU stress simulation
|
|
simulate_cpu_failure() {
|
|
local containers=$(get_target_containers)
|
|
log "Simulating CPU stress on containers: $containers"
|
|
|
|
for container in $containers; do
|
|
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
|
|
|
for cid in $container_ids; do
|
|
if [ -n "$cid" ]; then
|
|
log "Starting CPU stress in container $cid"
|
|
|
|
# Start CPU stress process
|
|
docker exec -d "$cid" bash -c "while true; do :; done" 2>/dev/null || true
|
|
echo "$cid:cpu_stress:$(docker exec "$cid" ps aux | grep "while true" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/cpu_failure_state
|
|
fi
|
|
done
|
|
done
|
|
}
|
|
|
|
# Memory stress simulation
|
|
simulate_memory_failure() {
|
|
local containers=$(get_target_containers)
|
|
log "Simulating memory exhaustion on containers: $containers"
|
|
|
|
for container in $containers; do
|
|
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
|
|
|
for cid in $container_ids; do
|
|
if [ -n "$cid" ]; then
|
|
log "Starting memory stress in container $cid"
|
|
|
|
# Start memory stress process
|
|
docker exec -d "$cid" bash -c "tail /dev/zero" 2>/dev/null || true
|
|
echo "$cid:memory_stress:$(docker exec "$cid" ps aux | grep "tail /dev/zero" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/memory_failure_state
|
|
fi
|
|
done
|
|
done
|
|
}
|
|
|
|
# Inject failure
|
|
inject_failure() {
|
|
case "$FAILURE_TYPE" in
|
|
network) simulate_network_failure ;;
|
|
disk) simulate_disk_failure ;;
|
|
service) simulate_service_failure ;;
|
|
node) simulate_node_failure ;;
|
|
cpu) simulate_cpu_failure ;;
|
|
memory) simulate_memory_failure ;;
|
|
esac
|
|
}
|
|
|
|
# Cleanup failure
|
|
cleanup_failure() {
|
|
log "Cleaning up failure simulation"
|
|
|
|
# Restore network connections
|
|
if [ -f /tmp/network_failure_state ]; then
|
|
while IFS=: read -r cid network; do
|
|
docker network connect "$network" "$cid" 2>/dev/null || true
|
|
done < /tmp/network_failure_state
|
|
rm -f /tmp/network_failure_state
|
|
fi
|
|
|
|
# Clean up disk fill files
|
|
if [ -f /tmp/disk_failure_state ]; then
|
|
while IFS=: read -r cid _; do
|
|
docker exec "$cid" rm -f /tmp/disk_fill 2>/dev/null || true
|
|
done < /tmp/disk_failure_state
|
|
rm -f /tmp/disk_failure_state
|
|
fi
|
|
|
|
# Restart services
|
|
if [ -f /tmp/service_failure_state ]; then
|
|
while IFS=: read -r cid _; do
|
|
docker exec "$cid" systemctl start nginx 2>/dev/null || true
|
|
docker exec "$cid" systemctl start postgresql 2>/dev/null || true
|
|
docker exec "$cid" systemctl start haproxy 2>/dev/null || true
|
|
done < /tmp/service_failure_state
|
|
rm -f /tmp/service_failure_state
|
|
fi
|
|
|
|
# Unpause containers
|
|
if [ -f /tmp/node_failure_state ]; then
|
|
while IFS=: read -r cid _; do
|
|
docker unpause "$cid" 2>/dev/null || true
|
|
done < /tmp/node_failure_state
|
|
rm -f /tmp/node_failure_state
|
|
fi
|
|
|
|
# Kill stress processes
|
|
if [ -f /tmp/cpu_failure_state ]; then
|
|
while IFS=: read -r cid _ pid; do
|
|
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
|
|
done < /tmp/cpu_failure_state
|
|
rm -f /tmp/cpu_failure_state
|
|
fi
|
|
|
|
if [ -f /tmp/memory_failure_state ]; then
|
|
while IFS=: read -r cid _ pid; do
|
|
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
|
|
done < /tmp/memory_failure_state
|
|
rm -f /tmp/memory_failure_state
|
|
fi
|
|
}
|
|
|
|
# Monitor failure
|
|
monitor_failure() {
|
|
local end_time=$(( $(date +%s) + DURATION ))
|
|
|
|
log "Monitoring failure for $DURATION seconds"
|
|
|
|
while [ $(date +%s) -lt $end_time ]; do
|
|
# Check container status
|
|
if ! docker-compose ps | grep -q "Up\|Paused"; then
|
|
log "WARNING: All containers are down"
|
|
fi
|
|
|
|
# Log system metrics
|
|
log "System status: $(docker stats --no-stream --format 'table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}' | tail -n +2)"
|
|
|
|
sleep 10
|
|
done
|
|
}
|
|
|
|
# Generate failure report
|
|
generate_report() {
|
|
local report_file="reports/failure_simulation_$(date +%Y%m%d_%H%M%S).txt"
|
|
|
|
cat > "$report_file" << EOF
|
|
Failure Simulation Report
|
|
========================
|
|
|
|
Timestamp: $(date)
|
|
Failure Type: $FAILURE_TYPE
|
|
Duration: $DURATION seconds
|
|
Target Nodes: $TARGET_NODES
|
|
Intensity: $INTENSITY
|
|
|
|
Pre-failure Status:
|
|
$(docker-compose ps)
|
|
|
|
Post-failure Status:
|
|
$(docker-compose ps)
|
|
|
|
Log File: $LOG_FILE
|
|
EOF
|
|
|
|
log "Failure simulation report generated: $report_file"
|
|
}
|
|
|
|
# Main execution
|
|
main() {
|
|
log "Starting failure simulation: $FAILURE_TYPE for $DURATION seconds"
|
|
|
|
validate_inputs
|
|
|
|
# Inject failure
|
|
inject_failure
|
|
|
|
# Monitor during failure
|
|
monitor_failure
|
|
|
|
# Cleanup
|
|
cleanup_failure
|
|
|
|
# Generate report
|
|
generate_report
|
|
|
|
log "Failure simulation completed successfully"
|
|
}
|
|
|
|
# Trap for cleanup on script exit
|
|
trap cleanup_failure EXIT
|
|
|
|
# Initialize logging
|
|
mkdir -p logs reports
|
|
|
|
# Run main function
|
|
main "$@" |