feat: Add comprehensive enterprise Linux infrastructure portfolio with Ansible, Python, and ELK stack
CI Pipeline / lint-ansible (push) Waiting to run
CI Pipeline / test-python (push) Waiting to run
CI Pipeline / validate-docker (push) Waiting to run
CI Pipeline / security-scan (push) Waiting to run
CI Pipeline / documentation (push) Waiting to run
CI Pipeline / integration-test (push) Blocked by required conditions
CI Pipeline / lint-ansible (push) Waiting to run
CI Pipeline / test-python (push) Waiting to run
CI Pipeline / validate-docker (push) Waiting to run
CI Pipeline / security-scan (push) Waiting to run
CI Pipeline / documentation (push) Waiting to run
CI Pipeline / integration-test (push) Blocked by required conditions
This commit is contained in:
@@ -0,0 +1,343 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Enterprise Infrastructure Failure Simulation Script
|
||||
# Simulates various types of infrastructure failures for testing
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration
|
||||
DOCKER_COMPOSE_FILE="docker-compose.yml"
|
||||
INVENTORY_FILE="inventory/hosts.ini"
|
||||
LOG_FILE="logs/failure_simulation.log"
|
||||
|
||||
# Default values
|
||||
FAILURE_TYPE="${1:-network}"
|
||||
DURATION="${2:-60}"
|
||||
TARGET_NODES="${3:-all}"
|
||||
INTENSITY="${INTENSITY:-medium}"
|
||||
|
||||
# Logging function
|
||||
log() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Error handling
|
||||
error_exit() {
|
||||
log "ERROR: $1"
|
||||
# Cleanup any active failures
|
||||
cleanup_failure
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Validate inputs
|
||||
validate_inputs() {
|
||||
case "$FAILURE_TYPE" in
|
||||
network|disk|service|node|cpu|memory) ;;
|
||||
*) error_exit "Invalid failure type: $FAILURE_TYPE. Must be network, disk, service, node, cpu, or memory" ;;
|
||||
esac
|
||||
|
||||
if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -lt 1 ]; then
|
||||
error_exit "Invalid duration: $DURATION. Must be a positive integer (seconds)"
|
||||
fi
|
||||
|
||||
case "$INTENSITY" in
|
||||
low|medium|high|critical) ;;
|
||||
*) error_exit "Invalid intensity: $INTENSITY. Must be low, medium, high, or critical" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Get target containers
|
||||
get_target_containers() {
|
||||
case "$TARGET_NODES" in
|
||||
all)
|
||||
docker-compose ps --services | grep -v "^NAME$" || true
|
||||
;;
|
||||
web)
|
||||
echo "web"
|
||||
;;
|
||||
db)
|
||||
echo "db"
|
||||
;;
|
||||
lb)
|
||||
echo "lb"
|
||||
;;
|
||||
monitor)
|
||||
echo "monitor"
|
||||
;;
|
||||
*)
|
||||
echo "$TARGET_NODES"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Network failure simulation
|
||||
simulate_network_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating network failure on containers: $containers"
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Disconnecting network for container $cid"
|
||||
|
||||
# Disconnect from network
|
||||
docker network disconnect "$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" "$cid" 2>/dev/null || true
|
||||
|
||||
# Store original network for restoration
|
||||
echo "$cid:$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" >> /tmp/network_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Disk failure simulation
|
||||
simulate_disk_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating disk space exhaustion on containers: $containers"
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Filling disk space in container $cid"
|
||||
|
||||
# Create a large file to consume disk space
|
||||
local fill_size="100M"
|
||||
case "$INTENSITY" in
|
||||
low) fill_size="50M" ;;
|
||||
medium) fill_size="100M" ;;
|
||||
high) fill_size="500M" ;;
|
||||
critical) fill_size="1G" ;;
|
||||
esac
|
||||
|
||||
docker exec "$cid" bash -c "dd if=/dev/zero of=/tmp/disk_fill bs=1M count=$(( ${fill_size%M} * 1024 ))" 2>/dev/null || true
|
||||
echo "$cid:disk_fill" >> /tmp/disk_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Service failure simulation
|
||||
simulate_service_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating service failures on containers: $containers"
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Stopping services in container $cid"
|
||||
|
||||
# Stop common services
|
||||
docker exec "$cid" systemctl stop nginx 2>/dev/null || true
|
||||
docker exec "$cid" systemctl stop postgresql 2>/dev/null || true
|
||||
docker exec "$cid" systemctl stop haproxy 2>/dev/null || true
|
||||
|
||||
echo "$cid:services" >> /tmp/service_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Node failure simulation
|
||||
simulate_node_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating complete node failures on containers: $containers"
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Stopping container $cid (node failure)"
|
||||
docker pause "$cid"
|
||||
echo "$cid:paused" >> /tmp/node_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# CPU stress simulation
|
||||
simulate_cpu_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating CPU stress on containers: $containers"
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Starting CPU stress in container $cid"
|
||||
|
||||
# Start CPU stress process
|
||||
docker exec -d "$cid" bash -c "while true; do :; done" 2>/dev/null || true
|
||||
echo "$cid:cpu_stress:$(docker exec "$cid" ps aux | grep "while true" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/cpu_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Memory stress simulation
|
||||
simulate_memory_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating memory exhaustion on containers: $containers"
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Starting memory stress in container $cid"
|
||||
|
||||
# Start memory stress process
|
||||
docker exec -d "$cid" bash -c "tail /dev/zero" 2>/dev/null || true
|
||||
echo "$cid:memory_stress:$(docker exec "$cid" ps aux | grep "tail /dev/zero" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/memory_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Inject failure
|
||||
inject_failure() {
|
||||
case "$FAILURE_TYPE" in
|
||||
network) simulate_network_failure ;;
|
||||
disk) simulate_disk_failure ;;
|
||||
service) simulate_service_failure ;;
|
||||
node) simulate_node_failure ;;
|
||||
cpu) simulate_cpu_failure ;;
|
||||
memory) simulate_memory_failure ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Cleanup failure
|
||||
cleanup_failure() {
|
||||
log "Cleaning up failure simulation"
|
||||
|
||||
# Restore network connections
|
||||
if [ -f /tmp/network_failure_state ]; then
|
||||
while IFS=: read -r cid network; do
|
||||
docker network connect "$network" "$cid" 2>/dev/null || true
|
||||
done < /tmp/network_failure_state
|
||||
rm -f /tmp/network_failure_state
|
||||
fi
|
||||
|
||||
# Clean up disk fill files
|
||||
if [ -f /tmp/disk_failure_state ]; then
|
||||
while IFS=: read -r cid _; do
|
||||
docker exec "$cid" rm -f /tmp/disk_fill 2>/dev/null || true
|
||||
done < /tmp/disk_failure_state
|
||||
rm -f /tmp/disk_failure_state
|
||||
fi
|
||||
|
||||
# Restart services
|
||||
if [ -f /tmp/service_failure_state ]; then
|
||||
while IFS=: read -r cid _; do
|
||||
docker exec "$cid" systemctl start nginx 2>/dev/null || true
|
||||
docker exec "$cid" systemctl start postgresql 2>/dev/null || true
|
||||
docker exec "$cid" systemctl start haproxy 2>/dev/null || true
|
||||
done < /tmp/service_failure_state
|
||||
rm -f /tmp/service_failure_state
|
||||
fi
|
||||
|
||||
# Unpause containers
|
||||
if [ -f /tmp/node_failure_state ]; then
|
||||
while IFS=: read -r cid _; do
|
||||
docker unpause "$cid" 2>/dev/null || true
|
||||
done < /tmp/node_failure_state
|
||||
rm -f /tmp/node_failure_state
|
||||
fi
|
||||
|
||||
# Kill stress processes
|
||||
if [ -f /tmp/cpu_failure_state ]; then
|
||||
while IFS=: read -r cid _ pid; do
|
||||
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
|
||||
done < /tmp/cpu_failure_state
|
||||
rm -f /tmp/cpu_failure_state
|
||||
fi
|
||||
|
||||
if [ -f /tmp/memory_failure_state ]; then
|
||||
while IFS=: read -r cid _ pid; do
|
||||
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
|
||||
done < /tmp/memory_failure_state
|
||||
rm -f /tmp/memory_failure_state
|
||||
fi
|
||||
}
|
||||
|
||||
# Monitor failure
|
||||
monitor_failure() {
|
||||
local end_time=$(( $(date +%s) + DURATION ))
|
||||
|
||||
log "Monitoring failure for $DURATION seconds"
|
||||
|
||||
while [ $(date +%s) -lt $end_time ]; do
|
||||
# Check container status
|
||||
if ! docker-compose ps | grep -q "Up\|Paused"; then
|
||||
log "WARNING: All containers are down"
|
||||
fi
|
||||
|
||||
# Log system metrics
|
||||
log "System status: $(docker stats --no-stream --format 'table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}' | tail -n +2)"
|
||||
|
||||
sleep 10
|
||||
done
|
||||
}
|
||||
|
||||
# Generate failure report
|
||||
generate_report() {
|
||||
local report_file="reports/failure_simulation_$(date +%Y%m%d_%H%M%S).txt"
|
||||
|
||||
cat > "$report_file" << EOF
|
||||
Failure Simulation Report
|
||||
========================
|
||||
|
||||
Timestamp: $(date)
|
||||
Failure Type: $FAILURE_TYPE
|
||||
Duration: $DURATION seconds
|
||||
Target Nodes: $TARGET_NODES
|
||||
Intensity: $INTENSITY
|
||||
|
||||
Pre-failure Status:
|
||||
$(docker-compose ps)
|
||||
|
||||
Post-failure Status:
|
||||
$(docker-compose ps)
|
||||
|
||||
Log File: $LOG_FILE
|
||||
EOF
|
||||
|
||||
log "Failure simulation report generated: $report_file"
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
log "Starting failure simulation: $FAILURE_TYPE for $DURATION seconds"
|
||||
|
||||
validate_inputs
|
||||
|
||||
# Inject failure
|
||||
inject_failure
|
||||
|
||||
# Monitor during failure
|
||||
monitor_failure
|
||||
|
||||
# Cleanup
|
||||
cleanup_failure
|
||||
|
||||
# Generate report
|
||||
generate_report
|
||||
|
||||
log "Failure simulation completed successfully"
|
||||
}
|
||||
|
||||
# Trap for cleanup on script exit
|
||||
trap cleanup_failure EXIT
|
||||
|
||||
# Initialize logging
|
||||
mkdir -p logs reports
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
@@ -0,0 +1,208 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Enterprise Infrastructure Scaling Simulation Script
|
||||
# Simulates scaling operations for infrastructure nodes
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration
|
||||
DOCKER_COMPOSE_FILE="docker-compose.yml"
|
||||
INVENTORY_FILE="inventory/hosts.ini"
|
||||
LOG_FILE="logs/scaling_simulation.log"
|
||||
|
||||
# Default values
|
||||
DIRECTION="${1:-up}"
|
||||
COUNT="${2:-1}"
|
||||
NODE_TYPE="${3:-web}"
|
||||
SIMULATION_MODE="${SIMULATION_MODE:-false}"
|
||||
|
||||
# Logging function
|
||||
log() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Error handling
|
||||
error_exit() {
|
||||
log "ERROR: $1"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Validate inputs
|
||||
validate_inputs() {
|
||||
if [[ "$DIRECTION" != "up" && "$DIRECTION" != "down" ]]; then
|
||||
error_exit "Invalid direction: $DIRECTION. Must be 'up' or 'down'"
|
||||
fi
|
||||
|
||||
if ! [[ "$COUNT" =~ ^[0-9]+$ ]] || [ "$COUNT" -lt 1 ]; then
|
||||
error_exit "Invalid count: $COUNT. Must be a positive integer"
|
||||
fi
|
||||
|
||||
case "$NODE_TYPE" in
|
||||
web|db|lb|monitor) ;;
|
||||
*) error_exit "Invalid node type: $NODE_TYPE. Must be web, db, lb, or monitor" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Get current node count
|
||||
get_current_count() {
|
||||
local type="$1"
|
||||
case "$type" in
|
||||
web) docker-compose ps web | grep -c "Up" ;;
|
||||
db) docker-compose ps db | grep -c "Up" ;;
|
||||
lb) docker-compose ps lb | grep -c "Up" ;;
|
||||
monitor) docker-compose ps monitor | grep -c "Up" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Scale up infrastructure
|
||||
scale_up() {
|
||||
local type="$1"
|
||||
local count="$2"
|
||||
|
||||
log "Scaling up $count $type nodes"
|
||||
|
||||
# Update docker-compose replica count
|
||||
sed -i.bak "s/replicas: [0-9]\+/replicas: $(( $(get_current_count "$type") + count ))/" "$DOCKER_COMPOSE_FILE"
|
||||
|
||||
# Deploy new containers
|
||||
docker-compose up -d --scale "${type}=${count}"
|
||||
|
||||
# Wait for containers to be ready
|
||||
log "Waiting for containers to be ready..."
|
||||
sleep 30
|
||||
|
||||
# Update inventory
|
||||
update_inventory "$type" "$count" "add"
|
||||
|
||||
# Run provisioning playbook on new nodes
|
||||
if [ "$SIMULATION_MODE" = false ]; then
|
||||
ansible-playbook -i "$INVENTORY_FILE" playbooks/provision.yml --limit "${type}*"
|
||||
fi
|
||||
|
||||
log "Successfully scaled up $count $type nodes"
|
||||
}
|
||||
|
||||
# Scale down infrastructure
|
||||
scale_down() {
|
||||
local type="$1"
|
||||
local count="$2"
|
||||
|
||||
local current_count=$(get_current_count "$type")
|
||||
if [ "$current_count" -lt "$count" ]; then
|
||||
error_exit "Cannot scale down $count nodes. Only $current_count $type nodes currently running"
|
||||
fi
|
||||
|
||||
log "Scaling down $count $type nodes"
|
||||
|
||||
# Select nodes to remove (oldest first)
|
||||
local nodes_to_remove=$(docker-compose ps "$type" | grep "Up" | head -n "$count" | awk '{print $1}')
|
||||
|
||||
# Decommission nodes
|
||||
for node in $nodes_to_remove; do
|
||||
if [ "$SIMULATION_MODE" = false ]; then
|
||||
ansible-playbook -i "$INVENTORY_FILE" playbooks/decommission.yml --limit "$node"
|
||||
fi
|
||||
docker stop "$node"
|
||||
docker rm "$node"
|
||||
done
|
||||
|
||||
# Update docker-compose replica count
|
||||
sed -i.bak "s/replicas: [0-9]\+/replicas: $(( current_count - count ))/" "$DOCKER_COMPOSE_FILE"
|
||||
|
||||
# Update inventory
|
||||
update_inventory "$type" "$count" "remove"
|
||||
|
||||
log "Successfully scaled down $count $type nodes"
|
||||
}
|
||||
|
||||
# Update Ansible inventory
|
||||
update_inventory() {
|
||||
local type="$1"
|
||||
local count="$2"
|
||||
local action="$3"
|
||||
|
||||
log "Updating inventory for $action $count $type nodes"
|
||||
|
||||
# This would be more complex in a real implementation
|
||||
# For simulation, we'll just log the action
|
||||
case "$action" in
|
||||
add)
|
||||
log "Added $count $type nodes to inventory"
|
||||
;;
|
||||
remove)
|
||||
log "Removed $count $type nodes from inventory"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Health check after scaling
|
||||
health_check() {
|
||||
log "Running health checks after scaling"
|
||||
|
||||
# Check container status
|
||||
if ! docker-compose ps | grep -q "Up"; then
|
||||
error_exit "Some containers failed to start"
|
||||
fi
|
||||
|
||||
# Ansible ping check
|
||||
if [ "$SIMULATION_MODE" = false ]; then
|
||||
if ! ansible -i "$INVENTORY_FILE" all -m ping >/dev/null 2>&1; then
|
||||
log "WARNING: Some nodes failed Ansible ping check"
|
||||
fi
|
||||
fi
|
||||
|
||||
log "Health checks completed"
|
||||
}
|
||||
|
||||
# Generate scaling report
|
||||
generate_report() {
|
||||
local report_file="reports/scaling_report_$(date +%Y%m%d_%H%M%S).txt"
|
||||
|
||||
cat > "$report_file" << EOF
|
||||
Scaling Simulation Report
|
||||
========================
|
||||
|
||||
Timestamp: $(date)
|
||||
Direction: $DIRECTION
|
||||
Node Type: $NODE_TYPE
|
||||
Count: $COUNT
|
||||
Simulation Mode: $SIMULATION_MODE
|
||||
|
||||
Current Status:
|
||||
$(docker-compose ps)
|
||||
|
||||
Inventory Status:
|
||||
$(ansible -i "$INVENTORY_FILE" --list-hosts all 2>/dev/null || echo "Ansible inventory check failed")
|
||||
|
||||
Log File: $LOG_FILE
|
||||
EOF
|
||||
|
||||
log "Scaling report generated: $report_file"
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
log "Starting scaling simulation: $DIRECTION $COUNT $NODE_TYPE nodes"
|
||||
|
||||
validate_inputs
|
||||
|
||||
case "$DIRECTION" in
|
||||
up)
|
||||
scale_up "$NODE_TYPE" "$COUNT"
|
||||
;;
|
||||
down)
|
||||
scale_down "$NODE_TYPE" "$COUNT"
|
||||
;;
|
||||
esac
|
||||
|
||||
health_check
|
||||
generate_report
|
||||
|
||||
log "Scaling simulation completed successfully"
|
||||
}
|
||||
|
||||
# Initialize logging
|
||||
mkdir -p logs reports
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user