ci: configure and stabilize CI/CD pipeline
- fix runner configuration issues - correct workflow labels and execution environment - resolve dependency issues in pipeline (python deps) - improve reliability of automation runs
This commit is contained in:
@@ -0,0 +1,173 @@
|
||||
# Enterprise Infrastructure Simulator Makefile
|
||||
|
||||
.PHONY: help run demo up down patch destroy status logs clean test
|
||||
|
||||
# Default target
|
||||
help: ## Show this help message
|
||||
@echo "Enterprise Infrastructure Simulator"
|
||||
@echo ""
|
||||
@echo "Available commands:"
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " %-15s %s\n", $$1, $$2}'
|
||||
|
||||
run: ## Run the default simulator workflow
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
|
||||
|
||||
demo: ## Run a failure-and-patch demonstration
|
||||
./scripts/simulate_failure.sh service 30 web
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
||||
|
||||
# Infrastructure management
|
||||
up: ## Start the infrastructure simulation
|
||||
@echo "Starting enterprise infrastructure simulation..."
|
||||
docker-compose up -d
|
||||
@echo "Waiting for containers to be ready..."
|
||||
@sleep 30
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
|
||||
@echo "Infrastructure simulation started successfully"
|
||||
|
||||
down: ## Stop the infrastructure simulation
|
||||
@echo "Stopping infrastructure simulation..."
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml || true
|
||||
docker-compose down
|
||||
@echo "Infrastructure simulation stopped"
|
||||
|
||||
patch: ## Apply security patches to all nodes
|
||||
@echo "Applying security patches..."
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
||||
@echo "Security patches applied"
|
||||
|
||||
destroy: ## Completely destroy the infrastructure
|
||||
@echo "Destroying infrastructure..."
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml || true
|
||||
docker-compose down -v --remove-orphans
|
||||
docker system prune -f
|
||||
rm -rf logs/* reports/*
|
||||
@echo "Infrastructure completely destroyed"
|
||||
|
||||
# Scaling operations
|
||||
scale-up-web: ## Scale up web servers (usage: make scale-up-web COUNT=2)
|
||||
@echo "Scaling up $(COUNT) web servers..."
|
||||
./scripts/simulate_scaling.sh up $(or $(COUNT),1) web
|
||||
|
||||
scale-up-db: ## Scale up database servers (usage: make scale-up-db COUNT=1)
|
||||
@echo "Scaling up $(COUNT) database servers..."
|
||||
./scripts/simulate_scaling.sh up $(or $(COUNT),1) db
|
||||
|
||||
scale-down-web: ## Scale down web servers (usage: make scale-down-web COUNT=1)
|
||||
@echo "Scaling down $(COUNT) web servers..."
|
||||
./scripts/simulate_scaling.sh down $(or $(COUNT),1) web
|
||||
|
||||
scale-down-db: ## Scale down database servers (usage: make scale-down-db COUNT=1)
|
||||
@echo "Scaling down $(COUNT) database servers..."
|
||||
./scripts/simulate_scaling.sh down $(or $(COUNT),1) db
|
||||
|
||||
# Failure simulation
|
||||
fail-network: ## Simulate network failure (usage: make fail-network DURATION=60)
|
||||
@echo "Simulating network failure for $(or $(DURATION),60) seconds..."
|
||||
./scripts/simulate_failure.sh network $(or $(DURATION),60)
|
||||
|
||||
fail-disk: ## Simulate disk space exhaustion (usage: make fail-disk DURATION=120)
|
||||
@echo "Simulating disk failure for $(or $(DURATION),120) seconds..."
|
||||
./scripts/simulate_failure.sh disk $(or $(DURATION),120)
|
||||
|
||||
fail-service: ## Simulate service failures (usage: make fail-service DURATION=30)
|
||||
@echo "Simulating service failure for $(or $(DURATION),30) seconds..."
|
||||
./scripts/simulate_failure.sh service $(or $(DURATION),30)
|
||||
|
||||
fail-node: ## Simulate complete node failure (usage: make fail-node DURATION=300)
|
||||
@echo "Simulating node failure for $(or $(DURATION),300) seconds..."
|
||||
./scripts/simulate_failure.sh node $(or $(DURATION),300)
|
||||
|
||||
# Monitoring and status
|
||||
status: ## Show infrastructure status
|
||||
@echo "=== Docker Containers ==="
|
||||
docker-compose ps
|
||||
@echo ""
|
||||
@echo "=== Ansible Inventory ==="
|
||||
ansible -i inventory/hosts.ini --list-hosts all || echo "Inventory check failed"
|
||||
@echo ""
|
||||
@echo "=== System Resources ==="
|
||||
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}\t{{.NetIO}}"
|
||||
|
||||
logs: ## Show infrastructure logs
|
||||
docker-compose logs -f --tail=100
|
||||
|
||||
logs-web: ## Show web server logs
|
||||
docker-compose logs -f web
|
||||
|
||||
logs-db: ## Show database logs
|
||||
docker-compose logs -f db
|
||||
|
||||
# Testing and validation
|
||||
test: ## Run infrastructure tests
|
||||
@echo "Running infrastructure tests..."
|
||||
ansible -i inventory/hosts.ini all -m ping
|
||||
ansible-playbook -i inventory/hosts.ini --syntax-check playbooks/*.yml
|
||||
@echo "Testing scaling scripts..."
|
||||
./scripts/simulate_scaling.sh up 0 web # Dry run
|
||||
./scripts/simulate_failure.sh network 1 # Quick test
|
||||
@echo "All tests passed"
|
||||
|
||||
validate: ## Validate infrastructure configuration
|
||||
@echo "Validating configuration..."
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml --check
|
||||
docker-compose config
|
||||
@echo "Configuration validation complete"
|
||||
|
||||
# Scenarios
|
||||
scenario-scaling: ## Run scaling event scenario
|
||||
@echo "Running scaling event scenario..."
|
||||
ansible-playbook -i inventory/hosts.ini scenarios/scaling_event.yml
|
||||
|
||||
scenario-disaster: ## Run disaster recovery scenario
|
||||
@echo "Running disaster recovery scenario..."
|
||||
ansible-playbook -i inventory/hosts.ini scenarios/disaster_recovery.yml
|
||||
|
||||
# Maintenance
|
||||
clean: ## Clean up temporary files and logs
|
||||
@echo "Cleaning up temporary files..."
|
||||
rm -rf logs/*.log reports/*.txt
|
||||
docker system prune -f
|
||||
@echo "Cleanup complete"
|
||||
|
||||
backup: ## Create infrastructure backup
|
||||
@echo "Creating infrastructure backup..."
|
||||
mkdir -p backups/$(shell date +%Y%m%d_%H%M%S)
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/backup.yml
|
||||
docker-compose exec ansible tar -czf /backups/infra_backup.tar.gz /infrastructure
|
||||
@echo "Backup created"
|
||||
|
||||
# Development
|
||||
lint: ## Lint Ansible playbooks
|
||||
@echo "Linting Ansible playbooks..."
|
||||
ansible-lint playbooks/*.yml scenarios/*.yml
|
||||
@echo "Linting complete"
|
||||
|
||||
format: ## Format code and configuration
|
||||
@echo "Formatting code..."
|
||||
# Add formatting commands here
|
||||
@echo "Formatting complete"
|
||||
|
||||
# Security
|
||||
harden: ## Apply security hardening
|
||||
@echo "Applying security hardening..."
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
|
||||
|
||||
security-scan: ## Run security scans
|
||||
@echo "Running security scans..."
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/security_scan.yml
|
||||
|
||||
# Help for specific targets
|
||||
help-scaling: ## Show scaling-related commands
|
||||
@echo "Scaling Commands:"
|
||||
@echo " make scale-up-web COUNT=2 - Add 2 web servers"
|
||||
@echo " make scale-up-db COUNT=1 - Add 1 database server"
|
||||
@echo " make scale-down-web COUNT=1 - Remove 1 web server"
|
||||
@echo " make scale-down-db COUNT=1 - Remove 1 database server"
|
||||
|
||||
help-failure: ## Show failure simulation commands
|
||||
@echo "Failure Simulation Commands:"
|
||||
@echo " make fail-network DURATION=60 - Network failure for 60s"
|
||||
@echo " make fail-disk DURATION=120 - Disk exhaustion for 120s"
|
||||
@echo " make fail-service DURATION=30 - Service failure for 30s"
|
||||
@echo " make fail-node DURATION=300 - Node failure for 300s"
|
||||
@@ -0,0 +1,74 @@
|
||||
# Enterprise Infrastructure Simulator
|
||||
|
||||
## Problem Statement
|
||||
|
||||
Infrastructure teams need a safe place to rehearse lifecycle operations before applying them to production fleets. Patch windows, hardening changes, scale events, and node failures all carry operational risk when they are tested only during real incidents.
|
||||
|
||||
## Solution Overview
|
||||
|
||||
This project models common Linux infrastructure operations with Ansible playbooks and shell-based simulations. It keeps the automation readable and auditable while producing example evidence that resembles a real change record.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
Operator -> Make/CLI -> Ansible Inventory -> Playbooks -> Linux Nodes
|
||||
| |
|
||||
v v
|
||||
Scenarios Reports/Logs
|
||||
```
|
||||
|
||||
Core components:
|
||||
|
||||
- `inventory/hosts.ini` defines managed node groups.
|
||||
- `playbooks/` contains provisioning, patching, hardening, and decommissioning workflows.
|
||||
- `scripts/` injects scaling and failure conditions.
|
||||
- `scenarios/` documents operational exercises.
|
||||
- `examples/` stores representative outputs for review.
|
||||
|
||||
## How to Run
|
||||
|
||||
```bash
|
||||
cd enterprise-infra-simulator
|
||||
|
||||
# Validate playbook syntax.
|
||||
make test
|
||||
|
||||
# Provision the simulated estate.
|
||||
make run
|
||||
|
||||
# Apply security patches.
|
||||
make patch
|
||||
|
||||
# Apply host hardening.
|
||||
make harden
|
||||
|
||||
# Run the failure and patch demo.
|
||||
make demo
|
||||
```
|
||||
|
||||
Direct Ansible commands are also supported:
|
||||
|
||||
```bash
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
|
||||
```
|
||||
|
||||
## Example Output
|
||||
|
||||
```text
|
||||
PLAY RECAP *********************************************************************
|
||||
web01 : ok=21 changed=7 unreachable=0 failed=0 skipped=3 rescued=0 ignored=1
|
||||
db01 : ok=18 changed=4 unreachable=0 failed=0 skipped=5 rescued=0 ignored=1
|
||||
lb01 : ok=16 changed=3 unreachable=0 failed=0 skipped=6 rescued=0 ignored=0
|
||||
|
||||
Patch status: SUCCESS
|
||||
Updates applied: 12
|
||||
Reboot required: false
|
||||
```
|
||||
|
||||
Additional sample evidence is available in [examples/patch-output.txt](examples/patch-output.txt) and [examples/failure-simulation.txt](examples/failure-simulation.txt).
|
||||
|
||||
## Real-World Use Case
|
||||
|
||||
A platform team can use this project to demonstrate how routine operating procedures are encoded, reviewed, and tested before production change windows. The same patterns apply to regulated Linux estates where patch evidence, hardening controls, and incident drills must be repeatable.
|
||||
@@ -0,0 +1,30 @@
|
||||
# Enterprise Infrastructure Simulator Architecture
|
||||
|
||||
## Components
|
||||
|
||||
- Operator interface: `make` targets and direct Ansible commands.
|
||||
- Inventory: static host groups in `inventory/hosts.ini`.
|
||||
- Automation: lifecycle playbooks in `playbooks/`.
|
||||
- Simulation scripts: controlled failure and scaling events in `scripts/`.
|
||||
- Evidence: logs, reports, scenario notes, and examples.
|
||||
|
||||
## Data Flow
|
||||
|
||||
```
|
||||
Operator
|
||||
-> Make target or shell script
|
||||
-> Ansible inventory
|
||||
-> lifecycle playbook
|
||||
-> managed Linux node
|
||||
-> log/report artifact
|
||||
```
|
||||
|
||||
Failure drills follow a parallel flow:
|
||||
|
||||
```
|
||||
Operator -> simulate_failure.sh -> target node/service -> health check -> patch/hardening playbook -> evidence
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
The project favors explicit playbooks over hidden orchestration so the operational intent is visible during review. In a production implementation, the same workflows would typically run from a CI runner or automation controller with credentials supplied by a secret manager.
|
||||
@@ -0,0 +1,8 @@
|
||||
2026-04-29 02:13:41 - Starting failure simulation: service 30 web
|
||||
2026-04-29 02:13:41 - Simulating service failures on containers: web
|
||||
2026-04-29 02:13:42 - Stopping services in container enterprise-web-1
|
||||
2026-04-29 02:13:44 - Health probe failed: http://web01/health returned 503
|
||||
2026-04-29 02:14:12 - Cleaning up failure simulation
|
||||
2026-04-29 02:14:13 - Restarted nginx in enterprise-web-1
|
||||
2026-04-29 02:14:18 - Health probe recovered: http://web01/health returned 200
|
||||
2026-04-29 02:14:18 - Failure simulation completed successfully
|
||||
@@ -0,0 +1,33 @@
|
||||
PLAY [Apply Security Patches and Updates] **************************************
|
||||
|
||||
TASK [Update package cache] *****************************************************
|
||||
changed: [web01]
|
||||
changed: [db01]
|
||||
ok: [lb01]
|
||||
|
||||
TASK [Check for available updates] **********************************************
|
||||
ok: [web01] => {"stdout": "9"}
|
||||
ok: [db01] => {"stdout": "4"}
|
||||
ok: [lb01] => {"stdout": "0"}
|
||||
|
||||
TASK [Apply security updates only] **********************************************
|
||||
changed: [web01]
|
||||
changed: [db01]
|
||||
ok: [lb01]
|
||||
|
||||
TASK [Verify critical services] *************************************************
|
||||
ok: [web01] => (item=systemd-journald)
|
||||
ok: [web01] => (item=cron)
|
||||
ok: [db01] => (item=systemd-journald)
|
||||
ok: [lb01] => (item=cron)
|
||||
|
||||
PLAY RECAP *********************************************************************
|
||||
web01 : ok=19 changed=6 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
|
||||
db01 : ok=18 changed=5 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
|
||||
lb01 : ok=15 changed=1 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0
|
||||
|
||||
Patch report
|
||||
Status: SUCCESS
|
||||
Window: 02:00-04:00 UTC
|
||||
Reboot required: false
|
||||
Notification: infra-team@example.com
|
||||
@@ -0,0 +1,35 @@
|
||||
[webservers]
|
||||
web01 ansible_host=172.20.0.11 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
web02 ansible_host=172.20.0.12 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
web03 ansible_host=172.20.0.13 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
|
||||
[databases]
|
||||
db01 ansible_host=172.20.0.21 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
db02 ansible_host=172.20.0.22 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
|
||||
[loadbalancers]
|
||||
lb01 ansible_host=172.20.0.31 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
|
||||
[monitoring]
|
||||
mon01 ansible_host=172.20.0.41 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
|
||||
[all:vars]
|
||||
ansible_python_interpreter=/usr/bin/python3
|
||||
ansible_ssh_common_args='-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
|
||||
ansible_connection=ssh
|
||||
|
||||
[webservers:vars]
|
||||
node_type=web
|
||||
environment=production
|
||||
|
||||
[databases:vars]
|
||||
node_type=database
|
||||
environment=production
|
||||
|
||||
[loadbalancers:vars]
|
||||
node_type=loadbalancer
|
||||
environment=production
|
||||
|
||||
[monitoring:vars]
|
||||
node_type=monitoring
|
||||
environment=production
|
||||
@@ -0,0 +1,181 @@
|
||||
---
|
||||
- name: Decommission Enterprise Infrastructure Nodes
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: true
|
||||
vars:
|
||||
backup_data: true
|
||||
export_config: true
|
||||
graceful_shutdown: true
|
||||
cleanup_inventory: true
|
||||
|
||||
pre_tasks:
|
||||
- name: Check node health before decommissioning
|
||||
uri:
|
||||
url: http://localhost/health
|
||||
method: GET
|
||||
status_code: 200
|
||||
register: health_check
|
||||
ignore_errors: true
|
||||
when: "'webservers' in group_names"
|
||||
|
||||
- name: Create decommissioning backup directory
|
||||
file:
|
||||
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Log decommissioning start
|
||||
lineinfile:
|
||||
path: "/var/log/decommission.log"
|
||||
line: "{{ ansible_date_time.iso8601 }} - Starting decommissioning of {{ inventory_hostname }}"
|
||||
create: yes
|
||||
|
||||
tasks:
|
||||
- name: Stop application services gracefully
|
||||
service:
|
||||
name: "{{ item }}"
|
||||
state: stopped
|
||||
loop: "{{ application_services | default(['nginx', 'postgresql', 'haproxy']) }}"
|
||||
ignore_errors: true
|
||||
when: graceful_shutdown
|
||||
|
||||
- name: Wait for connections to drain
|
||||
pause:
|
||||
seconds: 30
|
||||
when: graceful_shutdown and "'webservers' in group_names or 'loadbalancers' in group_names"
|
||||
|
||||
- name: Export configuration files
|
||||
block:
|
||||
- name: Create config export directory
|
||||
file:
|
||||
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config"
|
||||
state: directory
|
||||
|
||||
- name: Archive system configuration
|
||||
archive:
|
||||
path:
|
||||
- /etc/
|
||||
- /opt/application/
|
||||
dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config/system_config.tar.gz"
|
||||
format: gz
|
||||
|
||||
- name: Export service configurations
|
||||
command: >
|
||||
tar -czf /var/backups/decommission-{{ ansible_date_time.iso8601 }}/config/services.tar.gz
|
||||
/etc/nginx /etc/postgresql /etc/haproxy
|
||||
ignore_errors: true
|
||||
when: export_config
|
||||
|
||||
- name: Backup application data
|
||||
block:
|
||||
- name: Create data backup directory
|
||||
file:
|
||||
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data"
|
||||
state: directory
|
||||
|
||||
- name: Backup database data
|
||||
command: >
|
||||
pg_dumpall -U postgres > /var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/database_backup.sql
|
||||
ignore_errors: true
|
||||
when: "'databases' in group_names"
|
||||
|
||||
- name: Backup application files
|
||||
archive:
|
||||
path: "/var/www/html"
|
||||
dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/application_data.tar.gz"
|
||||
format: gz
|
||||
ignore_errors: true
|
||||
when: "'webservers' in group_names"
|
||||
|
||||
- name: Backup monitoring data
|
||||
archive:
|
||||
path: "/var/lib/prometheus"
|
||||
dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/monitoring_data.tar.gz"
|
||||
format: gz
|
||||
ignore_errors: true
|
||||
when: "'monitoring' in group_names"
|
||||
when: backup_data
|
||||
|
||||
- name: Remove from load balancer
|
||||
include_tasks: tasks/remove_from_lb.yml
|
||||
when: "'webservers' in group_names or 'databases' in group_names"
|
||||
|
||||
- name: Update monitoring alerts
|
||||
include_tasks: tasks/update_monitoring.yml
|
||||
when: "'monitoring' not in group_names"
|
||||
|
||||
- name: Clean up application directories
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- /opt/application
|
||||
- /var/www/html
|
||||
- /var/lib/postgresql
|
||||
- /var/lib/prometheus
|
||||
ignore_errors: true
|
||||
|
||||
- name: Remove application packages
|
||||
apt:
|
||||
name: "{{ item }}"
|
||||
state: absent
|
||||
purge: yes
|
||||
loop: "{{ application_packages | default(['nginx', 'postgresql', 'haproxy', 'prometheus']) }}"
|
||||
when: ansible_os_family == "Debian"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Clean up system logs
|
||||
command: >
|
||||
find /var/log -name "*.log" -type f -exec truncate -s 0 {} \;
|
||||
ignore_errors: true
|
||||
|
||||
- name: Remove SSH keys and known hosts
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- /root/.ssh/authorized_keys
|
||||
- /root/.ssh/known_hosts
|
||||
- /home/infra-admin/.ssh/authorized_keys
|
||||
ignore_errors: true
|
||||
|
||||
- name: Generate decommissioning report
|
||||
template:
|
||||
src: templates/decommission_report.j2
|
||||
dest: "/var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log"
|
||||
vars:
|
||||
decommission_status: "SUCCESS"
|
||||
backup_location: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}"
|
||||
|
||||
post_tasks:
|
||||
- name: Send decommissioning notification
|
||||
mail:
|
||||
to: "{{ decommission_notification_email | default('infra-team@company.com') }}"
|
||||
subject: "Node Decommissioned - {{ inventory_hostname }}"
|
||||
body: |
|
||||
Node {{ inventory_hostname }} has been successfully decommissioned.
|
||||
|
||||
Backup location: /var/backups/decommission-{{ ansible_date_time.iso8601 }}
|
||||
Services stopped: {{ application_services | default(['nginx', 'postgresql', 'haproxy']) | join(', ') }}
|
||||
Configuration exported: {{ export_config }}
|
||||
Data backed up: {{ backup_data }}
|
||||
|
||||
See /var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log for details
|
||||
when: decommission_notification_email is defined
|
||||
ignore_errors: true
|
||||
|
||||
- name: Update dynamic inventory
|
||||
include_tasks: tasks/update_inventory.yml
|
||||
when: cleanup_inventory
|
||||
|
||||
- name: Final log entry
|
||||
lineinfile:
|
||||
path: "/var/log/decommission.log"
|
||||
line: "{{ ansible_date_time.iso8601 }} - Decommissioning completed for {{ inventory_hostname }}"
|
||||
|
||||
- name: Shutdown node
|
||||
command: shutdown -h now
|
||||
async: 10
|
||||
poll: 0
|
||||
when: auto_shutdown | default(false) | bool
|
||||
@@ -0,0 +1,210 @@
|
||||
---
|
||||
- name: Harden Enterprise Infrastructure Nodes
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: true
|
||||
vars:
|
||||
cis_level: 1
|
||||
disable_root_login: true
|
||||
secure_ssh_config: true
|
||||
firewall_policy: deny
|
||||
auditd_enabled: true
|
||||
selinux_mode: enforcing
|
||||
apparmor_enabled: true
|
||||
|
||||
tasks:
|
||||
- name: Include CIS hardening tasks
|
||||
include_tasks: tasks/cis_hardening.yml
|
||||
|
||||
- name: Configure SSH hardening
|
||||
block:
|
||||
- name: Disable root SSH login
|
||||
lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^PermitRootLogin'
|
||||
line: 'PermitRootLogin no'
|
||||
when: disable_root_login
|
||||
|
||||
- name: Disable password authentication
|
||||
lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^PasswordAuthentication'
|
||||
line: 'PasswordAuthentication no'
|
||||
|
||||
- name: Set MaxAuthTries
|
||||
lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^MaxAuthTries'
|
||||
line: 'MaxAuthTries 3'
|
||||
|
||||
- name: Disable empty passwords
|
||||
lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^PermitEmptyPasswords'
|
||||
line: 'PermitEmptyPasswords no'
|
||||
|
||||
- name: Set ClientAliveInterval
|
||||
lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^ClientAliveInterval'
|
||||
line: 'ClientAliveInterval 300'
|
||||
|
||||
- name: Set ClientAliveCountMax
|
||||
lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^ClientAliveCountMax'
|
||||
line: 'ClientAliveCountMax 2'
|
||||
|
||||
notify: restart sshd
|
||||
|
||||
- name: Configure firewall
|
||||
ufw:
|
||||
state: enabled
|
||||
policy: "{{ firewall_policy }}"
|
||||
rules:
|
||||
- rule: allow
|
||||
port: '22'
|
||||
proto: tcp
|
||||
from: 10.0.0.0/8
|
||||
- rule: allow
|
||||
port: '22'
|
||||
proto: tcp
|
||||
from: 172.16.0.0/12
|
||||
- rule: allow
|
||||
port: '22'
|
||||
proto: tcp
|
||||
from: 192.168.0.0/16
|
||||
|
||||
- name: Disable unnecessary services
|
||||
service:
|
||||
name: "{{ item }}"
|
||||
state: stopped
|
||||
enabled: no
|
||||
loop:
|
||||
- cups
|
||||
- avahi-daemon
|
||||
- bluetooth
|
||||
- nfs-server
|
||||
- rpcbind
|
||||
ignore_errors: true
|
||||
|
||||
- name: Remove unnecessary packages
|
||||
apt:
|
||||
name: "{{ item }}"
|
||||
state: absent
|
||||
purge: yes
|
||||
loop:
|
||||
- telnet
|
||||
- rsh-client
|
||||
- talk
|
||||
- ntalk
|
||||
when: ansible_os_family == "Debian"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Configure auditd
|
||||
block:
|
||||
- name: Install auditd
|
||||
apt:
|
||||
name: auditd
|
||||
state: present
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Configure audit rules
|
||||
template:
|
||||
src: templates/audit.rules.j2
|
||||
dest: /etc/audit/rules.d/hardening.rules
|
||||
|
||||
- name: Enable auditd service
|
||||
service:
|
||||
name: auditd
|
||||
state: started
|
||||
enabled: yes
|
||||
when: auditd_enabled
|
||||
|
||||
- name: Configure AppArmor
|
||||
block:
|
||||
- name: Install apparmor
|
||||
apt:
|
||||
name: apparmor
|
||||
state: present
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Enable apparmor service
|
||||
service:
|
||||
name: apparmor
|
||||
state: started
|
||||
enabled: yes
|
||||
when: apparmor_enabled and ansible_os_family == "Debian"
|
||||
|
||||
- name: Configure sysctl hardening
|
||||
sysctl:
|
||||
name: "{{ item.key }}"
|
||||
value: "{{ item.value }}"
|
||||
state: present
|
||||
reload: yes
|
||||
loop:
|
||||
- { key: 'net.ipv4.ip_forward', value: '0' }
|
||||
- { key: 'net.ipv4.conf.all.send_redirects', value: '0' }
|
||||
- { key: 'net.ipv4.conf.default.send_redirects', value: '0' }
|
||||
- { key: 'net.ipv4.tcp_syncookies', value: '1' }
|
||||
- { key: 'net.ipv4.icmp_echo_ignore_broadcasts', value: '1' }
|
||||
|
||||
- name: Set secure file permissions
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
mode: '0644'
|
||||
owner: root
|
||||
group: root
|
||||
loop:
|
||||
- /etc/passwd
|
||||
- /etc/group
|
||||
- /etc/shadow
|
||||
- /etc/gshadow
|
||||
|
||||
- name: Lock inactive user accounts
|
||||
command: usermod -L "{{ item }}"
|
||||
loop: "{{ inactive_users | default([]) }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Configure password policies
|
||||
pam_limits:
|
||||
domain: '*'
|
||||
limit_type: hard
|
||||
limit_item: nofile
|
||||
value: 1024
|
||||
|
||||
- name: Generate hardening report
|
||||
template:
|
||||
src: templates/hardening_report.j2
|
||||
dest: "/var/log/hardening_report_{{ ansible_date_time.iso8601 }}.log"
|
||||
|
||||
handlers:
|
||||
- name: restart sshd
|
||||
service:
|
||||
name: ssh
|
||||
state: restarted
|
||||
|
||||
- name: restart auditd
|
||||
service:
|
||||
name: auditd
|
||||
state: restarted
|
||||
when: auditd_enabled
|
||||
|
||||
post_tasks:
|
||||
- name: Run CIS compliance check
|
||||
command: >
|
||||
bash -c "
|
||||
score=0
|
||||
total=0
|
||||
echo 'CIS Compliance Check Results:' > /tmp/cis_check.log
|
||||
# Add CIS checks here
|
||||
echo 'Overall Score: $score/$total' >> /tmp/cis_check.log
|
||||
cat /tmp/cis_check.log
|
||||
"
|
||||
register: cis_check
|
||||
changed_when: false
|
||||
|
||||
- name: Archive CIS results
|
||||
copy:
|
||||
content: "{{ cis_check.stdout }}"
|
||||
dest: "/var/log/cis_compliance_{{ ansible_date_time.iso8601 }}.log"
|
||||
@@ -0,0 +1,139 @@
|
||||
---
|
||||
- name: Apply Security Patches and Updates
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: true
|
||||
vars:
|
||||
patch_window_start: "02:00"
|
||||
patch_window_end: "04:00"
|
||||
reboot_required: false
|
||||
security_only: true
|
||||
|
||||
pre_tasks:
|
||||
- name: Check patch window
|
||||
assert:
|
||||
that: ansible_date_time.hour|int >= patch_window_start.split(':')[0]|int and ansible_date_time.hour|int < patch_window_end.split(':')[0]|int
|
||||
fail_msg: "Current time {{ ansible_date_time.hour }}:{{ ansible_date_time.minute }} is outside patch window {{ patch_window_start }}-{{ patch_window_end }}"
|
||||
when: enforce_patch_window | default(true) | bool
|
||||
|
||||
- name: Create patch backup
|
||||
file:
|
||||
path: "/var/backups/pre-patch-{{ ansible_date_time.iso8601 }}"
|
||||
state: directory
|
||||
|
||||
- name: Backup package list
|
||||
command: dpkg --get-selections
|
||||
register: package_backup
|
||||
changed_when: false
|
||||
|
||||
- name: Save package backup
|
||||
copy:
|
||||
content: "{{ package_backup.stdout }}"
|
||||
dest: "/var/backups/pre-patch-{{ ansible_date_time.iso8601 }}/packages.list"
|
||||
|
||||
tasks:
|
||||
- name: Update package cache
|
||||
apt:
|
||||
update_cache: yes
|
||||
cache_valid_time: 300
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Check for available updates
|
||||
command: apt list --upgradable 2>/dev/null | grep -v "Listing..." | wc -l
|
||||
register: updates_available
|
||||
changed_when: false
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Apply security updates only
|
||||
apt:
|
||||
upgrade: dist
|
||||
update_cache: yes
|
||||
when: security_only and ansible_os_family == "Debian"
|
||||
|
||||
- name: Apply all updates
|
||||
apt:
|
||||
upgrade: dist
|
||||
update_cache: yes
|
||||
when: not security_only and ansible_os_family == "Debian"
|
||||
|
||||
- name: Check if reboot required
|
||||
stat:
|
||||
path: /var/run/reboot-required
|
||||
register: reboot_required_file
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Set reboot flag
|
||||
set_fact:
|
||||
reboot_required: "{{ reboot_required_file.stat.exists | default(false) }}"
|
||||
|
||||
- name: Restart services after patching
|
||||
service:
|
||||
name: "{{ item }}"
|
||||
state: restarted
|
||||
loop:
|
||||
- sshd
|
||||
- fail2ban
|
||||
- unattended-upgrades
|
||||
ignore_errors: true
|
||||
|
||||
- name: Update monitoring agent
|
||||
include_role:
|
||||
name: monitoring_agent_update
|
||||
when: "'monitoring' in group_names"
|
||||
|
||||
- name: Verify critical services
|
||||
service:
|
||||
name: "{{ item }}"
|
||||
state: started
|
||||
loop:
|
||||
- systemd-journald
|
||||
- systemd-logind
|
||||
- cron
|
||||
ignore_errors: true
|
||||
|
||||
- name: Run post-patch health checks
|
||||
uri:
|
||||
url: http://localhost/health
|
||||
method: GET
|
||||
status_code: 200
|
||||
register: health_result
|
||||
ignore_errors: true
|
||||
when: "'webservers' in group_names"
|
||||
|
||||
post_tasks:
|
||||
- name: Generate patch report
|
||||
template:
|
||||
src: templates/patch_report.j2
|
||||
dest: "/var/log/patch_report_{{ ansible_date_time.iso8601 }}.log"
|
||||
vars:
|
||||
patch_status: "{{ 'SUCCESS' if health_result.status == 200 else 'WARNING' }}"
|
||||
updates_applied: "{{ updates_available.stdout | default('0') }}"
|
||||
reboot_needed: "{{ reboot_required }}"
|
||||
|
||||
- name: Send patch notification
|
||||
mail:
|
||||
to: "{{ patch_notification_email | default('infra-team@company.com') }}"
|
||||
subject: "Patch Report - {{ inventory_hostname }}"
|
||||
body: |
|
||||
Patch completed for {{ inventory_hostname }}
|
||||
|
||||
Updates applied: {{ updates_applied }}
|
||||
Reboot required: {{ reboot_required }}
|
||||
Health check: {{ 'PASSED' if health_result.status == 200 else 'FAILED' }}
|
||||
|
||||
See /var/log/patch_report_{{ ansible_date_time.iso8601 }}.log for details
|
||||
when: patch_notification_email is defined
|
||||
ignore_errors: true
|
||||
|
||||
- name: Schedule reboot if required
|
||||
command: shutdown -r +5 "Rebooting for security patches"
|
||||
when: reboot_required and auto_reboot | default(false) | bool
|
||||
async: 600
|
||||
poll: 0
|
||||
|
||||
handlers:
|
||||
- name: restart monitoring
|
||||
service:
|
||||
name: "{{ monitoring_service | default('prometheus-node-exporter') }}"
|
||||
state: restarted
|
||||
when: "'monitoring' in group_names"
|
||||
@@ -0,0 +1,158 @@
|
||||
---
|
||||
- name: Provision Enterprise Infrastructure Nodes
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: true
|
||||
vars:
|
||||
node_timezone: "UTC"
|
||||
admin_user: "infra-admin"
|
||||
ssh_port: 22
|
||||
packages:
|
||||
- curl
|
||||
- wget
|
||||
- vim
|
||||
- htop
|
||||
- net-tools
|
||||
- iptables
|
||||
- fail2ban
|
||||
- unattended-upgrades
|
||||
|
||||
tasks:
|
||||
- name: Update package cache
|
||||
apt:
|
||||
update_cache: yes
|
||||
cache_valid_time: 3600
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Install base packages
|
||||
apt:
|
||||
name: "{{ packages }}"
|
||||
state: present
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Create admin user
|
||||
user:
|
||||
name: "{{ admin_user }}"
|
||||
groups: sudo
|
||||
append: yes
|
||||
create_home: yes
|
||||
shell: /bin/bash
|
||||
password: "{{ 'infra-admin-password' | password_hash('sha512') }}"
|
||||
|
||||
- name: Configure timezone
|
||||
timezone:
|
||||
name: "{{ node_timezone }}"
|
||||
|
||||
- name: Configure SSH
|
||||
block:
|
||||
- name: Disable root SSH login
|
||||
lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^PermitRootLogin'
|
||||
line: 'PermitRootLogin no'
|
||||
|
||||
- name: Set SSH port
|
||||
lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^Port'
|
||||
line: "Port {{ ssh_port }}"
|
||||
|
||||
- name: Disable password authentication
|
||||
lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^PasswordAuthentication'
|
||||
line: 'PasswordAuthentication no'
|
||||
|
||||
- name: Restart SSH service
|
||||
service:
|
||||
name: sshd
|
||||
state: restarted
|
||||
|
||||
- name: Configure firewall
|
||||
ufw:
|
||||
state: enabled
|
||||
policy: deny
|
||||
rules:
|
||||
- rule: allow
|
||||
port: "{{ ssh_port }}"
|
||||
proto: tcp
|
||||
- rule: allow
|
||||
port: '80'
|
||||
proto: tcp
|
||||
- rule: allow
|
||||
port: '443'
|
||||
proto: tcp
|
||||
|
||||
- name: Configure fail2ban
|
||||
template:
|
||||
src: templates/jail.local.j2
|
||||
dest: /etc/fail2ban/jail.local
|
||||
notify: restart fail2ban
|
||||
|
||||
- name: Enable unattended upgrades
|
||||
lineinfile:
|
||||
path: /etc/apt/apt.conf.d/20auto-upgrades
|
||||
regexp: '^APT::Periodic::Unattended-Upgrade'
|
||||
line: 'APT::Periodic::Unattended-Upgrade "1";'
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Create application directories
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: "{{ admin_user }}"
|
||||
group: "{{ admin_user }}"
|
||||
mode: '0755'
|
||||
loop:
|
||||
- /opt/application
|
||||
- /var/log/application
|
||||
- /etc/application
|
||||
|
||||
- name: Deploy monitoring agent
|
||||
include_role:
|
||||
name: monitoring_agent
|
||||
when: "'monitoring' in group_names"
|
||||
|
||||
- name: Deploy web server
|
||||
include_role:
|
||||
name: nginx
|
||||
when: "'webservers' in group_names"
|
||||
|
||||
- name: Deploy database server
|
||||
include_role:
|
||||
name: postgresql
|
||||
when: "'databases' in group_names"
|
||||
|
||||
- name: Deploy load balancer
|
||||
include_role:
|
||||
name: haproxy
|
||||
when: "'loadbalancers' in group_names"
|
||||
|
||||
- name: Generate provisioning report
|
||||
template:
|
||||
src: templates/provisioning_report.j2
|
||||
dest: /var/log/provisioning_report_{{ ansible_date_time.iso8601 }}.log
|
||||
delegate_to: localhost
|
||||
|
||||
handlers:
|
||||
- name: restart fail2ban
|
||||
service:
|
||||
name: fail2ban
|
||||
state: restarted
|
||||
|
||||
post_tasks:
|
||||
- name: Verify services
|
||||
service:
|
||||
name: "{{ item }}"
|
||||
state: started
|
||||
enabled: yes
|
||||
loop: "{{ services_to_verify | default([]) }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Run health checks
|
||||
uri:
|
||||
url: http://localhost/health
|
||||
method: GET
|
||||
register: health_check
|
||||
ignore_errors: true
|
||||
when: "'webservers' in group_names"
|
||||
@@ -0,0 +1,21 @@
|
||||
# Scenario: Simulate Failure and Patch
|
||||
|
||||
## Description
|
||||
|
||||
Validate that a service-level failure can be detected, recovered, and followed by a controlled patch workflow. This mirrors a maintenance window where a degraded node is stabilized before package updates are applied.
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
cd enterprise-infra-simulator
|
||||
./scripts/simulate_failure.sh service 30 web
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml --check
|
||||
```
|
||||
|
||||
## Expected Result
|
||||
|
||||
- The simulation records a temporary service failure.
|
||||
- The service is restored after cleanup.
|
||||
- The patch playbook completes without unreachable hosts.
|
||||
- Hardening check mode reports no destructive changes.
|
||||
@@ -0,0 +1,116 @@
|
||||
---
|
||||
- name: Enterprise Scaling Event Scenario
|
||||
hosts: all
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
vars:
|
||||
scaling_threshold: 80
|
||||
cooldown_period: 300
|
||||
max_scale_up: 5
|
||||
min_instances: 2
|
||||
|
||||
pre_tasks:
|
||||
- name: Log scenario start
|
||||
lineinfile:
|
||||
path: "/var/log/scaling_scenario.log"
|
||||
line: "{{ ansible_date_time.iso8601 }} - Starting scaling event scenario"
|
||||
create: yes
|
||||
|
||||
- name: Check current load
|
||||
command: uptime
|
||||
register: system_load
|
||||
changed_when: false
|
||||
|
||||
- name: Parse load average
|
||||
set_fact:
|
||||
load_1min: "{{ system_load.stdout.split(',')[0].split()[-1] | float }}"
|
||||
load_5min: "{{ system_load.stdout.split(',')[1] | float }}"
|
||||
load_15min: "{{ system_load.stdout.split(',')[2] | float }}"
|
||||
|
||||
tasks:
|
||||
- name: Evaluate scaling conditions
|
||||
set_fact:
|
||||
scale_up_needed: "{{ load_5min > scaling_threshold }}"
|
||||
scale_down_needed: "{{ load_5min < (scaling_threshold * 0.3) }}"
|
||||
|
||||
- name: Scale up web servers
|
||||
include_role:
|
||||
name: scale_up
|
||||
tasks_from: web_servers
|
||||
vars:
|
||||
scale_count: "{{ [max_scale_up, (load_5min / 10) | int] | min }}"
|
||||
when: scale_up_needed and "'webservers' in group_names"
|
||||
|
||||
- name: Scale up database servers
|
||||
include_role:
|
||||
name: scale_up
|
||||
tasks_from: database_servers
|
||||
vars:
|
||||
scale_count: "{{ [2, (load_5min / 20) | int] | min }}"
|
||||
when: scale_up_needed and "'databases' in group_names"
|
||||
|
||||
- name: Update load balancer configuration
|
||||
include_role:
|
||||
name: load_balancer
|
||||
tasks_from: update_backends
|
||||
when: scale_up_needed
|
||||
|
||||
- name: Scale down web servers
|
||||
include_role:
|
||||
name: scale_down
|
||||
tasks_from: web_servers
|
||||
vars:
|
||||
scale_count: "{{ [(inventory_hostname | regex_findall('[0-9]+') | first | int) - min_instances, 1] | max }}"
|
||||
when: scale_down_needed and "'webservers' in group_names" and (inventory_hostname | regex_findall('[0-9]+') | first | int) > min_instances
|
||||
|
||||
- name: Wait for cooldown period
|
||||
pause:
|
||||
seconds: "{{ cooldown_period }}"
|
||||
when: scale_up_needed or scale_down_needed
|
||||
|
||||
- name: Verify scaling results
|
||||
uri:
|
||||
url: http://localhost/health
|
||||
method: GET
|
||||
status_code: 200
|
||||
register: health_check
|
||||
until: health_check.status == 200
|
||||
retries: 5
|
||||
delay: 10
|
||||
when: "'webservers' in group_names"
|
||||
|
||||
- name: Update monitoring thresholds
|
||||
include_role:
|
||||
name: monitoring
|
||||
tasks_from: update_alerts
|
||||
vars:
|
||||
new_threshold: "{{ scaling_threshold + 10 }}"
|
||||
|
||||
- name: Send scaling notification
|
||||
mail:
|
||||
to: "{{ scaling_notification_email | default('infra-team@company.com') }}"
|
||||
subject: "Infrastructure Scaling Event - {{ inventory_hostname }}"
|
||||
body: |
|
||||
Scaling event completed on {{ inventory_hostname }}
|
||||
|
||||
Load averages: {{ load_1min }}, {{ load_5min }}, {{ load_15min }}
|
||||
Action taken: {{ 'Scale Up' if scale_up_needed else 'Scale Down' if scale_down_needed else 'No Action' }}
|
||||
Health check: {{ 'PASSED' if health_check.status == 200 else 'FAILED' }}
|
||||
|
||||
See /var/log/scaling_scenario.log for details
|
||||
when: scaling_notification_email is defined
|
||||
ignore_errors: yes
|
||||
|
||||
post_tasks:
|
||||
- name: Generate scaling scenario report
|
||||
template:
|
||||
src: templates/scaling_scenario_report.j2
|
||||
dest: "/var/log/scaling_scenario_report_{{ ansible_date_time.iso8601 }}.log"
|
||||
vars:
|
||||
scenario_outcome: "{{ 'SUCCESS' if health_check.status == 200 else 'WARNING' }}"
|
||||
load_metrics: "{{ load_1min }}, {{ load_5min }}, {{ load_15min }}"
|
||||
|
||||
- name: Log scenario completion
|
||||
lineinfile:
|
||||
path: "/var/log/scaling_scenario.log"
|
||||
line: "{{ ansible_date_time.iso8601 }} - Scaling event scenario completed"
|
||||
@@ -0,0 +1,343 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Enterprise Infrastructure Failure Simulation Script
|
||||
# Simulates various types of infrastructure failures for testing
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration
|
||||
DOCKER_COMPOSE_FILE="docker-compose.yml"
|
||||
INVENTORY_FILE="inventory/hosts.ini"
|
||||
LOG_FILE="logs/failure_simulation.log"
|
||||
|
||||
# Default values
|
||||
FAILURE_TYPE="${1:-network}"
|
||||
DURATION="${2:-60}"
|
||||
TARGET_NODES="${3:-all}"
|
||||
INTENSITY="${INTENSITY:-medium}"
|
||||
|
||||
# Logging function
|
||||
log() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Error handling
|
||||
error_exit() {
|
||||
log "ERROR: $1"
|
||||
# Cleanup any active failures
|
||||
cleanup_failure
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Validate inputs
|
||||
validate_inputs() {
|
||||
case "$FAILURE_TYPE" in
|
||||
network|disk|service|node|cpu|memory) ;;
|
||||
*) error_exit "Invalid failure type: $FAILURE_TYPE. Must be network, disk, service, node, cpu, or memory" ;;
|
||||
esac
|
||||
|
||||
if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -lt 1 ]; then
|
||||
error_exit "Invalid duration: $DURATION. Must be a positive integer (seconds)"
|
||||
fi
|
||||
|
||||
case "$INTENSITY" in
|
||||
low|medium|high|critical) ;;
|
||||
*) error_exit "Invalid intensity: $INTENSITY. Must be low, medium, high, or critical" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Get target containers
|
||||
get_target_containers() {
|
||||
case "$TARGET_NODES" in
|
||||
all)
|
||||
docker-compose ps --services | grep -v "^NAME$" || true
|
||||
;;
|
||||
web)
|
||||
echo "web"
|
||||
;;
|
||||
db)
|
||||
echo "db"
|
||||
;;
|
||||
lb)
|
||||
echo "lb"
|
||||
;;
|
||||
monitor)
|
||||
echo "monitor"
|
||||
;;
|
||||
*)
|
||||
echo "$TARGET_NODES"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Network failure simulation
|
||||
simulate_network_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating network failure on containers: $containers"
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Disconnecting network for container $cid"
|
||||
|
||||
# Disconnect from network
|
||||
docker network disconnect "$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" "$cid" 2>/dev/null || true
|
||||
|
||||
# Store original network for restoration
|
||||
echo "$cid:$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" >> /tmp/network_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Disk failure simulation
|
||||
simulate_disk_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating disk space exhaustion on containers: $containers"
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Filling disk space in container $cid"
|
||||
|
||||
# Create a large file to consume disk space
|
||||
local fill_size="100M"
|
||||
case "$INTENSITY" in
|
||||
low) fill_size="50M" ;;
|
||||
medium) fill_size="100M" ;;
|
||||
high) fill_size="500M" ;;
|
||||
critical) fill_size="1G" ;;
|
||||
esac
|
||||
|
||||
docker exec "$cid" bash -c "dd if=/dev/zero of=/tmp/disk_fill bs=1M count=$(( ${fill_size%M} * 1024 ))" 2>/dev/null || true
|
||||
echo "$cid:disk_fill" >> /tmp/disk_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Service failure simulation
|
||||
simulate_service_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating service failures on containers: $containers"
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Stopping services in container $cid"
|
||||
|
||||
# Stop common services
|
||||
docker exec "$cid" systemctl stop nginx 2>/dev/null || true
|
||||
docker exec "$cid" systemctl stop postgresql 2>/dev/null || true
|
||||
docker exec "$cid" systemctl stop haproxy 2>/dev/null || true
|
||||
|
||||
echo "$cid:services" >> /tmp/service_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Node failure simulation
|
||||
simulate_node_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating complete node failures on containers: $containers"
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Stopping container $cid (node failure)"
|
||||
docker pause "$cid"
|
||||
echo "$cid:paused" >> /tmp/node_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# CPU stress simulation
|
||||
simulate_cpu_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating CPU stress on containers: $containers"
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Starting CPU stress in container $cid"
|
||||
|
||||
# Start CPU stress process
|
||||
docker exec -d "$cid" bash -c "while true; do :; done" 2>/dev/null || true
|
||||
echo "$cid:cpu_stress:$(docker exec "$cid" ps aux | grep "while true" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/cpu_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Memory stress simulation
|
||||
simulate_memory_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating memory exhaustion on containers: $containers"
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Starting memory stress in container $cid"
|
||||
|
||||
# Start memory stress process
|
||||
docker exec -d "$cid" bash -c "tail /dev/zero" 2>/dev/null || true
|
||||
echo "$cid:memory_stress:$(docker exec "$cid" ps aux | grep "tail /dev/zero" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/memory_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Inject failure
|
||||
inject_failure() {
|
||||
case "$FAILURE_TYPE" in
|
||||
network) simulate_network_failure ;;
|
||||
disk) simulate_disk_failure ;;
|
||||
service) simulate_service_failure ;;
|
||||
node) simulate_node_failure ;;
|
||||
cpu) simulate_cpu_failure ;;
|
||||
memory) simulate_memory_failure ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Cleanup failure
|
||||
cleanup_failure() {
|
||||
log "Cleaning up failure simulation"
|
||||
|
||||
# Restore network connections
|
||||
if [ -f /tmp/network_failure_state ]; then
|
||||
while IFS=: read -r cid network; do
|
||||
docker network connect "$network" "$cid" 2>/dev/null || true
|
||||
done < /tmp/network_failure_state
|
||||
rm -f /tmp/network_failure_state
|
||||
fi
|
||||
|
||||
# Clean up disk fill files
|
||||
if [ -f /tmp/disk_failure_state ]; then
|
||||
while IFS=: read -r cid _; do
|
||||
docker exec "$cid" rm -f /tmp/disk_fill 2>/dev/null || true
|
||||
done < /tmp/disk_failure_state
|
||||
rm -f /tmp/disk_failure_state
|
||||
fi
|
||||
|
||||
# Restart services
|
||||
if [ -f /tmp/service_failure_state ]; then
|
||||
while IFS=: read -r cid _; do
|
||||
docker exec "$cid" systemctl start nginx 2>/dev/null || true
|
||||
docker exec "$cid" systemctl start postgresql 2>/dev/null || true
|
||||
docker exec "$cid" systemctl start haproxy 2>/dev/null || true
|
||||
done < /tmp/service_failure_state
|
||||
rm -f /tmp/service_failure_state
|
||||
fi
|
||||
|
||||
# Unpause containers
|
||||
if [ -f /tmp/node_failure_state ]; then
|
||||
while IFS=: read -r cid _; do
|
||||
docker unpause "$cid" 2>/dev/null || true
|
||||
done < /tmp/node_failure_state
|
||||
rm -f /tmp/node_failure_state
|
||||
fi
|
||||
|
||||
# Kill stress processes
|
||||
if [ -f /tmp/cpu_failure_state ]; then
|
||||
while IFS=: read -r cid _ pid; do
|
||||
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
|
||||
done < /tmp/cpu_failure_state
|
||||
rm -f /tmp/cpu_failure_state
|
||||
fi
|
||||
|
||||
if [ -f /tmp/memory_failure_state ]; then
|
||||
while IFS=: read -r cid _ pid; do
|
||||
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
|
||||
done < /tmp/memory_failure_state
|
||||
rm -f /tmp/memory_failure_state
|
||||
fi
|
||||
}
|
||||
|
||||
# Monitor failure
|
||||
monitor_failure() {
|
||||
local end_time=$(( $(date +%s) + DURATION ))
|
||||
|
||||
log "Monitoring failure for $DURATION seconds"
|
||||
|
||||
while [ $(date +%s) -lt $end_time ]; do
|
||||
# Check container status
|
||||
if ! docker-compose ps | grep -q "Up\|Paused"; then
|
||||
log "WARNING: All containers are down"
|
||||
fi
|
||||
|
||||
# Log system metrics
|
||||
log "System status: $(docker stats --no-stream --format 'table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}' | tail -n +2)"
|
||||
|
||||
sleep 10
|
||||
done
|
||||
}
|
||||
|
||||
# Generate failure report
|
||||
generate_report() {
|
||||
local report_file="reports/failure_simulation_$(date +%Y%m%d_%H%M%S).txt"
|
||||
|
||||
cat > "$report_file" << EOF
|
||||
Failure Simulation Report
|
||||
========================
|
||||
|
||||
Timestamp: $(date)
|
||||
Failure Type: $FAILURE_TYPE
|
||||
Duration: $DURATION seconds
|
||||
Target Nodes: $TARGET_NODES
|
||||
Intensity: $INTENSITY
|
||||
|
||||
Pre-failure Status:
|
||||
$(docker-compose ps)
|
||||
|
||||
Post-failure Status:
|
||||
$(docker-compose ps)
|
||||
|
||||
Log File: $LOG_FILE
|
||||
EOF
|
||||
|
||||
log "Failure simulation report generated: $report_file"
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
log "Starting failure simulation: $FAILURE_TYPE for $DURATION seconds"
|
||||
|
||||
validate_inputs
|
||||
|
||||
# Inject failure
|
||||
inject_failure
|
||||
|
||||
# Monitor during failure
|
||||
monitor_failure
|
||||
|
||||
# Cleanup
|
||||
cleanup_failure
|
||||
|
||||
# Generate report
|
||||
generate_report
|
||||
|
||||
log "Failure simulation completed successfully"
|
||||
}
|
||||
|
||||
# Trap for cleanup on script exit
|
||||
trap cleanup_failure EXIT
|
||||
|
||||
# Initialize logging
|
||||
mkdir -p logs reports
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
@@ -0,0 +1,208 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Enterprise Infrastructure Scaling Simulation Script
|
||||
# Simulates scaling operations for infrastructure nodes
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration
|
||||
DOCKER_COMPOSE_FILE="docker-compose.yml"
|
||||
INVENTORY_FILE="inventory/hosts.ini"
|
||||
LOG_FILE="logs/scaling_simulation.log"
|
||||
|
||||
# Default values
|
||||
DIRECTION="${1:-up}"
|
||||
COUNT="${2:-1}"
|
||||
NODE_TYPE="${3:-web}"
|
||||
SIMULATION_MODE="${SIMULATION_MODE:-false}"
|
||||
|
||||
# Logging function
|
||||
log() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Error handling
|
||||
error_exit() {
|
||||
log "ERROR: $1"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Validate inputs
|
||||
validate_inputs() {
|
||||
if [[ "$DIRECTION" != "up" && "$DIRECTION" != "down" ]]; then
|
||||
error_exit "Invalid direction: $DIRECTION. Must be 'up' or 'down'"
|
||||
fi
|
||||
|
||||
if ! [[ "$COUNT" =~ ^[0-9]+$ ]] || [ "$COUNT" -lt 1 ]; then
|
||||
error_exit "Invalid count: $COUNT. Must be a positive integer"
|
||||
fi
|
||||
|
||||
case "$NODE_TYPE" in
|
||||
web|db|lb|monitor) ;;
|
||||
*) error_exit "Invalid node type: $NODE_TYPE. Must be web, db, lb, or monitor" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Get current node count
|
||||
get_current_count() {
|
||||
local type="$1"
|
||||
case "$type" in
|
||||
web) docker-compose ps web | grep -c "Up" ;;
|
||||
db) docker-compose ps db | grep -c "Up" ;;
|
||||
lb) docker-compose ps lb | grep -c "Up" ;;
|
||||
monitor) docker-compose ps monitor | grep -c "Up" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Scale up infrastructure
|
||||
scale_up() {
|
||||
local type="$1"
|
||||
local count="$2"
|
||||
|
||||
log "Scaling up $count $type nodes"
|
||||
|
||||
# Update docker-compose replica count
|
||||
sed -i.bak "s/replicas: [0-9]\+/replicas: $(( $(get_current_count "$type") + count ))/" "$DOCKER_COMPOSE_FILE"
|
||||
|
||||
# Deploy new containers
|
||||
docker-compose up -d --scale "${type}=${count}"
|
||||
|
||||
# Wait for containers to be ready
|
||||
log "Waiting for containers to be ready..."
|
||||
sleep 30
|
||||
|
||||
# Update inventory
|
||||
update_inventory "$type" "$count" "add"
|
||||
|
||||
# Run provisioning playbook on new nodes
|
||||
if [ "$SIMULATION_MODE" = false ]; then
|
||||
ansible-playbook -i "$INVENTORY_FILE" playbooks/provision.yml --limit "${type}*"
|
||||
fi
|
||||
|
||||
log "Successfully scaled up $count $type nodes"
|
||||
}
|
||||
|
||||
# Scale down infrastructure
|
||||
scale_down() {
|
||||
local type="$1"
|
||||
local count="$2"
|
||||
|
||||
local current_count=$(get_current_count "$type")
|
||||
if [ "$current_count" -lt "$count" ]; then
|
||||
error_exit "Cannot scale down $count nodes. Only $current_count $type nodes currently running"
|
||||
fi
|
||||
|
||||
log "Scaling down $count $type nodes"
|
||||
|
||||
# Select nodes to remove (oldest first)
|
||||
local nodes_to_remove=$(docker-compose ps "$type" | grep "Up" | head -n "$count" | awk '{print $1}')
|
||||
|
||||
# Decommission nodes
|
||||
for node in $nodes_to_remove; do
|
||||
if [ "$SIMULATION_MODE" = false ]; then
|
||||
ansible-playbook -i "$INVENTORY_FILE" playbooks/decommission.yml --limit "$node"
|
||||
fi
|
||||
docker stop "$node"
|
||||
docker rm "$node"
|
||||
done
|
||||
|
||||
# Update docker-compose replica count
|
||||
sed -i.bak "s/replicas: [0-9]\+/replicas: $(( current_count - count ))/" "$DOCKER_COMPOSE_FILE"
|
||||
|
||||
# Update inventory
|
||||
update_inventory "$type" "$count" "remove"
|
||||
|
||||
log "Successfully scaled down $count $type nodes"
|
||||
}
|
||||
|
||||
# Update Ansible inventory
|
||||
update_inventory() {
|
||||
local type="$1"
|
||||
local count="$2"
|
||||
local action="$3"
|
||||
|
||||
log "Updating inventory for $action $count $type nodes"
|
||||
|
||||
# This would be more complex in a real implementation
|
||||
# For simulation, we'll just log the action
|
||||
case "$action" in
|
||||
add)
|
||||
log "Added $count $type nodes to inventory"
|
||||
;;
|
||||
remove)
|
||||
log "Removed $count $type nodes from inventory"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Health check after scaling
|
||||
health_check() {
|
||||
log "Running health checks after scaling"
|
||||
|
||||
# Check container status
|
||||
if ! docker-compose ps | grep -q "Up"; then
|
||||
error_exit "Some containers failed to start"
|
||||
fi
|
||||
|
||||
# Ansible ping check
|
||||
if [ "$SIMULATION_MODE" = false ]; then
|
||||
if ! ansible -i "$INVENTORY_FILE" all -m ping >/dev/null 2>&1; then
|
||||
log "WARNING: Some nodes failed Ansible ping check"
|
||||
fi
|
||||
fi
|
||||
|
||||
log "Health checks completed"
|
||||
}
|
||||
|
||||
# Generate scaling report
|
||||
generate_report() {
|
||||
local report_file="reports/scaling_report_$(date +%Y%m%d_%H%M%S).txt"
|
||||
|
||||
cat > "$report_file" << EOF
|
||||
Scaling Simulation Report
|
||||
========================
|
||||
|
||||
Timestamp: $(date)
|
||||
Direction: $DIRECTION
|
||||
Node Type: $NODE_TYPE
|
||||
Count: $COUNT
|
||||
Simulation Mode: $SIMULATION_MODE
|
||||
|
||||
Current Status:
|
||||
$(docker-compose ps)
|
||||
|
||||
Inventory Status:
|
||||
$(ansible -i "$INVENTORY_FILE" --list-hosts all 2>/dev/null || echo "Ansible inventory check failed")
|
||||
|
||||
Log File: $LOG_FILE
|
||||
EOF
|
||||
|
||||
log "Scaling report generated: $report_file"
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
log "Starting scaling simulation: $DIRECTION $COUNT $NODE_TYPE nodes"
|
||||
|
||||
validate_inputs
|
||||
|
||||
case "$DIRECTION" in
|
||||
up)
|
||||
scale_up "$NODE_TYPE" "$COUNT"
|
||||
;;
|
||||
down)
|
||||
scale_down "$NODE_TYPE" "$COUNT"
|
||||
;;
|
||||
esac
|
||||
|
||||
health_check
|
||||
generate_report
|
||||
|
||||
log "Scaling simulation completed successfully"
|
||||
}
|
||||
|
||||
# Initialize logging
|
||||
mkdir -p logs reports
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user