ci: configure and stabilize CI/CD pipeline

- fix runner configuration issues
- correct workflow labels and execution environment
- resolve dependency issues in pipeline (python deps)
- improve reliability of automation runs
This commit is contained in:
Mateusz Suski
2026-04-29 23:14:14 +00:00
parent 2313efac88
commit fcf305bd70
45 changed files with 6016 additions and 0 deletions
+173
View File
@@ -0,0 +1,173 @@
# Enterprise Infrastructure Simulator Makefile
.PHONY: help run demo up down patch destroy status logs clean test
# Default target
help: ## Show this help message
@echo "Enterprise Infrastructure Simulator"
@echo ""
@echo "Available commands:"
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " %-15s %s\n", $$1, $$2}'
run: ## Run the default simulator workflow
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
demo: ## Run a failure-and-patch demonstration
./scripts/simulate_failure.sh service 30 web
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
# Infrastructure management
up: ## Start the infrastructure simulation
@echo "Starting enterprise infrastructure simulation..."
docker-compose up -d
@echo "Waiting for containers to be ready..."
@sleep 30
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
@echo "Infrastructure simulation started successfully"
down: ## Stop the infrastructure simulation
@echo "Stopping infrastructure simulation..."
ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml || true
docker-compose down
@echo "Infrastructure simulation stopped"
patch: ## Apply security patches to all nodes
@echo "Applying security patches..."
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
@echo "Security patches applied"
destroy: ## Completely destroy the infrastructure
@echo "Destroying infrastructure..."
ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml || true
docker-compose down -v --remove-orphans
docker system prune -f
rm -rf logs/* reports/*
@echo "Infrastructure completely destroyed"
# Scaling operations
scale-up-web: ## Scale up web servers (usage: make scale-up-web COUNT=2)
@echo "Scaling up $(COUNT) web servers..."
./scripts/simulate_scaling.sh up $(or $(COUNT),1) web
scale-up-db: ## Scale up database servers (usage: make scale-up-db COUNT=1)
@echo "Scaling up $(COUNT) database servers..."
./scripts/simulate_scaling.sh up $(or $(COUNT),1) db
scale-down-web: ## Scale down web servers (usage: make scale-down-web COUNT=1)
@echo "Scaling down $(COUNT) web servers..."
./scripts/simulate_scaling.sh down $(or $(COUNT),1) web
scale-down-db: ## Scale down database servers (usage: make scale-down-db COUNT=1)
@echo "Scaling down $(COUNT) database servers..."
./scripts/simulate_scaling.sh down $(or $(COUNT),1) db
# Failure simulation
fail-network: ## Simulate network failure (usage: make fail-network DURATION=60)
@echo "Simulating network failure for $(or $(DURATION),60) seconds..."
./scripts/simulate_failure.sh network $(or $(DURATION),60)
fail-disk: ## Simulate disk space exhaustion (usage: make fail-disk DURATION=120)
@echo "Simulating disk failure for $(or $(DURATION),120) seconds..."
./scripts/simulate_failure.sh disk $(or $(DURATION),120)
fail-service: ## Simulate service failures (usage: make fail-service DURATION=30)
@echo "Simulating service failure for $(or $(DURATION),30) seconds..."
./scripts/simulate_failure.sh service $(or $(DURATION),30)
fail-node: ## Simulate complete node failure (usage: make fail-node DURATION=300)
@echo "Simulating node failure for $(or $(DURATION),300) seconds..."
./scripts/simulate_failure.sh node $(or $(DURATION),300)
# Monitoring and status
status: ## Show infrastructure status
@echo "=== Docker Containers ==="
docker-compose ps
@echo ""
@echo "=== Ansible Inventory ==="
ansible -i inventory/hosts.ini --list-hosts all || echo "Inventory check failed"
@echo ""
@echo "=== System Resources ==="
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}\t{{.NetIO}}"
logs: ## Show infrastructure logs
docker-compose logs -f --tail=100
logs-web: ## Show web server logs
docker-compose logs -f web
logs-db: ## Show database logs
docker-compose logs -f db
# Testing and validation
test: ## Run infrastructure tests
@echo "Running infrastructure tests..."
ansible -i inventory/hosts.ini all -m ping
ansible-playbook -i inventory/hosts.ini --syntax-check playbooks/*.yml
@echo "Testing scaling scripts..."
./scripts/simulate_scaling.sh up 0 web # Dry run
./scripts/simulate_failure.sh network 1 # Quick test
@echo "All tests passed"
validate: ## Validate infrastructure configuration
@echo "Validating configuration..."
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml --check
docker-compose config
@echo "Configuration validation complete"
# Scenarios
scenario-scaling: ## Run scaling event scenario
@echo "Running scaling event scenario..."
ansible-playbook -i inventory/hosts.ini scenarios/scaling_event.yml
scenario-disaster: ## Run disaster recovery scenario
@echo "Running disaster recovery scenario..."
ansible-playbook -i inventory/hosts.ini scenarios/disaster_recovery.yml
# Maintenance
clean: ## Clean up temporary files and logs
@echo "Cleaning up temporary files..."
rm -rf logs/*.log reports/*.txt
docker system prune -f
@echo "Cleanup complete"
backup: ## Create infrastructure backup
@echo "Creating infrastructure backup..."
mkdir -p backups/$(shell date +%Y%m%d_%H%M%S)
ansible-playbook -i inventory/hosts.ini playbooks/backup.yml
docker-compose exec ansible tar -czf /backups/infra_backup.tar.gz /infrastructure
@echo "Backup created"
# Development
lint: ## Lint Ansible playbooks
@echo "Linting Ansible playbooks..."
ansible-lint playbooks/*.yml scenarios/*.yml
@echo "Linting complete"
format: ## Format code and configuration
@echo "Formatting code..."
# Add formatting commands here
@echo "Formatting complete"
# Security
harden: ## Apply security hardening
@echo "Applying security hardening..."
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
security-scan: ## Run security scans
@echo "Running security scans..."
ansible-playbook -i inventory/hosts.ini playbooks/security_scan.yml
# Help for specific targets
help-scaling: ## Show scaling-related commands
@echo "Scaling Commands:"
@echo " make scale-up-web COUNT=2 - Add 2 web servers"
@echo " make scale-up-db COUNT=1 - Add 1 database server"
@echo " make scale-down-web COUNT=1 - Remove 1 web server"
@echo " make scale-down-db COUNT=1 - Remove 1 database server"
help-failure: ## Show failure simulation commands
@echo "Failure Simulation Commands:"
@echo " make fail-network DURATION=60 - Network failure for 60s"
@echo " make fail-disk DURATION=120 - Disk exhaustion for 120s"
@echo " make fail-service DURATION=30 - Service failure for 30s"
@echo " make fail-node DURATION=300 - Node failure for 300s"
+74
View File
@@ -0,0 +1,74 @@
# Enterprise Infrastructure Simulator
## Problem Statement
Infrastructure teams need a safe place to rehearse lifecycle operations before applying them to production fleets. Patch windows, hardening changes, scale events, and node failures all carry operational risk when they are tested only during real incidents.
## Solution Overview
This project models common Linux infrastructure operations with Ansible playbooks and shell-based simulations. It keeps the automation readable and auditable while producing example evidence that resembles a real change record.
## Architecture Overview
```
Operator -> Make/CLI -> Ansible Inventory -> Playbooks -> Linux Nodes
| |
v v
Scenarios Reports/Logs
```
Core components:
- `inventory/hosts.ini` defines managed node groups.
- `playbooks/` contains provisioning, patching, hardening, and decommissioning workflows.
- `scripts/` injects scaling and failure conditions.
- `scenarios/` documents operational exercises.
- `examples/` stores representative outputs for review.
## How to Run
```bash
cd enterprise-infra-simulator
# Validate playbook syntax.
make test
# Provision the simulated estate.
make run
# Apply security patches.
make patch
# Apply host hardening.
make harden
# Run the failure and patch demo.
make demo
```
Direct Ansible commands are also supported:
```bash
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
```
## Example Output
```text
PLAY RECAP *********************************************************************
web01 : ok=21 changed=7 unreachable=0 failed=0 skipped=3 rescued=0 ignored=1
db01 : ok=18 changed=4 unreachable=0 failed=0 skipped=5 rescued=0 ignored=1
lb01 : ok=16 changed=3 unreachable=0 failed=0 skipped=6 rescued=0 ignored=0
Patch status: SUCCESS
Updates applied: 12
Reboot required: false
```
Additional sample evidence is available in [examples/patch-output.txt](examples/patch-output.txt) and [examples/failure-simulation.txt](examples/failure-simulation.txt).
## Real-World Use Case
A platform team can use this project to demonstrate how routine operating procedures are encoded, reviewed, and tested before production change windows. The same patterns apply to regulated Linux estates where patch evidence, hardening controls, and incident drills must be repeatable.
@@ -0,0 +1,30 @@
# Enterprise Infrastructure Simulator Architecture
## Components
- Operator interface: `make` targets and direct Ansible commands.
- Inventory: static host groups in `inventory/hosts.ini`.
- Automation: lifecycle playbooks in `playbooks/`.
- Simulation scripts: controlled failure and scaling events in `scripts/`.
- Evidence: logs, reports, scenario notes, and examples.
## Data Flow
```
Operator
-> Make target or shell script
-> Ansible inventory
-> lifecycle playbook
-> managed Linux node
-> log/report artifact
```
Failure drills follow a parallel flow:
```
Operator -> simulate_failure.sh -> target node/service -> health check -> patch/hardening playbook -> evidence
```
## Notes
The project favors explicit playbooks over hidden orchestration so the operational intent is visible during review. In a production implementation, the same workflows would typically run from a CI runner or automation controller with credentials supplied by a secret manager.
@@ -0,0 +1,8 @@
2026-04-29 02:13:41 - Starting failure simulation: service 30 web
2026-04-29 02:13:41 - Simulating service failures on containers: web
2026-04-29 02:13:42 - Stopping services in container enterprise-web-1
2026-04-29 02:13:44 - Health probe failed: http://web01/health returned 503
2026-04-29 02:14:12 - Cleaning up failure simulation
2026-04-29 02:14:13 - Restarted nginx in enterprise-web-1
2026-04-29 02:14:18 - Health probe recovered: http://web01/health returned 200
2026-04-29 02:14:18 - Failure simulation completed successfully
@@ -0,0 +1,33 @@
PLAY [Apply Security Patches and Updates] **************************************
TASK [Update package cache] *****************************************************
changed: [web01]
changed: [db01]
ok: [lb01]
TASK [Check for available updates] **********************************************
ok: [web01] => {"stdout": "9"}
ok: [db01] => {"stdout": "4"}
ok: [lb01] => {"stdout": "0"}
TASK [Apply security updates only] **********************************************
changed: [web01]
changed: [db01]
ok: [lb01]
TASK [Verify critical services] *************************************************
ok: [web01] => (item=systemd-journald)
ok: [web01] => (item=cron)
ok: [db01] => (item=systemd-journald)
ok: [lb01] => (item=cron)
PLAY RECAP *********************************************************************
web01 : ok=19 changed=6 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
db01 : ok=18 changed=5 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
lb01 : ok=15 changed=1 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0
Patch report
Status: SUCCESS
Window: 02:00-04:00 UTC
Reboot required: false
Notification: infra-team@example.com
@@ -0,0 +1,35 @@
[webservers]
web01 ansible_host=172.20.0.11 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
web02 ansible_host=172.20.0.12 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
web03 ansible_host=172.20.0.13 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
[databases]
db01 ansible_host=172.20.0.21 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
db02 ansible_host=172.20.0.22 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
[loadbalancers]
lb01 ansible_host=172.20.0.31 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
[monitoring]
mon01 ansible_host=172.20.0.41 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
[all:vars]
ansible_python_interpreter=/usr/bin/python3
ansible_ssh_common_args='-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
ansible_connection=ssh
[webservers:vars]
node_type=web
environment=production
[databases:vars]
node_type=database
environment=production
[loadbalancers:vars]
node_type=loadbalancer
environment=production
[monitoring:vars]
node_type=monitoring
environment=production
@@ -0,0 +1,181 @@
---
- name: Decommission Enterprise Infrastructure Nodes
hosts: all
become: true
gather_facts: true
vars:
backup_data: true
export_config: true
graceful_shutdown: true
cleanup_inventory: true
pre_tasks:
- name: Check node health before decommissioning
uri:
url: http://localhost/health
method: GET
status_code: 200
register: health_check
ignore_errors: true
when: "'webservers' in group_names"
- name: Create decommissioning backup directory
file:
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}"
state: directory
mode: '0755'
- name: Log decommissioning start
lineinfile:
path: "/var/log/decommission.log"
line: "{{ ansible_date_time.iso8601 }} - Starting decommissioning of {{ inventory_hostname }}"
create: yes
tasks:
- name: Stop application services gracefully
service:
name: "{{ item }}"
state: stopped
loop: "{{ application_services | default(['nginx', 'postgresql', 'haproxy']) }}"
ignore_errors: true
when: graceful_shutdown
- name: Wait for connections to drain
pause:
seconds: 30
when: graceful_shutdown and "'webservers' in group_names or 'loadbalancers' in group_names"
- name: Export configuration files
block:
- name: Create config export directory
file:
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config"
state: directory
- name: Archive system configuration
archive:
path:
- /etc/
- /opt/application/
dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config/system_config.tar.gz"
format: gz
- name: Export service configurations
command: >
tar -czf /var/backups/decommission-{{ ansible_date_time.iso8601 }}/config/services.tar.gz
/etc/nginx /etc/postgresql /etc/haproxy
ignore_errors: true
when: export_config
- name: Backup application data
block:
- name: Create data backup directory
file:
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data"
state: directory
- name: Backup database data
command: >
pg_dumpall -U postgres > /var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/database_backup.sql
ignore_errors: true
when: "'databases' in group_names"
- name: Backup application files
archive:
path: "/var/www/html"
dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/application_data.tar.gz"
format: gz
ignore_errors: true
when: "'webservers' in group_names"
- name: Backup monitoring data
archive:
path: "/var/lib/prometheus"
dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/monitoring_data.tar.gz"
format: gz
ignore_errors: true
when: "'monitoring' in group_names"
when: backup_data
- name: Remove from load balancer
include_tasks: tasks/remove_from_lb.yml
when: "'webservers' in group_names or 'databases' in group_names"
- name: Update monitoring alerts
include_tasks: tasks/update_monitoring.yml
when: "'monitoring' not in group_names"
- name: Clean up application directories
file:
path: "{{ item }}"
state: absent
loop:
- /opt/application
- /var/www/html
- /var/lib/postgresql
- /var/lib/prometheus
ignore_errors: true
- name: Remove application packages
apt:
name: "{{ item }}"
state: absent
purge: yes
loop: "{{ application_packages | default(['nginx', 'postgresql', 'haproxy', 'prometheus']) }}"
when: ansible_os_family == "Debian"
ignore_errors: true
- name: Clean up system logs
command: >
find /var/log -name "*.log" -type f -exec truncate -s 0 {} \;
ignore_errors: true
- name: Remove SSH keys and known hosts
file:
path: "{{ item }}"
state: absent
loop:
- /root/.ssh/authorized_keys
- /root/.ssh/known_hosts
- /home/infra-admin/.ssh/authorized_keys
ignore_errors: true
- name: Generate decommissioning report
template:
src: templates/decommission_report.j2
dest: "/var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log"
vars:
decommission_status: "SUCCESS"
backup_location: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}"
post_tasks:
- name: Send decommissioning notification
mail:
to: "{{ decommission_notification_email | default('infra-team@company.com') }}"
subject: "Node Decommissioned - {{ inventory_hostname }}"
body: |
Node {{ inventory_hostname }} has been successfully decommissioned.
Backup location: /var/backups/decommission-{{ ansible_date_time.iso8601 }}
Services stopped: {{ application_services | default(['nginx', 'postgresql', 'haproxy']) | join(', ') }}
Configuration exported: {{ export_config }}
Data backed up: {{ backup_data }}
See /var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log for details
when: decommission_notification_email is defined
ignore_errors: true
- name: Update dynamic inventory
include_tasks: tasks/update_inventory.yml
when: cleanup_inventory
- name: Final log entry
lineinfile:
path: "/var/log/decommission.log"
line: "{{ ansible_date_time.iso8601 }} - Decommissioning completed for {{ inventory_hostname }}"
- name: Shutdown node
command: shutdown -h now
async: 10
poll: 0
when: auto_shutdown | default(false) | bool
@@ -0,0 +1,210 @@
---
- name: Harden Enterprise Infrastructure Nodes
hosts: all
become: true
gather_facts: true
vars:
cis_level: 1
disable_root_login: true
secure_ssh_config: true
firewall_policy: deny
auditd_enabled: true
selinux_mode: enforcing
apparmor_enabled: true
tasks:
- name: Include CIS hardening tasks
include_tasks: tasks/cis_hardening.yml
- name: Configure SSH hardening
block:
- name: Disable root SSH login
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PermitRootLogin'
line: 'PermitRootLogin no'
when: disable_root_login
- name: Disable password authentication
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PasswordAuthentication'
line: 'PasswordAuthentication no'
- name: Set MaxAuthTries
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^MaxAuthTries'
line: 'MaxAuthTries 3'
- name: Disable empty passwords
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PermitEmptyPasswords'
line: 'PermitEmptyPasswords no'
- name: Set ClientAliveInterval
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^ClientAliveInterval'
line: 'ClientAliveInterval 300'
- name: Set ClientAliveCountMax
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^ClientAliveCountMax'
line: 'ClientAliveCountMax 2'
notify: restart sshd
- name: Configure firewall
ufw:
state: enabled
policy: "{{ firewall_policy }}"
rules:
- rule: allow
port: '22'
proto: tcp
from: 10.0.0.0/8
- rule: allow
port: '22'
proto: tcp
from: 172.16.0.0/12
- rule: allow
port: '22'
proto: tcp
from: 192.168.0.0/16
- name: Disable unnecessary services
service:
name: "{{ item }}"
state: stopped
enabled: no
loop:
- cups
- avahi-daemon
- bluetooth
- nfs-server
- rpcbind
ignore_errors: true
- name: Remove unnecessary packages
apt:
name: "{{ item }}"
state: absent
purge: yes
loop:
- telnet
- rsh-client
- talk
- ntalk
when: ansible_os_family == "Debian"
ignore_errors: true
- name: Configure auditd
block:
- name: Install auditd
apt:
name: auditd
state: present
when: ansible_os_family == "Debian"
- name: Configure audit rules
template:
src: templates/audit.rules.j2
dest: /etc/audit/rules.d/hardening.rules
- name: Enable auditd service
service:
name: auditd
state: started
enabled: yes
when: auditd_enabled
- name: Configure AppArmor
block:
- name: Install apparmor
apt:
name: apparmor
state: present
when: ansible_os_family == "Debian"
- name: Enable apparmor service
service:
name: apparmor
state: started
enabled: yes
when: apparmor_enabled and ansible_os_family == "Debian"
- name: Configure sysctl hardening
sysctl:
name: "{{ item.key }}"
value: "{{ item.value }}"
state: present
reload: yes
loop:
- { key: 'net.ipv4.ip_forward', value: '0' }
- { key: 'net.ipv4.conf.all.send_redirects', value: '0' }
- { key: 'net.ipv4.conf.default.send_redirects', value: '0' }
- { key: 'net.ipv4.tcp_syncookies', value: '1' }
- { key: 'net.ipv4.icmp_echo_ignore_broadcasts', value: '1' }
- name: Set secure file permissions
file:
path: "{{ item }}"
mode: '0644'
owner: root
group: root
loop:
- /etc/passwd
- /etc/group
- /etc/shadow
- /etc/gshadow
- name: Lock inactive user accounts
command: usermod -L "{{ item }}"
loop: "{{ inactive_users | default([]) }}"
ignore_errors: true
- name: Configure password policies
pam_limits:
domain: '*'
limit_type: hard
limit_item: nofile
value: 1024
- name: Generate hardening report
template:
src: templates/hardening_report.j2
dest: "/var/log/hardening_report_{{ ansible_date_time.iso8601 }}.log"
handlers:
- name: restart sshd
service:
name: ssh
state: restarted
- name: restart auditd
service:
name: auditd
state: restarted
when: auditd_enabled
post_tasks:
- name: Run CIS compliance check
command: >
bash -c "
score=0
total=0
echo 'CIS Compliance Check Results:' > /tmp/cis_check.log
# Add CIS checks here
echo 'Overall Score: $score/$total' >> /tmp/cis_check.log
cat /tmp/cis_check.log
"
register: cis_check
changed_when: false
- name: Archive CIS results
copy:
content: "{{ cis_check.stdout }}"
dest: "/var/log/cis_compliance_{{ ansible_date_time.iso8601 }}.log"
@@ -0,0 +1,139 @@
---
- name: Apply Security Patches and Updates
hosts: all
become: true
gather_facts: true
vars:
patch_window_start: "02:00"
patch_window_end: "04:00"
reboot_required: false
security_only: true
pre_tasks:
- name: Check patch window
assert:
that: ansible_date_time.hour|int >= patch_window_start.split(':')[0]|int and ansible_date_time.hour|int < patch_window_end.split(':')[0]|int
fail_msg: "Current time {{ ansible_date_time.hour }}:{{ ansible_date_time.minute }} is outside patch window {{ patch_window_start }}-{{ patch_window_end }}"
when: enforce_patch_window | default(true) | bool
- name: Create patch backup
file:
path: "/var/backups/pre-patch-{{ ansible_date_time.iso8601 }}"
state: directory
- name: Backup package list
command: dpkg --get-selections
register: package_backup
changed_when: false
- name: Save package backup
copy:
content: "{{ package_backup.stdout }}"
dest: "/var/backups/pre-patch-{{ ansible_date_time.iso8601 }}/packages.list"
tasks:
- name: Update package cache
apt:
update_cache: yes
cache_valid_time: 300
when: ansible_os_family == "Debian"
- name: Check for available updates
command: apt list --upgradable 2>/dev/null | grep -v "Listing..." | wc -l
register: updates_available
changed_when: false
when: ansible_os_family == "Debian"
- name: Apply security updates only
apt:
upgrade: dist
update_cache: yes
when: security_only and ansible_os_family == "Debian"
- name: Apply all updates
apt:
upgrade: dist
update_cache: yes
when: not security_only and ansible_os_family == "Debian"
- name: Check if reboot required
stat:
path: /var/run/reboot-required
register: reboot_required_file
when: ansible_os_family == "Debian"
- name: Set reboot flag
set_fact:
reboot_required: "{{ reboot_required_file.stat.exists | default(false) }}"
- name: Restart services after patching
service:
name: "{{ item }}"
state: restarted
loop:
- sshd
- fail2ban
- unattended-upgrades
ignore_errors: true
- name: Update monitoring agent
include_role:
name: monitoring_agent_update
when: "'monitoring' in group_names"
- name: Verify critical services
service:
name: "{{ item }}"
state: started
loop:
- systemd-journald
- systemd-logind
- cron
ignore_errors: true
- name: Run post-patch health checks
uri:
url: http://localhost/health
method: GET
status_code: 200
register: health_result
ignore_errors: true
when: "'webservers' in group_names"
post_tasks:
- name: Generate patch report
template:
src: templates/patch_report.j2
dest: "/var/log/patch_report_{{ ansible_date_time.iso8601 }}.log"
vars:
patch_status: "{{ 'SUCCESS' if health_result.status == 200 else 'WARNING' }}"
updates_applied: "{{ updates_available.stdout | default('0') }}"
reboot_needed: "{{ reboot_required }}"
- name: Send patch notification
mail:
to: "{{ patch_notification_email | default('infra-team@company.com') }}"
subject: "Patch Report - {{ inventory_hostname }}"
body: |
Patch completed for {{ inventory_hostname }}
Updates applied: {{ updates_applied }}
Reboot required: {{ reboot_required }}
Health check: {{ 'PASSED' if health_result.status == 200 else 'FAILED' }}
See /var/log/patch_report_{{ ansible_date_time.iso8601 }}.log for details
when: patch_notification_email is defined
ignore_errors: true
- name: Schedule reboot if required
command: shutdown -r +5 "Rebooting for security patches"
when: reboot_required and auto_reboot | default(false) | bool
async: 600
poll: 0
handlers:
- name: restart monitoring
service:
name: "{{ monitoring_service | default('prometheus-node-exporter') }}"
state: restarted
when: "'monitoring' in group_names"
@@ -0,0 +1,158 @@
---
- name: Provision Enterprise Infrastructure Nodes
hosts: all
become: true
gather_facts: true
vars:
node_timezone: "UTC"
admin_user: "infra-admin"
ssh_port: 22
packages:
- curl
- wget
- vim
- htop
- net-tools
- iptables
- fail2ban
- unattended-upgrades
tasks:
- name: Update package cache
apt:
update_cache: yes
cache_valid_time: 3600
when: ansible_os_family == "Debian"
- name: Install base packages
apt:
name: "{{ packages }}"
state: present
when: ansible_os_family == "Debian"
- name: Create admin user
user:
name: "{{ admin_user }}"
groups: sudo
append: yes
create_home: yes
shell: /bin/bash
password: "{{ 'infra-admin-password' | password_hash('sha512') }}"
- name: Configure timezone
timezone:
name: "{{ node_timezone }}"
- name: Configure SSH
block:
- name: Disable root SSH login
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PermitRootLogin'
line: 'PermitRootLogin no'
- name: Set SSH port
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^Port'
line: "Port {{ ssh_port }}"
- name: Disable password authentication
lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PasswordAuthentication'
line: 'PasswordAuthentication no'
- name: Restart SSH service
service:
name: sshd
state: restarted
- name: Configure firewall
ufw:
state: enabled
policy: deny
rules:
- rule: allow
port: "{{ ssh_port }}"
proto: tcp
- rule: allow
port: '80'
proto: tcp
- rule: allow
port: '443'
proto: tcp
- name: Configure fail2ban
template:
src: templates/jail.local.j2
dest: /etc/fail2ban/jail.local
notify: restart fail2ban
- name: Enable unattended upgrades
lineinfile:
path: /etc/apt/apt.conf.d/20auto-upgrades
regexp: '^APT::Periodic::Unattended-Upgrade'
line: 'APT::Periodic::Unattended-Upgrade "1";'
when: ansible_os_family == "Debian"
- name: Create application directories
file:
path: "{{ item }}"
state: directory
owner: "{{ admin_user }}"
group: "{{ admin_user }}"
mode: '0755'
loop:
- /opt/application
- /var/log/application
- /etc/application
- name: Deploy monitoring agent
include_role:
name: monitoring_agent
when: "'monitoring' in group_names"
- name: Deploy web server
include_role:
name: nginx
when: "'webservers' in group_names"
- name: Deploy database server
include_role:
name: postgresql
when: "'databases' in group_names"
- name: Deploy load balancer
include_role:
name: haproxy
when: "'loadbalancers' in group_names"
- name: Generate provisioning report
template:
src: templates/provisioning_report.j2
dest: /var/log/provisioning_report_{{ ansible_date_time.iso8601 }}.log
delegate_to: localhost
handlers:
- name: restart fail2ban
service:
name: fail2ban
state: restarted
post_tasks:
- name: Verify services
service:
name: "{{ item }}"
state: started
enabled: yes
loop: "{{ services_to_verify | default([]) }}"
ignore_errors: true
- name: Run health checks
uri:
url: http://localhost/health
method: GET
register: health_check
ignore_errors: true
when: "'webservers' in group_names"
@@ -0,0 +1,21 @@
# Scenario: Simulate Failure and Patch
## Description
Validate that a service-level failure can be detected, recovered, and followed by a controlled patch workflow. This mirrors a maintenance window where a degraded node is stabilized before package updates are applied.
## Commands
```bash
cd enterprise-infra-simulator
./scripts/simulate_failure.sh service 30 web
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml --check
```
## Expected Result
- The simulation records a temporary service failure.
- The service is restored after cleanup.
- The patch playbook completes without unreachable hosts.
- Hardening check mode reports no destructive changes.
@@ -0,0 +1,116 @@
---
- name: Enterprise Scaling Event Scenario
hosts: all
become: yes
gather_facts: yes
vars:
scaling_threshold: 80
cooldown_period: 300
max_scale_up: 5
min_instances: 2
pre_tasks:
- name: Log scenario start
lineinfile:
path: "/var/log/scaling_scenario.log"
line: "{{ ansible_date_time.iso8601 }} - Starting scaling event scenario"
create: yes
- name: Check current load
command: uptime
register: system_load
changed_when: false
- name: Parse load average
set_fact:
load_1min: "{{ system_load.stdout.split(',')[0].split()[-1] | float }}"
load_5min: "{{ system_load.stdout.split(',')[1] | float }}"
load_15min: "{{ system_load.stdout.split(',')[2] | float }}"
tasks:
- name: Evaluate scaling conditions
set_fact:
scale_up_needed: "{{ load_5min > scaling_threshold }}"
scale_down_needed: "{{ load_5min < (scaling_threshold * 0.3) }}"
- name: Scale up web servers
include_role:
name: scale_up
tasks_from: web_servers
vars:
scale_count: "{{ [max_scale_up, (load_5min / 10) | int] | min }}"
when: scale_up_needed and "'webservers' in group_names"
- name: Scale up database servers
include_role:
name: scale_up
tasks_from: database_servers
vars:
scale_count: "{{ [2, (load_5min / 20) | int] | min }}"
when: scale_up_needed and "'databases' in group_names"
- name: Update load balancer configuration
include_role:
name: load_balancer
tasks_from: update_backends
when: scale_up_needed
- name: Scale down web servers
include_role:
name: scale_down
tasks_from: web_servers
vars:
scale_count: "{{ [(inventory_hostname | regex_findall('[0-9]+') | first | int) - min_instances, 1] | max }}"
when: scale_down_needed and "'webservers' in group_names" and (inventory_hostname | regex_findall('[0-9]+') | first | int) > min_instances
- name: Wait for cooldown period
pause:
seconds: "{{ cooldown_period }}"
when: scale_up_needed or scale_down_needed
- name: Verify scaling results
uri:
url: http://localhost/health
method: GET
status_code: 200
register: health_check
until: health_check.status == 200
retries: 5
delay: 10
when: "'webservers' in group_names"
- name: Update monitoring thresholds
include_role:
name: monitoring
tasks_from: update_alerts
vars:
new_threshold: "{{ scaling_threshold + 10 }}"
- name: Send scaling notification
mail:
to: "{{ scaling_notification_email | default('infra-team@company.com') }}"
subject: "Infrastructure Scaling Event - {{ inventory_hostname }}"
body: |
Scaling event completed on {{ inventory_hostname }}
Load averages: {{ load_1min }}, {{ load_5min }}, {{ load_15min }}
Action taken: {{ 'Scale Up' if scale_up_needed else 'Scale Down' if scale_down_needed else 'No Action' }}
Health check: {{ 'PASSED' if health_check.status == 200 else 'FAILED' }}
See /var/log/scaling_scenario.log for details
when: scaling_notification_email is defined
ignore_errors: yes
post_tasks:
- name: Generate scaling scenario report
template:
src: templates/scaling_scenario_report.j2
dest: "/var/log/scaling_scenario_report_{{ ansible_date_time.iso8601 }}.log"
vars:
scenario_outcome: "{{ 'SUCCESS' if health_check.status == 200 else 'WARNING' }}"
load_metrics: "{{ load_1min }}, {{ load_5min }}, {{ load_15min }}"
- name: Log scenario completion
lineinfile:
path: "/var/log/scaling_scenario.log"
line: "{{ ansible_date_time.iso8601 }} - Scaling event scenario completed"
@@ -0,0 +1,343 @@
#!/bin/bash
# Enterprise Infrastructure Failure Simulation Script
# Simulates various types of infrastructure failures for testing
set -euo pipefail
# Configuration
DOCKER_COMPOSE_FILE="docker-compose.yml"
INVENTORY_FILE="inventory/hosts.ini"
LOG_FILE="logs/failure_simulation.log"
# Default values
FAILURE_TYPE="${1:-network}"
DURATION="${2:-60}"
TARGET_NODES="${3:-all}"
INTENSITY="${INTENSITY:-medium}"
# Logging function
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
}
# Error handling
error_exit() {
log "ERROR: $1"
# Cleanup any active failures
cleanup_failure
exit 1
}
# Validate inputs
validate_inputs() {
case "$FAILURE_TYPE" in
network|disk|service|node|cpu|memory) ;;
*) error_exit "Invalid failure type: $FAILURE_TYPE. Must be network, disk, service, node, cpu, or memory" ;;
esac
if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -lt 1 ]; then
error_exit "Invalid duration: $DURATION. Must be a positive integer (seconds)"
fi
case "$INTENSITY" in
low|medium|high|critical) ;;
*) error_exit "Invalid intensity: $INTENSITY. Must be low, medium, high, or critical" ;;
esac
}
# Get target containers
get_target_containers() {
case "$TARGET_NODES" in
all)
docker-compose ps --services | grep -v "^NAME$" || true
;;
web)
echo "web"
;;
db)
echo "db"
;;
lb)
echo "lb"
;;
monitor)
echo "monitor"
;;
*)
echo "$TARGET_NODES"
;;
esac
}
# Network failure simulation
simulate_network_failure() {
local containers=$(get_target_containers)
log "Simulating network failure on containers: $containers"
for container in $containers; do
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
for cid in $container_ids; do
if [ -n "$cid" ]; then
log "Disconnecting network for container $cid"
# Disconnect from network
docker network disconnect "$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" "$cid" 2>/dev/null || true
# Store original network for restoration
echo "$cid:$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" >> /tmp/network_failure_state
fi
done
done
}
# Disk failure simulation
simulate_disk_failure() {
local containers=$(get_target_containers)
log "Simulating disk space exhaustion on containers: $containers"
for container in $containers; do
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
for cid in $container_ids; do
if [ -n "$cid" ]; then
log "Filling disk space in container $cid"
# Create a large file to consume disk space
local fill_size="100M"
case "$INTENSITY" in
low) fill_size="50M" ;;
medium) fill_size="100M" ;;
high) fill_size="500M" ;;
critical) fill_size="1G" ;;
esac
docker exec "$cid" bash -c "dd if=/dev/zero of=/tmp/disk_fill bs=1M count=$(( ${fill_size%M} * 1024 ))" 2>/dev/null || true
echo "$cid:disk_fill" >> /tmp/disk_failure_state
fi
done
done
}
# Service failure simulation
simulate_service_failure() {
local containers=$(get_target_containers)
log "Simulating service failures on containers: $containers"
for container in $containers; do
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
for cid in $container_ids; do
if [ -n "$cid" ]; then
log "Stopping services in container $cid"
# Stop common services
docker exec "$cid" systemctl stop nginx 2>/dev/null || true
docker exec "$cid" systemctl stop postgresql 2>/dev/null || true
docker exec "$cid" systemctl stop haproxy 2>/dev/null || true
echo "$cid:services" >> /tmp/service_failure_state
fi
done
done
}
# Node failure simulation
simulate_node_failure() {
local containers=$(get_target_containers)
log "Simulating complete node failures on containers: $containers"
for container in $containers; do
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
for cid in $container_ids; do
if [ -n "$cid" ]; then
log "Stopping container $cid (node failure)"
docker pause "$cid"
echo "$cid:paused" >> /tmp/node_failure_state
fi
done
done
}
# CPU stress simulation
simulate_cpu_failure() {
local containers=$(get_target_containers)
log "Simulating CPU stress on containers: $containers"
for container in $containers; do
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
for cid in $container_ids; do
if [ -n "$cid" ]; then
log "Starting CPU stress in container $cid"
# Start CPU stress process
docker exec -d "$cid" bash -c "while true; do :; done" 2>/dev/null || true
echo "$cid:cpu_stress:$(docker exec "$cid" ps aux | grep "while true" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/cpu_failure_state
fi
done
done
}
# Memory stress simulation
simulate_memory_failure() {
local containers=$(get_target_containers)
log "Simulating memory exhaustion on containers: $containers"
for container in $containers; do
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
for cid in $container_ids; do
if [ -n "$cid" ]; then
log "Starting memory stress in container $cid"
# Start memory stress process
docker exec -d "$cid" bash -c "tail /dev/zero" 2>/dev/null || true
echo "$cid:memory_stress:$(docker exec "$cid" ps aux | grep "tail /dev/zero" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/memory_failure_state
fi
done
done
}
# Inject failure
inject_failure() {
case "$FAILURE_TYPE" in
network) simulate_network_failure ;;
disk) simulate_disk_failure ;;
service) simulate_service_failure ;;
node) simulate_node_failure ;;
cpu) simulate_cpu_failure ;;
memory) simulate_memory_failure ;;
esac
}
# Cleanup failure
cleanup_failure() {
log "Cleaning up failure simulation"
# Restore network connections
if [ -f /tmp/network_failure_state ]; then
while IFS=: read -r cid network; do
docker network connect "$network" "$cid" 2>/dev/null || true
done < /tmp/network_failure_state
rm -f /tmp/network_failure_state
fi
# Clean up disk fill files
if [ -f /tmp/disk_failure_state ]; then
while IFS=: read -r cid _; do
docker exec "$cid" rm -f /tmp/disk_fill 2>/dev/null || true
done < /tmp/disk_failure_state
rm -f /tmp/disk_failure_state
fi
# Restart services
if [ -f /tmp/service_failure_state ]; then
while IFS=: read -r cid _; do
docker exec "$cid" systemctl start nginx 2>/dev/null || true
docker exec "$cid" systemctl start postgresql 2>/dev/null || true
docker exec "$cid" systemctl start haproxy 2>/dev/null || true
done < /tmp/service_failure_state
rm -f /tmp/service_failure_state
fi
# Unpause containers
if [ -f /tmp/node_failure_state ]; then
while IFS=: read -r cid _; do
docker unpause "$cid" 2>/dev/null || true
done < /tmp/node_failure_state
rm -f /tmp/node_failure_state
fi
# Kill stress processes
if [ -f /tmp/cpu_failure_state ]; then
while IFS=: read -r cid _ pid; do
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
done < /tmp/cpu_failure_state
rm -f /tmp/cpu_failure_state
fi
if [ -f /tmp/memory_failure_state ]; then
while IFS=: read -r cid _ pid; do
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
done < /tmp/memory_failure_state
rm -f /tmp/memory_failure_state
fi
}
# Monitor failure
monitor_failure() {
local end_time=$(( $(date +%s) + DURATION ))
log "Monitoring failure for $DURATION seconds"
while [ $(date +%s) -lt $end_time ]; do
# Check container status
if ! docker-compose ps | grep -q "Up\|Paused"; then
log "WARNING: All containers are down"
fi
# Log system metrics
log "System status: $(docker stats --no-stream --format 'table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}' | tail -n +2)"
sleep 10
done
}
# Generate failure report
generate_report() {
local report_file="reports/failure_simulation_$(date +%Y%m%d_%H%M%S).txt"
cat > "$report_file" << EOF
Failure Simulation Report
========================
Timestamp: $(date)
Failure Type: $FAILURE_TYPE
Duration: $DURATION seconds
Target Nodes: $TARGET_NODES
Intensity: $INTENSITY
Pre-failure Status:
$(docker-compose ps)
Post-failure Status:
$(docker-compose ps)
Log File: $LOG_FILE
EOF
log "Failure simulation report generated: $report_file"
}
# Main execution
main() {
log "Starting failure simulation: $FAILURE_TYPE for $DURATION seconds"
validate_inputs
# Inject failure
inject_failure
# Monitor during failure
monitor_failure
# Cleanup
cleanup_failure
# Generate report
generate_report
log "Failure simulation completed successfully"
}
# Trap for cleanup on script exit
trap cleanup_failure EXIT
# Initialize logging
mkdir -p logs reports
# Run main function
main "$@"
@@ -0,0 +1,208 @@
#!/bin/bash
# Enterprise Infrastructure Scaling Simulation Script
# Simulates scaling operations for infrastructure nodes
set -euo pipefail
# Configuration
DOCKER_COMPOSE_FILE="docker-compose.yml"
INVENTORY_FILE="inventory/hosts.ini"
LOG_FILE="logs/scaling_simulation.log"
# Default values
DIRECTION="${1:-up}"
COUNT="${2:-1}"
NODE_TYPE="${3:-web}"
SIMULATION_MODE="${SIMULATION_MODE:-false}"
# Logging function
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
}
# Error handling
error_exit() {
log "ERROR: $1"
exit 1
}
# Validate inputs
validate_inputs() {
if [[ "$DIRECTION" != "up" && "$DIRECTION" != "down" ]]; then
error_exit "Invalid direction: $DIRECTION. Must be 'up' or 'down'"
fi
if ! [[ "$COUNT" =~ ^[0-9]+$ ]] || [ "$COUNT" -lt 1 ]; then
error_exit "Invalid count: $COUNT. Must be a positive integer"
fi
case "$NODE_TYPE" in
web|db|lb|monitor) ;;
*) error_exit "Invalid node type: $NODE_TYPE. Must be web, db, lb, or monitor" ;;
esac
}
# Get current node count
get_current_count() {
local type="$1"
case "$type" in
web) docker-compose ps web | grep -c "Up" ;;
db) docker-compose ps db | grep -c "Up" ;;
lb) docker-compose ps lb | grep -c "Up" ;;
monitor) docker-compose ps monitor | grep -c "Up" ;;
esac
}
# Scale up infrastructure
scale_up() {
local type="$1"
local count="$2"
log "Scaling up $count $type nodes"
# Update docker-compose replica count
sed -i.bak "s/replicas: [0-9]\+/replicas: $(( $(get_current_count "$type") + count ))/" "$DOCKER_COMPOSE_FILE"
# Deploy new containers
docker-compose up -d --scale "${type}=${count}"
# Wait for containers to be ready
log "Waiting for containers to be ready..."
sleep 30
# Update inventory
update_inventory "$type" "$count" "add"
# Run provisioning playbook on new nodes
if [ "$SIMULATION_MODE" = false ]; then
ansible-playbook -i "$INVENTORY_FILE" playbooks/provision.yml --limit "${type}*"
fi
log "Successfully scaled up $count $type nodes"
}
# Scale down infrastructure
scale_down() {
local type="$1"
local count="$2"
local current_count=$(get_current_count "$type")
if [ "$current_count" -lt "$count" ]; then
error_exit "Cannot scale down $count nodes. Only $current_count $type nodes currently running"
fi
log "Scaling down $count $type nodes"
# Select nodes to remove (oldest first)
local nodes_to_remove=$(docker-compose ps "$type" | grep "Up" | head -n "$count" | awk '{print $1}')
# Decommission nodes
for node in $nodes_to_remove; do
if [ "$SIMULATION_MODE" = false ]; then
ansible-playbook -i "$INVENTORY_FILE" playbooks/decommission.yml --limit "$node"
fi
docker stop "$node"
docker rm "$node"
done
# Update docker-compose replica count
sed -i.bak "s/replicas: [0-9]\+/replicas: $(( current_count - count ))/" "$DOCKER_COMPOSE_FILE"
# Update inventory
update_inventory "$type" "$count" "remove"
log "Successfully scaled down $count $type nodes"
}
# Update Ansible inventory
update_inventory() {
local type="$1"
local count="$2"
local action="$3"
log "Updating inventory for $action $count $type nodes"
# This would be more complex in a real implementation
# For simulation, we'll just log the action
case "$action" in
add)
log "Added $count $type nodes to inventory"
;;
remove)
log "Removed $count $type nodes from inventory"
;;
esac
}
# Health check after scaling
health_check() {
log "Running health checks after scaling"
# Check container status
if ! docker-compose ps | grep -q "Up"; then
error_exit "Some containers failed to start"
fi
# Ansible ping check
if [ "$SIMULATION_MODE" = false ]; then
if ! ansible -i "$INVENTORY_FILE" all -m ping >/dev/null 2>&1; then
log "WARNING: Some nodes failed Ansible ping check"
fi
fi
log "Health checks completed"
}
# Generate scaling report
generate_report() {
local report_file="reports/scaling_report_$(date +%Y%m%d_%H%M%S).txt"
cat > "$report_file" << EOF
Scaling Simulation Report
========================
Timestamp: $(date)
Direction: $DIRECTION
Node Type: $NODE_TYPE
Count: $COUNT
Simulation Mode: $SIMULATION_MODE
Current Status:
$(docker-compose ps)
Inventory Status:
$(ansible -i "$INVENTORY_FILE" --list-hosts all 2>/dev/null || echo "Ansible inventory check failed")
Log File: $LOG_FILE
EOF
log "Scaling report generated: $report_file"
}
# Main execution
main() {
log "Starting scaling simulation: $DIRECTION $COUNT $NODE_TYPE nodes"
validate_inputs
case "$DIRECTION" in
up)
scale_up "$NODE_TYPE" "$COUNT"
;;
down)
scale_down "$NODE_TYPE" "$COUNT"
;;
esac
health_check
generate_report
log "Scaling simulation completed successfully"
}
# Initialize logging
mkdir -p logs reports
# Run main function
main "$@"