Initial CV-aligned infrastructure portfolio

Rework portfolio around Linux operations, Zabbix monitoring, migration validation, and ELK/Grafana log observability. Add AAP-style LVM resize workflow, Zabbix server/proxy/agent automation assets, Linux/AIX monitoring templates, and updated validation CI.
2026-05-04 17:37:24 +00:00
commit 35e6b139fc
114 changed files with 6422 additions and 0 deletions
@@ -0,0 +1,14 @@
+---
+# Ansible-lint configuration
+
+skip_list:
+  - 'role-name'
+  - 'name[casing]'
+  - 'line-too-long'
+
+exclude_paths:
+  - .git
+  - .gitea
+  - molecule/
+  - molecule/default/tests/
+  - scenarios/
@@ -0,0 +1,95 @@
+# Linux Operations Automation Makefile
+
+.PHONY: help test run demo patch harden decommission lvm-check up down status logs validate clean lint scale-up-web scale-up-db scale-down-web scale-down-db fail-network fail-disk fail-service fail-node scenario-scaling help-scaling help-failure
+
+help: ## Show this help message
+	@echo "Linux Operations Automation"
+	@echo ""
+	@echo "Available commands:"
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "  %-18s %s\n", $$1, $$2}'
+
+test: ## Run offline validation checks
+	ansible-playbook -i inventory/hosts.ini --syntax-check playbooks/*.yml
+	ansible-lint
+
+run: ## Run provisioning against the configured inventory
+	ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
+
+demo: ## Run a safe local demonstration without requiring live SSH hosts
+	SIMULATION_MODE=true bash ./scripts/simulate_failure.sh service 5 web
+
+patch: ## Apply patching workflow against the configured inventory
+	ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
+
+harden: ## Apply hardening workflow against the configured inventory
+	ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
+
+decommission: ## Run decommissioning workflow against the configured inventory
+	ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml
+
+lvm-check: ## Validate the AAP-style LVM resize workflow
+	ansible-playbook -i inventory/hosts.ini --syntax-check playbooks/lvm_resize.yml
+
+up: ## Start the optional local container scaffold
+	docker compose up -d
+
+down: ## Stop the optional local container scaffold
+	docker compose down
+
+status: ## Show local scaffold status and inventory hosts
+	docker compose ps
+	ansible -i inventory/hosts.ini --list-hosts all || echo "Inventory check failed"
+
+logs: ## Show local scaffold logs
+	docker compose logs -f --tail=100
+
+validate: ## Run all offline validation checks
+	$(MAKE) test
+	docker compose config --quiet
+
+clean: ## Clean up generated local logs and reports
+	rm -f logs/*.log reports/*.txt
+
+lint: ## Lint Ansible content
+	ansible-lint
+
+scale-up-web: ## Scale up web servers in simulation mode (usage: make scale-up-web COUNT=2)
+	SIMULATION_MODE=true bash ./scripts/simulate_scaling.sh up $(or $(COUNT),1) web
+
+scale-up-db: ## Scale up database servers in simulation mode (usage: make scale-up-db COUNT=1)
+	SIMULATION_MODE=true bash ./scripts/simulate_scaling.sh up $(or $(COUNT),1) db
+
+scale-down-web: ## Scale down web servers in simulation mode (usage: make scale-down-web COUNT=1)
+	SIMULATION_MODE=true bash ./scripts/simulate_scaling.sh down $(or $(COUNT),1) web
+
+scale-down-db: ## Scale down database servers in simulation mode (usage: make scale-down-db COUNT=1)
+	SIMULATION_MODE=true bash ./scripts/simulate_scaling.sh down $(or $(COUNT),1) db
+
+fail-network: ## Simulate network failure safely (usage: make fail-network DURATION=60)
+	SIMULATION_MODE=true bash ./scripts/simulate_failure.sh network $(or $(DURATION),60)
+
+fail-disk: ## Simulate disk pressure safely (usage: make fail-disk DURATION=120)
+	SIMULATION_MODE=true bash ./scripts/simulate_failure.sh disk $(or $(DURATION),120)
+
+fail-service: ## Simulate service failures safely (usage: make fail-service DURATION=30)
+	SIMULATION_MODE=true bash ./scripts/simulate_failure.sh service $(or $(DURATION),30)
+
+fail-node: ## Simulate node failure safely (usage: make fail-node DURATION=300)
+	SIMULATION_MODE=true bash ./scripts/simulate_failure.sh node $(or $(DURATION),300)
+
+scenario-scaling: ## Run scaling event syntax validation
+	ansible-playbook -i inventory/hosts.ini --syntax-check scenarios/scaling_event.yml
+
+help-scaling: ## Show scaling-related commands
+	@echo "Scaling Commands:"
+	@echo "  make scale-up-web COUNT=2"
+	@echo "  make scale-up-db COUNT=1"
+	@echo "  make scale-down-web COUNT=1"
+	@echo "  make scale-down-db COUNT=1"
+
+help-failure: ## Show failure simulation commands
+	@echo "Failure Simulation Commands:"
+	@echo "  make fail-network DURATION=60"
+	@echo "  make fail-disk DURATION=120"
+	@echo "  make fail-service DURATION=30"
+	@echo "  make fail-node DURATION=300"
@@ -0,0 +1,92 @@
+# Linux Operations Automation
+
+## Problem
+
+Linux infrastructure work often starts as ticket-driven operations: deploy a server, patch it, harden SSH, check a failed service, expand a filesystem, and leave evidence that the change was safe. These tasks need automation that is readable, repeatable, and cautious enough for production-style environments.
+
+## CV Relevance
+
+This project maps directly to Linux/Unix operations, server deployment, patching, troubleshooting, and storage/LVM work from enterprise infrastructure environments. The LVM resize workflow is written in an AAP-style shape: explicit survey variables, dry-run defaults, pre-checks, resize actions, and before/after evidence.
+
+## What This Project Demonstrates
+
+- Ansible playbooks for common Linux node lifecycle operations.
+- Role-based task organization with clear defaults and handlers.
+- LVM filesystem expansion workflow suitable for Ansible Automation Platform job templates.
+- Safe simulation scripts for failure, service, and scaling exercises.
+- Reviewer-friendly evidence in `examples/` without relying on a live enterprise lab.
+
+## Architecture
+
+```text
+Operator -> Make targets -> Ansible inventory -> Playbooks/Roles -> Linux nodes
+                         -> Simulation scripts -> Example evidence
+                         -> AAP-style LVM workflow -> Before/after report
+```
+
+Core components:
+
+- `inventory/hosts.ini` defines realistic host groups.
+- `playbooks/` contains provision, patch, harden, and decommission workflows.
+- `playbooks/lvm_resize.yml` contains the storage expansion workflow.
+- `roles/` contains the implemented Ansible roles.
+- `scripts/` provides safe simulation helpers.
+- `docker-compose.yml` is a lightweight local scaffold, not a production lab.
+
+## Quickstart
+
+```bash
+cd professional-infra/linux-operations-automation
+make test
+make demo
+```
+
+`make test` runs offline syntax and lint checks. `make demo` runs a safe simulation with `SIMULATION_MODE=true` and does not require reachable SSH hosts.
+
+To run playbooks against real or lab hosts, update `inventory/hosts.ini` and run:
+
+```bash
+make run
+make patch
+make harden
+make decommission
+```
+
+Review the LVM workflow:
+
+```bash
+ansible-playbook -i inventory/hosts.ini playbooks/lvm_resize.yml --syntax-check
+cat docs/aap_lvm_resize_workflow.md
+```
+
+## Validation
+
+```bash
+make test
+docker compose config --quiet
+```
+
+The optional compose scaffold can be started with:
+
+```bash
+make up
+make down
+```
+
+## Example Output
+
+Sample evidence is available in [examples/patch-output.txt](examples/patch-output.txt), [examples/failure-simulation.txt](examples/failure-simulation.txt), and [examples/lvm-resize-output.txt](examples/lvm-resize-output.txt).
+
+## Interview Talking Points
+
+- How to make LVM resize automation safe with dry-run defaults and explicit approval.
+- Why before/after evidence matters for storage and filesystem changes.
+- How Ansible roles keep Linux baseline operations repeatable.
+- Where AAP surveys and job templates reduce ticket handling errors.
+
+## Roadmap
+
+- Add complete service roles for application deployment examples.
+- Add backup, security scan, and disaster recovery playbooks.
+- Add a richer local lab with SSH-ready containers.
+- Add cloud or Kubernetes deployment variants.
@@ -0,0 +1,43 @@
+# Vault Configuration Guide
+
+## Overview
+
+The current portfolio demo does not require Ansible Vault for `make test` or `make demo`. Secrets are intentionally kept out of the main validation path so reviewers can run the project offline.
+
+Use Vault only when extending the simulator to manage real hosts or credentials.
+
+## Recommended Pattern
+
+1. Start from the example file:
+
+```bash
+cp group_vars/vault.example.yml group_vars/vault.yml
+```
+
+2. Replace placeholder values locally.
+
+3. Encrypt the file before using it with real systems:
+
+```bash
+ansible-vault encrypt group_vars/vault.yml
+```
+
+4. Do not commit real secret values. Keep `group_vars/vault.example.yml` as the committed reference.
+
+## Running With Vault
+
+```bash
+ansible-playbook -i inventory/hosts.ini playbooks/provision.yml --ask-vault-pass
+```
+
+or:
+
+```bash
+ansible-playbook -i inventory/hosts.ini playbooks/provision.yml --vault-password-file ~/.vault_pass.txt
+```
+
+## Notes
+
+- The delivered playbooks do not import a vault file by default.
+- Add `vars_files` only in an environment-specific branch or private overlay.
+- Prefer a secret manager or automation controller for production use.
@@ -0,0 +1,5 @@
+[defaults]
+roles_path = ./roles
+inventory = ./inventory/hosts.ini
+host_key_checking = False
+retry_files_enabled = False
@@ -0,0 +1,28 @@
+services:
+  web:
+    image: debian:12-slim
+    command: ["sleep", "infinity"]
+    networks:
+      infra_sim:
+        ipv4_address: 172.20.0.11
+
+  db:
+    image: debian:12-slim
+    command: ["sleep", "infinity"]
+    networks:
+      infra_sim:
+        ipv4_address: 172.20.0.21
+
+  lb:
+    image: debian:12-slim
+    command: ["sleep", "infinity"]
+    networks:
+      infra_sim:
+        ipv4_address: 172.20.0.31
+
+networks:
+  infra_sim:
+    driver: bridge
+    ipam:
+      config:
+        - subnet: 172.20.0.0/24
@@ -0,0 +1,45 @@
+# AAP-Style LVM Resize Workflow
+
+## Purpose
+
+This workflow shows how a routine storage ticket can be converted into a controlled Ansible Automation Platform job. It is intentionally conservative: dry-run is the default, required variables are explicit, and every run produces before/after evidence.
+
+## Suggested Job Template
+
+- Name: `Linux - LVM Filesystem Resize`
+- Inventory: Linux production or pre-production inventory
+- Playbook: `playbooks/lvm_resize.yml`
+- Credentials: privileged Linux automation credential
+- Privilege escalation: enabled
+- Default extra vars:
+
+```yaml
+lvm_dry_run: true
+lvm_resize_filesystem: true
+```
+
+## Suggested Survey Variables
+
+| Variable | Example | Required | Notes |
+| --- | --- | --- | --- |
+| `lvm_vg_name` | `vg_app` | yes | Target volume group. |
+| `lvm_lv_name` | `lv_data` | yes | Target logical volume. |
+| `lvm_mountpoint` | `/data` | yes | Filesystem mountpoint to validate before/after. |
+| `lvm_size_request` | `+20G` | yes | Passed to `lvextend -L`; use explicit growth syntax for tickets. |
+| `lvm_dry_run` | `true` | yes | Start with `true`; switch to `false` after evidence review. |
+
+## Safety Notes
+
+- Run with `lvm_dry_run=true` first and attach output to the ticket.
+- Confirm backup/snapshot status before actual resize.
+- Confirm filesystem type; this workflow supports XFS and ext filesystems.
+- Keep requested size aligned with the ticket approval.
+- Use maintenance windows for critical systems.
+
+## Evidence Captured
+
+- `lsblk --fs`
+- `pvs`, `vgs`, `lvs`
+- `df -hT <mountpoint>` before and after
+- target LV path and filesystem type
+- dry-run flag and requested size
@@ -0,0 +1,30 @@
+# Linux Operations Automation Architecture
+
+## Components
+
+- Operator interface: `make` targets and direct Ansible commands.
+- Inventory: static host groups in `inventory/hosts.ini`.
+- Automation: lifecycle playbooks in `playbooks/`.
+- Simulation scripts: controlled failure and scaling events in `scripts/`.
+- Evidence: logs, reports, scenario notes, and examples.
+
+## Data Flow
+
+```
+Operator
+  -> Make target or shell script
+  -> Ansible inventory
+  -> lifecycle playbook
+  -> managed Linux node
+  -> log/report artifact
+```
+
+Failure drills follow a parallel flow:
+
+```
+Operator -> simulate_failure.sh -> target node/service -> health check -> patch/hardening playbook -> evidence
+```
+
+## Notes
+
+The project favors explicit playbooks over hidden orchestration so the operational intent is visible during review. In a production implementation, the same workflows would typically run from a CI runner or automation controller with credentials supplied by a secret manager.
@@ -0,0 +1,8 @@
+2026-04-29 02:13:41 - Starting failure simulation: service 30 web
+2026-04-29 02:13:41 - Simulating service failures on containers: web
+2026-04-29 02:13:42 - Stopping services in container enterprise-web-1
+2026-04-29 02:13:44 - Health probe failed: http://web01/health returned 503
+2026-04-29 02:14:12 - Cleaning up failure simulation
+2026-04-29 02:14:13 - Restarted nginx in enterprise-web-1
+2026-04-29 02:14:18 - Health probe recovered: http://web01/health returned 200
+2026-04-29 02:14:18 - Failure simulation completed successfully
@@ -0,0 +1,19 @@
+TASK [Report LVM resize evidence] **********************************************
+ok: [app01] => {
+  "msg": {
+    "host": "app01",
+    "dry_run": true,
+    "target": "/dev/vg_app/lv_data",
+    "mountpoint": "/data",
+    "requested_size": "+20G",
+    "filesystem_type": "xfs",
+    "before_df": [
+      "Filesystem                 Type  Size  Used Avail Use% Mounted on",
+      "/dev/mapper/vg_app-lv_data xfs   100G   83G   17G  84% /data"
+    ],
+    "after_df": [
+      "Filesystem                 Type  Size  Used Avail Use% Mounted on",
+      "/dev/mapper/vg_app-lv_data xfs   100G   83G   17G  84% /data"
+    ]
+  }
+}
@@ -0,0 +1,33 @@
+PLAY [Apply Security Patches and Updates] **************************************
+
+TASK [Update package cache] *****************************************************
+changed: [web01]
+changed: [db01]
+ok: [lb01]
+
+TASK [Check for available updates] **********************************************
+ok: [web01] => {"stdout": "9"}
+ok: [db01] => {"stdout": "4"}
+ok: [lb01] => {"stdout": "0"}
+
+TASK [Apply security updates only] **********************************************
+changed: [web01]
+changed: [db01]
+ok: [lb01]
+
+TASK [Verify critical services] *************************************************
+ok: [web01] => (item=systemd-journald)
+ok: [web01] => (item=cron)
+ok: [db01] => (item=systemd-journald)
+ok: [lb01] => (item=cron)
+
+PLAY RECAP *********************************************************************
+web01 : ok=19 changed=6 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
+db01  : ok=18 changed=5 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
+lb01  : ok=15 changed=1 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0
+
+Patch report
+Status: SUCCESS
+Window: 02:00-04:00 UTC
+Reboot required: false
+Notification: infra-team@example.com
@@ -0,0 +1,20 @@
+---
+# Group variables for all hosts
+
+# SSH Configuration
+ssh_config:
+  port: 22
+  max_auth_tries: 3
+  alive_interval: 300
+
+# Firewall defaults
+firewall_enabled: true
+firewall_default_policy: deny
+
+# Patching defaults
+patch_enabled: true
+enforce_patch_window: true
+
+# Services monitoring
+enable_monitoring: false
+enable_health_checks: true
@@ -0,0 +1,9 @@
+---
+# Database servers group configuration
+db_type: postgresql
+db_port: 5432
+db_backup_enabled: true
+db_backup_path: /var/backups/database
+
+# Database user (use vault for production)
+db_admin_user: postgres
@@ -0,0 +1,10 @@
+---
+# Load balancers group configuration
+lb_type: haproxy
+lb_port: 443
+lb_stats_port: 8404
+lb_stats_enabled: true
+
+# Frontend configuration
+frontend_host: "0.0.0.0"
+frontend_port: 80
@@ -0,0 +1,10 @@
+---
+# Monitoring servers group configuration
+monitoring_type: prometheus
+monitoring_port: 9090
+monitoring_retention: 30d
+monitoring_scrape_interval: 15s
+
+# Grafana configuration
+grafana_port: 3000
+grafana_admin_password: "{{ vault_grafana_password }}"
@@ -0,0 +1,8 @@
+---
+# Example variables for secret values.
+# Copy these keys into an Ansible Vault encrypted file when real secrets are needed.
+
+admin_password: "replace-with-vault-managed-value"
+db_root_password: "replace-with-vault-managed-value"
+grafana_admin_password: "replace-with-vault-managed-value"
+ssh_key_passphrase: "replace-with-vault-managed-value"
@@ -0,0 +1,11 @@
+---
+# Webservers group configuration
+webserver_type: nginx
+http_port: 80
+https_port: 443
+health_check_path: /health
+
+# Application configuration
+app_name: "{{ group_names[0] | default('app') }}"
+app_user: "{{ admin_user }}"
+app_group: "{{ admin_user }}"
@@ -0,0 +1,35 @@
+[webservers]
+web01 ansible_host=172.20.0.11 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
+web02 ansible_host=172.20.0.12 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
+web03 ansible_host=172.20.0.13 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
+
+[databases]
+db01 ansible_host=172.20.0.21 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
+db02 ansible_host=172.20.0.22 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
+
+[loadbalancers]
+lb01 ansible_host=172.20.0.31 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
+
+[monitoring]
+mon01 ansible_host=172.20.0.41 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
+
+[all:vars]
+ansible_python_interpreter=/usr/bin/python3
+ansible_ssh_common_args='-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
+ansible_connection=ssh
+
+[webservers:vars]
+node_type=web
+environment=production
+
+[databases:vars]
+node_type=database
+environment=production
+
+[loadbalancers:vars]
+node_type=loadbalancer
+environment=production
+
+[monitoring:vars]
+node_type=monitoring
+environment=production
@@ -0,0 +1,24 @@
+---
+# Molecule converge playbook - applies roles to test them
+
+- name: Converge
+  hosts: all
+  become: true
+  gather_facts: true
+
+  pre_tasks:
+    - name: Update apt cache
+      apt:
+        update_cache: yes
+        cache_valid_time: 3600
+      when: ansible_os_family == "Debian"
+
+  roles:
+    - role: base_provision
+    - role: hardening
+    - role: patching
+
+  post_tasks:
+    - name: Print Ansible facts
+      debug:
+        var: ansible_facts
@@ -0,0 +1,15 @@
+---
+# Molecule destroy playbook
+
+- name: Destroy
+  hosts: localhost
+  gather_facts: false
+  tasks:
+    - name: Destroy molecule containers
+      docker_container:
+        name: "{{ item }}"
+        state: absent
+        force_kill: yes
+      loop: "{{ molecule_yml.platforms | map(attribute='name') | list }}"
+      register: destroy_result
+      ignore_errors: yes
@@ -0,0 +1,31 @@
+---
+# Molecule configuration for Ansible role testing
+
+driver:
+  name: docker
+
+platforms:
+  - name: ubuntu-22.04
+    image: geerlingguy/docker-ubuntu2204-ansible:latest
+    pre_build_image: true
+    privileged: true
+    volumes:
+      - /sys/fs/cgroup:/sys/fs/cgroup:rw
+
+provisioner:
+  name: ansible
+  config_options:
+    defaults:
+      gathering: smart
+      fact_caching: jsonfile
+      fact_caching_connection: /tmp/ansible_facts
+      fact_caching_timeout: 3600
+      deprecation_warnings: false
+
+verifier:
+  name: ansible
+  directory: molecule/default/tests
+
+lint: |
+  yamllint .
+  ansible-lint
@@ -0,0 +1,32 @@
+---
+# Molecule verify playbook - runs tests to verify roles
+
+- name: Verify
+  hosts: all
+  gather_facts: false
+  tasks:
+    - name: Check if base OS packages are installed
+      shell: dpkg -l | grep -E '(curl|wget|vim|htop)'
+      register: package_check
+      failed_when: package_check.rc not in [0, 1]
+
+    - name: Check SSH configuration
+      stat:
+        path: /etc/ssh/sshd_config
+      register: ssh_config_stat
+      failed_when: not ssh_config_stat.stat.exists
+
+    - name: Check firewall status
+      shell: ufw status | grep -q active
+      register: firewall_check
+      failed_when: false
+
+    - name: Verify admin user exists
+      getent:
+        database: passwd
+        key: infra-admin
+      failed_when: false
+
+    - name: Print verification results
+      debug:
+        msg: "Role verification completed"
@@ -0,0 +1,34 @@
+---
+- name: Decommission Enterprise Infrastructure Nodes
+  hosts: all
+  become: true
+  gather_facts: true
+
+  pre_tasks:
+    - name: Confirm decommissioning
+      ansible.builtin.pause:
+        prompt: |
+          WARNING: This will decommission {{ inventory_hostname }}
+          Backup Data: {{ backup_data }}
+          Export Config: {{ export_config }}
+
+          Press ENTER to continue or Ctrl+C to cancel
+
+    - name: Display decommissioning information
+      ansible.builtin.debug:
+        msg: |
+          Decommissioning {{ inventory_hostname }}
+          Auto Shutdown: {{ auto_shutdown }}
+          Backup Enabled: {{ backup_data }}
+
+  roles:
+    - role: decommission
+      tags: ['decommission', 'cleanup']
+
+  post_tasks:
+    - name: Display decommissioning summary
+      ansible.builtin.debug:
+        msg: |
+          Decommissioning completed!
+          Host: {{ inventory_hostname }}
+          Backup Location: /var/backups/decommission-{{ ansible_date_time.iso8601 }}/
@@ -0,0 +1,124 @@
+---
+- name: Harden Enterprise Infrastructure Nodes
+  hosts: all
+  become: true
+  gather_facts: true
+
+  pre_tasks:
+    - name: Validate hardening prerequisites
+      ansible.builtin.assert:
+        that:
+          - ansible_os_family == "Debian"
+          - cis_level in [1, 2]
+        fail_msg: "Invalid hardening configuration"
+
+    - name: Display hardening information
+      ansible.builtin.debug:
+        msg: |
+          Hardening {{ inventory_hostname }}
+          CIS Level: {{ cis_level }}
+          Disable Root Login: {{ disable_root_login }}
+
+  roles:
+    - role: hardening
+      tags: ['hardening', 'security']
+
+  post_tasks:
+    - name: Display hardening summary
+      ansible.builtin.debug:
+        msg: |
+          Hardening completed successfully!
+          Host: {{ inventory_hostname }}
+
+      when: ansible_os_family == "Debian"
+
+    - name: Configure auditd
+      when: auditd_enabled
+      block:
+        - name: Install auditd
+          ansible.builtin.apt:
+            name: auditd
+            state: present
+          when: ansible_os_family == "Debian"
+
+        - name: Configure audit rules
+          ansible.builtin.template:
+            src: templates/audit.rules.j2
+            dest: /etc/audit/rules.d/hardening.rules
+            mode: '0644'
+
+        - name: Enable auditd service
+          ansible.builtin.service:
+            name: auditd
+            state: started
+            enabled: true
+
+    - name: Configure AppArmor
+      when: apparmor_enabled and ansible_os_family == "Debian"
+      block:
+        - name: Install apparmor
+          ansible.builtin.apt:
+            name: apparmor
+            state: present
+          when: ansible_os_family == "Debian"
+
+        - name: Enable apparmor service
+          ansible.builtin.service:
+            name: apparmor
+            state: started
+            enabled: true
+
+    - name: Configure sysctl hardening
+      ansible.posix.sysctl:
+        name: "{{ item.key }}"
+        value: "{{ item.value }}"
+        state: present
+        reload: true
+      loop:
+        - { key: 'net.ipv4.ip_forward', value: '0' }
+        - { key: 'net.ipv4.conf.all.send_redirects', value: '0' }
+        - { key: 'net.ipv4.conf.default.send_redirects', value: '0' }
+        - { key: 'net.ipv4.tcp_syncookies', value: '1' }
+        - { key: 'net.ipv4.icmp_echo_ignore_broadcasts', value: '1' }
+
+    - name: Set secure file permissions
+      ansible.builtin.file:
+        path: "{{ item }}"
+        mode: '0644'
+        owner: root
+        group: root
+      loop:
+        - /etc/passwd
+        - /etc/group
+        - /etc/shadow
+        - /etc/gshadow
+
+    - name: Lock inactive user accounts
+      ansible.builtin.command: usermod -L "{{ item }}"
+      loop: "{{ inactive_users | default([]) }}"
+      changed_when: false
+
+    - name: Configure password policies
+      community.general.pam_limits:
+        domain: '*'
+        limit_type: hard
+        limit_item: nofile
+        value: 1024
+
+    - name: Generate hardening report
+      ansible.builtin.template:
+        src: templates/hardening_report.j2
+        dest: "/var/log/hardening_report_{{ ansible_date_time.iso8601 }}.log"
+        mode: '0644'
+
+  handlers:
+    - name: restart sshd
+      ansible.builtin.service:
+        name: ssh
+        state: restarted
+
+    - name: restart auditd
+      ansible.builtin.service:
+        name: auditd
+        state: restarted
+      when: auditd_enabled
@@ -0,0 +1,149 @@
+---
+- name: AAP-style LVM filesystem resize workflow
+  hosts: all
+  become: true
+  gather_facts: true
+
+  vars:
+    lvm_dry_run: true
+    lvm_vg_name: ""
+    lvm_lv_name: ""
+    lvm_mountpoint: ""
+    lvm_size_request: "+10G"
+    lvm_resize_filesystem: true
+
+  pre_tasks:
+    - name: Validate required survey variables
+      ansible.builtin.assert:
+        that:
+          - lvm_vg_name | length > 0
+          - lvm_lv_name | length > 0
+          - lvm_mountpoint | length > 0
+          - lvm_size_request | length > 0
+        fail_msg: "Required variables: lvm_vg_name, lvm_lv_name, lvm_mountpoint, lvm_size_request"
+
+  tasks:
+    - name: Capture block device layout before resize
+      ansible.builtin.command:
+        argv:
+          - lsblk
+          - --fs
+      register: lvm_lsblk_before
+      changed_when: false
+
+    - name: Capture physical volumes before resize
+      ansible.builtin.command:
+        argv:
+          - pvs
+          - --noheadings
+          - --units
+          - g
+      register: lvm_pvs_before
+      changed_when: false
+
+    - name: Capture volume groups before resize
+      ansible.builtin.command:
+        argv:
+          - vgs
+          - --noheadings
+          - --units
+          - g
+      register: lvm_vgs_before
+      changed_when: false
+
+    - name: Capture logical volumes before resize
+      ansible.builtin.command:
+        argv:
+          - lvs
+          - --noheadings
+          - --units
+          - g
+      register: lvm_lvs_before
+      changed_when: false
+
+    - name: Capture filesystem usage before resize
+      ansible.builtin.command:
+        argv:
+          - df
+          - -hT
+          - "{{ lvm_mountpoint }}"
+      register: lvm_df_before
+      changed_when: false
+
+    - name: Validate target logical volume exists
+      ansible.builtin.command:
+        argv:
+          - lvs
+          - "/dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
+      register: lvm_target_check
+      changed_when: false
+
+    - name: Show dry-run resize command
+      ansible.builtin.debug:
+        msg: "DRY RUN: would run lvextend -L {{ lvm_size_request }} /dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
+      when: lvm_dry_run | bool
+
+    - name: Extend logical volume
+      ansible.builtin.command:
+        argv:
+          - lvextend
+          - -L
+          - "{{ lvm_size_request }}"
+          - "/dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
+      register: lvm_lvextend_result
+      changed_when: true
+      when: not (lvm_dry_run | bool)
+
+    - name: Detect filesystem type
+      ansible.builtin.command:
+        argv:
+          - findmnt
+          - -n
+          - -o
+          - FSTYPE
+          - "{{ lvm_mountpoint }}"
+      register: lvm_fstype
+      changed_when: false
+
+    - name: Resize XFS filesystem
+      ansible.builtin.command:
+        argv:
+          - xfs_growfs
+          - "{{ lvm_mountpoint }}"
+      changed_when: true
+      when:
+        - not (lvm_dry_run | bool)
+        - lvm_resize_filesystem | bool
+        - lvm_fstype.stdout == "xfs"
+
+    - name: Resize ext filesystem
+      ansible.builtin.command:
+        argv:
+          - resize2fs
+          - "/dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
+      changed_when: true
+      when:
+        - not (lvm_dry_run | bool)
+        - lvm_resize_filesystem | bool
+        - lvm_fstype.stdout in ["ext2", "ext3", "ext4"]
+
+    - name: Capture filesystem usage after resize
+      ansible.builtin.command:
+        argv:
+          - df
+          - -hT
+          - "{{ lvm_mountpoint }}"
+      register: lvm_df_after
+      changed_when: false
+
+    - name: Report LVM resize evidence
+      ansible.builtin.debug:
+        msg:
+          host: "{{ inventory_hostname }}"
+          dry_run: "{{ lvm_dry_run }}"
+          target: "/dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
+          mountpoint: "{{ lvm_mountpoint }}"
+          requested_size: "{{ lvm_size_request }}"
+          filesystem_type: "{{ lvm_fstype.stdout | default('unknown') }}"
+          before_df: "{{ lvm_df_before.stdout_lines }}"
+          after_df: "{{ lvm_df_after.stdout_lines }}"
@@ -0,0 +1,31 @@
+---
+- name: Apply Security Patches and Updates
+  hosts: all
+  become: true
+  gather_facts: true
+
+  pre_tasks:
+    - name: Validate patch prerequisites
+      ansible.builtin.assert:
+        that:
+          - ansible_os_family == "Debian"
+        fail_msg: "Patching supported only on Debian-based systems"
+
+    - name: Display patch information
+      ansible.builtin.debug:
+        msg: |
+          Patching {{ inventory_hostname }}
+          Patch Window: {{ patch_window_start }} - {{ patch_window_end }}
+          Security Only: {{ patch_security_only }}
+
+  roles:
+    - role: patching
+      tags: ['patch', 'updates']
+
+  post_tasks:
+    - name: Display patching summary
+      ansible.builtin.debug:
+        msg: |
+          Patching completed!
+          Host: {{ inventory_hostname }}
+          Reboot Required: {{ reboot_required | default(false) }}
@@ -0,0 +1,33 @@
+---
+- name: Provision Enterprise Infrastructure Nodes
+  hosts: all
+  become: true
+  gather_facts: true
+
+  pre_tasks:
+    - name: Validate Ansible version
+      ansible.builtin.assert:
+        that:
+          - ansible_version.major >= 2
+          - ansible_version.minor >= 9
+        fail_msg: "Ansible 2.9+ is required"
+
+    - name: Display provisioning information
+      ansible.builtin.debug:
+        msg: |
+          Provisioning {{ inventory_hostname }}
+          OS: {{ ansible_os_family }}
+          Python: {{ ansible_python_version }}
+
+  roles:
+    - role: base_provision
+      tags: ['provision', 'base']
+
+  post_tasks:
+    - name: Generate provisioning summary
+      ansible.builtin.debug:
+        msg: |
+          Provisioning completed successfully!
+          Host: {{ inventory_hostname }}
+          IP: {{ ansible_default_ipv4.address }}
+          OS: {{ ansible_os_family }} {{ ansible_os_version }}
@@ -0,0 +1,48 @@
+# Base Provision Role
+
+Provision basic infrastructure on enterprise nodes with security hardening.
+
+## Features
+
+- **Idempotent**: All tasks use proper idempotency markers (`changed_when`, `failed_when`)
+- **Handlers**: SSH and fail2ban restarts use handlers instead of direct service calls
+- **Variables**: All configuration in `defaults/main.yml` - no hardcoding
+- **Validation**: Pre-flight checks for system requirements
+- **Firewall**: UFW firewall configuration with configurable rules
+- **SSH Security**: Root login disabled, password auth disabled, key-based auth only
+
+## Role Variables
+
+See `defaults/main.yml` for all available variables.
+
+### Key Variables
+
+- `node_timezone`: System timezone (default: UTC)
+- `admin_user`: Admin username for infrastructure access
+- `ssh_port`: SSH service port (default: 22)
+- `base_packages`: List of base packages to install
+- `firewall_enabled`: Enable UFW firewall (default: true)
+- `firewall_allowed_tcp_ports`: Allowed TCP ports for firewall
+
+## Secret Variables
+
+This portfolio demo does not require secrets for offline validation. If you extend it with real passwords or keys, copy `group_vars/vault.example.yml` into an encrypted Ansible Vault file and keep real values out of normal git history.
+
+## Usage
+
+```yaml
+- role: base_provision
+  vars:
+    node_timezone: "Europe/Warsaw"
+    firewall_enabled: true
+```
+
+## Handlers
+
+- `restart sshd`: Restarts SSH service (triggered by config changes)
+- `restart fail2ban`: Restarts fail2ban service (triggered by config changes)
+
+## Tags
+
+- `provision`: All provisioning tasks
+- `base`: Base provision role tasks
@@ -0,0 +1,44 @@
+---
+# Base provisioning configuration
+node_timezone: "UTC"
+admin_user: "infra-admin"
+ssh_port: 22
+ssh_disabled_root_login: true
+ssh_disable_password_auth: true
+
+# Packages to install
+base_packages:
+  - curl
+  - wget
+  - vim
+  - htop
+  - net-tools
+  - iptables
+  - fail2ban
+  - unattended-upgrades
+
+# Firewall rules
+firewall_enabled: true
+firewall_default_policy: deny
+firewall_allowed_tcp_ports:
+  - 22
+  - 80
+  - 443
+
+# Application directories
+app_directories:
+  - path: /opt/application
+    owner: "{{ admin_user }}"
+    group: "{{ admin_user }}"
+    mode: '0755'
+  - path: /var/log/application
+    owner: "{{ admin_user }}"
+    group: "{{ admin_user }}"
+    mode: '0755'
+  - path: /etc/application
+    owner: root
+    group: root
+    mode: '0755'
+
+# Service verification
+services_to_verify: []
@@ -0,0 +1,11 @@
+---
+- name: restart sshd
+  ansible.builtin.service:
+    name: sshd
+    state: restarted
+
+- name: restart fail2ban
+  ansible.builtin.service:
+    name: fail2ban
+    state: restarted
+    enabled: true
@@ -0,0 +1,138 @@
+---
+- name: Validate system requirements
+  ansible.builtin.assert:
+    that:
+      - ansible_os_family == "Debian"
+      - ansible_python_version is version('3.6', '>=')
+    fail_msg: "Unsupported system - requires Debian and Python 3.6+"
+
+- name: Update package cache
+  ansible.builtin.apt:
+    update_cache: true
+    cache_valid_time: 3600
+  changed_when: false
+
+- name: Install base packages
+  ansible.builtin.apt:
+    name: "{{ base_packages }}"
+    state: present
+    update_cache: true
+
+- name: Check if admin user exists
+  ansible.builtin.getent:
+    database: passwd
+    key: "{{ admin_user }}"
+  register: admin_check
+  failed_when: false
+  changed_when: false
+
+- name: Create admin user
+  ansible.builtin.user:
+    name: "{{ admin_user }}"
+    groups: sudo
+    append: true
+    create_home: true
+    shell: /bin/bash
+  when: admin_check.failed
+
+- name: Configure timezone
+  community.general.timezone:
+    name: "{{ node_timezone }}"
+
+- name: Configure SSH security
+  block:
+    - name: Disable root SSH login
+      ansible.builtin.lineinfile:
+        path: /etc/ssh/sshd_config
+        regexp: '^PermitRootLogin'
+        line: 'PermitRootLogin no'
+        state: present
+      when: ssh_disabled_root_login
+      notify: restart sshd
+
+    - name: Set SSH port
+      ansible.builtin.lineinfile:
+        path: /etc/ssh/sshd_config
+        regexp: '^Port'
+        line: "Port {{ ssh_port }}"
+        state: present
+      notify: restart sshd
+
+    - name: Disable password authentication
+      ansible.builtin.lineinfile:
+        path: /etc/ssh/sshd_config
+        regexp: '^PasswordAuthentication'
+        line: 'PasswordAuthentication no'
+        state: present
+      when: ssh_disable_password_auth
+      notify: restart sshd
+
+- name: Configure firewall
+  block:
+    - name: Enable UFW firewall
+      community.general.ufw:
+        state: enabled
+        policy: "{{ firewall_default_policy }}"
+      when: firewall_enabled
+
+    - name: Allow SSH access
+      community.general.ufw:
+        rule: allow
+        port: "{{ ssh_port }}"
+        proto: tcp
+      when: firewall_enabled
+
+    - name: Allow HTTP/HTTPS
+      community.general.ufw:
+        rule: allow
+        port: "{{ item }}"
+        proto: tcp
+      loop: "{{ firewall_allowed_tcp_ports }}"
+      when: firewall_enabled and item not in [ssh_port]
+
+- name: Configure fail2ban
+  ansible.builtin.template:
+    src: jail.local.j2
+    dest: /etc/fail2ban/jail.local
+    backup: true
+    mode: '0644'
+  notify: restart fail2ban
+
+- name: Enable unattended upgrades
+  ansible.builtin.lineinfile:
+    path: /etc/apt/apt.conf.d/20auto-upgrades
+    regexp: '^APT::Periodic::Unattended-Upgrade'
+    line: 'APT::Periodic::Unattended-Upgrade "1";'
+    state: present
+
+- name: Create application directories
+  ansible.builtin.file:
+    path: "{{ item.path }}"
+    state: directory
+    owner: "{{ item.owner }}"
+    group: "{{ item.group }}"
+    mode: "{{ item.mode }}"
+  loop: "{{ app_directories }}"
+
+- name: Record role-specific service intent
+  ansible.builtin.debug:
+    msg: "Would configure {{ node_type | default('generic') }} service components in a full lab deployment"
+
+- name: Verify services are running
+  ansible.builtin.service:
+    name: "{{ item }}"
+    state: started
+    enabled: true
+  loop: "{{ services_to_verify }}"
+  when: services_to_verify | length > 0
+  failed_when: false
+
+- name: Run health checks
+  ansible.builtin.uri:
+    url: http://localhost/health
+    method: GET
+    status_code: 200
+  register: health_check
+  failed_when: false
+  ignore_errors: true
+  when: "'webservers' in group_names"
@@ -0,0 +1,14 @@
+# fail2ban configuration
+[DEFAULT]
+bantime = 3600
+findtime = 600
+maxretry = 5
+
+[sshd]
+enabled = true
+port = {{ ssh_port }}
+logpath = /var/log/auth.log
+maxretry = 3
+
+[recidive]
+enabled = true
@@ -0,0 +1,62 @@
+# Decommission Role
+
+Gracefully decommission enterprise infrastructure nodes with comprehensive backup and cleanup.
+
+## Features
+
+- **Confirmation Prompt**: Interactive confirmation before decommissioning
+- **Graceful Shutdown**: Stop services gracefully with connection drain time
+- **Comprehensive Backup**: Archive configurations and data before cleanup
+- **Selective Cleanup**: Only remove items that were deployed
+- **Logging**: Detailed decommissioning logs for audit trail
+- **Notifications**: Optional email notifications on completion
+
+## Role Variables
+
+See `defaults/main.yml` for all available variables.
+
+### Key Variables
+
+- `backup_data`: Backup application data (default: true)
+- `export_config`: Export system configuration (default: true)
+- `graceful_shutdown`: Graceful service shutdown (default: true)
+- `auto_shutdown`: Auto shutdown after decommissioning (default: false)
+- `application_services`: Services to stop
+- `application_packages`: Packages to remove
+- `decommission_notification_email`: Email for notifications (optional)
+
+## Usage
+
+```yaml
+- role: decommission
+  vars:
+    backup_data: true
+    export_config: true
+    auto_shutdown: false
+    decommission_notification_email: "ops@company.com"
+```
+
+## Backup Locations
+
+- Configuration: `/var/backups/decommission-<timestamp>/config/`
+- Data: `/var/backups/decommission-<timestamp>/data/`
+- Report: `/var/log/decommission_report_<timestamp>.log`
+
+## Supported Groups
+
+- `webservers`: Backs up /var/www/html
+- `databases`: Backs up PostgreSQL data
+- `monitoring`: Backs up Prometheus data
+- `loadbalancers`: Loadbalancer cleanup
+
+## Safety Features
+
+- Interactive confirmation before execution
+- Connection drain time before shutdown (30 seconds)
+- Errors are logged but don't stop the process
+- Comprehensive audit log
+
+## Tags
+
+- `decommission`: All decommissioning tasks
+- `cleanup`: Cleanup-related tasks
@@ -0,0 +1,34 @@
+---
+# Decommissioning configuration
+backup_data: true
+export_config: true
+graceful_shutdown: true
+cleanup_inventory: true
+auto_shutdown: false
+shutdown_delay: 10
+
+# Services to stop gracefully
+application_services:
+  - nginx
+  - postgresql
+  - haproxy
+
+# Packages to remove
+application_packages:
+  - nginx
+  - postgresql
+  - haproxy
+  - prometheus
+
+# Directories to archive
+config_paths:
+  - /etc/
+  - /opt/application/
+
+data_paths:
+  - /var/www/html
+  - /var/lib/postgresql
+  - /var/lib/prometheus
+
+# Notification settings
+decommission_notification_email: null
@@ -0,0 +1,177 @@
+---
+- name: Validate decommissioning requirements
+  ansible.builtin.assert:
+    that:
+      - backup_data or not backup_data
+    fail_msg: "Invalid decommissioning configuration"
+
+- name: Pre-decommissioning checks
+  block:
+    - name: Check node health
+      ansible.builtin.uri:
+        url: http://localhost/health
+        method: GET
+        status_code: 200
+      register: health_check
+      failed_when: false
+      ignore_errors: true
+      when: "'webservers' in group_names"
+
+    - name: Create decommissioning backup directory
+      ansible.builtin.file:
+        path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}"
+        state: directory
+        mode: '0755'
+
+    - name: Initialize decommissioning log
+      ansible.builtin.file:
+        path: "/var/log/decommission.log"
+        state: touch
+        mode: '0644'
+        modification_time: now
+        access_time: now
+
+    - name: Log decommissioning start
+      ansible.builtin.lineinfile:
+        path: "/var/log/decommission.log"
+        line: "{{ ansible_date_time.iso8601 }} - Starting decommissioning of {{ inventory_hostname }}"
+        state: present
+
+- name: Graceful application shutdown
+  block:
+    - name: Stop application services
+      ansible.builtin.service:
+        name: "{{ item }}"
+        state: stopped
+      loop: "{{ application_services }}"
+      failed_when: false
+      when: graceful_shutdown
+
+    - name: Wait for connections to drain
+      ansible.builtin.pause:
+        seconds: 30
+      when: graceful_shutdown and ("webservers" in group_names or "loadbalancers" in group_names)
+
+- name: Export and backup data
+  block:
+    - name: Create config export directory
+      ansible.builtin.file:
+        path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config"
+        state: directory
+        mode: '0755'
+
+    - name: Archive system configuration
+      community.general.archive:
+        path: "{{ config_paths }}"
+        dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config/system_config.tar.gz"
+        format: gz
+      when: export_config
+      failed_when: false  # noqa risky-file-permissions
+
+    - name: Create data backup directory
+      ansible.builtin.file:
+        path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data"
+        state: directory
+        mode: '0755'
+      when: backup_data
+
+    - name: Backup individual data paths
+      community.general.archive:
+        path: "{{ item }}"
+        dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/{{ item | regex_replace('/', '_') }}.tar.gz"
+        format: gz
+      loop: "{{ data_paths }}"
+      when: backup_data
+      failed_when: false  # noqa risky-file-permissions
+
+- name: Update monitoring and load balancing
+  block:
+    - name: Remove from load balancer
+      ansible.builtin.debug:
+        msg: "Would remove {{ inventory_hostname }} from load balancer"
+      when: "'webservers' in group_names or 'databases' in group_names"
+
+    - name: Update monitoring alerts
+      ansible.builtin.debug:
+        msg: "Would update monitoring alerts for {{ inventory_hostname }}"
+      when: "'monitoring' not in group_names"
+
+- name: Clean up application
+  block:
+    - name: Remove application directories
+      ansible.builtin.file:
+        path: "{{ item }}"
+        state: absent
+      loop:
+        - /opt/application
+        - /var/www/html
+        - /var/lib/postgresql
+        - /var/lib/prometheus
+      failed_when: false
+
+    - name: Remove application packages
+      ansible.builtin.apt:
+        name: "{{ item }}"
+        state: absent
+        purge: true
+      loop: "{{ application_packages }}"
+      failed_when: false
+
+    - name: Clean system logs
+      ansible.builtin.shell: |
+        set -o pipefail
+        find /var/log -name "*.log" -type f -size +0 -exec truncate -s 0 {} \;
+      changed_when: false
+      failed_when: false
+
+    - name: Remove SSH credentials
+      ansible.builtin.file:
+        path: "{{ item }}"
+        state: absent
+      loop:
+        - /root/.ssh/authorized_keys
+        - /root/.ssh/known_hosts
+        - /home/infra-admin/.ssh/authorized_keys
+      failed_when: false
+
+- name: Generate decommissioning report
+  ansible.builtin.template:
+    src: decommission_report.j2
+    dest: "/var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log"
+    mode: '0644'
+  vars:
+    backup_location: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}"
+
+- name: Send decommissioning notification
+  community.general.mail:
+    host: localhost
+    port: 25
+    to: "{{ decommission_notification_email }}"
+    subject: "Node Decommissioned - {{ inventory_hostname }}"
+    body: |
+      Node {{ inventory_hostname }} has been successfully decommissioned.
+
+      Backup location: /var/backups/decommission-{{ ansible_date_time.iso8601 }}/
+      Services stopped: {{ application_services | join(', ') }}
+      Configuration exported: {{ export_config }}
+      Data backed up: {{ backup_data }}
+
+      See /var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log for details
+  when: decommission_notification_email is defined
+  failed_when: false
+
+- name: Finalize decommissioning
+  block:
+    - name: Log decommissioning completion
+      ansible.builtin.lineinfile:
+        path: "/var/log/decommission.log"
+        line: "{{ ansible_date_time.iso8601 }} - Decommissioning completed for {{ inventory_hostname }}"
+        state: present
+
+    - name: Perform system shutdown
+      ansible.builtin.reboot:
+        msg: "System scheduled for shutdown after decommissioning"
+        delay: "{{ shutdown_delay }}"
+      when: auto_shutdown | bool
+      async: 1
+      poll: 0
@@ -0,0 +1,13 @@
+Decommissioning Report
+======================
+Generated: {{ ansible_date_time.iso8601 }}
+Host: {{ inventory_hostname }}
+
+Status: COMPLETED
+Backup Location: {{ backup_location }}
+
+Configuration Exported: {{ export_config }}
+Data Backed Up: {{ backup_data }}
+Services Stopped: {{ application_services | join(', ') }}
+
+Log Location: /var/log/decommission.log
@@ -0,0 +1,58 @@
+# Hardening Role
+
+Apply security hardening to enterprise infrastructure nodes following CIS benchmarks.
+
+## Features
+
+- **CIS Compliance**: Support for CIS hardening levels 1 and 2
+- **SSH Hardening**: Disable root login, password auth, set auth limits
+- **Firewall Configuration**: UFW with configurable rules
+- **Service Cleanup**: Disable unnecessary services and remove insecure packages
+- **Handlers**: SSH restarts via handlers
+
+## Role Variables
+
+See `defaults/main.yml` for all available variables.
+
+### Key Variables
+
+- `cis_level`: CIS hardening level (1 or 2)
+- `disable_root_login`: Disable root SSH login (default: true)
+- `secure_ssh_config`: Apply SSH security hardening (default: true)
+- `firewall_policy`: Firewall default policy (default: deny)
+- `ssh_max_auth_tries`: Maximum SSH authentication attempts (default: 3)
+- `ssh_client_alive_interval`: SSH client alive interval in seconds (default: 300)
+- `ssh_allowed_networks`: Networks allowed SSH access from
+
+### SSH Allowed Networks
+
+Default trusted networks:
+- 10.0.0.0/8 (Private Class A)
+- 172.16.0.0/12 (Private Class B)
+- 192.168.0.0/16 (Private Class C)
+
+## Usage
+
+```yaml
+- role: hardening
+  vars:
+    cis_level: 1
+    disable_root_login: true
+    ssh_allowed_networks:
+      - 10.0.0.0/8
+      - 203.0.113.0/24
+```
+
+## SSH Configuration Changes
+
+- Root login disabled
+- Password authentication disabled
+- Maximum auth tries: 3
+- Empty passwords prohibited
+- Client alive interval: 300 seconds
+- Client alive count max: 2
+
+## Tags
+
+- `hardening`: All hardening tasks
+- `security`: Security-related tasks
@@ -0,0 +1,35 @@
+---
+# Hardening configuration
+cis_level: 1
+disable_root_login: true
+secure_ssh_config: true
+firewall_policy: deny
+auditd_enabled: true
+selinux_mode: enforcing
+apparmor_enabled: true
+
+# SSH Hardening
+ssh_max_auth_tries: 3
+ssh_client_alive_interval: 300
+ssh_client_alive_count_max: 2
+
+# Firewall rules for SSH (trusted networks)
+ssh_allowed_networks:
+  - 10.0.0.0/8
+  - 172.16.0.0/12
+  - 192.168.0.0/16
+
+# Services to disable
+unnecessary_services:
+  - cups
+  - avahi-daemon
+  - bluetooth
+  - nfs-server
+  - rpcbind
+
+# Packages to remove
+unnecessary_packages:
+  - telnet
+  - rsh-client
+  - talk
+  - ntalk
@@ -0,0 +1,5 @@
+---
+- name: restart sshd
+  ansible.builtin.service:
+    name: sshd
+    state: restarted
@@ -0,0 +1,7 @@
+---
+# CIS Hardening Level 1 tasks (stub for future expansion)
+# https://www.cisecurity.org/cis-benchmarks/
+
+- name: Check CIS status
+  ansible.builtin.debug:
+    msg: "CIS Hardening Level {{ cis_level }} would be applied here"
@@ -0,0 +1,95 @@
+---
+- name: Validate hardening requirements
+  ansible.builtin.assert:
+    that:
+      - ansible_os_family == "Debian"
+      - cis_level in [1, 2]
+    fail_msg: "Unsupported configuration for hardening"
+
+- name: Apply CIS hardening tasks
+  ansible.builtin.include_tasks: cis_hardening.yml
+  when: cis_level >= 1
+
+- name: Configure SSH hardening
+  block:
+    - name: Disable root SSH login
+      ansible.builtin.lineinfile:
+        path: /etc/ssh/sshd_config
+        regexp: '^PermitRootLogin'
+        line: 'PermitRootLogin no'
+        state: present
+      when: disable_root_login
+      notify: restart sshd
+
+    - name: Disable password authentication
+      ansible.builtin.lineinfile:
+        path: /etc/ssh/sshd_config
+        regexp: '^PasswordAuthentication'
+        line: 'PasswordAuthentication no'
+        state: present
+      when: secure_ssh_config
+      notify: restart sshd
+
+    - name: Set MaxAuthTries
+      ansible.builtin.lineinfile:
+        path: /etc/ssh/sshd_config
+        regexp: '^MaxAuthTries'
+        line: "MaxAuthTries {{ ssh_max_auth_tries }}"
+        state: present
+      notify: restart sshd
+
+    - name: Disable empty passwords
+      ansible.builtin.lineinfile:
+        path: /etc/ssh/sshd_config
+        regexp: '^PermitEmptyPasswords'
+        line: 'PermitEmptyPasswords no'
+        state: present
+      notify: restart sshd
+
+    - name: Set ClientAliveInterval
+      ansible.builtin.lineinfile:
+        path: /etc/ssh/sshd_config
+        regexp: '^ClientAliveInterval'
+        line: "ClientAliveInterval {{ ssh_client_alive_interval }}"
+        state: present
+      notify: restart sshd
+
+    - name: Set ClientAliveCountMax
+      ansible.builtin.lineinfile:
+        path: /etc/ssh/sshd_config
+        regexp: '^ClientAliveCountMax'
+        line: "ClientAliveCountMax {{ ssh_client_alive_count_max }}"
+        state: present
+      notify: restart sshd
+
+- name: Configure firewall rules
+  block:
+    - name: Enable firewall
+      community.general.ufw:
+        state: enabled
+        policy: "{{ firewall_policy }}"
+      when: firewall_policy is defined
+
+    - name: Allow SSH from trusted networks
+      community.general.ufw:
+        rule: allow
+        port: '22'
+        proto: tcp
+        from: "{{ item }}"
+      loop: "{{ ssh_allowed_networks }}"
+
+- name: Disable unnecessary services
+  ansible.builtin.service:
+    name: "{{ item }}"
+    state: stopped
+    enabled: false
+  loop: "{{ unnecessary_services }}"
+  failed_when: false
+
+- name: Remove unnecessary packages
+  ansible.builtin.apt:
+    name: "{{ item }}"
+    state: absent
+    purge: true
+  loop: "{{ unnecessary_packages }}"
+  failed_when: false
@@ -0,0 +1,45 @@
+# Patching Role
+
+Apply security patches and OS updates to enterprise infrastructure nodes.
+
+## Features
+
+- **Idempotent**: Properly checks for changes with `changed_when`
+- **Patch Window**: Optional enforcement of patch time windows
+- **Pre-patch Backup**: Backs up package list before patching
+- **Smart Reboot**: Automatically detects if reboot is required
+- **Service Restart**: Restarts only necessary services after patching
+- **Health Checks**: Verifies services and runs health endpoint checks
+
+## Role Variables
+
+See `defaults/main.yml` for all available variables.
+
+### Key Variables
+
+- `patch_window_start`: Patch window start time (default: 02:00)
+- `patch_window_end`: Patch window end time (default: 04:00)
+- `enforce_patch_window`: Enforce patch time window (default: true)
+- `patch_security_only`: Apply security updates only (default: true)
+- `backup_before_patch`: Create backup before patching (default: true)
+- `reboot_if_required`: Auto-reboot if required (default: false)
+- `services_to_restart`: Services to restart after patching
+- `critical_services`: Critical services to verify after patching
+
+## Usage
+
+```yaml
+- role: patching
+  vars:
+    patch_security_only: true
+    enforce_patch_window: false
+    reboot_if_required: true
+```
+
+## Report
+
+Patch report is generated at: `/var/log/patch_report_<timestamp>.log`
+
+## Backup Location
+
+Pre-patch backups saved to: `/var/backups/pre-patch-<timestamp>/`
@@ -0,0 +1,20 @@
+---
+# Patching configuration
+patch_window_start: "02:00"
+patch_window_end: "04:00"
+enforce_patch_window: true
+patch_security_only: true
+backup_before_patch: true
+reboot_if_required: false
+reboot_timeout: 300
+
+# Services to restart after patching
+services_to_restart:
+  - sshd
+  - fail2ban
+
+# Services to verify after patching
+critical_services:
+  - systemd-journald
+  - systemd-logind
+  - cron
@@ -0,0 +1,6 @@
+---
+- name: restart patching services
+  ansible.builtin.service:
+    name: "{{ item }}"
+    state: restarted
+  loop: "{{ services_to_restart }}"
@@ -0,0 +1,105 @@
+---
+- name: Validate patch window
+  when: enforce_patch_window | bool
+  block:
+    - name: Check current time against patch window
+      ansible.builtin.assert:
+        that:
+          - ansible_date_time.hour | int >= patch_window_start.split(':')[0] | int
+          - ansible_date_time.hour | int < patch_window_end.split(':')[0] | int
+        fail_msg: |
+          Current time {{ ansible_date_time.hour }}:{{ ansible_date_time.minute }} is outside patch window {{ patch_window_start }}-{{ patch_window_end }}
+
+- name: Create pre-patch backup
+  when: backup_before_patch | bool
+  block:
+    - name: Create backup directory
+      ansible.builtin.file:
+        path: "/var/backups/pre-patch-{{ ansible_date_time.iso8601 }}"
+        state: directory
+        mode: '0755'
+
+    - name: Capture current package list
+      ansible.builtin.shell: |
+        set -o pipefail
+        dpkg --get-selections > /var/backups/pre-patch-{{ ansible_date_time.iso8601 }}/packages.list
+      changed_when: false
+
+- name: Check for available updates
+  ansible.builtin.shell: |
+    set -o pipefail
+    apt list --upgradable 2>/dev/null | grep -v "Listing..." | wc -l
+  register: updates_available_count
+  changed_when: false
+  failed_when: false
+
+- name: Update package cache
+  ansible.builtin.apt:
+    update_cache: true
+    cache_valid_time: 300
+  changed_when: false
+
+- name: Check if reboot required before patching
+  ansible.builtin.stat:
+    path: /var/run/reboot-required
+  register: reboot_required_before
+  changed_when: false
+
+- name: Apply security updates
+  ansible.builtin.apt:
+    upgrade: dist
+    update_cache: true
+  when: patch_security_only | bool
+  register: apt_update_result
+  notify: restart patching services
+
+- name: Apply all available updates
+  ansible.builtin.apt:
+    upgrade: full
+    update_cache: true
+  when: not (patch_security_only | bool)
+  register: apt_update_result
+  notify: restart patching services
+
+- name: Check if reboot required after patching
+  ansible.builtin.stat:
+    path: /var/run/reboot-required
+  register: reboot_required_after
+  changed_when: false
+
+- name: Verify critical services are running
+  ansible.builtin.service:
+    name: "{{ item }}"
+    state: started
+    enabled: true
+  loop: "{{ critical_services }}"
+  failed_when: false
+
+- name: Run post-patch health checks
+  ansible.builtin.uri:
+    url: http://localhost/health
+    method: GET
+    status_code: 200
+  register: health_check
+  failed_when: false
+  ignore_errors: true
+  when: "'webservers' in group_names"
+
+- name: Set reboot required flag
+  ansible.builtin.set_fact:
+    reboot_required: "{{ reboot_required_after.stat.exists | default(false) }}"
+
+- name: Perform system reboot if required
+  ansible.builtin.reboot:
+    msg: "Rebooting after security patches"
+    timeout: "{{ reboot_timeout }}"
+  when: reboot_required and reboot_if_required | bool
+
+- name: Generate patching report
+  ansible.builtin.template:
+    src: patch_report.j2
+    dest: /var/log/patch_report_{{ ansible_date_time.iso8601 }}.log
+    mode: '0644'
+  vars:
+    updates_applied_count: "{{ apt_update_result.changed | ternary('Yes', 'No') }}"
+    reboot_required_flag: "{{ reboot_required }}"
@@ -0,0 +1,10 @@
+Patching Report
+===============
+Generated: {{ ansible_date_time.iso8601 }}
+Host: {{ inventory_hostname }}
+
+Updates Applied: {{ updates_applied_count }}
+Reboot Required: {{ reboot_required_flag }}
+Services Restarted: {{ services_to_restart | join(', ') }}
+
+Backup Location: /var/backups/pre-patch-{{ ansible_date_time.iso8601 }}/
@@ -0,0 +1,21 @@
+# Scenario: Simulate Failure and Patch
+
+## Description
+
+Validate that a service-level failure can be detected, recovered, and followed by a controlled patch workflow. This mirrors a maintenance window where a degraded node is stabilized before package updates are applied.
+
+## Commands
+
+```bash
+cd professional-infra/linux-operations-automation
+./scripts/simulate_failure.sh service 30 web
+ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
+ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml --check
+```
+
+## Expected Result
+
+- The simulation records a temporary service failure.
+- The service is restored after cleanup.
+- The patch playbook completes without unreachable hosts.
+- Hardening check mode reports no destructive changes.
@@ -0,0 +1,116 @@
+---
+- name: Enterprise Scaling Event Scenario
+  hosts: all
+  become: yes
+  gather_facts: yes
+  vars:
+    scaling_threshold: 80
+    cooldown_period: 300
+    max_scale_up: 5
+    min_instances: 2
+
+  pre_tasks:
+    - name: Log scenario start
+      lineinfile:
+        path: "/var/log/scaling_scenario.log"
+        line: "{{ ansible_date_time.iso8601 }} - Starting scaling event scenario"
+        create: yes
+
+    - name: Check current load
+      command: uptime
+      register: system_load
+      changed_when: false
+
+    - name: Parse load average
+      set_fact:
+        load_1min: "{{ system_load.stdout.split(',')[0].split()[-1] | float }}"
+        load_5min: "{{ system_load.stdout.split(',')[1] | float }}"
+        load_15min: "{{ system_load.stdout.split(',')[2] | float }}"
+
+  tasks:
+    - name: Evaluate scaling conditions
+      set_fact:
+        scale_up_needed: "{{ load_5min > scaling_threshold }}"
+        scale_down_needed: "{{ load_5min < (scaling_threshold * 0.3) }}"
+
+    - name: Scale up web servers
+      include_role:
+        name: scale_up
+        tasks_from: web_servers
+      vars:
+        scale_count: "{{ [max_scale_up, (load_5min / 10) | int] | min }}"
+      when: scale_up_needed and "'webservers' in group_names"
+
+    - name: Scale up database servers
+      include_role:
+        name: scale_up
+        tasks_from: database_servers
+      vars:
+        scale_count: "{{ [2, (load_5min / 20) | int] | min }}"
+      when: scale_up_needed and "'databases' in group_names"
+
+    - name: Update load balancer configuration
+      include_role:
+        name: load_balancer
+        tasks_from: update_backends
+      when: scale_up_needed
+
+    - name: Scale down web servers
+      include_role:
+        name: scale_down
+        tasks_from: web_servers
+      vars:
+        scale_count: "{{ [(inventory_hostname | regex_findall('[0-9]+') | first | int) - min_instances, 1] | max }}"
+      when: scale_down_needed and "'webservers' in group_names" and (inventory_hostname | regex_findall('[0-9]+') | first | int) > min_instances
+
+    - name: Wait for cooldown period
+      pause:
+        seconds: "{{ cooldown_period }}"
+      when: scale_up_needed or scale_down_needed
+
+    - name: Verify scaling results
+      uri:
+        url: http://localhost/health
+        method: GET
+        status_code: 200
+      register: health_check
+      until: health_check.status == 200
+      retries: 5
+      delay: 10
+      when: "'webservers' in group_names"
+
+    - name: Update monitoring thresholds
+      include_role:
+        name: monitoring
+        tasks_from: update_alerts
+      vars:
+        new_threshold: "{{ scaling_threshold + 10 }}"
+
+    - name: Send scaling notification
+      mail:
+        to: "{{ scaling_notification_email | default('infra-team@company.com') }}"
+        subject: "Infrastructure Scaling Event - {{ inventory_hostname }}"
+        body: |
+          Scaling event completed on {{ inventory_hostname }}
+
+          Load averages: {{ load_1min }}, {{ load_5min }}, {{ load_15min }}
+          Action taken: {{ 'Scale Up' if scale_up_needed else 'Scale Down' if scale_down_needed else 'No Action' }}
+          Health check: {{ 'PASSED' if health_check.status == 200 else 'FAILED' }}
+
+          See /var/log/scaling_scenario.log for details
+      when: scaling_notification_email is defined
+      ignore_errors: yes
+
+  post_tasks:
+    - name: Generate scaling scenario report
+      template:
+        src: templates/scaling_scenario_report.j2
+        dest: "/var/log/scaling_scenario_report_{{ ansible_date_time.iso8601 }}.log"
+      vars:
+        scenario_outcome: "{{ 'SUCCESS' if health_check.status == 200 else 'WARNING' }}"
+        load_metrics: "{{ load_1min }}, {{ load_5min }}, {{ load_15min }}"
+
+    - name: Log scenario completion
+      lineinfile:
+        path: "/var/log/scaling_scenario.log"
+        line: "{{ ansible_date_time.iso8601 }} - Scaling event scenario completed"
@@ -0,0 +1,388 @@
+#!/bin/bash
+
+# Enterprise Infrastructure Failure Simulation Script
+# Simulates various types of infrastructure failures for testing
+
+set -euo pipefail
+
+# Configuration
+DOCKER_COMPOSE_FILE="docker-compose.yml"
+INVENTORY_FILE="inventory/hosts.ini"
+LOG_FILE="logs/failure_simulation.log"
+
+# Default values
+FAILURE_TYPE="${1:-network}"
+DURATION="${2:-60}"
+TARGET_NODES="${3:-all}"
+INTENSITY="${INTENSITY:-medium}"
+
+# Logging function
+log() {
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
+}
+
+# Error handling
+error_exit() {
+    log "ERROR: $1"
+    # Cleanup any active failures
+    cleanup_failure
+    exit 1
+}
+
+# Validate inputs
+validate_inputs() {
+    case "$FAILURE_TYPE" in
+        network|disk|service|node|cpu|memory) ;;
+        *) error_exit "Invalid failure type: $FAILURE_TYPE. Must be network, disk, service, node, cpu, or memory" ;;
+    esac
+
+    if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -lt 1 ]; then
+        error_exit "Invalid duration: $DURATION. Must be a positive integer (seconds)"
+    fi
+
+    case "$INTENSITY" in
+        low|medium|high|critical) ;;
+        *) error_exit "Invalid intensity: $INTENSITY. Must be low, medium, high, or critical" ;;
+    esac
+}
+
+# Get target containers
+get_target_containers() {
+    if [ "${SIMULATION_MODE:-false}" = true ]; then
+        case "$TARGET_NODES" in
+            all) echo "web db lb" ;;
+            *) echo "$TARGET_NODES" ;;
+        esac
+        return
+    fi
+
+    case "$TARGET_NODES" in
+        all)
+            docker compose ps --services | grep -v "^NAME$" || true
+            ;;
+        web)
+            echo "web"
+            ;;
+        db)
+            echo "db"
+            ;;
+        lb)
+            echo "lb"
+            ;;
+        monitor)
+            echo "monitor"
+            ;;
+        *)
+            echo "$TARGET_NODES"
+            ;;
+    esac
+}
+
+# Network failure simulation
+simulate_network_failure() {
+    local containers=$(get_target_containers)
+    log "Simulating network failure on containers: $containers"
+
+    if [ "${SIMULATION_MODE:-false}" = true ]; then
+        log "SIMULATION_MODE=true: skipping Docker network changes"
+        return
+    fi
+
+    for container in $containers; do
+        local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
+
+        for cid in $container_ids; do
+            if [ -n "$cid" ]; then
+                log "Disconnecting network for container $cid"
+
+                # Disconnect from network
+                docker network disconnect "$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" "$cid" 2>/dev/null || true
+
+                # Store original network for restoration
+                echo "$cid:$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" >> /tmp/network_failure_state
+            fi
+        done
+    done
+}
+
+# Disk failure simulation
+simulate_disk_failure() {
+    local containers=$(get_target_containers)
+    log "Simulating disk space exhaustion on containers: $containers"
+
+    if [ "${SIMULATION_MODE:-false}" = true ]; then
+        log "SIMULATION_MODE=true: skipping container disk writes"
+        return
+    fi
+
+    for container in $containers; do
+        local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
+
+        for cid in $container_ids; do
+            if [ -n "$cid" ]; then
+                log "Filling disk space in container $cid"
+
+                # Create a large file to consume disk space
+                local fill_size_mb=100
+                case "$INTENSITY" in
+                    low) fill_size_mb=50 ;;
+                    medium) fill_size_mb=100 ;;
+                    high) fill_size_mb=500 ;;
+                    critical) fill_size_mb=1024 ;;
+                esac
+
+                docker exec "$cid" bash -c "dd if=/dev/zero of=/tmp/disk_fill bs=1M count=${fill_size_mb}" 2>/dev/null || true
+                echo "$cid:disk_fill" >> /tmp/disk_failure_state
+            fi
+        done
+    done
+}
+
+# Service failure simulation
+simulate_service_failure() {
+    local containers=$(get_target_containers)
+    log "Simulating service failures on containers: $containers"
+
+    if [ "${SIMULATION_MODE:-false}" = true ]; then
+        for container in $containers; do
+            log "SIMULATION_MODE=true: would stop services in $container"
+        done
+        return
+    fi
+
+    for container in $containers; do
+        local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
+
+        for cid in $container_ids; do
+            if [ -n "$cid" ]; then
+                log "Stopping services in container $cid"
+
+                # Stop common services
+                docker exec "$cid" systemctl stop nginx 2>/dev/null || true
+                docker exec "$cid" systemctl stop postgresql 2>/dev/null || true
+                docker exec "$cid" systemctl stop haproxy 2>/dev/null || true
+
+                echo "$cid:services" >> /tmp/service_failure_state
+            fi
+        done
+    done
+}
+
+# Node failure simulation
+simulate_node_failure() {
+    local containers=$(get_target_containers)
+    log "Simulating complete node failures on containers: $containers"
+
+    if [ "${SIMULATION_MODE:-false}" = true ]; then
+        log "SIMULATION_MODE=true: skipping container pause"
+        return
+    fi
+
+    for container in $containers; do
+        local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
+
+        for cid in $container_ids; do
+            if [ -n "$cid" ]; then
+                log "Stopping container $cid (node failure)"
+                docker pause "$cid"
+                echo "$cid:paused" >> /tmp/node_failure_state
+            fi
+        done
+    done
+}
+
+# CPU stress simulation
+simulate_cpu_failure() {
+    local containers=$(get_target_containers)
+    log "Simulating CPU stress on containers: $containers"
+
+    if [ "${SIMULATION_MODE:-false}" = true ]; then
+        log "SIMULATION_MODE=true: skipping CPU stress"
+        return
+    fi
+
+    for container in $containers; do
+        local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
+
+        for cid in $container_ids; do
+            if [ -n "$cid" ]; then
+                log "Starting CPU stress in container $cid"
+
+                # Start CPU stress process
+                docker exec -d "$cid" bash -c "while true; do :; done" 2>/dev/null || true
+                echo "$cid:cpu_stress:$(docker exec "$cid" ps aux | grep "while true" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/cpu_failure_state
+            fi
+        done
+    done
+}
+
+# Memory stress simulation
+simulate_memory_failure() {
+    local containers=$(get_target_containers)
+    log "Simulating memory exhaustion on containers: $containers"
+
+    if [ "${SIMULATION_MODE:-false}" = true ]; then
+        log "SIMULATION_MODE=true: skipping memory stress"
+        return
+    fi
+
+    for container in $containers; do
+        local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
+
+        for cid in $container_ids; do
+            if [ -n "$cid" ]; then
+                log "Starting memory stress in container $cid"
+
+                # Start memory stress process
+                docker exec -d "$cid" bash -c "tail /dev/zero" 2>/dev/null || true
+                echo "$cid:memory_stress:$(docker exec "$cid" ps aux | grep "tail /dev/zero" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/memory_failure_state
+            fi
+        done
+    done
+}
+
+# Inject failure
+inject_failure() {
+    case "$FAILURE_TYPE" in
+        network) simulate_network_failure ;;
+        disk) simulate_disk_failure ;;
+        service) simulate_service_failure ;;
+        node) simulate_node_failure ;;
+        cpu) simulate_cpu_failure ;;
+        memory) simulate_memory_failure ;;
+    esac
+}
+
+# Cleanup failure
+cleanup_failure() {
+    log "Cleaning up failure simulation"
+
+    # Restore network connections
+    if [ -f /tmp/network_failure_state ]; then
+        while IFS=: read -r cid network; do
+            docker network connect "$network" "$cid" 2>/dev/null || true
+        done < /tmp/network_failure_state
+        rm -f /tmp/network_failure_state
+    fi
+
+    # Clean up disk fill files
+    if [ -f /tmp/disk_failure_state ]; then
+        while IFS=: read -r cid _; do
+            docker exec "$cid" rm -f /tmp/disk_fill 2>/dev/null || true
+        done < /tmp/disk_failure_state
+        rm -f /tmp/disk_failure_state
+    fi
+
+    # Restart services
+    if [ -f /tmp/service_failure_state ]; then
+        while IFS=: read -r cid _; do
+            docker exec "$cid" systemctl start nginx 2>/dev/null || true
+            docker exec "$cid" systemctl start postgresql 2>/dev/null || true
+            docker exec "$cid" systemctl start haproxy 2>/dev/null || true
+        done < /tmp/service_failure_state
+        rm -f /tmp/service_failure_state
+    fi
+
+    # Unpause containers
+    if [ -f /tmp/node_failure_state ]; then
+        while IFS=: read -r cid _; do
+            docker unpause "$cid" 2>/dev/null || true
+        done < /tmp/node_failure_state
+        rm -f /tmp/node_failure_state
+    fi
+
+    # Kill stress processes
+    if [ -f /tmp/cpu_failure_state ]; then
+        while IFS=: read -r cid _ pid; do
+            docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
+        done < /tmp/cpu_failure_state
+        rm -f /tmp/cpu_failure_state
+    fi
+
+    if [ -f /tmp/memory_failure_state ]; then
+        while IFS=: read -r cid _ pid; do
+            docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
+        done < /tmp/memory_failure_state
+        rm -f /tmp/memory_failure_state
+    fi
+}
+
+# Monitor failure
+monitor_failure() {
+    local end_time=$(( $(date +%s) + DURATION ))
+
+    log "Monitoring failure for $DURATION seconds"
+
+    while [ $(date +%s) -lt $end_time ]; do
+        # Check container status
+    if [ "${SIMULATION_MODE:-false}" = true ]; then
+        log "SIMULATION_MODE=true: validation simulated"
+        return
+    fi
+
+    if ! docker compose ps | grep -q "Up\|Paused"; then
+            log "WARNING: All containers are down"
+        fi
+
+        # Log system metrics
+        log "System status: $(docker stats --no-stream --format 'table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}' | tail -n +2)"
+
+        sleep 10
+    done
+}
+
+# Generate failure report
+generate_report() {
+    local report_file="reports/failure_simulation_$(date +%Y%m%d_%H%M%S).txt"
+
+    cat > "$report_file" << EOF
+Failure Simulation Report
+========================
+
+Timestamp: $(date)
+Failure Type: $FAILURE_TYPE
+Duration: $DURATION seconds
+Target Nodes: $TARGET_NODES
+Intensity: $INTENSITY
+
+Pre-failure Status:
+$(docker compose ps 2>/dev/null || echo "Docker Compose not running")
+
+Post-failure Status:
+$(docker compose ps 2>/dev/null || echo "Docker Compose not running")
+
+Log File: $LOG_FILE
+EOF
+
+    log "Failure simulation report generated: $report_file"
+}
+
+# Main execution
+main() {
+    log "Starting failure simulation: $FAILURE_TYPE for $DURATION seconds"
+
+    validate_inputs
+
+    # Inject failure
+    inject_failure
+
+    # Monitor during failure
+    monitor_failure
+
+    # Cleanup
+    cleanup_failure
+
+    # Generate report
+    generate_report
+
+    log "Failure simulation completed successfully"
+}
+
+# Trap for cleanup on script exit
+trap cleanup_failure EXIT
+
+# Initialize logging
+mkdir -p logs reports
+
+# Run main function
+main "$@"
@@ -0,0 +1,229 @@
+#!/bin/bash
+
+# Enterprise Infrastructure Scaling Simulation Script
+# Simulates scaling operations for infrastructure nodes
+
+set -euo pipefail
+
+# Configuration
+DOCKER_COMPOSE_FILE="docker-compose.yml"
+INVENTORY_FILE="inventory/hosts.ini"
+LOG_FILE="logs/scaling_simulation.log"
+
+# Default values
+DIRECTION="${1:-up}"
+COUNT="${2:-1}"
+NODE_TYPE="${3:-web}"
+SIMULATION_MODE="${SIMULATION_MODE:-false}"
+
+# Logging function
+log() {
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
+}
+
+# Error handling
+error_exit() {
+    log "ERROR: $1"
+    exit 1
+}
+
+# Validate inputs
+validate_inputs() {
+    if [[ "$DIRECTION" != "up" && "$DIRECTION" != "down" ]]; then
+        error_exit "Invalid direction: $DIRECTION. Must be 'up' or 'down'"
+    fi
+
+    if ! [[ "$COUNT" =~ ^[0-9]+$ ]] || [ "$COUNT" -lt 1 ]; then
+        error_exit "Invalid count: $COUNT. Must be a positive integer"
+    fi
+
+    case "$NODE_TYPE" in
+        web|db|lb|monitor) ;;
+        *) error_exit "Invalid node type: $NODE_TYPE. Must be web, db, lb, or monitor" ;;
+    esac
+}
+
+# Get current node count
+get_current_count() {
+    local type="$1"
+    if [ "$SIMULATION_MODE" = true ]; then
+        case "$type" in
+            web) echo 3 ;;
+            db) echo 2 ;;
+            lb|monitor) echo 1 ;;
+        esac
+        return
+    fi
+
+    case "$type" in
+        web) docker compose ps web | grep -c "Up" ;;
+        db) docker compose ps db | grep -c "Up" ;;
+        lb) docker compose ps lb | grep -c "Up" ;;
+        monitor) docker compose ps monitor | grep -c "Up" ;;
+    esac
+}
+
+# Scale up infrastructure
+scale_up() {
+    local type="$1"
+    local count="$2"
+
+    log "Scaling up $count $type nodes"
+
+    if [ "$SIMULATION_MODE" = true ]; then
+        log "SIMULATION_MODE=true: skipping Docker Compose mutation and Ansible provisioning"
+        update_inventory "$type" "$count" "add"
+        log "Successfully simulated scale up of $count $type nodes"
+        return
+    fi
+
+    docker compose -f "$DOCKER_COMPOSE_FILE" up -d --scale "${type}=${count}"
+
+    # Wait for containers to be ready
+    log "Waiting for containers to be ready..."
+    sleep 30
+
+    # Update inventory
+    update_inventory "$type" "$count" "add"
+
+    # Run provisioning playbook on new nodes
+    if [ "$SIMULATION_MODE" = false ]; then
+        ansible-playbook -i "$INVENTORY_FILE" playbooks/provision.yml --limit "${type}*"
+    fi
+
+    log "Successfully scaled up $count $type nodes"
+}
+
+# Scale down infrastructure
+scale_down() {
+    local type="$1"
+    local count="$2"
+
+    local current_count=$(get_current_count "$type")
+    if [ "$current_count" -lt "$count" ]; then
+        error_exit "Cannot scale down $count nodes. Only $current_count $type nodes currently running"
+    fi
+
+    log "Scaling down $count $type nodes"
+
+    # Select nodes to remove (oldest first)
+    if [ "$SIMULATION_MODE" = true ]; then
+        log "SIMULATION_MODE=true: skipping Docker Compose mutation and Ansible decommissioning"
+        update_inventory "$type" "$count" "remove"
+        log "Successfully simulated scale down of $count $type nodes"
+        return
+    fi
+
+    local nodes_to_remove=$(docker compose ps "$type" | grep "Up" | head -n "$count" | awk '{print $1}')
+
+    # Decommission nodes
+    for node in $nodes_to_remove; do
+        if [ "$SIMULATION_MODE" = false ]; then
+            ansible-playbook -i "$INVENTORY_FILE" playbooks/decommission.yml --limit "$node"
+        fi
+        docker stop "$node"
+        docker rm "$node"
+    done
+
+    # Update inventory
+    update_inventory "$type" "$count" "remove"
+
+    log "Successfully scaled down $count $type nodes"
+}
+
+# Update Ansible inventory
+update_inventory() {
+    local type="$1"
+    local count="$2"
+    local action="$3"
+
+    log "Updating inventory for $action $count $type nodes"
+
+    # This would be more complex in a real implementation
+    # For simulation, we'll just log the action
+    case "$action" in
+        add)
+            log "Added $count $type nodes to inventory"
+            ;;
+        remove)
+            log "Removed $count $type nodes from inventory"
+            ;;
+    esac
+}
+
+# Health check after scaling
+health_check() {
+    log "Running health checks after scaling"
+
+    # Check container status
+    if [ "$SIMULATION_MODE" = true ]; then
+        log "SIMULATION_MODE=true: health checks simulated"
+        return
+    fi
+
+    if ! docker compose ps | grep -q "Up"; then
+        error_exit "Some containers failed to start"
+    fi
+
+    # Ansible ping check
+    if [ "$SIMULATION_MODE" = false ]; then
+        if ! ansible -i "$INVENTORY_FILE" all -m ping >/dev/null 2>&1; then
+            log "WARNING: Some nodes failed Ansible ping check"
+        fi
+    fi
+
+    log "Health checks completed"
+}
+
+# Generate scaling report
+generate_report() {
+    local report_file="reports/scaling_report_$(date +%Y%m%d_%H%M%S).txt"
+
+    cat > "$report_file" << EOF
+Scaling Simulation Report
+========================
+
+Timestamp: $(date)
+Direction: $DIRECTION
+Node Type: $NODE_TYPE
+Count: $COUNT
+Simulation Mode: $SIMULATION_MODE
+
+Current Status:
+$(docker compose ps 2>/dev/null || echo "Docker Compose not running")
+
+Inventory Status:
+$(ansible -i "$INVENTORY_FILE" --list-hosts all 2>/dev/null || echo "Ansible inventory check failed")
+
+Log File: $LOG_FILE
+EOF
+
+    log "Scaling report generated: $report_file"
+}
+
+# Main execution
+main() {
+    log "Starting scaling simulation: $DIRECTION $COUNT $NODE_TYPE nodes"
+
+    validate_inputs
+
+    case "$DIRECTION" in
+        up)
+            scale_up "$NODE_TYPE" "$COUNT"
+            ;;
+        down)
+            scale_down "$NODE_TYPE" "$COUNT"
+            ;;
+    esac
+
+    health_check
+    generate_report
+
+    log "Scaling simulation completed successfully"
+}
+
+# Initialize logging
+mkdir -p logs reports
+
+# Run main function
+main "$@"