Initial CV-aligned infrastructure portfolio
ci / validate (push) Failing after 1m8s

Rework portfolio around Linux operations, Zabbix monitoring, migration validation, and ELK/Grafana log observability.

Add AAP-style LVM resize workflow, Zabbix server/proxy/agent automation assets, Linux/AIX monitoring templates, and updated validation CI.
This commit is contained in:
Mateusz Suski
2026-05-04 17:37:24 +00:00
commit 35e6b139fc
114 changed files with 6422 additions and 0 deletions
@@ -0,0 +1,14 @@
---
# Ansible-lint configuration
skip_list:
- 'role-name'
- 'name[casing]'
- 'line-too-long'
exclude_paths:
- .git
- .gitea
- molecule/
- molecule/default/tests/
- scenarios/
@@ -0,0 +1,95 @@
# Linux Operations Automation Makefile
.PHONY: help test run demo patch harden decommission lvm-check up down status logs validate clean lint scale-up-web scale-up-db scale-down-web scale-down-db fail-network fail-disk fail-service fail-node scenario-scaling help-scaling help-failure
help: ## Show this help message
@echo "Linux Operations Automation"
@echo ""
@echo "Available commands:"
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " %-18s %s\n", $$1, $$2}'
test: ## Run offline validation checks
ansible-playbook -i inventory/hosts.ini --syntax-check playbooks/*.yml
ansible-lint
run: ## Run provisioning against the configured inventory
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
demo: ## Run a safe local demonstration without requiring live SSH hosts
SIMULATION_MODE=true bash ./scripts/simulate_failure.sh service 5 web
patch: ## Apply patching workflow against the configured inventory
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
harden: ## Apply hardening workflow against the configured inventory
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
decommission: ## Run decommissioning workflow against the configured inventory
ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml
lvm-check: ## Validate the AAP-style LVM resize workflow
ansible-playbook -i inventory/hosts.ini --syntax-check playbooks/lvm_resize.yml
up: ## Start the optional local container scaffold
docker compose up -d
down: ## Stop the optional local container scaffold
docker compose down
status: ## Show local scaffold status and inventory hosts
docker compose ps
ansible -i inventory/hosts.ini --list-hosts all || echo "Inventory check failed"
logs: ## Show local scaffold logs
docker compose logs -f --tail=100
validate: ## Run all offline validation checks
$(MAKE) test
docker compose config --quiet
clean: ## Clean up generated local logs and reports
rm -f logs/*.log reports/*.txt
lint: ## Lint Ansible content
ansible-lint
scale-up-web: ## Scale up web servers in simulation mode (usage: make scale-up-web COUNT=2)
SIMULATION_MODE=true bash ./scripts/simulate_scaling.sh up $(or $(COUNT),1) web
scale-up-db: ## Scale up database servers in simulation mode (usage: make scale-up-db COUNT=1)
SIMULATION_MODE=true bash ./scripts/simulate_scaling.sh up $(or $(COUNT),1) db
scale-down-web: ## Scale down web servers in simulation mode (usage: make scale-down-web COUNT=1)
SIMULATION_MODE=true bash ./scripts/simulate_scaling.sh down $(or $(COUNT),1) web
scale-down-db: ## Scale down database servers in simulation mode (usage: make scale-down-db COUNT=1)
SIMULATION_MODE=true bash ./scripts/simulate_scaling.sh down $(or $(COUNT),1) db
fail-network: ## Simulate network failure safely (usage: make fail-network DURATION=60)
SIMULATION_MODE=true bash ./scripts/simulate_failure.sh network $(or $(DURATION),60)
fail-disk: ## Simulate disk pressure safely (usage: make fail-disk DURATION=120)
SIMULATION_MODE=true bash ./scripts/simulate_failure.sh disk $(or $(DURATION),120)
fail-service: ## Simulate service failures safely (usage: make fail-service DURATION=30)
SIMULATION_MODE=true bash ./scripts/simulate_failure.sh service $(or $(DURATION),30)
fail-node: ## Simulate node failure safely (usage: make fail-node DURATION=300)
SIMULATION_MODE=true bash ./scripts/simulate_failure.sh node $(or $(DURATION),300)
scenario-scaling: ## Run scaling event syntax validation
ansible-playbook -i inventory/hosts.ini --syntax-check scenarios/scaling_event.yml
help-scaling: ## Show scaling-related commands
@echo "Scaling Commands:"
@echo " make scale-up-web COUNT=2"
@echo " make scale-up-db COUNT=1"
@echo " make scale-down-web COUNT=1"
@echo " make scale-down-db COUNT=1"
help-failure: ## Show failure simulation commands
@echo "Failure Simulation Commands:"
@echo " make fail-network DURATION=60"
@echo " make fail-disk DURATION=120"
@echo " make fail-service DURATION=30"
@echo " make fail-node DURATION=300"
@@ -0,0 +1,92 @@
# Linux Operations Automation
## Problem
Linux infrastructure work often starts as ticket-driven operations: deploy a server, patch it, harden SSH, check a failed service, expand a filesystem, and leave evidence that the change was safe. These tasks need automation that is readable, repeatable, and cautious enough for production-style environments.
## CV Relevance
This project maps directly to Linux/Unix operations, server deployment, patching, troubleshooting, and storage/LVM work from enterprise infrastructure environments. The LVM resize workflow is written in an AAP-style shape: explicit survey variables, dry-run defaults, pre-checks, resize actions, and before/after evidence.
## What This Project Demonstrates
- Ansible playbooks for common Linux node lifecycle operations.
- Role-based task organization with clear defaults and handlers.
- LVM filesystem expansion workflow suitable for Ansible Automation Platform job templates.
- Safe simulation scripts for failure, service, and scaling exercises.
- Reviewer-friendly evidence in `examples/` without relying on a live enterprise lab.
## Architecture
```text
Operator -> Make targets -> Ansible inventory -> Playbooks/Roles -> Linux nodes
-> Simulation scripts -> Example evidence
-> AAP-style LVM workflow -> Before/after report
```
Core components:
- `inventory/hosts.ini` defines realistic host groups.
- `playbooks/` contains provision, patch, harden, and decommission workflows.
- `playbooks/lvm_resize.yml` contains the storage expansion workflow.
- `roles/` contains the implemented Ansible roles.
- `scripts/` provides safe simulation helpers.
- `docker-compose.yml` is a lightweight local scaffold, not a production lab.
## Quickstart
```bash
cd professional-infra/linux-operations-automation
make test
make demo
```
`make test` runs offline syntax and lint checks. `make demo` runs a safe simulation with `SIMULATION_MODE=true` and does not require reachable SSH hosts.
To run playbooks against real or lab hosts, update `inventory/hosts.ini` and run:
```bash
make run
make patch
make harden
make decommission
```
Review the LVM workflow:
```bash
ansible-playbook -i inventory/hosts.ini playbooks/lvm_resize.yml --syntax-check
cat docs/aap_lvm_resize_workflow.md
```
## Validation
```bash
make test
docker compose config --quiet
```
The optional compose scaffold can be started with:
```bash
make up
make down
```
## Example Output
Sample evidence is available in [examples/patch-output.txt](examples/patch-output.txt), [examples/failure-simulation.txt](examples/failure-simulation.txt), and [examples/lvm-resize-output.txt](examples/lvm-resize-output.txt).
## Interview Talking Points
- How to make LVM resize automation safe with dry-run defaults and explicit approval.
- Why before/after evidence matters for storage and filesystem changes.
- How Ansible roles keep Linux baseline operations repeatable.
- Where AAP surveys and job templates reduce ticket handling errors.
## Roadmap
- Add complete service roles for application deployment examples.
- Add backup, security scan, and disaster recovery playbooks.
- Add a richer local lab with SSH-ready containers.
- Add cloud or Kubernetes deployment variants.
@@ -0,0 +1,43 @@
# Vault Configuration Guide
## Overview
The current portfolio demo does not require Ansible Vault for `make test` or `make demo`. Secrets are intentionally kept out of the main validation path so reviewers can run the project offline.
Use Vault only when extending the simulator to manage real hosts or credentials.
## Recommended Pattern
1. Start from the example file:
```bash
cp group_vars/vault.example.yml group_vars/vault.yml
```
2. Replace placeholder values locally.
3. Encrypt the file before using it with real systems:
```bash
ansible-vault encrypt group_vars/vault.yml
```
4. Do not commit real secret values. Keep `group_vars/vault.example.yml` as the committed reference.
## Running With Vault
```bash
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml --ask-vault-pass
```
or:
```bash
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml --vault-password-file ~/.vault_pass.txt
```
## Notes
- The delivered playbooks do not import a vault file by default.
- Add `vars_files` only in an environment-specific branch or private overlay.
- Prefer a secret manager or automation controller for production use.
@@ -0,0 +1,5 @@
[defaults]
roles_path = ./roles
inventory = ./inventory/hosts.ini
host_key_checking = False
retry_files_enabled = False
@@ -0,0 +1,28 @@
services:
web:
image: debian:12-slim
command: ["sleep", "infinity"]
networks:
infra_sim:
ipv4_address: 172.20.0.11
db:
image: debian:12-slim
command: ["sleep", "infinity"]
networks:
infra_sim:
ipv4_address: 172.20.0.21
lb:
image: debian:12-slim
command: ["sleep", "infinity"]
networks:
infra_sim:
ipv4_address: 172.20.0.31
networks:
infra_sim:
driver: bridge
ipam:
config:
- subnet: 172.20.0.0/24
@@ -0,0 +1,45 @@
# AAP-Style LVM Resize Workflow
## Purpose
This workflow shows how a routine storage ticket can be converted into a controlled Ansible Automation Platform job. It is intentionally conservative: dry-run is the default, required variables are explicit, and every run produces before/after evidence.
## Suggested Job Template
- Name: `Linux - LVM Filesystem Resize`
- Inventory: Linux production or pre-production inventory
- Playbook: `playbooks/lvm_resize.yml`
- Credentials: privileged Linux automation credential
- Privilege escalation: enabled
- Default extra vars:
```yaml
lvm_dry_run: true
lvm_resize_filesystem: true
```
## Suggested Survey Variables
| Variable | Example | Required | Notes |
| --- | --- | --- | --- |
| `lvm_vg_name` | `vg_app` | yes | Target volume group. |
| `lvm_lv_name` | `lv_data` | yes | Target logical volume. |
| `lvm_mountpoint` | `/data` | yes | Filesystem mountpoint to validate before/after. |
| `lvm_size_request` | `+20G` | yes | Passed to `lvextend -L`; use explicit growth syntax for tickets. |
| `lvm_dry_run` | `true` | yes | Start with `true`; switch to `false` after evidence review. |
## Safety Notes
- Run with `lvm_dry_run=true` first and attach output to the ticket.
- Confirm backup/snapshot status before actual resize.
- Confirm filesystem type; this workflow supports XFS and ext filesystems.
- Keep requested size aligned with the ticket approval.
- Use maintenance windows for critical systems.
## Evidence Captured
- `lsblk --fs`
- `pvs`, `vgs`, `lvs`
- `df -hT <mountpoint>` before and after
- target LV path and filesystem type
- dry-run flag and requested size
@@ -0,0 +1,30 @@
# Linux Operations Automation Architecture
## Components
- Operator interface: `make` targets and direct Ansible commands.
- Inventory: static host groups in `inventory/hosts.ini`.
- Automation: lifecycle playbooks in `playbooks/`.
- Simulation scripts: controlled failure and scaling events in `scripts/`.
- Evidence: logs, reports, scenario notes, and examples.
## Data Flow
```
Operator
-> Make target or shell script
-> Ansible inventory
-> lifecycle playbook
-> managed Linux node
-> log/report artifact
```
Failure drills follow a parallel flow:
```
Operator -> simulate_failure.sh -> target node/service -> health check -> patch/hardening playbook -> evidence
```
## Notes
The project favors explicit playbooks over hidden orchestration so the operational intent is visible during review. In a production implementation, the same workflows would typically run from a CI runner or automation controller with credentials supplied by a secret manager.
@@ -0,0 +1,8 @@
2026-04-29 02:13:41 - Starting failure simulation: service 30 web
2026-04-29 02:13:41 - Simulating service failures on containers: web
2026-04-29 02:13:42 - Stopping services in container enterprise-web-1
2026-04-29 02:13:44 - Health probe failed: http://web01/health returned 503
2026-04-29 02:14:12 - Cleaning up failure simulation
2026-04-29 02:14:13 - Restarted nginx in enterprise-web-1
2026-04-29 02:14:18 - Health probe recovered: http://web01/health returned 200
2026-04-29 02:14:18 - Failure simulation completed successfully
@@ -0,0 +1,19 @@
TASK [Report LVM resize evidence] **********************************************
ok: [app01] => {
"msg": {
"host": "app01",
"dry_run": true,
"target": "/dev/vg_app/lv_data",
"mountpoint": "/data",
"requested_size": "+20G",
"filesystem_type": "xfs",
"before_df": [
"Filesystem Type Size Used Avail Use% Mounted on",
"/dev/mapper/vg_app-lv_data xfs 100G 83G 17G 84% /data"
],
"after_df": [
"Filesystem Type Size Used Avail Use% Mounted on",
"/dev/mapper/vg_app-lv_data xfs 100G 83G 17G 84% /data"
]
}
}
@@ -0,0 +1,33 @@
PLAY [Apply Security Patches and Updates] **************************************
TASK [Update package cache] *****************************************************
changed: [web01]
changed: [db01]
ok: [lb01]
TASK [Check for available updates] **********************************************
ok: [web01] => {"stdout": "9"}
ok: [db01] => {"stdout": "4"}
ok: [lb01] => {"stdout": "0"}
TASK [Apply security updates only] **********************************************
changed: [web01]
changed: [db01]
ok: [lb01]
TASK [Verify critical services] *************************************************
ok: [web01] => (item=systemd-journald)
ok: [web01] => (item=cron)
ok: [db01] => (item=systemd-journald)
ok: [lb01] => (item=cron)
PLAY RECAP *********************************************************************
web01 : ok=19 changed=6 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
db01 : ok=18 changed=5 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
lb01 : ok=15 changed=1 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0
Patch report
Status: SUCCESS
Window: 02:00-04:00 UTC
Reboot required: false
Notification: infra-team@example.com
@@ -0,0 +1,20 @@
---
# Group variables for all hosts
# SSH Configuration
ssh_config:
port: 22
max_auth_tries: 3
alive_interval: 300
# Firewall defaults
firewall_enabled: true
firewall_default_policy: deny
# Patching defaults
patch_enabled: true
enforce_patch_window: true
# Services monitoring
enable_monitoring: false
enable_health_checks: true
@@ -0,0 +1,9 @@
---
# Database servers group configuration
db_type: postgresql
db_port: 5432
db_backup_enabled: true
db_backup_path: /var/backups/database
# Database user (use vault for production)
db_admin_user: postgres
@@ -0,0 +1,10 @@
---
# Load balancers group configuration
lb_type: haproxy
lb_port: 443
lb_stats_port: 8404
lb_stats_enabled: true
# Frontend configuration
frontend_host: "0.0.0.0"
frontend_port: 80
@@ -0,0 +1,10 @@
---
# Monitoring servers group configuration
monitoring_type: prometheus
monitoring_port: 9090
monitoring_retention: 30d
monitoring_scrape_interval: 15s
# Grafana configuration
grafana_port: 3000
grafana_admin_password: "{{ vault_grafana_password }}"
@@ -0,0 +1,8 @@
---
# Example variables for secret values.
# Copy these keys into an Ansible Vault encrypted file when real secrets are needed.
admin_password: "replace-with-vault-managed-value"
db_root_password: "replace-with-vault-managed-value"
grafana_admin_password: "replace-with-vault-managed-value"
ssh_key_passphrase: "replace-with-vault-managed-value"
@@ -0,0 +1,11 @@
---
# Webservers group configuration
webserver_type: nginx
http_port: 80
https_port: 443
health_check_path: /health
# Application configuration
app_name: "{{ group_names[0] | default('app') }}"
app_user: "{{ admin_user }}"
app_group: "{{ admin_user }}"
@@ -0,0 +1,35 @@
[webservers]
web01 ansible_host=172.20.0.11 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
web02 ansible_host=172.20.0.12 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
web03 ansible_host=172.20.0.13 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
[databases]
db01 ansible_host=172.20.0.21 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
db02 ansible_host=172.20.0.22 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
[loadbalancers]
lb01 ansible_host=172.20.0.31 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
[monitoring]
mon01 ansible_host=172.20.0.41 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
[all:vars]
ansible_python_interpreter=/usr/bin/python3
ansible_ssh_common_args='-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
ansible_connection=ssh
[webservers:vars]
node_type=web
environment=production
[databases:vars]
node_type=database
environment=production
[loadbalancers:vars]
node_type=loadbalancer
environment=production
[monitoring:vars]
node_type=monitoring
environment=production
@@ -0,0 +1,24 @@
---
# Molecule converge playbook - applies roles to test them
- name: Converge
hosts: all
become: true
gather_facts: true
pre_tasks:
- name: Update apt cache
apt:
update_cache: yes
cache_valid_time: 3600
when: ansible_os_family == "Debian"
roles:
- role: base_provision
- role: hardening
- role: patching
post_tasks:
- name: Print Ansible facts
debug:
var: ansible_facts
@@ -0,0 +1,15 @@
---
# Molecule destroy playbook
- name: Destroy
hosts: localhost
gather_facts: false
tasks:
- name: Destroy molecule containers
docker_container:
name: "{{ item }}"
state: absent
force_kill: yes
loop: "{{ molecule_yml.platforms | map(attribute='name') | list }}"
register: destroy_result
ignore_errors: yes
@@ -0,0 +1,31 @@
---
# Molecule configuration for Ansible role testing
driver:
name: docker
platforms:
- name: ubuntu-22.04
image: geerlingguy/docker-ubuntu2204-ansible:latest
pre_build_image: true
privileged: true
volumes:
- /sys/fs/cgroup:/sys/fs/cgroup:rw
provisioner:
name: ansible
config_options:
defaults:
gathering: smart
fact_caching: jsonfile
fact_caching_connection: /tmp/ansible_facts
fact_caching_timeout: 3600
deprecation_warnings: false
verifier:
name: ansible
directory: molecule/default/tests
lint: |
yamllint .
ansible-lint
@@ -0,0 +1,32 @@
---
# Molecule verify playbook - runs tests to verify roles
- name: Verify
hosts: all
gather_facts: false
tasks:
- name: Check if base OS packages are installed
shell: dpkg -l | grep -E '(curl|wget|vim|htop)'
register: package_check
failed_when: package_check.rc not in [0, 1]
- name: Check SSH configuration
stat:
path: /etc/ssh/sshd_config
register: ssh_config_stat
failed_when: not ssh_config_stat.stat.exists
- name: Check firewall status
shell: ufw status | grep -q active
register: firewall_check
failed_when: false
- name: Verify admin user exists
getent:
database: passwd
key: infra-admin
failed_when: false
- name: Print verification results
debug:
msg: "Role verification completed"
@@ -0,0 +1,34 @@
---
- name: Decommission Enterprise Infrastructure Nodes
hosts: all
become: true
gather_facts: true
pre_tasks:
- name: Confirm decommissioning
ansible.builtin.pause:
prompt: |
WARNING: This will decommission {{ inventory_hostname }}
Backup Data: {{ backup_data }}
Export Config: {{ export_config }}
Press ENTER to continue or Ctrl+C to cancel
- name: Display decommissioning information
ansible.builtin.debug:
msg: |
Decommissioning {{ inventory_hostname }}
Auto Shutdown: {{ auto_shutdown }}
Backup Enabled: {{ backup_data }}
roles:
- role: decommission
tags: ['decommission', 'cleanup']
post_tasks:
- name: Display decommissioning summary
ansible.builtin.debug:
msg: |
Decommissioning completed!
Host: {{ inventory_hostname }}
Backup Location: /var/backups/decommission-{{ ansible_date_time.iso8601 }}/
@@ -0,0 +1,124 @@
---
- name: Harden Enterprise Infrastructure Nodes
hosts: all
become: true
gather_facts: true
pre_tasks:
- name: Validate hardening prerequisites
ansible.builtin.assert:
that:
- ansible_os_family == "Debian"
- cis_level in [1, 2]
fail_msg: "Invalid hardening configuration"
- name: Display hardening information
ansible.builtin.debug:
msg: |
Hardening {{ inventory_hostname }}
CIS Level: {{ cis_level }}
Disable Root Login: {{ disable_root_login }}
roles:
- role: hardening
tags: ['hardening', 'security']
post_tasks:
- name: Display hardening summary
ansible.builtin.debug:
msg: |
Hardening completed successfully!
Host: {{ inventory_hostname }}
when: ansible_os_family == "Debian"
- name: Configure auditd
when: auditd_enabled
block:
- name: Install auditd
ansible.builtin.apt:
name: auditd
state: present
when: ansible_os_family == "Debian"
- name: Configure audit rules
ansible.builtin.template:
src: templates/audit.rules.j2
dest: /etc/audit/rules.d/hardening.rules
mode: '0644'
- name: Enable auditd service
ansible.builtin.service:
name: auditd
state: started
enabled: true
- name: Configure AppArmor
when: apparmor_enabled and ansible_os_family == "Debian"
block:
- name: Install apparmor
ansible.builtin.apt:
name: apparmor
state: present
when: ansible_os_family == "Debian"
- name: Enable apparmor service
ansible.builtin.service:
name: apparmor
state: started
enabled: true
- name: Configure sysctl hardening
ansible.posix.sysctl:
name: "{{ item.key }}"
value: "{{ item.value }}"
state: present
reload: true
loop:
- { key: 'net.ipv4.ip_forward', value: '0' }
- { key: 'net.ipv4.conf.all.send_redirects', value: '0' }
- { key: 'net.ipv4.conf.default.send_redirects', value: '0' }
- { key: 'net.ipv4.tcp_syncookies', value: '1' }
- { key: 'net.ipv4.icmp_echo_ignore_broadcasts', value: '1' }
- name: Set secure file permissions
ansible.builtin.file:
path: "{{ item }}"
mode: '0644'
owner: root
group: root
loop:
- /etc/passwd
- /etc/group
- /etc/shadow
- /etc/gshadow
- name: Lock inactive user accounts
ansible.builtin.command: usermod -L "{{ item }}"
loop: "{{ inactive_users | default([]) }}"
changed_when: false
- name: Configure password policies
community.general.pam_limits:
domain: '*'
limit_type: hard
limit_item: nofile
value: 1024
- name: Generate hardening report
ansible.builtin.template:
src: templates/hardening_report.j2
dest: "/var/log/hardening_report_{{ ansible_date_time.iso8601 }}.log"
mode: '0644'
handlers:
- name: restart sshd
ansible.builtin.service:
name: ssh
state: restarted
- name: restart auditd
ansible.builtin.service:
name: auditd
state: restarted
when: auditd_enabled
@@ -0,0 +1,149 @@
---
- name: AAP-style LVM filesystem resize workflow
hosts: all
become: true
gather_facts: true
vars:
lvm_dry_run: true
lvm_vg_name: ""
lvm_lv_name: ""
lvm_mountpoint: ""
lvm_size_request: "+10G"
lvm_resize_filesystem: true
pre_tasks:
- name: Validate required survey variables
ansible.builtin.assert:
that:
- lvm_vg_name | length > 0
- lvm_lv_name | length > 0
- lvm_mountpoint | length > 0
- lvm_size_request | length > 0
fail_msg: "Required variables: lvm_vg_name, lvm_lv_name, lvm_mountpoint, lvm_size_request"
tasks:
- name: Capture block device layout before resize
ansible.builtin.command:
argv:
- lsblk
- --fs
register: lvm_lsblk_before
changed_when: false
- name: Capture physical volumes before resize
ansible.builtin.command:
argv:
- pvs
- --noheadings
- --units
- g
register: lvm_pvs_before
changed_when: false
- name: Capture volume groups before resize
ansible.builtin.command:
argv:
- vgs
- --noheadings
- --units
- g
register: lvm_vgs_before
changed_when: false
- name: Capture logical volumes before resize
ansible.builtin.command:
argv:
- lvs
- --noheadings
- --units
- g
register: lvm_lvs_before
changed_when: false
- name: Capture filesystem usage before resize
ansible.builtin.command:
argv:
- df
- -hT
- "{{ lvm_mountpoint }}"
register: lvm_df_before
changed_when: false
- name: Validate target logical volume exists
ansible.builtin.command:
argv:
- lvs
- "/dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
register: lvm_target_check
changed_when: false
- name: Show dry-run resize command
ansible.builtin.debug:
msg: "DRY RUN: would run lvextend -L {{ lvm_size_request }} /dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
when: lvm_dry_run | bool
- name: Extend logical volume
ansible.builtin.command:
argv:
- lvextend
- -L
- "{{ lvm_size_request }}"
- "/dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
register: lvm_lvextend_result
changed_when: true
when: not (lvm_dry_run | bool)
- name: Detect filesystem type
ansible.builtin.command:
argv:
- findmnt
- -n
- -o
- FSTYPE
- "{{ lvm_mountpoint }}"
register: lvm_fstype
changed_when: false
- name: Resize XFS filesystem
ansible.builtin.command:
argv:
- xfs_growfs
- "{{ lvm_mountpoint }}"
changed_when: true
when:
- not (lvm_dry_run | bool)
- lvm_resize_filesystem | bool
- lvm_fstype.stdout == "xfs"
- name: Resize ext filesystem
ansible.builtin.command:
argv:
- resize2fs
- "/dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
changed_when: true
when:
- not (lvm_dry_run | bool)
- lvm_resize_filesystem | bool
- lvm_fstype.stdout in ["ext2", "ext3", "ext4"]
- name: Capture filesystem usage after resize
ansible.builtin.command:
argv:
- df
- -hT
- "{{ lvm_mountpoint }}"
register: lvm_df_after
changed_when: false
- name: Report LVM resize evidence
ansible.builtin.debug:
msg:
host: "{{ inventory_hostname }}"
dry_run: "{{ lvm_dry_run }}"
target: "/dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
mountpoint: "{{ lvm_mountpoint }}"
requested_size: "{{ lvm_size_request }}"
filesystem_type: "{{ lvm_fstype.stdout | default('unknown') }}"
before_df: "{{ lvm_df_before.stdout_lines }}"
after_df: "{{ lvm_df_after.stdout_lines }}"
@@ -0,0 +1,31 @@
---
- name: Apply Security Patches and Updates
hosts: all
become: true
gather_facts: true
pre_tasks:
- name: Validate patch prerequisites
ansible.builtin.assert:
that:
- ansible_os_family == "Debian"
fail_msg: "Patching supported only on Debian-based systems"
- name: Display patch information
ansible.builtin.debug:
msg: |
Patching {{ inventory_hostname }}
Patch Window: {{ patch_window_start }} - {{ patch_window_end }}
Security Only: {{ patch_security_only }}
roles:
- role: patching
tags: ['patch', 'updates']
post_tasks:
- name: Display patching summary
ansible.builtin.debug:
msg: |
Patching completed!
Host: {{ inventory_hostname }}
Reboot Required: {{ reboot_required | default(false) }}
@@ -0,0 +1,33 @@
---
- name: Provision Enterprise Infrastructure Nodes
hosts: all
become: true
gather_facts: true
pre_tasks:
- name: Validate Ansible version
ansible.builtin.assert:
that:
- ansible_version.major >= 2
- ansible_version.minor >= 9
fail_msg: "Ansible 2.9+ is required"
- name: Display provisioning information
ansible.builtin.debug:
msg: |
Provisioning {{ inventory_hostname }}
OS: {{ ansible_os_family }}
Python: {{ ansible_python_version }}
roles:
- role: base_provision
tags: ['provision', 'base']
post_tasks:
- name: Generate provisioning summary
ansible.builtin.debug:
msg: |
Provisioning completed successfully!
Host: {{ inventory_hostname }}
IP: {{ ansible_default_ipv4.address }}
OS: {{ ansible_os_family }} {{ ansible_os_version }}
@@ -0,0 +1,48 @@
# Base Provision Role
Provision basic infrastructure on enterprise nodes with security hardening.
## Features
- **Idempotent**: All tasks use proper idempotency markers (`changed_when`, `failed_when`)
- **Handlers**: SSH and fail2ban restarts use handlers instead of direct service calls
- **Variables**: All configuration in `defaults/main.yml` - no hardcoding
- **Validation**: Pre-flight checks for system requirements
- **Firewall**: UFW firewall configuration with configurable rules
- **SSH Security**: Root login disabled, password auth disabled, key-based auth only
## Role Variables
See `defaults/main.yml` for all available variables.
### Key Variables
- `node_timezone`: System timezone (default: UTC)
- `admin_user`: Admin username for infrastructure access
- `ssh_port`: SSH service port (default: 22)
- `base_packages`: List of base packages to install
- `firewall_enabled`: Enable UFW firewall (default: true)
- `firewall_allowed_tcp_ports`: Allowed TCP ports for firewall
## Secret Variables
This portfolio demo does not require secrets for offline validation. If you extend it with real passwords or keys, copy `group_vars/vault.example.yml` into an encrypted Ansible Vault file and keep real values out of normal git history.
## Usage
```yaml
- role: base_provision
vars:
node_timezone: "Europe/Warsaw"
firewall_enabled: true
```
## Handlers
- `restart sshd`: Restarts SSH service (triggered by config changes)
- `restart fail2ban`: Restarts fail2ban service (triggered by config changes)
## Tags
- `provision`: All provisioning tasks
- `base`: Base provision role tasks
@@ -0,0 +1,44 @@
---
# Base provisioning configuration
node_timezone: "UTC"
admin_user: "infra-admin"
ssh_port: 22
ssh_disabled_root_login: true
ssh_disable_password_auth: true
# Packages to install
base_packages:
- curl
- wget
- vim
- htop
- net-tools
- iptables
- fail2ban
- unattended-upgrades
# Firewall rules
firewall_enabled: true
firewall_default_policy: deny
firewall_allowed_tcp_ports:
- 22
- 80
- 443
# Application directories
app_directories:
- path: /opt/application
owner: "{{ admin_user }}"
group: "{{ admin_user }}"
mode: '0755'
- path: /var/log/application
owner: "{{ admin_user }}"
group: "{{ admin_user }}"
mode: '0755'
- path: /etc/application
owner: root
group: root
mode: '0755'
# Service verification
services_to_verify: []
@@ -0,0 +1,11 @@
---
- name: restart sshd
ansible.builtin.service:
name: sshd
state: restarted
- name: restart fail2ban
ansible.builtin.service:
name: fail2ban
state: restarted
enabled: true
@@ -0,0 +1,138 @@
---
- name: Validate system requirements
ansible.builtin.assert:
that:
- ansible_os_family == "Debian"
- ansible_python_version is version('3.6', '>=')
fail_msg: "Unsupported system - requires Debian and Python 3.6+"
- name: Update package cache
ansible.builtin.apt:
update_cache: true
cache_valid_time: 3600
changed_when: false
- name: Install base packages
ansible.builtin.apt:
name: "{{ base_packages }}"
state: present
update_cache: true
- name: Check if admin user exists
ansible.builtin.getent:
database: passwd
key: "{{ admin_user }}"
register: admin_check
failed_when: false
changed_when: false
- name: Create admin user
ansible.builtin.user:
name: "{{ admin_user }}"
groups: sudo
append: true
create_home: true
shell: /bin/bash
when: admin_check.failed
- name: Configure timezone
community.general.timezone:
name: "{{ node_timezone }}"
- name: Configure SSH security
block:
- name: Disable root SSH login
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PermitRootLogin'
line: 'PermitRootLogin no'
state: present
when: ssh_disabled_root_login
notify: restart sshd
- name: Set SSH port
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: '^Port'
line: "Port {{ ssh_port }}"
state: present
notify: restart sshd
- name: Disable password authentication
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PasswordAuthentication'
line: 'PasswordAuthentication no'
state: present
when: ssh_disable_password_auth
notify: restart sshd
- name: Configure firewall
block:
- name: Enable UFW firewall
community.general.ufw:
state: enabled
policy: "{{ firewall_default_policy }}"
when: firewall_enabled
- name: Allow SSH access
community.general.ufw:
rule: allow
port: "{{ ssh_port }}"
proto: tcp
when: firewall_enabled
- name: Allow HTTP/HTTPS
community.general.ufw:
rule: allow
port: "{{ item }}"
proto: tcp
loop: "{{ firewall_allowed_tcp_ports }}"
when: firewall_enabled and item not in [ssh_port]
- name: Configure fail2ban
ansible.builtin.template:
src: jail.local.j2
dest: /etc/fail2ban/jail.local
backup: true
mode: '0644'
notify: restart fail2ban
- name: Enable unattended upgrades
ansible.builtin.lineinfile:
path: /etc/apt/apt.conf.d/20auto-upgrades
regexp: '^APT::Periodic::Unattended-Upgrade'
line: 'APT::Periodic::Unattended-Upgrade "1";'
state: present
- name: Create application directories
ansible.builtin.file:
path: "{{ item.path }}"
state: directory
owner: "{{ item.owner }}"
group: "{{ item.group }}"
mode: "{{ item.mode }}"
loop: "{{ app_directories }}"
- name: Record role-specific service intent
ansible.builtin.debug:
msg: "Would configure {{ node_type | default('generic') }} service components in a full lab deployment"
- name: Verify services are running
ansible.builtin.service:
name: "{{ item }}"
state: started
enabled: true
loop: "{{ services_to_verify }}"
when: services_to_verify | length > 0
failed_when: false
- name: Run health checks
ansible.builtin.uri:
url: http://localhost/health
method: GET
status_code: 200
register: health_check
failed_when: false
ignore_errors: true
when: "'webservers' in group_names"
@@ -0,0 +1,14 @@
# fail2ban configuration
[DEFAULT]
bantime = 3600
findtime = 600
maxretry = 5
[sshd]
enabled = true
port = {{ ssh_port }}
logpath = /var/log/auth.log
maxretry = 3
[recidive]
enabled = true
@@ -0,0 +1,62 @@
# Decommission Role
Gracefully decommission enterprise infrastructure nodes with comprehensive backup and cleanup.
## Features
- **Confirmation Prompt**: Interactive confirmation before decommissioning
- **Graceful Shutdown**: Stop services gracefully with connection drain time
- **Comprehensive Backup**: Archive configurations and data before cleanup
- **Selective Cleanup**: Only remove items that were deployed
- **Logging**: Detailed decommissioning logs for audit trail
- **Notifications**: Optional email notifications on completion
## Role Variables
See `defaults/main.yml` for all available variables.
### Key Variables
- `backup_data`: Backup application data (default: true)
- `export_config`: Export system configuration (default: true)
- `graceful_shutdown`: Graceful service shutdown (default: true)
- `auto_shutdown`: Auto shutdown after decommissioning (default: false)
- `application_services`: Services to stop
- `application_packages`: Packages to remove
- `decommission_notification_email`: Email for notifications (optional)
## Usage
```yaml
- role: decommission
vars:
backup_data: true
export_config: true
auto_shutdown: false
decommission_notification_email: "ops@company.com"
```
## Backup Locations
- Configuration: `/var/backups/decommission-<timestamp>/config/`
- Data: `/var/backups/decommission-<timestamp>/data/`
- Report: `/var/log/decommission_report_<timestamp>.log`
## Supported Groups
- `webservers`: Backs up /var/www/html
- `databases`: Backs up PostgreSQL data
- `monitoring`: Backs up Prometheus data
- `loadbalancers`: Loadbalancer cleanup
## Safety Features
- Interactive confirmation before execution
- Connection drain time before shutdown (30 seconds)
- Errors are logged but don't stop the process
- Comprehensive audit log
## Tags
- `decommission`: All decommissioning tasks
- `cleanup`: Cleanup-related tasks
@@ -0,0 +1,34 @@
---
# Decommissioning configuration
backup_data: true
export_config: true
graceful_shutdown: true
cleanup_inventory: true
auto_shutdown: false
shutdown_delay: 10
# Services to stop gracefully
application_services:
- nginx
- postgresql
- haproxy
# Packages to remove
application_packages:
- nginx
- postgresql
- haproxy
- prometheus
# Directories to archive
config_paths:
- /etc/
- /opt/application/
data_paths:
- /var/www/html
- /var/lib/postgresql
- /var/lib/prometheus
# Notification settings
decommission_notification_email: null
@@ -0,0 +1,177 @@
---
- name: Validate decommissioning requirements
ansible.builtin.assert:
that:
- backup_data or not backup_data
fail_msg: "Invalid decommissioning configuration"
- name: Pre-decommissioning checks
block:
- name: Check node health
ansible.builtin.uri:
url: http://localhost/health
method: GET
status_code: 200
register: health_check
failed_when: false
ignore_errors: true
when: "'webservers' in group_names"
- name: Create decommissioning backup directory
ansible.builtin.file:
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}"
state: directory
mode: '0755'
- name: Initialize decommissioning log
ansible.builtin.file:
path: "/var/log/decommission.log"
state: touch
mode: '0644'
modification_time: now
access_time: now
- name: Log decommissioning start
ansible.builtin.lineinfile:
path: "/var/log/decommission.log"
line: "{{ ansible_date_time.iso8601 }} - Starting decommissioning of {{ inventory_hostname }}"
state: present
- name: Graceful application shutdown
block:
- name: Stop application services
ansible.builtin.service:
name: "{{ item }}"
state: stopped
loop: "{{ application_services }}"
failed_when: false
when: graceful_shutdown
- name: Wait for connections to drain
ansible.builtin.pause:
seconds: 30
when: graceful_shutdown and ("webservers" in group_names or "loadbalancers" in group_names)
- name: Export and backup data
block:
- name: Create config export directory
ansible.builtin.file:
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config"
state: directory
mode: '0755'
- name: Archive system configuration
community.general.archive:
path: "{{ config_paths }}"
dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config/system_config.tar.gz"
format: gz
when: export_config
failed_when: false # noqa risky-file-permissions
- name: Create data backup directory
ansible.builtin.file:
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data"
state: directory
mode: '0755'
when: backup_data
- name: Backup individual data paths
community.general.archive:
path: "{{ item }}"
dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/{{ item | regex_replace('/', '_') }}.tar.gz"
format: gz
loop: "{{ data_paths }}"
when: backup_data
failed_when: false # noqa risky-file-permissions
- name: Update monitoring and load balancing
block:
- name: Remove from load balancer
ansible.builtin.debug:
msg: "Would remove {{ inventory_hostname }} from load balancer"
when: "'webservers' in group_names or 'databases' in group_names"
- name: Update monitoring alerts
ansible.builtin.debug:
msg: "Would update monitoring alerts for {{ inventory_hostname }}"
when: "'monitoring' not in group_names"
- name: Clean up application
block:
- name: Remove application directories
ansible.builtin.file:
path: "{{ item }}"
state: absent
loop:
- /opt/application
- /var/www/html
- /var/lib/postgresql
- /var/lib/prometheus
failed_when: false
- name: Remove application packages
ansible.builtin.apt:
name: "{{ item }}"
state: absent
purge: true
loop: "{{ application_packages }}"
failed_when: false
- name: Clean system logs
ansible.builtin.shell: |
set -o pipefail
find /var/log -name "*.log" -type f -size +0 -exec truncate -s 0 {} \;
changed_when: false
failed_when: false
- name: Remove SSH credentials
ansible.builtin.file:
path: "{{ item }}"
state: absent
loop:
- /root/.ssh/authorized_keys
- /root/.ssh/known_hosts
- /home/infra-admin/.ssh/authorized_keys
failed_when: false
- name: Generate decommissioning report
ansible.builtin.template:
src: decommission_report.j2
dest: "/var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log"
mode: '0644'
vars:
backup_location: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}"
- name: Send decommissioning notification
community.general.mail:
host: localhost
port: 25
to: "{{ decommission_notification_email }}"
subject: "Node Decommissioned - {{ inventory_hostname }}"
body: |
Node {{ inventory_hostname }} has been successfully decommissioned.
Backup location: /var/backups/decommission-{{ ansible_date_time.iso8601 }}/
Services stopped: {{ application_services | join(', ') }}
Configuration exported: {{ export_config }}
Data backed up: {{ backup_data }}
See /var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log for details
when: decommission_notification_email is defined
failed_when: false
- name: Finalize decommissioning
block:
- name: Log decommissioning completion
ansible.builtin.lineinfile:
path: "/var/log/decommission.log"
line: "{{ ansible_date_time.iso8601 }} - Decommissioning completed for {{ inventory_hostname }}"
state: present
- name: Perform system shutdown
ansible.builtin.reboot:
msg: "System scheduled for shutdown after decommissioning"
delay: "{{ shutdown_delay }}"
when: auto_shutdown | bool
async: 1
poll: 0
@@ -0,0 +1,13 @@
Decommissioning Report
======================
Generated: {{ ansible_date_time.iso8601 }}
Host: {{ inventory_hostname }}
Status: COMPLETED
Backup Location: {{ backup_location }}
Configuration Exported: {{ export_config }}
Data Backed Up: {{ backup_data }}
Services Stopped: {{ application_services | join(', ') }}
Log Location: /var/log/decommission.log
@@ -0,0 +1,58 @@
# Hardening Role
Apply security hardening to enterprise infrastructure nodes following CIS benchmarks.
## Features
- **CIS Compliance**: Support for CIS hardening levels 1 and 2
- **SSH Hardening**: Disable root login, password auth, set auth limits
- **Firewall Configuration**: UFW with configurable rules
- **Service Cleanup**: Disable unnecessary services and remove insecure packages
- **Handlers**: SSH restarts via handlers
## Role Variables
See `defaults/main.yml` for all available variables.
### Key Variables
- `cis_level`: CIS hardening level (1 or 2)
- `disable_root_login`: Disable root SSH login (default: true)
- `secure_ssh_config`: Apply SSH security hardening (default: true)
- `firewall_policy`: Firewall default policy (default: deny)
- `ssh_max_auth_tries`: Maximum SSH authentication attempts (default: 3)
- `ssh_client_alive_interval`: SSH client alive interval in seconds (default: 300)
- `ssh_allowed_networks`: Networks allowed SSH access from
### SSH Allowed Networks
Default trusted networks:
- 10.0.0.0/8 (Private Class A)
- 172.16.0.0/12 (Private Class B)
- 192.168.0.0/16 (Private Class C)
## Usage
```yaml
- role: hardening
vars:
cis_level: 1
disable_root_login: true
ssh_allowed_networks:
- 10.0.0.0/8
- 203.0.113.0/24
```
## SSH Configuration Changes
- Root login disabled
- Password authentication disabled
- Maximum auth tries: 3
- Empty passwords prohibited
- Client alive interval: 300 seconds
- Client alive count max: 2
## Tags
- `hardening`: All hardening tasks
- `security`: Security-related tasks
@@ -0,0 +1,35 @@
---
# Hardening configuration
cis_level: 1
disable_root_login: true
secure_ssh_config: true
firewall_policy: deny
auditd_enabled: true
selinux_mode: enforcing
apparmor_enabled: true
# SSH Hardening
ssh_max_auth_tries: 3
ssh_client_alive_interval: 300
ssh_client_alive_count_max: 2
# Firewall rules for SSH (trusted networks)
ssh_allowed_networks:
- 10.0.0.0/8
- 172.16.0.0/12
- 192.168.0.0/16
# Services to disable
unnecessary_services:
- cups
- avahi-daemon
- bluetooth
- nfs-server
- rpcbind
# Packages to remove
unnecessary_packages:
- telnet
- rsh-client
- talk
- ntalk
@@ -0,0 +1,5 @@
---
- name: restart sshd
ansible.builtin.service:
name: sshd
state: restarted
@@ -0,0 +1,7 @@
---
# CIS Hardening Level 1 tasks (stub for future expansion)
# https://www.cisecurity.org/cis-benchmarks/
- name: Check CIS status
ansible.builtin.debug:
msg: "CIS Hardening Level {{ cis_level }} would be applied here"
@@ -0,0 +1,95 @@
---
- name: Validate hardening requirements
ansible.builtin.assert:
that:
- ansible_os_family == "Debian"
- cis_level in [1, 2]
fail_msg: "Unsupported configuration for hardening"
- name: Apply CIS hardening tasks
ansible.builtin.include_tasks: cis_hardening.yml
when: cis_level >= 1
- name: Configure SSH hardening
block:
- name: Disable root SSH login
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PermitRootLogin'
line: 'PermitRootLogin no'
state: present
when: disable_root_login
notify: restart sshd
- name: Disable password authentication
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PasswordAuthentication'
line: 'PasswordAuthentication no'
state: present
when: secure_ssh_config
notify: restart sshd
- name: Set MaxAuthTries
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: '^MaxAuthTries'
line: "MaxAuthTries {{ ssh_max_auth_tries }}"
state: present
notify: restart sshd
- name: Disable empty passwords
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: '^PermitEmptyPasswords'
line: 'PermitEmptyPasswords no'
state: present
notify: restart sshd
- name: Set ClientAliveInterval
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: '^ClientAliveInterval'
line: "ClientAliveInterval {{ ssh_client_alive_interval }}"
state: present
notify: restart sshd
- name: Set ClientAliveCountMax
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: '^ClientAliveCountMax'
line: "ClientAliveCountMax {{ ssh_client_alive_count_max }}"
state: present
notify: restart sshd
- name: Configure firewall rules
block:
- name: Enable firewall
community.general.ufw:
state: enabled
policy: "{{ firewall_policy }}"
when: firewall_policy is defined
- name: Allow SSH from trusted networks
community.general.ufw:
rule: allow
port: '22'
proto: tcp
from: "{{ item }}"
loop: "{{ ssh_allowed_networks }}"
- name: Disable unnecessary services
ansible.builtin.service:
name: "{{ item }}"
state: stopped
enabled: false
loop: "{{ unnecessary_services }}"
failed_when: false
- name: Remove unnecessary packages
ansible.builtin.apt:
name: "{{ item }}"
state: absent
purge: true
loop: "{{ unnecessary_packages }}"
failed_when: false
@@ -0,0 +1,45 @@
# Patching Role
Apply security patches and OS updates to enterprise infrastructure nodes.
## Features
- **Idempotent**: Properly checks for changes with `changed_when`
- **Patch Window**: Optional enforcement of patch time windows
- **Pre-patch Backup**: Backs up package list before patching
- **Smart Reboot**: Automatically detects if reboot is required
- **Service Restart**: Restarts only necessary services after patching
- **Health Checks**: Verifies services and runs health endpoint checks
## Role Variables
See `defaults/main.yml` for all available variables.
### Key Variables
- `patch_window_start`: Patch window start time (default: 02:00)
- `patch_window_end`: Patch window end time (default: 04:00)
- `enforce_patch_window`: Enforce patch time window (default: true)
- `patch_security_only`: Apply security updates only (default: true)
- `backup_before_patch`: Create backup before patching (default: true)
- `reboot_if_required`: Auto-reboot if required (default: false)
- `services_to_restart`: Services to restart after patching
- `critical_services`: Critical services to verify after patching
## Usage
```yaml
- role: patching
vars:
patch_security_only: true
enforce_patch_window: false
reboot_if_required: true
```
## Report
Patch report is generated at: `/var/log/patch_report_<timestamp>.log`
## Backup Location
Pre-patch backups saved to: `/var/backups/pre-patch-<timestamp>/`
@@ -0,0 +1,20 @@
---
# Patching configuration
patch_window_start: "02:00"
patch_window_end: "04:00"
enforce_patch_window: true
patch_security_only: true
backup_before_patch: true
reboot_if_required: false
reboot_timeout: 300
# Services to restart after patching
services_to_restart:
- sshd
- fail2ban
# Services to verify after patching
critical_services:
- systemd-journald
- systemd-logind
- cron
@@ -0,0 +1,6 @@
---
- name: restart patching services
ansible.builtin.service:
name: "{{ item }}"
state: restarted
loop: "{{ services_to_restart }}"
@@ -0,0 +1,105 @@
---
- name: Validate patch window
when: enforce_patch_window | bool
block:
- name: Check current time against patch window
ansible.builtin.assert:
that:
- ansible_date_time.hour | int >= patch_window_start.split(':')[0] | int
- ansible_date_time.hour | int < patch_window_end.split(':')[0] | int
fail_msg: |
Current time {{ ansible_date_time.hour }}:{{ ansible_date_time.minute }} is outside patch window {{ patch_window_start }}-{{ patch_window_end }}
- name: Create pre-patch backup
when: backup_before_patch | bool
block:
- name: Create backup directory
ansible.builtin.file:
path: "/var/backups/pre-patch-{{ ansible_date_time.iso8601 }}"
state: directory
mode: '0755'
- name: Capture current package list
ansible.builtin.shell: |
set -o pipefail
dpkg --get-selections > /var/backups/pre-patch-{{ ansible_date_time.iso8601 }}/packages.list
changed_when: false
- name: Check for available updates
ansible.builtin.shell: |
set -o pipefail
apt list --upgradable 2>/dev/null | grep -v "Listing..." | wc -l
register: updates_available_count
changed_when: false
failed_when: false
- name: Update package cache
ansible.builtin.apt:
update_cache: true
cache_valid_time: 300
changed_when: false
- name: Check if reboot required before patching
ansible.builtin.stat:
path: /var/run/reboot-required
register: reboot_required_before
changed_when: false
- name: Apply security updates
ansible.builtin.apt:
upgrade: dist
update_cache: true
when: patch_security_only | bool
register: apt_update_result
notify: restart patching services
- name: Apply all available updates
ansible.builtin.apt:
upgrade: full
update_cache: true
when: not (patch_security_only | bool)
register: apt_update_result
notify: restart patching services
- name: Check if reboot required after patching
ansible.builtin.stat:
path: /var/run/reboot-required
register: reboot_required_after
changed_when: false
- name: Verify critical services are running
ansible.builtin.service:
name: "{{ item }}"
state: started
enabled: true
loop: "{{ critical_services }}"
failed_when: false
- name: Run post-patch health checks
ansible.builtin.uri:
url: http://localhost/health
method: GET
status_code: 200
register: health_check
failed_when: false
ignore_errors: true
when: "'webservers' in group_names"
- name: Set reboot required flag
ansible.builtin.set_fact:
reboot_required: "{{ reboot_required_after.stat.exists | default(false) }}"
- name: Perform system reboot if required
ansible.builtin.reboot:
msg: "Rebooting after security patches"
timeout: "{{ reboot_timeout }}"
when: reboot_required and reboot_if_required | bool
- name: Generate patching report
ansible.builtin.template:
src: patch_report.j2
dest: /var/log/patch_report_{{ ansible_date_time.iso8601 }}.log
mode: '0644'
vars:
updates_applied_count: "{{ apt_update_result.changed | ternary('Yes', 'No') }}"
reboot_required_flag: "{{ reboot_required }}"
@@ -0,0 +1,10 @@
Patching Report
===============
Generated: {{ ansible_date_time.iso8601 }}
Host: {{ inventory_hostname }}
Updates Applied: {{ updates_applied_count }}
Reboot Required: {{ reboot_required_flag }}
Services Restarted: {{ services_to_restart | join(', ') }}
Backup Location: /var/backups/pre-patch-{{ ansible_date_time.iso8601 }}/
@@ -0,0 +1,21 @@
# Scenario: Simulate Failure and Patch
## Description
Validate that a service-level failure can be detected, recovered, and followed by a controlled patch workflow. This mirrors a maintenance window where a degraded node is stabilized before package updates are applied.
## Commands
```bash
cd professional-infra/linux-operations-automation
./scripts/simulate_failure.sh service 30 web
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml --check
```
## Expected Result
- The simulation records a temporary service failure.
- The service is restored after cleanup.
- The patch playbook completes without unreachable hosts.
- Hardening check mode reports no destructive changes.
@@ -0,0 +1,116 @@
---
- name: Enterprise Scaling Event Scenario
hosts: all
become: yes
gather_facts: yes
vars:
scaling_threshold: 80
cooldown_period: 300
max_scale_up: 5
min_instances: 2
pre_tasks:
- name: Log scenario start
lineinfile:
path: "/var/log/scaling_scenario.log"
line: "{{ ansible_date_time.iso8601 }} - Starting scaling event scenario"
create: yes
- name: Check current load
command: uptime
register: system_load
changed_when: false
- name: Parse load average
set_fact:
load_1min: "{{ system_load.stdout.split(',')[0].split()[-1] | float }}"
load_5min: "{{ system_load.stdout.split(',')[1] | float }}"
load_15min: "{{ system_load.stdout.split(',')[2] | float }}"
tasks:
- name: Evaluate scaling conditions
set_fact:
scale_up_needed: "{{ load_5min > scaling_threshold }}"
scale_down_needed: "{{ load_5min < (scaling_threshold * 0.3) }}"
- name: Scale up web servers
include_role:
name: scale_up
tasks_from: web_servers
vars:
scale_count: "{{ [max_scale_up, (load_5min / 10) | int] | min }}"
when: scale_up_needed and "'webservers' in group_names"
- name: Scale up database servers
include_role:
name: scale_up
tasks_from: database_servers
vars:
scale_count: "{{ [2, (load_5min / 20) | int] | min }}"
when: scale_up_needed and "'databases' in group_names"
- name: Update load balancer configuration
include_role:
name: load_balancer
tasks_from: update_backends
when: scale_up_needed
- name: Scale down web servers
include_role:
name: scale_down
tasks_from: web_servers
vars:
scale_count: "{{ [(inventory_hostname | regex_findall('[0-9]+') | first | int) - min_instances, 1] | max }}"
when: scale_down_needed and "'webservers' in group_names" and (inventory_hostname | regex_findall('[0-9]+') | first | int) > min_instances
- name: Wait for cooldown period
pause:
seconds: "{{ cooldown_period }}"
when: scale_up_needed or scale_down_needed
- name: Verify scaling results
uri:
url: http://localhost/health
method: GET
status_code: 200
register: health_check
until: health_check.status == 200
retries: 5
delay: 10
when: "'webservers' in group_names"
- name: Update monitoring thresholds
include_role:
name: monitoring
tasks_from: update_alerts
vars:
new_threshold: "{{ scaling_threshold + 10 }}"
- name: Send scaling notification
mail:
to: "{{ scaling_notification_email | default('infra-team@company.com') }}"
subject: "Infrastructure Scaling Event - {{ inventory_hostname }}"
body: |
Scaling event completed on {{ inventory_hostname }}
Load averages: {{ load_1min }}, {{ load_5min }}, {{ load_15min }}
Action taken: {{ 'Scale Up' if scale_up_needed else 'Scale Down' if scale_down_needed else 'No Action' }}
Health check: {{ 'PASSED' if health_check.status == 200 else 'FAILED' }}
See /var/log/scaling_scenario.log for details
when: scaling_notification_email is defined
ignore_errors: yes
post_tasks:
- name: Generate scaling scenario report
template:
src: templates/scaling_scenario_report.j2
dest: "/var/log/scaling_scenario_report_{{ ansible_date_time.iso8601 }}.log"
vars:
scenario_outcome: "{{ 'SUCCESS' if health_check.status == 200 else 'WARNING' }}"
load_metrics: "{{ load_1min }}, {{ load_5min }}, {{ load_15min }}"
- name: Log scenario completion
lineinfile:
path: "/var/log/scaling_scenario.log"
line: "{{ ansible_date_time.iso8601 }} - Scaling event scenario completed"
@@ -0,0 +1,388 @@
#!/bin/bash
# Enterprise Infrastructure Failure Simulation Script
# Simulates various types of infrastructure failures for testing
set -euo pipefail
# Configuration
DOCKER_COMPOSE_FILE="docker-compose.yml"
INVENTORY_FILE="inventory/hosts.ini"
LOG_FILE="logs/failure_simulation.log"
# Default values
FAILURE_TYPE="${1:-network}"
DURATION="${2:-60}"
TARGET_NODES="${3:-all}"
INTENSITY="${INTENSITY:-medium}"
# Logging function
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
}
# Error handling
error_exit() {
log "ERROR: $1"
# Cleanup any active failures
cleanup_failure
exit 1
}
# Validate inputs
validate_inputs() {
case "$FAILURE_TYPE" in
network|disk|service|node|cpu|memory) ;;
*) error_exit "Invalid failure type: $FAILURE_TYPE. Must be network, disk, service, node, cpu, or memory" ;;
esac
if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -lt 1 ]; then
error_exit "Invalid duration: $DURATION. Must be a positive integer (seconds)"
fi
case "$INTENSITY" in
low|medium|high|critical) ;;
*) error_exit "Invalid intensity: $INTENSITY. Must be low, medium, high, or critical" ;;
esac
}
# Get target containers
get_target_containers() {
if [ "${SIMULATION_MODE:-false}" = true ]; then
case "$TARGET_NODES" in
all) echo "web db lb" ;;
*) echo "$TARGET_NODES" ;;
esac
return
fi
case "$TARGET_NODES" in
all)
docker compose ps --services | grep -v "^NAME$" || true
;;
web)
echo "web"
;;
db)
echo "db"
;;
lb)
echo "lb"
;;
monitor)
echo "monitor"
;;
*)
echo "$TARGET_NODES"
;;
esac
}
# Network failure simulation
simulate_network_failure() {
local containers=$(get_target_containers)
log "Simulating network failure on containers: $containers"
if [ "${SIMULATION_MODE:-false}" = true ]; then
log "SIMULATION_MODE=true: skipping Docker network changes"
return
fi
for container in $containers; do
local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
for cid in $container_ids; do
if [ -n "$cid" ]; then
log "Disconnecting network for container $cid"
# Disconnect from network
docker network disconnect "$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" "$cid" 2>/dev/null || true
# Store original network for restoration
echo "$cid:$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" >> /tmp/network_failure_state
fi
done
done
}
# Disk failure simulation
simulate_disk_failure() {
local containers=$(get_target_containers)
log "Simulating disk space exhaustion on containers: $containers"
if [ "${SIMULATION_MODE:-false}" = true ]; then
log "SIMULATION_MODE=true: skipping container disk writes"
return
fi
for container in $containers; do
local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
for cid in $container_ids; do
if [ -n "$cid" ]; then
log "Filling disk space in container $cid"
# Create a large file to consume disk space
local fill_size_mb=100
case "$INTENSITY" in
low) fill_size_mb=50 ;;
medium) fill_size_mb=100 ;;
high) fill_size_mb=500 ;;
critical) fill_size_mb=1024 ;;
esac
docker exec "$cid" bash -c "dd if=/dev/zero of=/tmp/disk_fill bs=1M count=${fill_size_mb}" 2>/dev/null || true
echo "$cid:disk_fill" >> /tmp/disk_failure_state
fi
done
done
}
# Service failure simulation
simulate_service_failure() {
local containers=$(get_target_containers)
log "Simulating service failures on containers: $containers"
if [ "${SIMULATION_MODE:-false}" = true ]; then
for container in $containers; do
log "SIMULATION_MODE=true: would stop services in $container"
done
return
fi
for container in $containers; do
local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
for cid in $container_ids; do
if [ -n "$cid" ]; then
log "Stopping services in container $cid"
# Stop common services
docker exec "$cid" systemctl stop nginx 2>/dev/null || true
docker exec "$cid" systemctl stop postgresql 2>/dev/null || true
docker exec "$cid" systemctl stop haproxy 2>/dev/null || true
echo "$cid:services" >> /tmp/service_failure_state
fi
done
done
}
# Node failure simulation
simulate_node_failure() {
local containers=$(get_target_containers)
log "Simulating complete node failures on containers: $containers"
if [ "${SIMULATION_MODE:-false}" = true ]; then
log "SIMULATION_MODE=true: skipping container pause"
return
fi
for container in $containers; do
local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
for cid in $container_ids; do
if [ -n "$cid" ]; then
log "Stopping container $cid (node failure)"
docker pause "$cid"
echo "$cid:paused" >> /tmp/node_failure_state
fi
done
done
}
# CPU stress simulation
simulate_cpu_failure() {
local containers=$(get_target_containers)
log "Simulating CPU stress on containers: $containers"
if [ "${SIMULATION_MODE:-false}" = true ]; then
log "SIMULATION_MODE=true: skipping CPU stress"
return
fi
for container in $containers; do
local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
for cid in $container_ids; do
if [ -n "$cid" ]; then
log "Starting CPU stress in container $cid"
# Start CPU stress process
docker exec -d "$cid" bash -c "while true; do :; done" 2>/dev/null || true
echo "$cid:cpu_stress:$(docker exec "$cid" ps aux | grep "while true" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/cpu_failure_state
fi
done
done
}
# Memory stress simulation
simulate_memory_failure() {
local containers=$(get_target_containers)
log "Simulating memory exhaustion on containers: $containers"
if [ "${SIMULATION_MODE:-false}" = true ]; then
log "SIMULATION_MODE=true: skipping memory stress"
return
fi
for container in $containers; do
local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
for cid in $container_ids; do
if [ -n "$cid" ]; then
log "Starting memory stress in container $cid"
# Start memory stress process
docker exec -d "$cid" bash -c "tail /dev/zero" 2>/dev/null || true
echo "$cid:memory_stress:$(docker exec "$cid" ps aux | grep "tail /dev/zero" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/memory_failure_state
fi
done
done
}
# Inject failure
inject_failure() {
case "$FAILURE_TYPE" in
network) simulate_network_failure ;;
disk) simulate_disk_failure ;;
service) simulate_service_failure ;;
node) simulate_node_failure ;;
cpu) simulate_cpu_failure ;;
memory) simulate_memory_failure ;;
esac
}
# Cleanup failure
cleanup_failure() {
log "Cleaning up failure simulation"
# Restore network connections
if [ -f /tmp/network_failure_state ]; then
while IFS=: read -r cid network; do
docker network connect "$network" "$cid" 2>/dev/null || true
done < /tmp/network_failure_state
rm -f /tmp/network_failure_state
fi
# Clean up disk fill files
if [ -f /tmp/disk_failure_state ]; then
while IFS=: read -r cid _; do
docker exec "$cid" rm -f /tmp/disk_fill 2>/dev/null || true
done < /tmp/disk_failure_state
rm -f /tmp/disk_failure_state
fi
# Restart services
if [ -f /tmp/service_failure_state ]; then
while IFS=: read -r cid _; do
docker exec "$cid" systemctl start nginx 2>/dev/null || true
docker exec "$cid" systemctl start postgresql 2>/dev/null || true
docker exec "$cid" systemctl start haproxy 2>/dev/null || true
done < /tmp/service_failure_state
rm -f /tmp/service_failure_state
fi
# Unpause containers
if [ -f /tmp/node_failure_state ]; then
while IFS=: read -r cid _; do
docker unpause "$cid" 2>/dev/null || true
done < /tmp/node_failure_state
rm -f /tmp/node_failure_state
fi
# Kill stress processes
if [ -f /tmp/cpu_failure_state ]; then
while IFS=: read -r cid _ pid; do
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
done < /tmp/cpu_failure_state
rm -f /tmp/cpu_failure_state
fi
if [ -f /tmp/memory_failure_state ]; then
while IFS=: read -r cid _ pid; do
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
done < /tmp/memory_failure_state
rm -f /tmp/memory_failure_state
fi
}
# Monitor failure
monitor_failure() {
local end_time=$(( $(date +%s) + DURATION ))
log "Monitoring failure for $DURATION seconds"
while [ $(date +%s) -lt $end_time ]; do
# Check container status
if [ "${SIMULATION_MODE:-false}" = true ]; then
log "SIMULATION_MODE=true: validation simulated"
return
fi
if ! docker compose ps | grep -q "Up\|Paused"; then
log "WARNING: All containers are down"
fi
# Log system metrics
log "System status: $(docker stats --no-stream --format 'table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}' | tail -n +2)"
sleep 10
done
}
# Generate failure report
generate_report() {
local report_file="reports/failure_simulation_$(date +%Y%m%d_%H%M%S).txt"
cat > "$report_file" << EOF
Failure Simulation Report
========================
Timestamp: $(date)
Failure Type: $FAILURE_TYPE
Duration: $DURATION seconds
Target Nodes: $TARGET_NODES
Intensity: $INTENSITY
Pre-failure Status:
$(docker compose ps 2>/dev/null || echo "Docker Compose not running")
Post-failure Status:
$(docker compose ps 2>/dev/null || echo "Docker Compose not running")
Log File: $LOG_FILE
EOF
log "Failure simulation report generated: $report_file"
}
# Main execution
main() {
log "Starting failure simulation: $FAILURE_TYPE for $DURATION seconds"
validate_inputs
# Inject failure
inject_failure
# Monitor during failure
monitor_failure
# Cleanup
cleanup_failure
# Generate report
generate_report
log "Failure simulation completed successfully"
}
# Trap for cleanup on script exit
trap cleanup_failure EXIT
# Initialize logging
mkdir -p logs reports
# Run main function
main "$@"
@@ -0,0 +1,229 @@
#!/bin/bash
# Enterprise Infrastructure Scaling Simulation Script
# Simulates scaling operations for infrastructure nodes
set -euo pipefail
# Configuration
DOCKER_COMPOSE_FILE="docker-compose.yml"
INVENTORY_FILE="inventory/hosts.ini"
LOG_FILE="logs/scaling_simulation.log"
# Default values
DIRECTION="${1:-up}"
COUNT="${2:-1}"
NODE_TYPE="${3:-web}"
SIMULATION_MODE="${SIMULATION_MODE:-false}"
# Logging function
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
}
# Error handling
error_exit() {
log "ERROR: $1"
exit 1
}
# Validate inputs
validate_inputs() {
if [[ "$DIRECTION" != "up" && "$DIRECTION" != "down" ]]; then
error_exit "Invalid direction: $DIRECTION. Must be 'up' or 'down'"
fi
if ! [[ "$COUNT" =~ ^[0-9]+$ ]] || [ "$COUNT" -lt 1 ]; then
error_exit "Invalid count: $COUNT. Must be a positive integer"
fi
case "$NODE_TYPE" in
web|db|lb|monitor) ;;
*) error_exit "Invalid node type: $NODE_TYPE. Must be web, db, lb, or monitor" ;;
esac
}
# Get current node count
get_current_count() {
local type="$1"
if [ "$SIMULATION_MODE" = true ]; then
case "$type" in
web) echo 3 ;;
db) echo 2 ;;
lb|monitor) echo 1 ;;
esac
return
fi
case "$type" in
web) docker compose ps web | grep -c "Up" ;;
db) docker compose ps db | grep -c "Up" ;;
lb) docker compose ps lb | grep -c "Up" ;;
monitor) docker compose ps monitor | grep -c "Up" ;;
esac
}
# Scale up infrastructure
scale_up() {
local type="$1"
local count="$2"
log "Scaling up $count $type nodes"
if [ "$SIMULATION_MODE" = true ]; then
log "SIMULATION_MODE=true: skipping Docker Compose mutation and Ansible provisioning"
update_inventory "$type" "$count" "add"
log "Successfully simulated scale up of $count $type nodes"
return
fi
docker compose -f "$DOCKER_COMPOSE_FILE" up -d --scale "${type}=${count}"
# Wait for containers to be ready
log "Waiting for containers to be ready..."
sleep 30
# Update inventory
update_inventory "$type" "$count" "add"
# Run provisioning playbook on new nodes
if [ "$SIMULATION_MODE" = false ]; then
ansible-playbook -i "$INVENTORY_FILE" playbooks/provision.yml --limit "${type}*"
fi
log "Successfully scaled up $count $type nodes"
}
# Scale down infrastructure
scale_down() {
local type="$1"
local count="$2"
local current_count=$(get_current_count "$type")
if [ "$current_count" -lt "$count" ]; then
error_exit "Cannot scale down $count nodes. Only $current_count $type nodes currently running"
fi
log "Scaling down $count $type nodes"
# Select nodes to remove (oldest first)
if [ "$SIMULATION_MODE" = true ]; then
log "SIMULATION_MODE=true: skipping Docker Compose mutation and Ansible decommissioning"
update_inventory "$type" "$count" "remove"
log "Successfully simulated scale down of $count $type nodes"
return
fi
local nodes_to_remove=$(docker compose ps "$type" | grep "Up" | head -n "$count" | awk '{print $1}')
# Decommission nodes
for node in $nodes_to_remove; do
if [ "$SIMULATION_MODE" = false ]; then
ansible-playbook -i "$INVENTORY_FILE" playbooks/decommission.yml --limit "$node"
fi
docker stop "$node"
docker rm "$node"
done
# Update inventory
update_inventory "$type" "$count" "remove"
log "Successfully scaled down $count $type nodes"
}
# Update Ansible inventory
update_inventory() {
local type="$1"
local count="$2"
local action="$3"
log "Updating inventory for $action $count $type nodes"
# This would be more complex in a real implementation
# For simulation, we'll just log the action
case "$action" in
add)
log "Added $count $type nodes to inventory"
;;
remove)
log "Removed $count $type nodes from inventory"
;;
esac
}
# Health check after scaling
health_check() {
log "Running health checks after scaling"
# Check container status
if [ "$SIMULATION_MODE" = true ]; then
log "SIMULATION_MODE=true: health checks simulated"
return
fi
if ! docker compose ps | grep -q "Up"; then
error_exit "Some containers failed to start"
fi
# Ansible ping check
if [ "$SIMULATION_MODE" = false ]; then
if ! ansible -i "$INVENTORY_FILE" all -m ping >/dev/null 2>&1; then
log "WARNING: Some nodes failed Ansible ping check"
fi
fi
log "Health checks completed"
}
# Generate scaling report
generate_report() {
local report_file="reports/scaling_report_$(date +%Y%m%d_%H%M%S).txt"
cat > "$report_file" << EOF
Scaling Simulation Report
========================
Timestamp: $(date)
Direction: $DIRECTION
Node Type: $NODE_TYPE
Count: $COUNT
Simulation Mode: $SIMULATION_MODE
Current Status:
$(docker compose ps 2>/dev/null || echo "Docker Compose not running")
Inventory Status:
$(ansible -i "$INVENTORY_FILE" --list-hosts all 2>/dev/null || echo "Ansible inventory check failed")
Log File: $LOG_FILE
EOF
log "Scaling report generated: $report_file"
}
# Main execution
main() {
log "Starting scaling simulation: $DIRECTION $COUNT $NODE_TYPE nodes"
validate_inputs
case "$DIRECTION" in
up)
scale_up "$NODE_TYPE" "$COUNT"
;;
down)
scale_down "$NODE_TYPE" "$COUNT"
;;
esac
health_check
generate_report
log "Scaling simulation completed successfully"
}
# Initialize logging
mkdir -p logs reports
# Run main function
main "$@"