Rework portfolio around Linux operations, Zabbix monitoring, migration validation, and ELK/Grafana log observability. Add AAP-style LVM resize workflow, Zabbix server/proxy/agent automation assets, Linux/AIX monitoring templates, and updated validation CI.
This commit is contained in:
@@ -0,0 +1,14 @@
|
||||
---
|
||||
# Ansible-lint configuration
|
||||
|
||||
skip_list:
|
||||
- 'role-name'
|
||||
- 'name[casing]'
|
||||
- 'line-too-long'
|
||||
|
||||
exclude_paths:
|
||||
- .git
|
||||
- .gitea
|
||||
- molecule/
|
||||
- molecule/default/tests/
|
||||
- scenarios/
|
||||
@@ -0,0 +1,95 @@
|
||||
# Linux Operations Automation Makefile
|
||||
|
||||
.PHONY: help test run demo patch harden decommission lvm-check up down status logs validate clean lint scale-up-web scale-up-db scale-down-web scale-down-db fail-network fail-disk fail-service fail-node scenario-scaling help-scaling help-failure
|
||||
|
||||
help: ## Show this help message
|
||||
@echo "Linux Operations Automation"
|
||||
@echo ""
|
||||
@echo "Available commands:"
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " %-18s %s\n", $$1, $$2}'
|
||||
|
||||
test: ## Run offline validation checks
|
||||
ansible-playbook -i inventory/hosts.ini --syntax-check playbooks/*.yml
|
||||
ansible-lint
|
||||
|
||||
run: ## Run provisioning against the configured inventory
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
|
||||
|
||||
demo: ## Run a safe local demonstration without requiring live SSH hosts
|
||||
SIMULATION_MODE=true bash ./scripts/simulate_failure.sh service 5 web
|
||||
|
||||
patch: ## Apply patching workflow against the configured inventory
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
||||
|
||||
harden: ## Apply hardening workflow against the configured inventory
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
|
||||
|
||||
decommission: ## Run decommissioning workflow against the configured inventory
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml
|
||||
|
||||
lvm-check: ## Validate the AAP-style LVM resize workflow
|
||||
ansible-playbook -i inventory/hosts.ini --syntax-check playbooks/lvm_resize.yml
|
||||
|
||||
up: ## Start the optional local container scaffold
|
||||
docker compose up -d
|
||||
|
||||
down: ## Stop the optional local container scaffold
|
||||
docker compose down
|
||||
|
||||
status: ## Show local scaffold status and inventory hosts
|
||||
docker compose ps
|
||||
ansible -i inventory/hosts.ini --list-hosts all || echo "Inventory check failed"
|
||||
|
||||
logs: ## Show local scaffold logs
|
||||
docker compose logs -f --tail=100
|
||||
|
||||
validate: ## Run all offline validation checks
|
||||
$(MAKE) test
|
||||
docker compose config --quiet
|
||||
|
||||
clean: ## Clean up generated local logs and reports
|
||||
rm -f logs/*.log reports/*.txt
|
||||
|
||||
lint: ## Lint Ansible content
|
||||
ansible-lint
|
||||
|
||||
scale-up-web: ## Scale up web servers in simulation mode (usage: make scale-up-web COUNT=2)
|
||||
SIMULATION_MODE=true bash ./scripts/simulate_scaling.sh up $(or $(COUNT),1) web
|
||||
|
||||
scale-up-db: ## Scale up database servers in simulation mode (usage: make scale-up-db COUNT=1)
|
||||
SIMULATION_MODE=true bash ./scripts/simulate_scaling.sh up $(or $(COUNT),1) db
|
||||
|
||||
scale-down-web: ## Scale down web servers in simulation mode (usage: make scale-down-web COUNT=1)
|
||||
SIMULATION_MODE=true bash ./scripts/simulate_scaling.sh down $(or $(COUNT),1) web
|
||||
|
||||
scale-down-db: ## Scale down database servers in simulation mode (usage: make scale-down-db COUNT=1)
|
||||
SIMULATION_MODE=true bash ./scripts/simulate_scaling.sh down $(or $(COUNT),1) db
|
||||
|
||||
fail-network: ## Simulate network failure safely (usage: make fail-network DURATION=60)
|
||||
SIMULATION_MODE=true bash ./scripts/simulate_failure.sh network $(or $(DURATION),60)
|
||||
|
||||
fail-disk: ## Simulate disk pressure safely (usage: make fail-disk DURATION=120)
|
||||
SIMULATION_MODE=true bash ./scripts/simulate_failure.sh disk $(or $(DURATION),120)
|
||||
|
||||
fail-service: ## Simulate service failures safely (usage: make fail-service DURATION=30)
|
||||
SIMULATION_MODE=true bash ./scripts/simulate_failure.sh service $(or $(DURATION),30)
|
||||
|
||||
fail-node: ## Simulate node failure safely (usage: make fail-node DURATION=300)
|
||||
SIMULATION_MODE=true bash ./scripts/simulate_failure.sh node $(or $(DURATION),300)
|
||||
|
||||
scenario-scaling: ## Run scaling event syntax validation
|
||||
ansible-playbook -i inventory/hosts.ini --syntax-check scenarios/scaling_event.yml
|
||||
|
||||
help-scaling: ## Show scaling-related commands
|
||||
@echo "Scaling Commands:"
|
||||
@echo " make scale-up-web COUNT=2"
|
||||
@echo " make scale-up-db COUNT=1"
|
||||
@echo " make scale-down-web COUNT=1"
|
||||
@echo " make scale-down-db COUNT=1"
|
||||
|
||||
help-failure: ## Show failure simulation commands
|
||||
@echo "Failure Simulation Commands:"
|
||||
@echo " make fail-network DURATION=60"
|
||||
@echo " make fail-disk DURATION=120"
|
||||
@echo " make fail-service DURATION=30"
|
||||
@echo " make fail-node DURATION=300"
|
||||
@@ -0,0 +1,92 @@
|
||||
# Linux Operations Automation
|
||||
|
||||
## Problem
|
||||
|
||||
Linux infrastructure work often starts as ticket-driven operations: deploy a server, patch it, harden SSH, check a failed service, expand a filesystem, and leave evidence that the change was safe. These tasks need automation that is readable, repeatable, and cautious enough for production-style environments.
|
||||
|
||||
## CV Relevance
|
||||
|
||||
This project maps directly to Linux/Unix operations, server deployment, patching, troubleshooting, and storage/LVM work from enterprise infrastructure environments. The LVM resize workflow is written in an AAP-style shape: explicit survey variables, dry-run defaults, pre-checks, resize actions, and before/after evidence.
|
||||
|
||||
## What This Project Demonstrates
|
||||
|
||||
- Ansible playbooks for common Linux node lifecycle operations.
|
||||
- Role-based task organization with clear defaults and handlers.
|
||||
- LVM filesystem expansion workflow suitable for Ansible Automation Platform job templates.
|
||||
- Safe simulation scripts for failure, service, and scaling exercises.
|
||||
- Reviewer-friendly evidence in `examples/` without relying on a live enterprise lab.
|
||||
|
||||
## Architecture
|
||||
|
||||
```text
|
||||
Operator -> Make targets -> Ansible inventory -> Playbooks/Roles -> Linux nodes
|
||||
-> Simulation scripts -> Example evidence
|
||||
-> AAP-style LVM workflow -> Before/after report
|
||||
```
|
||||
|
||||
Core components:
|
||||
|
||||
- `inventory/hosts.ini` defines realistic host groups.
|
||||
- `playbooks/` contains provision, patch, harden, and decommission workflows.
|
||||
- `playbooks/lvm_resize.yml` contains the storage expansion workflow.
|
||||
- `roles/` contains the implemented Ansible roles.
|
||||
- `scripts/` provides safe simulation helpers.
|
||||
- `docker-compose.yml` is a lightweight local scaffold, not a production lab.
|
||||
|
||||
## Quickstart
|
||||
|
||||
```bash
|
||||
cd professional-infra/linux-operations-automation
|
||||
make test
|
||||
make demo
|
||||
```
|
||||
|
||||
`make test` runs offline syntax and lint checks. `make demo` runs a safe simulation with `SIMULATION_MODE=true` and does not require reachable SSH hosts.
|
||||
|
||||
To run playbooks against real or lab hosts, update `inventory/hosts.ini` and run:
|
||||
|
||||
```bash
|
||||
make run
|
||||
make patch
|
||||
make harden
|
||||
make decommission
|
||||
```
|
||||
|
||||
Review the LVM workflow:
|
||||
|
||||
```bash
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/lvm_resize.yml --syntax-check
|
||||
cat docs/aap_lvm_resize_workflow.md
|
||||
```
|
||||
|
||||
## Validation
|
||||
|
||||
```bash
|
||||
make test
|
||||
docker compose config --quiet
|
||||
```
|
||||
|
||||
The optional compose scaffold can be started with:
|
||||
|
||||
```bash
|
||||
make up
|
||||
make down
|
||||
```
|
||||
|
||||
## Example Output
|
||||
|
||||
Sample evidence is available in [examples/patch-output.txt](examples/patch-output.txt), [examples/failure-simulation.txt](examples/failure-simulation.txt), and [examples/lvm-resize-output.txt](examples/lvm-resize-output.txt).
|
||||
|
||||
## Interview Talking Points
|
||||
|
||||
- How to make LVM resize automation safe with dry-run defaults and explicit approval.
|
||||
- Why before/after evidence matters for storage and filesystem changes.
|
||||
- How Ansible roles keep Linux baseline operations repeatable.
|
||||
- Where AAP surveys and job templates reduce ticket handling errors.
|
||||
|
||||
## Roadmap
|
||||
|
||||
- Add complete service roles for application deployment examples.
|
||||
- Add backup, security scan, and disaster recovery playbooks.
|
||||
- Add a richer local lab with SSH-ready containers.
|
||||
- Add cloud or Kubernetes deployment variants.
|
||||
@@ -0,0 +1,43 @@
|
||||
# Vault Configuration Guide
|
||||
|
||||
## Overview
|
||||
|
||||
The current portfolio demo does not require Ansible Vault for `make test` or `make demo`. Secrets are intentionally kept out of the main validation path so reviewers can run the project offline.
|
||||
|
||||
Use Vault only when extending the simulator to manage real hosts or credentials.
|
||||
|
||||
## Recommended Pattern
|
||||
|
||||
1. Start from the example file:
|
||||
|
||||
```bash
|
||||
cp group_vars/vault.example.yml group_vars/vault.yml
|
||||
```
|
||||
|
||||
2. Replace placeholder values locally.
|
||||
|
||||
3. Encrypt the file before using it with real systems:
|
||||
|
||||
```bash
|
||||
ansible-vault encrypt group_vars/vault.yml
|
||||
```
|
||||
|
||||
4. Do not commit real secret values. Keep `group_vars/vault.example.yml` as the committed reference.
|
||||
|
||||
## Running With Vault
|
||||
|
||||
```bash
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml --ask-vault-pass
|
||||
```
|
||||
|
||||
or:
|
||||
|
||||
```bash
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml --vault-password-file ~/.vault_pass.txt
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- The delivered playbooks do not import a vault file by default.
|
||||
- Add `vars_files` only in an environment-specific branch or private overlay.
|
||||
- Prefer a secret manager or automation controller for production use.
|
||||
@@ -0,0 +1,5 @@
|
||||
[defaults]
|
||||
roles_path = ./roles
|
||||
inventory = ./inventory/hosts.ini
|
||||
host_key_checking = False
|
||||
retry_files_enabled = False
|
||||
@@ -0,0 +1,28 @@
|
||||
services:
|
||||
web:
|
||||
image: debian:12-slim
|
||||
command: ["sleep", "infinity"]
|
||||
networks:
|
||||
infra_sim:
|
||||
ipv4_address: 172.20.0.11
|
||||
|
||||
db:
|
||||
image: debian:12-slim
|
||||
command: ["sleep", "infinity"]
|
||||
networks:
|
||||
infra_sim:
|
||||
ipv4_address: 172.20.0.21
|
||||
|
||||
lb:
|
||||
image: debian:12-slim
|
||||
command: ["sleep", "infinity"]
|
||||
networks:
|
||||
infra_sim:
|
||||
ipv4_address: 172.20.0.31
|
||||
|
||||
networks:
|
||||
infra_sim:
|
||||
driver: bridge
|
||||
ipam:
|
||||
config:
|
||||
- subnet: 172.20.0.0/24
|
||||
@@ -0,0 +1,45 @@
|
||||
# AAP-Style LVM Resize Workflow
|
||||
|
||||
## Purpose
|
||||
|
||||
This workflow shows how a routine storage ticket can be converted into a controlled Ansible Automation Platform job. It is intentionally conservative: dry-run is the default, required variables are explicit, and every run produces before/after evidence.
|
||||
|
||||
## Suggested Job Template
|
||||
|
||||
- Name: `Linux - LVM Filesystem Resize`
|
||||
- Inventory: Linux production or pre-production inventory
|
||||
- Playbook: `playbooks/lvm_resize.yml`
|
||||
- Credentials: privileged Linux automation credential
|
||||
- Privilege escalation: enabled
|
||||
- Default extra vars:
|
||||
|
||||
```yaml
|
||||
lvm_dry_run: true
|
||||
lvm_resize_filesystem: true
|
||||
```
|
||||
|
||||
## Suggested Survey Variables
|
||||
|
||||
| Variable | Example | Required | Notes |
|
||||
| --- | --- | --- | --- |
|
||||
| `lvm_vg_name` | `vg_app` | yes | Target volume group. |
|
||||
| `lvm_lv_name` | `lv_data` | yes | Target logical volume. |
|
||||
| `lvm_mountpoint` | `/data` | yes | Filesystem mountpoint to validate before/after. |
|
||||
| `lvm_size_request` | `+20G` | yes | Passed to `lvextend -L`; use explicit growth syntax for tickets. |
|
||||
| `lvm_dry_run` | `true` | yes | Start with `true`; switch to `false` after evidence review. |
|
||||
|
||||
## Safety Notes
|
||||
|
||||
- Run with `lvm_dry_run=true` first and attach output to the ticket.
|
||||
- Confirm backup/snapshot status before actual resize.
|
||||
- Confirm filesystem type; this workflow supports XFS and ext filesystems.
|
||||
- Keep requested size aligned with the ticket approval.
|
||||
- Use maintenance windows for critical systems.
|
||||
|
||||
## Evidence Captured
|
||||
|
||||
- `lsblk --fs`
|
||||
- `pvs`, `vgs`, `lvs`
|
||||
- `df -hT <mountpoint>` before and after
|
||||
- target LV path and filesystem type
|
||||
- dry-run flag and requested size
|
||||
@@ -0,0 +1,30 @@
|
||||
# Linux Operations Automation Architecture
|
||||
|
||||
## Components
|
||||
|
||||
- Operator interface: `make` targets and direct Ansible commands.
|
||||
- Inventory: static host groups in `inventory/hosts.ini`.
|
||||
- Automation: lifecycle playbooks in `playbooks/`.
|
||||
- Simulation scripts: controlled failure and scaling events in `scripts/`.
|
||||
- Evidence: logs, reports, scenario notes, and examples.
|
||||
|
||||
## Data Flow
|
||||
|
||||
```
|
||||
Operator
|
||||
-> Make target or shell script
|
||||
-> Ansible inventory
|
||||
-> lifecycle playbook
|
||||
-> managed Linux node
|
||||
-> log/report artifact
|
||||
```
|
||||
|
||||
Failure drills follow a parallel flow:
|
||||
|
||||
```
|
||||
Operator -> simulate_failure.sh -> target node/service -> health check -> patch/hardening playbook -> evidence
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
The project favors explicit playbooks over hidden orchestration so the operational intent is visible during review. In a production implementation, the same workflows would typically run from a CI runner or automation controller with credentials supplied by a secret manager.
|
||||
@@ -0,0 +1,8 @@
|
||||
2026-04-29 02:13:41 - Starting failure simulation: service 30 web
|
||||
2026-04-29 02:13:41 - Simulating service failures on containers: web
|
||||
2026-04-29 02:13:42 - Stopping services in container enterprise-web-1
|
||||
2026-04-29 02:13:44 - Health probe failed: http://web01/health returned 503
|
||||
2026-04-29 02:14:12 - Cleaning up failure simulation
|
||||
2026-04-29 02:14:13 - Restarted nginx in enterprise-web-1
|
||||
2026-04-29 02:14:18 - Health probe recovered: http://web01/health returned 200
|
||||
2026-04-29 02:14:18 - Failure simulation completed successfully
|
||||
@@ -0,0 +1,19 @@
|
||||
TASK [Report LVM resize evidence] **********************************************
|
||||
ok: [app01] => {
|
||||
"msg": {
|
||||
"host": "app01",
|
||||
"dry_run": true,
|
||||
"target": "/dev/vg_app/lv_data",
|
||||
"mountpoint": "/data",
|
||||
"requested_size": "+20G",
|
||||
"filesystem_type": "xfs",
|
||||
"before_df": [
|
||||
"Filesystem Type Size Used Avail Use% Mounted on",
|
||||
"/dev/mapper/vg_app-lv_data xfs 100G 83G 17G 84% /data"
|
||||
],
|
||||
"after_df": [
|
||||
"Filesystem Type Size Used Avail Use% Mounted on",
|
||||
"/dev/mapper/vg_app-lv_data xfs 100G 83G 17G 84% /data"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
PLAY [Apply Security Patches and Updates] **************************************
|
||||
|
||||
TASK [Update package cache] *****************************************************
|
||||
changed: [web01]
|
||||
changed: [db01]
|
||||
ok: [lb01]
|
||||
|
||||
TASK [Check for available updates] **********************************************
|
||||
ok: [web01] => {"stdout": "9"}
|
||||
ok: [db01] => {"stdout": "4"}
|
||||
ok: [lb01] => {"stdout": "0"}
|
||||
|
||||
TASK [Apply security updates only] **********************************************
|
||||
changed: [web01]
|
||||
changed: [db01]
|
||||
ok: [lb01]
|
||||
|
||||
TASK [Verify critical services] *************************************************
|
||||
ok: [web01] => (item=systemd-journald)
|
||||
ok: [web01] => (item=cron)
|
||||
ok: [db01] => (item=systemd-journald)
|
||||
ok: [lb01] => (item=cron)
|
||||
|
||||
PLAY RECAP *********************************************************************
|
||||
web01 : ok=19 changed=6 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
|
||||
db01 : ok=18 changed=5 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
|
||||
lb01 : ok=15 changed=1 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0
|
||||
|
||||
Patch report
|
||||
Status: SUCCESS
|
||||
Window: 02:00-04:00 UTC
|
||||
Reboot required: false
|
||||
Notification: infra-team@example.com
|
||||
@@ -0,0 +1,20 @@
|
||||
---
|
||||
# Group variables for all hosts
|
||||
|
||||
# SSH Configuration
|
||||
ssh_config:
|
||||
port: 22
|
||||
max_auth_tries: 3
|
||||
alive_interval: 300
|
||||
|
||||
# Firewall defaults
|
||||
firewall_enabled: true
|
||||
firewall_default_policy: deny
|
||||
|
||||
# Patching defaults
|
||||
patch_enabled: true
|
||||
enforce_patch_window: true
|
||||
|
||||
# Services monitoring
|
||||
enable_monitoring: false
|
||||
enable_health_checks: true
|
||||
@@ -0,0 +1,9 @@
|
||||
---
|
||||
# Database servers group configuration
|
||||
db_type: postgresql
|
||||
db_port: 5432
|
||||
db_backup_enabled: true
|
||||
db_backup_path: /var/backups/database
|
||||
|
||||
# Database user (use vault for production)
|
||||
db_admin_user: postgres
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
# Load balancers group configuration
|
||||
lb_type: haproxy
|
||||
lb_port: 443
|
||||
lb_stats_port: 8404
|
||||
lb_stats_enabled: true
|
||||
|
||||
# Frontend configuration
|
||||
frontend_host: "0.0.0.0"
|
||||
frontend_port: 80
|
||||
@@ -0,0 +1,10 @@
|
||||
---
|
||||
# Monitoring servers group configuration
|
||||
monitoring_type: prometheus
|
||||
monitoring_port: 9090
|
||||
monitoring_retention: 30d
|
||||
monitoring_scrape_interval: 15s
|
||||
|
||||
# Grafana configuration
|
||||
grafana_port: 3000
|
||||
grafana_admin_password: "{{ vault_grafana_password }}"
|
||||
@@ -0,0 +1,8 @@
|
||||
---
|
||||
# Example variables for secret values.
|
||||
# Copy these keys into an Ansible Vault encrypted file when real secrets are needed.
|
||||
|
||||
admin_password: "replace-with-vault-managed-value"
|
||||
db_root_password: "replace-with-vault-managed-value"
|
||||
grafana_admin_password: "replace-with-vault-managed-value"
|
||||
ssh_key_passphrase: "replace-with-vault-managed-value"
|
||||
@@ -0,0 +1,11 @@
|
||||
---
|
||||
# Webservers group configuration
|
||||
webserver_type: nginx
|
||||
http_port: 80
|
||||
https_port: 443
|
||||
health_check_path: /health
|
||||
|
||||
# Application configuration
|
||||
app_name: "{{ group_names[0] | default('app') }}"
|
||||
app_user: "{{ admin_user }}"
|
||||
app_group: "{{ admin_user }}"
|
||||
@@ -0,0 +1,35 @@
|
||||
[webservers]
|
||||
web01 ansible_host=172.20.0.11 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
web02 ansible_host=172.20.0.12 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
web03 ansible_host=172.20.0.13 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
|
||||
[databases]
|
||||
db01 ansible_host=172.20.0.21 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
db02 ansible_host=172.20.0.22 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
|
||||
[loadbalancers]
|
||||
lb01 ansible_host=172.20.0.31 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
|
||||
[monitoring]
|
||||
mon01 ansible_host=172.20.0.41 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
||||
|
||||
[all:vars]
|
||||
ansible_python_interpreter=/usr/bin/python3
|
||||
ansible_ssh_common_args='-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
|
||||
ansible_connection=ssh
|
||||
|
||||
[webservers:vars]
|
||||
node_type=web
|
||||
environment=production
|
||||
|
||||
[databases:vars]
|
||||
node_type=database
|
||||
environment=production
|
||||
|
||||
[loadbalancers:vars]
|
||||
node_type=loadbalancer
|
||||
environment=production
|
||||
|
||||
[monitoring:vars]
|
||||
node_type=monitoring
|
||||
environment=production
|
||||
@@ -0,0 +1,24 @@
|
||||
---
|
||||
# Molecule converge playbook - applies roles to test them
|
||||
|
||||
- name: Converge
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
pre_tasks:
|
||||
- name: Update apt cache
|
||||
apt:
|
||||
update_cache: yes
|
||||
cache_valid_time: 3600
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
roles:
|
||||
- role: base_provision
|
||||
- role: hardening
|
||||
- role: patching
|
||||
|
||||
post_tasks:
|
||||
- name: Print Ansible facts
|
||||
debug:
|
||||
var: ansible_facts
|
||||
@@ -0,0 +1,15 @@
|
||||
---
|
||||
# Molecule destroy playbook
|
||||
|
||||
- name: Destroy
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Destroy molecule containers
|
||||
docker_container:
|
||||
name: "{{ item }}"
|
||||
state: absent
|
||||
force_kill: yes
|
||||
loop: "{{ molecule_yml.platforms | map(attribute='name') | list }}"
|
||||
register: destroy_result
|
||||
ignore_errors: yes
|
||||
@@ -0,0 +1,31 @@
|
||||
---
|
||||
# Molecule configuration for Ansible role testing
|
||||
|
||||
driver:
|
||||
name: docker
|
||||
|
||||
platforms:
|
||||
- name: ubuntu-22.04
|
||||
image: geerlingguy/docker-ubuntu2204-ansible:latest
|
||||
pre_build_image: true
|
||||
privileged: true
|
||||
volumes:
|
||||
- /sys/fs/cgroup:/sys/fs/cgroup:rw
|
||||
|
||||
provisioner:
|
||||
name: ansible
|
||||
config_options:
|
||||
defaults:
|
||||
gathering: smart
|
||||
fact_caching: jsonfile
|
||||
fact_caching_connection: /tmp/ansible_facts
|
||||
fact_caching_timeout: 3600
|
||||
deprecation_warnings: false
|
||||
|
||||
verifier:
|
||||
name: ansible
|
||||
directory: molecule/default/tests
|
||||
|
||||
lint: |
|
||||
yamllint .
|
||||
ansible-lint
|
||||
@@ -0,0 +1,32 @@
|
||||
---
|
||||
# Molecule verify playbook - runs tests to verify roles
|
||||
|
||||
- name: Verify
|
||||
hosts: all
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Check if base OS packages are installed
|
||||
shell: dpkg -l | grep -E '(curl|wget|vim|htop)'
|
||||
register: package_check
|
||||
failed_when: package_check.rc not in [0, 1]
|
||||
|
||||
- name: Check SSH configuration
|
||||
stat:
|
||||
path: /etc/ssh/sshd_config
|
||||
register: ssh_config_stat
|
||||
failed_when: not ssh_config_stat.stat.exists
|
||||
|
||||
- name: Check firewall status
|
||||
shell: ufw status | grep -q active
|
||||
register: firewall_check
|
||||
failed_when: false
|
||||
|
||||
- name: Verify admin user exists
|
||||
getent:
|
||||
database: passwd
|
||||
key: infra-admin
|
||||
failed_when: false
|
||||
|
||||
- name: Print verification results
|
||||
debug:
|
||||
msg: "Role verification completed"
|
||||
@@ -0,0 +1,34 @@
|
||||
---
|
||||
- name: Decommission Enterprise Infrastructure Nodes
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
pre_tasks:
|
||||
- name: Confirm decommissioning
|
||||
ansible.builtin.pause:
|
||||
prompt: |
|
||||
WARNING: This will decommission {{ inventory_hostname }}
|
||||
Backup Data: {{ backup_data }}
|
||||
Export Config: {{ export_config }}
|
||||
|
||||
Press ENTER to continue or Ctrl+C to cancel
|
||||
|
||||
- name: Display decommissioning information
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
Decommissioning {{ inventory_hostname }}
|
||||
Auto Shutdown: {{ auto_shutdown }}
|
||||
Backup Enabled: {{ backup_data }}
|
||||
|
||||
roles:
|
||||
- role: decommission
|
||||
tags: ['decommission', 'cleanup']
|
||||
|
||||
post_tasks:
|
||||
- name: Display decommissioning summary
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
Decommissioning completed!
|
||||
Host: {{ inventory_hostname }}
|
||||
Backup Location: /var/backups/decommission-{{ ansible_date_time.iso8601 }}/
|
||||
@@ -0,0 +1,124 @@
|
||||
---
|
||||
- name: Harden Enterprise Infrastructure Nodes
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
pre_tasks:
|
||||
- name: Validate hardening prerequisites
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- ansible_os_family == "Debian"
|
||||
- cis_level in [1, 2]
|
||||
fail_msg: "Invalid hardening configuration"
|
||||
|
||||
- name: Display hardening information
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
Hardening {{ inventory_hostname }}
|
||||
CIS Level: {{ cis_level }}
|
||||
Disable Root Login: {{ disable_root_login }}
|
||||
|
||||
roles:
|
||||
- role: hardening
|
||||
tags: ['hardening', 'security']
|
||||
|
||||
post_tasks:
|
||||
- name: Display hardening summary
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
Hardening completed successfully!
|
||||
Host: {{ inventory_hostname }}
|
||||
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Configure auditd
|
||||
when: auditd_enabled
|
||||
block:
|
||||
- name: Install auditd
|
||||
ansible.builtin.apt:
|
||||
name: auditd
|
||||
state: present
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Configure audit rules
|
||||
ansible.builtin.template:
|
||||
src: templates/audit.rules.j2
|
||||
dest: /etc/audit/rules.d/hardening.rules
|
||||
mode: '0644'
|
||||
|
||||
- name: Enable auditd service
|
||||
ansible.builtin.service:
|
||||
name: auditd
|
||||
state: started
|
||||
enabled: true
|
||||
|
||||
- name: Configure AppArmor
|
||||
when: apparmor_enabled and ansible_os_family == "Debian"
|
||||
block:
|
||||
- name: Install apparmor
|
||||
ansible.builtin.apt:
|
||||
name: apparmor
|
||||
state: present
|
||||
when: ansible_os_family == "Debian"
|
||||
|
||||
- name: Enable apparmor service
|
||||
ansible.builtin.service:
|
||||
name: apparmor
|
||||
state: started
|
||||
enabled: true
|
||||
|
||||
- name: Configure sysctl hardening
|
||||
ansible.posix.sysctl:
|
||||
name: "{{ item.key }}"
|
||||
value: "{{ item.value }}"
|
||||
state: present
|
||||
reload: true
|
||||
loop:
|
||||
- { key: 'net.ipv4.ip_forward', value: '0' }
|
||||
- { key: 'net.ipv4.conf.all.send_redirects', value: '0' }
|
||||
- { key: 'net.ipv4.conf.default.send_redirects', value: '0' }
|
||||
- { key: 'net.ipv4.tcp_syncookies', value: '1' }
|
||||
- { key: 'net.ipv4.icmp_echo_ignore_broadcasts', value: '1' }
|
||||
|
||||
- name: Set secure file permissions
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
mode: '0644'
|
||||
owner: root
|
||||
group: root
|
||||
loop:
|
||||
- /etc/passwd
|
||||
- /etc/group
|
||||
- /etc/shadow
|
||||
- /etc/gshadow
|
||||
|
||||
- name: Lock inactive user accounts
|
||||
ansible.builtin.command: usermod -L "{{ item }}"
|
||||
loop: "{{ inactive_users | default([]) }}"
|
||||
changed_when: false
|
||||
|
||||
- name: Configure password policies
|
||||
community.general.pam_limits:
|
||||
domain: '*'
|
||||
limit_type: hard
|
||||
limit_item: nofile
|
||||
value: 1024
|
||||
|
||||
- name: Generate hardening report
|
||||
ansible.builtin.template:
|
||||
src: templates/hardening_report.j2
|
||||
dest: "/var/log/hardening_report_{{ ansible_date_time.iso8601 }}.log"
|
||||
mode: '0644'
|
||||
|
||||
handlers:
|
||||
- name: restart sshd
|
||||
ansible.builtin.service:
|
||||
name: ssh
|
||||
state: restarted
|
||||
|
||||
- name: restart auditd
|
||||
ansible.builtin.service:
|
||||
name: auditd
|
||||
state: restarted
|
||||
when: auditd_enabled
|
||||
@@ -0,0 +1,149 @@
|
||||
---
|
||||
- name: AAP-style LVM filesystem resize workflow
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
vars:
|
||||
lvm_dry_run: true
|
||||
lvm_vg_name: ""
|
||||
lvm_lv_name: ""
|
||||
lvm_mountpoint: ""
|
||||
lvm_size_request: "+10G"
|
||||
lvm_resize_filesystem: true
|
||||
|
||||
pre_tasks:
|
||||
- name: Validate required survey variables
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- lvm_vg_name | length > 0
|
||||
- lvm_lv_name | length > 0
|
||||
- lvm_mountpoint | length > 0
|
||||
- lvm_size_request | length > 0
|
||||
fail_msg: "Required variables: lvm_vg_name, lvm_lv_name, lvm_mountpoint, lvm_size_request"
|
||||
|
||||
tasks:
|
||||
- name: Capture block device layout before resize
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- lsblk
|
||||
- --fs
|
||||
register: lvm_lsblk_before
|
||||
changed_when: false
|
||||
|
||||
- name: Capture physical volumes before resize
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- pvs
|
||||
- --noheadings
|
||||
- --units
|
||||
- g
|
||||
register: lvm_pvs_before
|
||||
changed_when: false
|
||||
|
||||
- name: Capture volume groups before resize
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- vgs
|
||||
- --noheadings
|
||||
- --units
|
||||
- g
|
||||
register: lvm_vgs_before
|
||||
changed_when: false
|
||||
|
||||
- name: Capture logical volumes before resize
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- lvs
|
||||
- --noheadings
|
||||
- --units
|
||||
- g
|
||||
register: lvm_lvs_before
|
||||
changed_when: false
|
||||
|
||||
- name: Capture filesystem usage before resize
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- df
|
||||
- -hT
|
||||
- "{{ lvm_mountpoint }}"
|
||||
register: lvm_df_before
|
||||
changed_when: false
|
||||
|
||||
- name: Validate target logical volume exists
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- lvs
|
||||
- "/dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
|
||||
register: lvm_target_check
|
||||
changed_when: false
|
||||
|
||||
- name: Show dry-run resize command
|
||||
ansible.builtin.debug:
|
||||
msg: "DRY RUN: would run lvextend -L {{ lvm_size_request }} /dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
|
||||
when: lvm_dry_run | bool
|
||||
|
||||
- name: Extend logical volume
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- lvextend
|
||||
- -L
|
||||
- "{{ lvm_size_request }}"
|
||||
- "/dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
|
||||
register: lvm_lvextend_result
|
||||
changed_when: true
|
||||
when: not (lvm_dry_run | bool)
|
||||
|
||||
- name: Detect filesystem type
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- findmnt
|
||||
- -n
|
||||
- -o
|
||||
- FSTYPE
|
||||
- "{{ lvm_mountpoint }}"
|
||||
register: lvm_fstype
|
||||
changed_when: false
|
||||
|
||||
- name: Resize XFS filesystem
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- xfs_growfs
|
||||
- "{{ lvm_mountpoint }}"
|
||||
changed_when: true
|
||||
when:
|
||||
- not (lvm_dry_run | bool)
|
||||
- lvm_resize_filesystem | bool
|
||||
- lvm_fstype.stdout == "xfs"
|
||||
|
||||
- name: Resize ext filesystem
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- resize2fs
|
||||
- "/dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
|
||||
changed_when: true
|
||||
when:
|
||||
- not (lvm_dry_run | bool)
|
||||
- lvm_resize_filesystem | bool
|
||||
- lvm_fstype.stdout in ["ext2", "ext3", "ext4"]
|
||||
|
||||
- name: Capture filesystem usage after resize
|
||||
ansible.builtin.command:
|
||||
argv:
|
||||
- df
|
||||
- -hT
|
||||
- "{{ lvm_mountpoint }}"
|
||||
register: lvm_df_after
|
||||
changed_when: false
|
||||
|
||||
- name: Report LVM resize evidence
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
host: "{{ inventory_hostname }}"
|
||||
dry_run: "{{ lvm_dry_run }}"
|
||||
target: "/dev/{{ lvm_vg_name }}/{{ lvm_lv_name }}"
|
||||
mountpoint: "{{ lvm_mountpoint }}"
|
||||
requested_size: "{{ lvm_size_request }}"
|
||||
filesystem_type: "{{ lvm_fstype.stdout | default('unknown') }}"
|
||||
before_df: "{{ lvm_df_before.stdout_lines }}"
|
||||
after_df: "{{ lvm_df_after.stdout_lines }}"
|
||||
@@ -0,0 +1,31 @@
|
||||
---
|
||||
- name: Apply Security Patches and Updates
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
pre_tasks:
|
||||
- name: Validate patch prerequisites
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- ansible_os_family == "Debian"
|
||||
fail_msg: "Patching supported only on Debian-based systems"
|
||||
|
||||
- name: Display patch information
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
Patching {{ inventory_hostname }}
|
||||
Patch Window: {{ patch_window_start }} - {{ patch_window_end }}
|
||||
Security Only: {{ patch_security_only }}
|
||||
|
||||
roles:
|
||||
- role: patching
|
||||
tags: ['patch', 'updates']
|
||||
|
||||
post_tasks:
|
||||
- name: Display patching summary
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
Patching completed!
|
||||
Host: {{ inventory_hostname }}
|
||||
Reboot Required: {{ reboot_required | default(false) }}
|
||||
@@ -0,0 +1,33 @@
|
||||
---
|
||||
- name: Provision Enterprise Infrastructure Nodes
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
pre_tasks:
|
||||
- name: Validate Ansible version
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- ansible_version.major >= 2
|
||||
- ansible_version.minor >= 9
|
||||
fail_msg: "Ansible 2.9+ is required"
|
||||
|
||||
- name: Display provisioning information
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
Provisioning {{ inventory_hostname }}
|
||||
OS: {{ ansible_os_family }}
|
||||
Python: {{ ansible_python_version }}
|
||||
|
||||
roles:
|
||||
- role: base_provision
|
||||
tags: ['provision', 'base']
|
||||
|
||||
post_tasks:
|
||||
- name: Generate provisioning summary
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
Provisioning completed successfully!
|
||||
Host: {{ inventory_hostname }}
|
||||
IP: {{ ansible_default_ipv4.address }}
|
||||
OS: {{ ansible_os_family }} {{ ansible_os_version }}
|
||||
@@ -0,0 +1,48 @@
|
||||
# Base Provision Role
|
||||
|
||||
Provision basic infrastructure on enterprise nodes with security hardening.
|
||||
|
||||
## Features
|
||||
|
||||
- **Idempotent**: All tasks use proper idempotency markers (`changed_when`, `failed_when`)
|
||||
- **Handlers**: SSH and fail2ban restarts use handlers instead of direct service calls
|
||||
- **Variables**: All configuration in `defaults/main.yml` - no hardcoding
|
||||
- **Validation**: Pre-flight checks for system requirements
|
||||
- **Firewall**: UFW firewall configuration with configurable rules
|
||||
- **SSH Security**: Root login disabled, password auth disabled, key-based auth only
|
||||
|
||||
## Role Variables
|
||||
|
||||
See `defaults/main.yml` for all available variables.
|
||||
|
||||
### Key Variables
|
||||
|
||||
- `node_timezone`: System timezone (default: UTC)
|
||||
- `admin_user`: Admin username for infrastructure access
|
||||
- `ssh_port`: SSH service port (default: 22)
|
||||
- `base_packages`: List of base packages to install
|
||||
- `firewall_enabled`: Enable UFW firewall (default: true)
|
||||
- `firewall_allowed_tcp_ports`: Allowed TCP ports for firewall
|
||||
|
||||
## Secret Variables
|
||||
|
||||
This portfolio demo does not require secrets for offline validation. If you extend it with real passwords or keys, copy `group_vars/vault.example.yml` into an encrypted Ansible Vault file and keep real values out of normal git history.
|
||||
|
||||
## Usage
|
||||
|
||||
```yaml
|
||||
- role: base_provision
|
||||
vars:
|
||||
node_timezone: "Europe/Warsaw"
|
||||
firewall_enabled: true
|
||||
```
|
||||
|
||||
## Handlers
|
||||
|
||||
- `restart sshd`: Restarts SSH service (triggered by config changes)
|
||||
- `restart fail2ban`: Restarts fail2ban service (triggered by config changes)
|
||||
|
||||
## Tags
|
||||
|
||||
- `provision`: All provisioning tasks
|
||||
- `base`: Base provision role tasks
|
||||
@@ -0,0 +1,44 @@
|
||||
---
|
||||
# Base provisioning configuration
|
||||
node_timezone: "UTC"
|
||||
admin_user: "infra-admin"
|
||||
ssh_port: 22
|
||||
ssh_disabled_root_login: true
|
||||
ssh_disable_password_auth: true
|
||||
|
||||
# Packages to install
|
||||
base_packages:
|
||||
- curl
|
||||
- wget
|
||||
- vim
|
||||
- htop
|
||||
- net-tools
|
||||
- iptables
|
||||
- fail2ban
|
||||
- unattended-upgrades
|
||||
|
||||
# Firewall rules
|
||||
firewall_enabled: true
|
||||
firewall_default_policy: deny
|
||||
firewall_allowed_tcp_ports:
|
||||
- 22
|
||||
- 80
|
||||
- 443
|
||||
|
||||
# Application directories
|
||||
app_directories:
|
||||
- path: /opt/application
|
||||
owner: "{{ admin_user }}"
|
||||
group: "{{ admin_user }}"
|
||||
mode: '0755'
|
||||
- path: /var/log/application
|
||||
owner: "{{ admin_user }}"
|
||||
group: "{{ admin_user }}"
|
||||
mode: '0755'
|
||||
- path: /etc/application
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
# Service verification
|
||||
services_to_verify: []
|
||||
@@ -0,0 +1,11 @@
|
||||
---
|
||||
- name: restart sshd
|
||||
ansible.builtin.service:
|
||||
name: sshd
|
||||
state: restarted
|
||||
|
||||
- name: restart fail2ban
|
||||
ansible.builtin.service:
|
||||
name: fail2ban
|
||||
state: restarted
|
||||
enabled: true
|
||||
@@ -0,0 +1,138 @@
|
||||
---
|
||||
- name: Validate system requirements
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- ansible_os_family == "Debian"
|
||||
- ansible_python_version is version('3.6', '>=')
|
||||
fail_msg: "Unsupported system - requires Debian and Python 3.6+"
|
||||
|
||||
- name: Update package cache
|
||||
ansible.builtin.apt:
|
||||
update_cache: true
|
||||
cache_valid_time: 3600
|
||||
changed_when: false
|
||||
|
||||
- name: Install base packages
|
||||
ansible.builtin.apt:
|
||||
name: "{{ base_packages }}"
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Check if admin user exists
|
||||
ansible.builtin.getent:
|
||||
database: passwd
|
||||
key: "{{ admin_user }}"
|
||||
register: admin_check
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Create admin user
|
||||
ansible.builtin.user:
|
||||
name: "{{ admin_user }}"
|
||||
groups: sudo
|
||||
append: true
|
||||
create_home: true
|
||||
shell: /bin/bash
|
||||
when: admin_check.failed
|
||||
|
||||
- name: Configure timezone
|
||||
community.general.timezone:
|
||||
name: "{{ node_timezone }}"
|
||||
|
||||
- name: Configure SSH security
|
||||
block:
|
||||
- name: Disable root SSH login
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^PermitRootLogin'
|
||||
line: 'PermitRootLogin no'
|
||||
state: present
|
||||
when: ssh_disabled_root_login
|
||||
notify: restart sshd
|
||||
|
||||
- name: Set SSH port
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^Port'
|
||||
line: "Port {{ ssh_port }}"
|
||||
state: present
|
||||
notify: restart sshd
|
||||
|
||||
- name: Disable password authentication
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^PasswordAuthentication'
|
||||
line: 'PasswordAuthentication no'
|
||||
state: present
|
||||
when: ssh_disable_password_auth
|
||||
notify: restart sshd
|
||||
|
||||
- name: Configure firewall
|
||||
block:
|
||||
- name: Enable UFW firewall
|
||||
community.general.ufw:
|
||||
state: enabled
|
||||
policy: "{{ firewall_default_policy }}"
|
||||
when: firewall_enabled
|
||||
|
||||
- name: Allow SSH access
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
port: "{{ ssh_port }}"
|
||||
proto: tcp
|
||||
when: firewall_enabled
|
||||
|
||||
- name: Allow HTTP/HTTPS
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
port: "{{ item }}"
|
||||
proto: tcp
|
||||
loop: "{{ firewall_allowed_tcp_ports }}"
|
||||
when: firewall_enabled and item not in [ssh_port]
|
||||
|
||||
- name: Configure fail2ban
|
||||
ansible.builtin.template:
|
||||
src: jail.local.j2
|
||||
dest: /etc/fail2ban/jail.local
|
||||
backup: true
|
||||
mode: '0644'
|
||||
notify: restart fail2ban
|
||||
|
||||
- name: Enable unattended upgrades
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/apt/apt.conf.d/20auto-upgrades
|
||||
regexp: '^APT::Periodic::Unattended-Upgrade'
|
||||
line: 'APT::Periodic::Unattended-Upgrade "1";'
|
||||
state: present
|
||||
|
||||
- name: Create application directories
|
||||
ansible.builtin.file:
|
||||
path: "{{ item.path }}"
|
||||
state: directory
|
||||
owner: "{{ item.owner }}"
|
||||
group: "{{ item.group }}"
|
||||
mode: "{{ item.mode }}"
|
||||
loop: "{{ app_directories }}"
|
||||
|
||||
- name: Record role-specific service intent
|
||||
ansible.builtin.debug:
|
||||
msg: "Would configure {{ node_type | default('generic') }} service components in a full lab deployment"
|
||||
|
||||
- name: Verify services are running
|
||||
ansible.builtin.service:
|
||||
name: "{{ item }}"
|
||||
state: started
|
||||
enabled: true
|
||||
loop: "{{ services_to_verify }}"
|
||||
when: services_to_verify | length > 0
|
||||
failed_when: false
|
||||
|
||||
- name: Run health checks
|
||||
ansible.builtin.uri:
|
||||
url: http://localhost/health
|
||||
method: GET
|
||||
status_code: 200
|
||||
register: health_check
|
||||
failed_when: false
|
||||
ignore_errors: true
|
||||
when: "'webservers' in group_names"
|
||||
+14
@@ -0,0 +1,14 @@
|
||||
# fail2ban configuration
|
||||
[DEFAULT]
|
||||
bantime = 3600
|
||||
findtime = 600
|
||||
maxretry = 5
|
||||
|
||||
[sshd]
|
||||
enabled = true
|
||||
port = {{ ssh_port }}
|
||||
logpath = /var/log/auth.log
|
||||
maxretry = 3
|
||||
|
||||
[recidive]
|
||||
enabled = true
|
||||
@@ -0,0 +1,62 @@
|
||||
# Decommission Role
|
||||
|
||||
Gracefully decommission enterprise infrastructure nodes with comprehensive backup and cleanup.
|
||||
|
||||
## Features
|
||||
|
||||
- **Confirmation Prompt**: Interactive confirmation before decommissioning
|
||||
- **Graceful Shutdown**: Stop services gracefully with connection drain time
|
||||
- **Comprehensive Backup**: Archive configurations and data before cleanup
|
||||
- **Selective Cleanup**: Only remove items that were deployed
|
||||
- **Logging**: Detailed decommissioning logs for audit trail
|
||||
- **Notifications**: Optional email notifications on completion
|
||||
|
||||
## Role Variables
|
||||
|
||||
See `defaults/main.yml` for all available variables.
|
||||
|
||||
### Key Variables
|
||||
|
||||
- `backup_data`: Backup application data (default: true)
|
||||
- `export_config`: Export system configuration (default: true)
|
||||
- `graceful_shutdown`: Graceful service shutdown (default: true)
|
||||
- `auto_shutdown`: Auto shutdown after decommissioning (default: false)
|
||||
- `application_services`: Services to stop
|
||||
- `application_packages`: Packages to remove
|
||||
- `decommission_notification_email`: Email for notifications (optional)
|
||||
|
||||
## Usage
|
||||
|
||||
```yaml
|
||||
- role: decommission
|
||||
vars:
|
||||
backup_data: true
|
||||
export_config: true
|
||||
auto_shutdown: false
|
||||
decommission_notification_email: "ops@company.com"
|
||||
```
|
||||
|
||||
## Backup Locations
|
||||
|
||||
- Configuration: `/var/backups/decommission-<timestamp>/config/`
|
||||
- Data: `/var/backups/decommission-<timestamp>/data/`
|
||||
- Report: `/var/log/decommission_report_<timestamp>.log`
|
||||
|
||||
## Supported Groups
|
||||
|
||||
- `webservers`: Backs up /var/www/html
|
||||
- `databases`: Backs up PostgreSQL data
|
||||
- `monitoring`: Backs up Prometheus data
|
||||
- `loadbalancers`: Loadbalancer cleanup
|
||||
|
||||
## Safety Features
|
||||
|
||||
- Interactive confirmation before execution
|
||||
- Connection drain time before shutdown (30 seconds)
|
||||
- Errors are logged but don't stop the process
|
||||
- Comprehensive audit log
|
||||
|
||||
## Tags
|
||||
|
||||
- `decommission`: All decommissioning tasks
|
||||
- `cleanup`: Cleanup-related tasks
|
||||
@@ -0,0 +1,34 @@
|
||||
---
|
||||
# Decommissioning configuration
|
||||
backup_data: true
|
||||
export_config: true
|
||||
graceful_shutdown: true
|
||||
cleanup_inventory: true
|
||||
auto_shutdown: false
|
||||
shutdown_delay: 10
|
||||
|
||||
# Services to stop gracefully
|
||||
application_services:
|
||||
- nginx
|
||||
- postgresql
|
||||
- haproxy
|
||||
|
||||
# Packages to remove
|
||||
application_packages:
|
||||
- nginx
|
||||
- postgresql
|
||||
- haproxy
|
||||
- prometheus
|
||||
|
||||
# Directories to archive
|
||||
config_paths:
|
||||
- /etc/
|
||||
- /opt/application/
|
||||
|
||||
data_paths:
|
||||
- /var/www/html
|
||||
- /var/lib/postgresql
|
||||
- /var/lib/prometheus
|
||||
|
||||
# Notification settings
|
||||
decommission_notification_email: null
|
||||
@@ -0,0 +1,177 @@
|
||||
---
|
||||
- name: Validate decommissioning requirements
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- backup_data or not backup_data
|
||||
fail_msg: "Invalid decommissioning configuration"
|
||||
|
||||
- name: Pre-decommissioning checks
|
||||
block:
|
||||
- name: Check node health
|
||||
ansible.builtin.uri:
|
||||
url: http://localhost/health
|
||||
method: GET
|
||||
status_code: 200
|
||||
register: health_check
|
||||
failed_when: false
|
||||
ignore_errors: true
|
||||
when: "'webservers' in group_names"
|
||||
|
||||
- name: Create decommissioning backup directory
|
||||
ansible.builtin.file:
|
||||
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Initialize decommissioning log
|
||||
ansible.builtin.file:
|
||||
path: "/var/log/decommission.log"
|
||||
state: touch
|
||||
mode: '0644'
|
||||
modification_time: now
|
||||
access_time: now
|
||||
|
||||
- name: Log decommissioning start
|
||||
ansible.builtin.lineinfile:
|
||||
path: "/var/log/decommission.log"
|
||||
line: "{{ ansible_date_time.iso8601 }} - Starting decommissioning of {{ inventory_hostname }}"
|
||||
state: present
|
||||
|
||||
- name: Graceful application shutdown
|
||||
block:
|
||||
- name: Stop application services
|
||||
ansible.builtin.service:
|
||||
name: "{{ item }}"
|
||||
state: stopped
|
||||
loop: "{{ application_services }}"
|
||||
failed_when: false
|
||||
when: graceful_shutdown
|
||||
|
||||
- name: Wait for connections to drain
|
||||
ansible.builtin.pause:
|
||||
seconds: 30
|
||||
when: graceful_shutdown and ("webservers" in group_names or "loadbalancers" in group_names)
|
||||
|
||||
- name: Export and backup data
|
||||
block:
|
||||
- name: Create config export directory
|
||||
ansible.builtin.file:
|
||||
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Archive system configuration
|
||||
community.general.archive:
|
||||
path: "{{ config_paths }}"
|
||||
dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config/system_config.tar.gz"
|
||||
format: gz
|
||||
when: export_config
|
||||
failed_when: false # noqa risky-file-permissions
|
||||
|
||||
- name: Create data backup directory
|
||||
ansible.builtin.file:
|
||||
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
when: backup_data
|
||||
|
||||
- name: Backup individual data paths
|
||||
community.general.archive:
|
||||
path: "{{ item }}"
|
||||
dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/{{ item | regex_replace('/', '_') }}.tar.gz"
|
||||
format: gz
|
||||
loop: "{{ data_paths }}"
|
||||
when: backup_data
|
||||
failed_when: false # noqa risky-file-permissions
|
||||
|
||||
- name: Update monitoring and load balancing
|
||||
block:
|
||||
- name: Remove from load balancer
|
||||
ansible.builtin.debug:
|
||||
msg: "Would remove {{ inventory_hostname }} from load balancer"
|
||||
when: "'webservers' in group_names or 'databases' in group_names"
|
||||
|
||||
- name: Update monitoring alerts
|
||||
ansible.builtin.debug:
|
||||
msg: "Would update monitoring alerts for {{ inventory_hostname }}"
|
||||
when: "'monitoring' not in group_names"
|
||||
|
||||
- name: Clean up application
|
||||
block:
|
||||
- name: Remove application directories
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- /opt/application
|
||||
- /var/www/html
|
||||
- /var/lib/postgresql
|
||||
- /var/lib/prometheus
|
||||
failed_when: false
|
||||
|
||||
- name: Remove application packages
|
||||
ansible.builtin.apt:
|
||||
name: "{{ item }}"
|
||||
state: absent
|
||||
purge: true
|
||||
loop: "{{ application_packages }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Clean system logs
|
||||
ansible.builtin.shell: |
|
||||
set -o pipefail
|
||||
find /var/log -name "*.log" -type f -size +0 -exec truncate -s 0 {} \;
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Remove SSH credentials
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- /root/.ssh/authorized_keys
|
||||
- /root/.ssh/known_hosts
|
||||
- /home/infra-admin/.ssh/authorized_keys
|
||||
failed_when: false
|
||||
|
||||
- name: Generate decommissioning report
|
||||
ansible.builtin.template:
|
||||
src: decommission_report.j2
|
||||
dest: "/var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log"
|
||||
mode: '0644'
|
||||
vars:
|
||||
backup_location: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}"
|
||||
|
||||
- name: Send decommissioning notification
|
||||
community.general.mail:
|
||||
host: localhost
|
||||
port: 25
|
||||
to: "{{ decommission_notification_email }}"
|
||||
subject: "Node Decommissioned - {{ inventory_hostname }}"
|
||||
body: |
|
||||
Node {{ inventory_hostname }} has been successfully decommissioned.
|
||||
|
||||
Backup location: /var/backups/decommission-{{ ansible_date_time.iso8601 }}/
|
||||
Services stopped: {{ application_services | join(', ') }}
|
||||
Configuration exported: {{ export_config }}
|
||||
Data backed up: {{ backup_data }}
|
||||
|
||||
See /var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log for details
|
||||
when: decommission_notification_email is defined
|
||||
failed_when: false
|
||||
|
||||
- name: Finalize decommissioning
|
||||
block:
|
||||
- name: Log decommissioning completion
|
||||
ansible.builtin.lineinfile:
|
||||
path: "/var/log/decommission.log"
|
||||
line: "{{ ansible_date_time.iso8601 }} - Decommissioning completed for {{ inventory_hostname }}"
|
||||
state: present
|
||||
|
||||
- name: Perform system shutdown
|
||||
ansible.builtin.reboot:
|
||||
msg: "System scheduled for shutdown after decommissioning"
|
||||
delay: "{{ shutdown_delay }}"
|
||||
when: auto_shutdown | bool
|
||||
async: 1
|
||||
poll: 0
|
||||
+13
@@ -0,0 +1,13 @@
|
||||
Decommissioning Report
|
||||
======================
|
||||
Generated: {{ ansible_date_time.iso8601 }}
|
||||
Host: {{ inventory_hostname }}
|
||||
|
||||
Status: COMPLETED
|
||||
Backup Location: {{ backup_location }}
|
||||
|
||||
Configuration Exported: {{ export_config }}
|
||||
Data Backed Up: {{ backup_data }}
|
||||
Services Stopped: {{ application_services | join(', ') }}
|
||||
|
||||
Log Location: /var/log/decommission.log
|
||||
@@ -0,0 +1,58 @@
|
||||
# Hardening Role
|
||||
|
||||
Apply security hardening to enterprise infrastructure nodes following CIS benchmarks.
|
||||
|
||||
## Features
|
||||
|
||||
- **CIS Compliance**: Support for CIS hardening levels 1 and 2
|
||||
- **SSH Hardening**: Disable root login, password auth, set auth limits
|
||||
- **Firewall Configuration**: UFW with configurable rules
|
||||
- **Service Cleanup**: Disable unnecessary services and remove insecure packages
|
||||
- **Handlers**: SSH restarts via handlers
|
||||
|
||||
## Role Variables
|
||||
|
||||
See `defaults/main.yml` for all available variables.
|
||||
|
||||
### Key Variables
|
||||
|
||||
- `cis_level`: CIS hardening level (1 or 2)
|
||||
- `disable_root_login`: Disable root SSH login (default: true)
|
||||
- `secure_ssh_config`: Apply SSH security hardening (default: true)
|
||||
- `firewall_policy`: Firewall default policy (default: deny)
|
||||
- `ssh_max_auth_tries`: Maximum SSH authentication attempts (default: 3)
|
||||
- `ssh_client_alive_interval`: SSH client alive interval in seconds (default: 300)
|
||||
- `ssh_allowed_networks`: Networks allowed SSH access from
|
||||
|
||||
### SSH Allowed Networks
|
||||
|
||||
Default trusted networks:
|
||||
- 10.0.0.0/8 (Private Class A)
|
||||
- 172.16.0.0/12 (Private Class B)
|
||||
- 192.168.0.0/16 (Private Class C)
|
||||
|
||||
## Usage
|
||||
|
||||
```yaml
|
||||
- role: hardening
|
||||
vars:
|
||||
cis_level: 1
|
||||
disable_root_login: true
|
||||
ssh_allowed_networks:
|
||||
- 10.0.0.0/8
|
||||
- 203.0.113.0/24
|
||||
```
|
||||
|
||||
## SSH Configuration Changes
|
||||
|
||||
- Root login disabled
|
||||
- Password authentication disabled
|
||||
- Maximum auth tries: 3
|
||||
- Empty passwords prohibited
|
||||
- Client alive interval: 300 seconds
|
||||
- Client alive count max: 2
|
||||
|
||||
## Tags
|
||||
|
||||
- `hardening`: All hardening tasks
|
||||
- `security`: Security-related tasks
|
||||
@@ -0,0 +1,35 @@
|
||||
---
|
||||
# Hardening configuration
|
||||
cis_level: 1
|
||||
disable_root_login: true
|
||||
secure_ssh_config: true
|
||||
firewall_policy: deny
|
||||
auditd_enabled: true
|
||||
selinux_mode: enforcing
|
||||
apparmor_enabled: true
|
||||
|
||||
# SSH Hardening
|
||||
ssh_max_auth_tries: 3
|
||||
ssh_client_alive_interval: 300
|
||||
ssh_client_alive_count_max: 2
|
||||
|
||||
# Firewall rules for SSH (trusted networks)
|
||||
ssh_allowed_networks:
|
||||
- 10.0.0.0/8
|
||||
- 172.16.0.0/12
|
||||
- 192.168.0.0/16
|
||||
|
||||
# Services to disable
|
||||
unnecessary_services:
|
||||
- cups
|
||||
- avahi-daemon
|
||||
- bluetooth
|
||||
- nfs-server
|
||||
- rpcbind
|
||||
|
||||
# Packages to remove
|
||||
unnecessary_packages:
|
||||
- telnet
|
||||
- rsh-client
|
||||
- talk
|
||||
- ntalk
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
- name: restart sshd
|
||||
ansible.builtin.service:
|
||||
name: sshd
|
||||
state: restarted
|
||||
@@ -0,0 +1,7 @@
|
||||
---
|
||||
# CIS Hardening Level 1 tasks (stub for future expansion)
|
||||
# https://www.cisecurity.org/cis-benchmarks/
|
||||
|
||||
- name: Check CIS status
|
||||
ansible.builtin.debug:
|
||||
msg: "CIS Hardening Level {{ cis_level }} would be applied here"
|
||||
@@ -0,0 +1,95 @@
|
||||
---
|
||||
- name: Validate hardening requirements
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- ansible_os_family == "Debian"
|
||||
- cis_level in [1, 2]
|
||||
fail_msg: "Unsupported configuration for hardening"
|
||||
|
||||
- name: Apply CIS hardening tasks
|
||||
ansible.builtin.include_tasks: cis_hardening.yml
|
||||
when: cis_level >= 1
|
||||
|
||||
- name: Configure SSH hardening
|
||||
block:
|
||||
- name: Disable root SSH login
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^PermitRootLogin'
|
||||
line: 'PermitRootLogin no'
|
||||
state: present
|
||||
when: disable_root_login
|
||||
notify: restart sshd
|
||||
|
||||
- name: Disable password authentication
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^PasswordAuthentication'
|
||||
line: 'PasswordAuthentication no'
|
||||
state: present
|
||||
when: secure_ssh_config
|
||||
notify: restart sshd
|
||||
|
||||
- name: Set MaxAuthTries
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^MaxAuthTries'
|
||||
line: "MaxAuthTries {{ ssh_max_auth_tries }}"
|
||||
state: present
|
||||
notify: restart sshd
|
||||
|
||||
- name: Disable empty passwords
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^PermitEmptyPasswords'
|
||||
line: 'PermitEmptyPasswords no'
|
||||
state: present
|
||||
notify: restart sshd
|
||||
|
||||
- name: Set ClientAliveInterval
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^ClientAliveInterval'
|
||||
line: "ClientAliveInterval {{ ssh_client_alive_interval }}"
|
||||
state: present
|
||||
notify: restart sshd
|
||||
|
||||
- name: Set ClientAliveCountMax
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^ClientAliveCountMax'
|
||||
line: "ClientAliveCountMax {{ ssh_client_alive_count_max }}"
|
||||
state: present
|
||||
notify: restart sshd
|
||||
|
||||
- name: Configure firewall rules
|
||||
block:
|
||||
- name: Enable firewall
|
||||
community.general.ufw:
|
||||
state: enabled
|
||||
policy: "{{ firewall_policy }}"
|
||||
when: firewall_policy is defined
|
||||
|
||||
- name: Allow SSH from trusted networks
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
port: '22'
|
||||
proto: tcp
|
||||
from: "{{ item }}"
|
||||
loop: "{{ ssh_allowed_networks }}"
|
||||
|
||||
- name: Disable unnecessary services
|
||||
ansible.builtin.service:
|
||||
name: "{{ item }}"
|
||||
state: stopped
|
||||
enabled: false
|
||||
loop: "{{ unnecessary_services }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Remove unnecessary packages
|
||||
ansible.builtin.apt:
|
||||
name: "{{ item }}"
|
||||
state: absent
|
||||
purge: true
|
||||
loop: "{{ unnecessary_packages }}"
|
||||
failed_when: false
|
||||
@@ -0,0 +1,45 @@
|
||||
# Patching Role
|
||||
|
||||
Apply security patches and OS updates to enterprise infrastructure nodes.
|
||||
|
||||
## Features
|
||||
|
||||
- **Idempotent**: Properly checks for changes with `changed_when`
|
||||
- **Patch Window**: Optional enforcement of patch time windows
|
||||
- **Pre-patch Backup**: Backs up package list before patching
|
||||
- **Smart Reboot**: Automatically detects if reboot is required
|
||||
- **Service Restart**: Restarts only necessary services after patching
|
||||
- **Health Checks**: Verifies services and runs health endpoint checks
|
||||
|
||||
## Role Variables
|
||||
|
||||
See `defaults/main.yml` for all available variables.
|
||||
|
||||
### Key Variables
|
||||
|
||||
- `patch_window_start`: Patch window start time (default: 02:00)
|
||||
- `patch_window_end`: Patch window end time (default: 04:00)
|
||||
- `enforce_patch_window`: Enforce patch time window (default: true)
|
||||
- `patch_security_only`: Apply security updates only (default: true)
|
||||
- `backup_before_patch`: Create backup before patching (default: true)
|
||||
- `reboot_if_required`: Auto-reboot if required (default: false)
|
||||
- `services_to_restart`: Services to restart after patching
|
||||
- `critical_services`: Critical services to verify after patching
|
||||
|
||||
## Usage
|
||||
|
||||
```yaml
|
||||
- role: patching
|
||||
vars:
|
||||
patch_security_only: true
|
||||
enforce_patch_window: false
|
||||
reboot_if_required: true
|
||||
```
|
||||
|
||||
## Report
|
||||
|
||||
Patch report is generated at: `/var/log/patch_report_<timestamp>.log`
|
||||
|
||||
## Backup Location
|
||||
|
||||
Pre-patch backups saved to: `/var/backups/pre-patch-<timestamp>/`
|
||||
@@ -0,0 +1,20 @@
|
||||
---
|
||||
# Patching configuration
|
||||
patch_window_start: "02:00"
|
||||
patch_window_end: "04:00"
|
||||
enforce_patch_window: true
|
||||
patch_security_only: true
|
||||
backup_before_patch: true
|
||||
reboot_if_required: false
|
||||
reboot_timeout: 300
|
||||
|
||||
# Services to restart after patching
|
||||
services_to_restart:
|
||||
- sshd
|
||||
- fail2ban
|
||||
|
||||
# Services to verify after patching
|
||||
critical_services:
|
||||
- systemd-journald
|
||||
- systemd-logind
|
||||
- cron
|
||||
@@ -0,0 +1,6 @@
|
||||
---
|
||||
- name: restart patching services
|
||||
ansible.builtin.service:
|
||||
name: "{{ item }}"
|
||||
state: restarted
|
||||
loop: "{{ services_to_restart }}"
|
||||
@@ -0,0 +1,105 @@
|
||||
---
|
||||
- name: Validate patch window
|
||||
when: enforce_patch_window | bool
|
||||
block:
|
||||
- name: Check current time against patch window
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- ansible_date_time.hour | int >= patch_window_start.split(':')[0] | int
|
||||
- ansible_date_time.hour | int < patch_window_end.split(':')[0] | int
|
||||
fail_msg: |
|
||||
Current time {{ ansible_date_time.hour }}:{{ ansible_date_time.minute }} is outside patch window {{ patch_window_start }}-{{ patch_window_end }}
|
||||
|
||||
- name: Create pre-patch backup
|
||||
when: backup_before_patch | bool
|
||||
block:
|
||||
- name: Create backup directory
|
||||
ansible.builtin.file:
|
||||
path: "/var/backups/pre-patch-{{ ansible_date_time.iso8601 }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Capture current package list
|
||||
ansible.builtin.shell: |
|
||||
set -o pipefail
|
||||
dpkg --get-selections > /var/backups/pre-patch-{{ ansible_date_time.iso8601 }}/packages.list
|
||||
changed_when: false
|
||||
|
||||
- name: Check for available updates
|
||||
ansible.builtin.shell: |
|
||||
set -o pipefail
|
||||
apt list --upgradable 2>/dev/null | grep -v "Listing..." | wc -l
|
||||
register: updates_available_count
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Update package cache
|
||||
ansible.builtin.apt:
|
||||
update_cache: true
|
||||
cache_valid_time: 300
|
||||
changed_when: false
|
||||
|
||||
- name: Check if reboot required before patching
|
||||
ansible.builtin.stat:
|
||||
path: /var/run/reboot-required
|
||||
register: reboot_required_before
|
||||
changed_when: false
|
||||
|
||||
- name: Apply security updates
|
||||
ansible.builtin.apt:
|
||||
upgrade: dist
|
||||
update_cache: true
|
||||
when: patch_security_only | bool
|
||||
register: apt_update_result
|
||||
notify: restart patching services
|
||||
|
||||
- name: Apply all available updates
|
||||
ansible.builtin.apt:
|
||||
upgrade: full
|
||||
update_cache: true
|
||||
when: not (patch_security_only | bool)
|
||||
register: apt_update_result
|
||||
notify: restart patching services
|
||||
|
||||
- name: Check if reboot required after patching
|
||||
ansible.builtin.stat:
|
||||
path: /var/run/reboot-required
|
||||
register: reboot_required_after
|
||||
changed_when: false
|
||||
|
||||
- name: Verify critical services are running
|
||||
ansible.builtin.service:
|
||||
name: "{{ item }}"
|
||||
state: started
|
||||
enabled: true
|
||||
loop: "{{ critical_services }}"
|
||||
failed_when: false
|
||||
|
||||
- name: Run post-patch health checks
|
||||
ansible.builtin.uri:
|
||||
url: http://localhost/health
|
||||
method: GET
|
||||
status_code: 200
|
||||
register: health_check
|
||||
failed_when: false
|
||||
ignore_errors: true
|
||||
when: "'webservers' in group_names"
|
||||
|
||||
- name: Set reboot required flag
|
||||
ansible.builtin.set_fact:
|
||||
reboot_required: "{{ reboot_required_after.stat.exists | default(false) }}"
|
||||
|
||||
- name: Perform system reboot if required
|
||||
ansible.builtin.reboot:
|
||||
msg: "Rebooting after security patches"
|
||||
timeout: "{{ reboot_timeout }}"
|
||||
when: reboot_required and reboot_if_required | bool
|
||||
|
||||
- name: Generate patching report
|
||||
ansible.builtin.template:
|
||||
src: patch_report.j2
|
||||
dest: /var/log/patch_report_{{ ansible_date_time.iso8601 }}.log
|
||||
mode: '0644'
|
||||
vars:
|
||||
updates_applied_count: "{{ apt_update_result.changed | ternary('Yes', 'No') }}"
|
||||
reboot_required_flag: "{{ reboot_required }}"
|
||||
+10
@@ -0,0 +1,10 @@
|
||||
Patching Report
|
||||
===============
|
||||
Generated: {{ ansible_date_time.iso8601 }}
|
||||
Host: {{ inventory_hostname }}
|
||||
|
||||
Updates Applied: {{ updates_applied_count }}
|
||||
Reboot Required: {{ reboot_required_flag }}
|
||||
Services Restarted: {{ services_to_restart | join(', ') }}
|
||||
|
||||
Backup Location: /var/backups/pre-patch-{{ ansible_date_time.iso8601 }}/
|
||||
@@ -0,0 +1,21 @@
|
||||
# Scenario: Simulate Failure and Patch
|
||||
|
||||
## Description
|
||||
|
||||
Validate that a service-level failure can be detected, recovered, and followed by a controlled patch workflow. This mirrors a maintenance window where a degraded node is stabilized before package updates are applied.
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
cd professional-infra/linux-operations-automation
|
||||
./scripts/simulate_failure.sh service 30 web
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml --check
|
||||
```
|
||||
|
||||
## Expected Result
|
||||
|
||||
- The simulation records a temporary service failure.
|
||||
- The service is restored after cleanup.
|
||||
- The patch playbook completes without unreachable hosts.
|
||||
- Hardening check mode reports no destructive changes.
|
||||
@@ -0,0 +1,116 @@
|
||||
---
|
||||
- name: Enterprise Scaling Event Scenario
|
||||
hosts: all
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
vars:
|
||||
scaling_threshold: 80
|
||||
cooldown_period: 300
|
||||
max_scale_up: 5
|
||||
min_instances: 2
|
||||
|
||||
pre_tasks:
|
||||
- name: Log scenario start
|
||||
lineinfile:
|
||||
path: "/var/log/scaling_scenario.log"
|
||||
line: "{{ ansible_date_time.iso8601 }} - Starting scaling event scenario"
|
||||
create: yes
|
||||
|
||||
- name: Check current load
|
||||
command: uptime
|
||||
register: system_load
|
||||
changed_when: false
|
||||
|
||||
- name: Parse load average
|
||||
set_fact:
|
||||
load_1min: "{{ system_load.stdout.split(',')[0].split()[-1] | float }}"
|
||||
load_5min: "{{ system_load.stdout.split(',')[1] | float }}"
|
||||
load_15min: "{{ system_load.stdout.split(',')[2] | float }}"
|
||||
|
||||
tasks:
|
||||
- name: Evaluate scaling conditions
|
||||
set_fact:
|
||||
scale_up_needed: "{{ load_5min > scaling_threshold }}"
|
||||
scale_down_needed: "{{ load_5min < (scaling_threshold * 0.3) }}"
|
||||
|
||||
- name: Scale up web servers
|
||||
include_role:
|
||||
name: scale_up
|
||||
tasks_from: web_servers
|
||||
vars:
|
||||
scale_count: "{{ [max_scale_up, (load_5min / 10) | int] | min }}"
|
||||
when: scale_up_needed and "'webservers' in group_names"
|
||||
|
||||
- name: Scale up database servers
|
||||
include_role:
|
||||
name: scale_up
|
||||
tasks_from: database_servers
|
||||
vars:
|
||||
scale_count: "{{ [2, (load_5min / 20) | int] | min }}"
|
||||
when: scale_up_needed and "'databases' in group_names"
|
||||
|
||||
- name: Update load balancer configuration
|
||||
include_role:
|
||||
name: load_balancer
|
||||
tasks_from: update_backends
|
||||
when: scale_up_needed
|
||||
|
||||
- name: Scale down web servers
|
||||
include_role:
|
||||
name: scale_down
|
||||
tasks_from: web_servers
|
||||
vars:
|
||||
scale_count: "{{ [(inventory_hostname | regex_findall('[0-9]+') | first | int) - min_instances, 1] | max }}"
|
||||
when: scale_down_needed and "'webservers' in group_names" and (inventory_hostname | regex_findall('[0-9]+') | first | int) > min_instances
|
||||
|
||||
- name: Wait for cooldown period
|
||||
pause:
|
||||
seconds: "{{ cooldown_period }}"
|
||||
when: scale_up_needed or scale_down_needed
|
||||
|
||||
- name: Verify scaling results
|
||||
uri:
|
||||
url: http://localhost/health
|
||||
method: GET
|
||||
status_code: 200
|
||||
register: health_check
|
||||
until: health_check.status == 200
|
||||
retries: 5
|
||||
delay: 10
|
||||
when: "'webservers' in group_names"
|
||||
|
||||
- name: Update monitoring thresholds
|
||||
include_role:
|
||||
name: monitoring
|
||||
tasks_from: update_alerts
|
||||
vars:
|
||||
new_threshold: "{{ scaling_threshold + 10 }}"
|
||||
|
||||
- name: Send scaling notification
|
||||
mail:
|
||||
to: "{{ scaling_notification_email | default('infra-team@company.com') }}"
|
||||
subject: "Infrastructure Scaling Event - {{ inventory_hostname }}"
|
||||
body: |
|
||||
Scaling event completed on {{ inventory_hostname }}
|
||||
|
||||
Load averages: {{ load_1min }}, {{ load_5min }}, {{ load_15min }}
|
||||
Action taken: {{ 'Scale Up' if scale_up_needed else 'Scale Down' if scale_down_needed else 'No Action' }}
|
||||
Health check: {{ 'PASSED' if health_check.status == 200 else 'FAILED' }}
|
||||
|
||||
See /var/log/scaling_scenario.log for details
|
||||
when: scaling_notification_email is defined
|
||||
ignore_errors: yes
|
||||
|
||||
post_tasks:
|
||||
- name: Generate scaling scenario report
|
||||
template:
|
||||
src: templates/scaling_scenario_report.j2
|
||||
dest: "/var/log/scaling_scenario_report_{{ ansible_date_time.iso8601 }}.log"
|
||||
vars:
|
||||
scenario_outcome: "{{ 'SUCCESS' if health_check.status == 200 else 'WARNING' }}"
|
||||
load_metrics: "{{ load_1min }}, {{ load_5min }}, {{ load_15min }}"
|
||||
|
||||
- name: Log scenario completion
|
||||
lineinfile:
|
||||
path: "/var/log/scaling_scenario.log"
|
||||
line: "{{ ansible_date_time.iso8601 }} - Scaling event scenario completed"
|
||||
@@ -0,0 +1,388 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Enterprise Infrastructure Failure Simulation Script
|
||||
# Simulates various types of infrastructure failures for testing
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration
|
||||
DOCKER_COMPOSE_FILE="docker-compose.yml"
|
||||
INVENTORY_FILE="inventory/hosts.ini"
|
||||
LOG_FILE="logs/failure_simulation.log"
|
||||
|
||||
# Default values
|
||||
FAILURE_TYPE="${1:-network}"
|
||||
DURATION="${2:-60}"
|
||||
TARGET_NODES="${3:-all}"
|
||||
INTENSITY="${INTENSITY:-medium}"
|
||||
|
||||
# Logging function
|
||||
log() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Error handling
|
||||
error_exit() {
|
||||
log "ERROR: $1"
|
||||
# Cleanup any active failures
|
||||
cleanup_failure
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Validate inputs
|
||||
validate_inputs() {
|
||||
case "$FAILURE_TYPE" in
|
||||
network|disk|service|node|cpu|memory) ;;
|
||||
*) error_exit "Invalid failure type: $FAILURE_TYPE. Must be network, disk, service, node, cpu, or memory" ;;
|
||||
esac
|
||||
|
||||
if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -lt 1 ]; then
|
||||
error_exit "Invalid duration: $DURATION. Must be a positive integer (seconds)"
|
||||
fi
|
||||
|
||||
case "$INTENSITY" in
|
||||
low|medium|high|critical) ;;
|
||||
*) error_exit "Invalid intensity: $INTENSITY. Must be low, medium, high, or critical" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Get target containers
|
||||
get_target_containers() {
|
||||
if [ "${SIMULATION_MODE:-false}" = true ]; then
|
||||
case "$TARGET_NODES" in
|
||||
all) echo "web db lb" ;;
|
||||
*) echo "$TARGET_NODES" ;;
|
||||
esac
|
||||
return
|
||||
fi
|
||||
|
||||
case "$TARGET_NODES" in
|
||||
all)
|
||||
docker compose ps --services | grep -v "^NAME$" || true
|
||||
;;
|
||||
web)
|
||||
echo "web"
|
||||
;;
|
||||
db)
|
||||
echo "db"
|
||||
;;
|
||||
lb)
|
||||
echo "lb"
|
||||
;;
|
||||
monitor)
|
||||
echo "monitor"
|
||||
;;
|
||||
*)
|
||||
echo "$TARGET_NODES"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Network failure simulation
|
||||
simulate_network_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating network failure on containers: $containers"
|
||||
|
||||
if [ "${SIMULATION_MODE:-false}" = true ]; then
|
||||
log "SIMULATION_MODE=true: skipping Docker network changes"
|
||||
return
|
||||
fi
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Disconnecting network for container $cid"
|
||||
|
||||
# Disconnect from network
|
||||
docker network disconnect "$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" "$cid" 2>/dev/null || true
|
||||
|
||||
# Store original network for restoration
|
||||
echo "$cid:$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" >> /tmp/network_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Disk failure simulation
|
||||
simulate_disk_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating disk space exhaustion on containers: $containers"
|
||||
|
||||
if [ "${SIMULATION_MODE:-false}" = true ]; then
|
||||
log "SIMULATION_MODE=true: skipping container disk writes"
|
||||
return
|
||||
fi
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Filling disk space in container $cid"
|
||||
|
||||
# Create a large file to consume disk space
|
||||
local fill_size_mb=100
|
||||
case "$INTENSITY" in
|
||||
low) fill_size_mb=50 ;;
|
||||
medium) fill_size_mb=100 ;;
|
||||
high) fill_size_mb=500 ;;
|
||||
critical) fill_size_mb=1024 ;;
|
||||
esac
|
||||
|
||||
docker exec "$cid" bash -c "dd if=/dev/zero of=/tmp/disk_fill bs=1M count=${fill_size_mb}" 2>/dev/null || true
|
||||
echo "$cid:disk_fill" >> /tmp/disk_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Service failure simulation
|
||||
simulate_service_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating service failures on containers: $containers"
|
||||
|
||||
if [ "${SIMULATION_MODE:-false}" = true ]; then
|
||||
for container in $containers; do
|
||||
log "SIMULATION_MODE=true: would stop services in $container"
|
||||
done
|
||||
return
|
||||
fi
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Stopping services in container $cid"
|
||||
|
||||
# Stop common services
|
||||
docker exec "$cid" systemctl stop nginx 2>/dev/null || true
|
||||
docker exec "$cid" systemctl stop postgresql 2>/dev/null || true
|
||||
docker exec "$cid" systemctl stop haproxy 2>/dev/null || true
|
||||
|
||||
echo "$cid:services" >> /tmp/service_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Node failure simulation
|
||||
simulate_node_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating complete node failures on containers: $containers"
|
||||
|
||||
if [ "${SIMULATION_MODE:-false}" = true ]; then
|
||||
log "SIMULATION_MODE=true: skipping container pause"
|
||||
return
|
||||
fi
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Stopping container $cid (node failure)"
|
||||
docker pause "$cid"
|
||||
echo "$cid:paused" >> /tmp/node_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# CPU stress simulation
|
||||
simulate_cpu_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating CPU stress on containers: $containers"
|
||||
|
||||
if [ "${SIMULATION_MODE:-false}" = true ]; then
|
||||
log "SIMULATION_MODE=true: skipping CPU stress"
|
||||
return
|
||||
fi
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Starting CPU stress in container $cid"
|
||||
|
||||
# Start CPU stress process
|
||||
docker exec -d "$cid" bash -c "while true; do :; done" 2>/dev/null || true
|
||||
echo "$cid:cpu_stress:$(docker exec "$cid" ps aux | grep "while true" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/cpu_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Memory stress simulation
|
||||
simulate_memory_failure() {
|
||||
local containers=$(get_target_containers)
|
||||
log "Simulating memory exhaustion on containers: $containers"
|
||||
|
||||
if [ "${SIMULATION_MODE:-false}" = true ]; then
|
||||
log "SIMULATION_MODE=true: skipping memory stress"
|
||||
return
|
||||
fi
|
||||
|
||||
for container in $containers; do
|
||||
local container_ids=$(docker compose ps -q "$container" 2>/dev/null || true)
|
||||
|
||||
for cid in $container_ids; do
|
||||
if [ -n "$cid" ]; then
|
||||
log "Starting memory stress in container $cid"
|
||||
|
||||
# Start memory stress process
|
||||
docker exec -d "$cid" bash -c "tail /dev/zero" 2>/dev/null || true
|
||||
echo "$cid:memory_stress:$(docker exec "$cid" ps aux | grep "tail /dev/zero" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/memory_failure_state
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# Inject failure
|
||||
inject_failure() {
|
||||
case "$FAILURE_TYPE" in
|
||||
network) simulate_network_failure ;;
|
||||
disk) simulate_disk_failure ;;
|
||||
service) simulate_service_failure ;;
|
||||
node) simulate_node_failure ;;
|
||||
cpu) simulate_cpu_failure ;;
|
||||
memory) simulate_memory_failure ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Cleanup failure
|
||||
cleanup_failure() {
|
||||
log "Cleaning up failure simulation"
|
||||
|
||||
# Restore network connections
|
||||
if [ -f /tmp/network_failure_state ]; then
|
||||
while IFS=: read -r cid network; do
|
||||
docker network connect "$network" "$cid" 2>/dev/null || true
|
||||
done < /tmp/network_failure_state
|
||||
rm -f /tmp/network_failure_state
|
||||
fi
|
||||
|
||||
# Clean up disk fill files
|
||||
if [ -f /tmp/disk_failure_state ]; then
|
||||
while IFS=: read -r cid _; do
|
||||
docker exec "$cid" rm -f /tmp/disk_fill 2>/dev/null || true
|
||||
done < /tmp/disk_failure_state
|
||||
rm -f /tmp/disk_failure_state
|
||||
fi
|
||||
|
||||
# Restart services
|
||||
if [ -f /tmp/service_failure_state ]; then
|
||||
while IFS=: read -r cid _; do
|
||||
docker exec "$cid" systemctl start nginx 2>/dev/null || true
|
||||
docker exec "$cid" systemctl start postgresql 2>/dev/null || true
|
||||
docker exec "$cid" systemctl start haproxy 2>/dev/null || true
|
||||
done < /tmp/service_failure_state
|
||||
rm -f /tmp/service_failure_state
|
||||
fi
|
||||
|
||||
# Unpause containers
|
||||
if [ -f /tmp/node_failure_state ]; then
|
||||
while IFS=: read -r cid _; do
|
||||
docker unpause "$cid" 2>/dev/null || true
|
||||
done < /tmp/node_failure_state
|
||||
rm -f /tmp/node_failure_state
|
||||
fi
|
||||
|
||||
# Kill stress processes
|
||||
if [ -f /tmp/cpu_failure_state ]; then
|
||||
while IFS=: read -r cid _ pid; do
|
||||
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
|
||||
done < /tmp/cpu_failure_state
|
||||
rm -f /tmp/cpu_failure_state
|
||||
fi
|
||||
|
||||
if [ -f /tmp/memory_failure_state ]; then
|
||||
while IFS=: read -r cid _ pid; do
|
||||
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
|
||||
done < /tmp/memory_failure_state
|
||||
rm -f /tmp/memory_failure_state
|
||||
fi
|
||||
}
|
||||
|
||||
# Monitor failure
|
||||
monitor_failure() {
|
||||
local end_time=$(( $(date +%s) + DURATION ))
|
||||
|
||||
log "Monitoring failure for $DURATION seconds"
|
||||
|
||||
while [ $(date +%s) -lt $end_time ]; do
|
||||
# Check container status
|
||||
if [ "${SIMULATION_MODE:-false}" = true ]; then
|
||||
log "SIMULATION_MODE=true: validation simulated"
|
||||
return
|
||||
fi
|
||||
|
||||
if ! docker compose ps | grep -q "Up\|Paused"; then
|
||||
log "WARNING: All containers are down"
|
||||
fi
|
||||
|
||||
# Log system metrics
|
||||
log "System status: $(docker stats --no-stream --format 'table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}' | tail -n +2)"
|
||||
|
||||
sleep 10
|
||||
done
|
||||
}
|
||||
|
||||
# Generate failure report
|
||||
generate_report() {
|
||||
local report_file="reports/failure_simulation_$(date +%Y%m%d_%H%M%S).txt"
|
||||
|
||||
cat > "$report_file" << EOF
|
||||
Failure Simulation Report
|
||||
========================
|
||||
|
||||
Timestamp: $(date)
|
||||
Failure Type: $FAILURE_TYPE
|
||||
Duration: $DURATION seconds
|
||||
Target Nodes: $TARGET_NODES
|
||||
Intensity: $INTENSITY
|
||||
|
||||
Pre-failure Status:
|
||||
$(docker compose ps 2>/dev/null || echo "Docker Compose not running")
|
||||
|
||||
Post-failure Status:
|
||||
$(docker compose ps 2>/dev/null || echo "Docker Compose not running")
|
||||
|
||||
Log File: $LOG_FILE
|
||||
EOF
|
||||
|
||||
log "Failure simulation report generated: $report_file"
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
log "Starting failure simulation: $FAILURE_TYPE for $DURATION seconds"
|
||||
|
||||
validate_inputs
|
||||
|
||||
# Inject failure
|
||||
inject_failure
|
||||
|
||||
# Monitor during failure
|
||||
monitor_failure
|
||||
|
||||
# Cleanup
|
||||
cleanup_failure
|
||||
|
||||
# Generate report
|
||||
generate_report
|
||||
|
||||
log "Failure simulation completed successfully"
|
||||
}
|
||||
|
||||
# Trap for cleanup on script exit
|
||||
trap cleanup_failure EXIT
|
||||
|
||||
# Initialize logging
|
||||
mkdir -p logs reports
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
@@ -0,0 +1,229 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Enterprise Infrastructure Scaling Simulation Script
|
||||
# Simulates scaling operations for infrastructure nodes
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration
|
||||
DOCKER_COMPOSE_FILE="docker-compose.yml"
|
||||
INVENTORY_FILE="inventory/hosts.ini"
|
||||
LOG_FILE="logs/scaling_simulation.log"
|
||||
|
||||
# Default values
|
||||
DIRECTION="${1:-up}"
|
||||
COUNT="${2:-1}"
|
||||
NODE_TYPE="${3:-web}"
|
||||
SIMULATION_MODE="${SIMULATION_MODE:-false}"
|
||||
|
||||
# Logging function
|
||||
log() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Error handling
|
||||
error_exit() {
|
||||
log "ERROR: $1"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Validate inputs
|
||||
validate_inputs() {
|
||||
if [[ "$DIRECTION" != "up" && "$DIRECTION" != "down" ]]; then
|
||||
error_exit "Invalid direction: $DIRECTION. Must be 'up' or 'down'"
|
||||
fi
|
||||
|
||||
if ! [[ "$COUNT" =~ ^[0-9]+$ ]] || [ "$COUNT" -lt 1 ]; then
|
||||
error_exit "Invalid count: $COUNT. Must be a positive integer"
|
||||
fi
|
||||
|
||||
case "$NODE_TYPE" in
|
||||
web|db|lb|monitor) ;;
|
||||
*) error_exit "Invalid node type: $NODE_TYPE. Must be web, db, lb, or monitor" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Get current node count
|
||||
get_current_count() {
|
||||
local type="$1"
|
||||
if [ "$SIMULATION_MODE" = true ]; then
|
||||
case "$type" in
|
||||
web) echo 3 ;;
|
||||
db) echo 2 ;;
|
||||
lb|monitor) echo 1 ;;
|
||||
esac
|
||||
return
|
||||
fi
|
||||
|
||||
case "$type" in
|
||||
web) docker compose ps web | grep -c "Up" ;;
|
||||
db) docker compose ps db | grep -c "Up" ;;
|
||||
lb) docker compose ps lb | grep -c "Up" ;;
|
||||
monitor) docker compose ps monitor | grep -c "Up" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Scale up infrastructure
|
||||
scale_up() {
|
||||
local type="$1"
|
||||
local count="$2"
|
||||
|
||||
log "Scaling up $count $type nodes"
|
||||
|
||||
if [ "$SIMULATION_MODE" = true ]; then
|
||||
log "SIMULATION_MODE=true: skipping Docker Compose mutation and Ansible provisioning"
|
||||
update_inventory "$type" "$count" "add"
|
||||
log "Successfully simulated scale up of $count $type nodes"
|
||||
return
|
||||
fi
|
||||
|
||||
docker compose -f "$DOCKER_COMPOSE_FILE" up -d --scale "${type}=${count}"
|
||||
|
||||
# Wait for containers to be ready
|
||||
log "Waiting for containers to be ready..."
|
||||
sleep 30
|
||||
|
||||
# Update inventory
|
||||
update_inventory "$type" "$count" "add"
|
||||
|
||||
# Run provisioning playbook on new nodes
|
||||
if [ "$SIMULATION_MODE" = false ]; then
|
||||
ansible-playbook -i "$INVENTORY_FILE" playbooks/provision.yml --limit "${type}*"
|
||||
fi
|
||||
|
||||
log "Successfully scaled up $count $type nodes"
|
||||
}
|
||||
|
||||
# Scale down infrastructure
|
||||
scale_down() {
|
||||
local type="$1"
|
||||
local count="$2"
|
||||
|
||||
local current_count=$(get_current_count "$type")
|
||||
if [ "$current_count" -lt "$count" ]; then
|
||||
error_exit "Cannot scale down $count nodes. Only $current_count $type nodes currently running"
|
||||
fi
|
||||
|
||||
log "Scaling down $count $type nodes"
|
||||
|
||||
# Select nodes to remove (oldest first)
|
||||
if [ "$SIMULATION_MODE" = true ]; then
|
||||
log "SIMULATION_MODE=true: skipping Docker Compose mutation and Ansible decommissioning"
|
||||
update_inventory "$type" "$count" "remove"
|
||||
log "Successfully simulated scale down of $count $type nodes"
|
||||
return
|
||||
fi
|
||||
|
||||
local nodes_to_remove=$(docker compose ps "$type" | grep "Up" | head -n "$count" | awk '{print $1}')
|
||||
|
||||
# Decommission nodes
|
||||
for node in $nodes_to_remove; do
|
||||
if [ "$SIMULATION_MODE" = false ]; then
|
||||
ansible-playbook -i "$INVENTORY_FILE" playbooks/decommission.yml --limit "$node"
|
||||
fi
|
||||
docker stop "$node"
|
||||
docker rm "$node"
|
||||
done
|
||||
|
||||
# Update inventory
|
||||
update_inventory "$type" "$count" "remove"
|
||||
|
||||
log "Successfully scaled down $count $type nodes"
|
||||
}
|
||||
|
||||
# Update Ansible inventory
|
||||
update_inventory() {
|
||||
local type="$1"
|
||||
local count="$2"
|
||||
local action="$3"
|
||||
|
||||
log "Updating inventory for $action $count $type nodes"
|
||||
|
||||
# This would be more complex in a real implementation
|
||||
# For simulation, we'll just log the action
|
||||
case "$action" in
|
||||
add)
|
||||
log "Added $count $type nodes to inventory"
|
||||
;;
|
||||
remove)
|
||||
log "Removed $count $type nodes from inventory"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Health check after scaling
|
||||
health_check() {
|
||||
log "Running health checks after scaling"
|
||||
|
||||
# Check container status
|
||||
if [ "$SIMULATION_MODE" = true ]; then
|
||||
log "SIMULATION_MODE=true: health checks simulated"
|
||||
return
|
||||
fi
|
||||
|
||||
if ! docker compose ps | grep -q "Up"; then
|
||||
error_exit "Some containers failed to start"
|
||||
fi
|
||||
|
||||
# Ansible ping check
|
||||
if [ "$SIMULATION_MODE" = false ]; then
|
||||
if ! ansible -i "$INVENTORY_FILE" all -m ping >/dev/null 2>&1; then
|
||||
log "WARNING: Some nodes failed Ansible ping check"
|
||||
fi
|
||||
fi
|
||||
|
||||
log "Health checks completed"
|
||||
}
|
||||
|
||||
# Generate scaling report
|
||||
generate_report() {
|
||||
local report_file="reports/scaling_report_$(date +%Y%m%d_%H%M%S).txt"
|
||||
|
||||
cat > "$report_file" << EOF
|
||||
Scaling Simulation Report
|
||||
========================
|
||||
|
||||
Timestamp: $(date)
|
||||
Direction: $DIRECTION
|
||||
Node Type: $NODE_TYPE
|
||||
Count: $COUNT
|
||||
Simulation Mode: $SIMULATION_MODE
|
||||
|
||||
Current Status:
|
||||
$(docker compose ps 2>/dev/null || echo "Docker Compose not running")
|
||||
|
||||
Inventory Status:
|
||||
$(ansible -i "$INVENTORY_FILE" --list-hosts all 2>/dev/null || echo "Ansible inventory check failed")
|
||||
|
||||
Log File: $LOG_FILE
|
||||
EOF
|
||||
|
||||
log "Scaling report generated: $report_file"
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
log "Starting scaling simulation: $DIRECTION $COUNT $NODE_TYPE nodes"
|
||||
|
||||
validate_inputs
|
||||
|
||||
case "$DIRECTION" in
|
||||
up)
|
||||
scale_up "$NODE_TYPE" "$COUNT"
|
||||
;;
|
||||
down)
|
||||
scale_down "$NODE_TYPE" "$COUNT"
|
||||
;;
|
||||
esac
|
||||
|
||||
health_check
|
||||
generate_report
|
||||
|
||||
log "Scaling simulation completed successfully"
|
||||
}
|
||||
|
||||
# Initialize logging
|
||||
mkdir -p logs reports
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user