diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 2a410be..867160a 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -1,118 +1,31 @@ -name: CI Pipeline +name: ci on: push: - branches: [ main, develop ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] jobs: - lint-ansible: + validate: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Install Ansible Lint - run: pip install ansible-lint - - name: Lint Ansible Playbooks - run: | - cd enterprise-infra-simulator - ansible-lint playbooks/*.yml - - name: Check Ansible Syntax - run: | - cd enterprise-infra-simulator - ansible-playbook --syntax-check playbooks/*.yml + - uses: actions/checkout@v4 - test-python: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - - name: Install Dependencies - run: | - cd migration-validation-framework - pip install -r requirements.txt - - name: Run Python Tests - run: | - cd migration-validation-framework - python -m pytest tests/ -v --cov=. --cov-report=xml - - name: Lint Python Code - run: | - pip install flake8 black isort - cd migration-validation-framework - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - black --check . - isort --check-only . + - name: Python syntax check + run: | + python3 -m py_compile \ + migration-validation-framework/cli.py \ + migration-validation-framework/collectors/*.py \ + migration-validation-framework/validators/*.py \ + migration-validation-framework/reports/*.py - validate-docker: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Validate Docker Compose - run: | - cd observability-stack - docker-compose config - - name: Check Docker Images - run: | - cd observability-stack - docker-compose pull --quiet + - name: Ansible syntax check + run: | + python3 -m pip install --user ansible + ansible-playbook -i enterprise-infra-simulator/inventory/hosts.ini \ + --syntax-check enterprise-infra-simulator/playbooks/*.yml - security-scan: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master - with: - scan-type: 'fs' - scan-ref: '.' - format: 'sarif' - output: 'trivy-results.sarif' - - name: Upload Trivy scan results to GitHub Security tab - uses: github/codeql-action/upload-sarif@v2 - if: always() - with: - sarif_file: 'trivy-results.sarif' - - documentation: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Check Documentation - run: | - # Check for broken links in README files - find . -name "README.md" -exec markdown-link-check {} \; - # Validate YAML files - find . -name "*.yml" -o -name "*.yaml" | xargs -I {} yamllint {} - - integration-test: - runs-on: ubuntu-latest - needs: [lint-ansible, test-python, validate-docker] - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - - name: Install Dependencies - run: | - pip install ansible docker-compose - - name: Run Integration Tests - run: | - # Start infrastructure simulator - cd enterprise-infra-simulator - make up - sleep 30 - # Run basic validation - ansible -i inventory/hosts.ini all -m ping - # Test migration framework - cd ../migration-validation-framework - python cli.py --help - # Validate observability stack - cd ../observability-stack - docker-compose config - # Cleanup - cd ../enterprise-infra-simulator - make destroy \ No newline at end of file + - name: Docker compose validation + run: | + docker compose -f observability-stack/docker-compose.yml config diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..edb9ae6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +**pycache**/ +*.pyc +*.log +.env +.DS_Store diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a8f652..7af936c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ - **Comprehensive Ansible automation suite**: - `provision.yml`: Node provisioning with security hardening, package installation, and service configuration - `patch.yml`: Automated patching with rollback capabilities and notification system - - `harden.yml`: Security hardening following CIS benchmarks (firewall, SSH, user management) + - `hardening.yml`: Security hardening following CIS benchmarks (firewall, SSH, user management) - `decommission.yml`: Graceful node decommissioning with cleanup and notification - **Operational scripts**: - `simulate_scaling.sh`: Infrastructure scaling simulation @@ -120,4 +120,4 @@ - Disaster recovery procedures --- -*Portfolio created to demonstrate enterprise-level Linux infrastructure engineering capabilities across the full technology stack.* \ No newline at end of file +*Portfolio created to demonstrate enterprise-level Linux infrastructure engineering capabilities across the full technology stack.* diff --git a/README.md b/README.md index 3d54a49..faf8ccb 100644 --- a/README.md +++ b/README.md @@ -1,59 +1,19 @@ -# Enterprise Infrastructure Portfolio +# Infrastructure Engineering Portfolio -This mono-repository showcases enterprise-level Linux infrastructure engineering capabilities through three comprehensive projects demonstrating DevOps and Platform Engineering best practices. +This repository contains independent infrastructure projects focused on automation, migration assurance, and observability. The projects are intentionally small enough to run locally, but structured around the operating patterns used in enterprise platform teams: repeatable workflows, clear evidence artifacts, and operational documentation. -## Projects Overview +## Projects -### 1. Enterprise Infrastructure Simulator -A container-based simulation of enterprise Linux infrastructure with Ansible automation for provisioning, patching, hardening, and decommissioning operations. Includes failure simulation and scaling scenarios. +- [Enterprise Infrastructure Simulator](enterprise-infra-simulator/) - Ansible-driven lifecycle operations for provisioning, patching, hardening, decommissioning, and failure simulation across Linux nodes. +- [Migration Validation Framework](migration-validation-framework/) - Python CLI for collecting before/after system snapshots and producing structured migration comparison results. +- [Observability Stack](observability-stack/) - Docker Compose based logging and dashboard stack with alert rules, sample logs, and incident simulation. -**Key Features:** -- Multi-node Linux simulation using Docker containers -- Ansible playbooks for infrastructure lifecycle management -- Automated scaling and failure injection scripts -- Enterprise-grade inventory and scenario management +## Skills Demonstrated -### 2. Migration Validation Framework -A Python CLI tool for validating system migrations by collecting, comparing, and reporting on system state changes. Generates comprehensive before/after snapshots and HTML reports. +- Infrastructure automation with Ansible +- Operational scenario design and incident simulation +- Migration validation, drift detection, and JSON reporting +- Docker Compose service validation +- Repository hygiene, CI checks, and professional project documentation -**Key Features:** -- Automated system data collection (mounts, services, disk usage) -- JSON snapshot generation and comparison -- HTML report generation with change visualization -- CLI interface for enterprise migration workflows - -### 3. Observability Stack -A complete monitoring and logging stack using ELK (Elasticsearch, Logstash, Kibana) and Grafana for comprehensive infrastructure observability. - -**Key Features:** -- Docker Compose deployment of full observability stack -- Sample log ingestion and processing pipelines -- Alerting rules and incident simulation scenarios -- Real-time dashboards and monitoring capabilities - -## Architecture - -See [docs/architecture.md](docs/architecture.md) for detailed architecture overview. - -## Runbooks - -Operational procedures and troubleshooting guides available in [docs/runbooks.md](docs/runbooks.md). - -## Getting Started - -Each project contains its own README.md with setup and usage instructions. - -## CI/CD - -Automated testing and linting via Gitea workflows in [.gitea/workflows/](.gitea/workflows/). - -## Prerequisites - -- Docker and Docker Compose -- Ansible -- Python 3.8+ -- Make - -## License - -Enterprise Internal Use Only \ No newline at end of file +Each project remains independent and includes its own README, architecture notes, examples, and runnable scenarios. diff --git a/docs/runbooks.md b/docs/runbooks.md index 29a5e32..c9684b1 100644 --- a/docs/runbooks.md +++ b/docs/runbooks.md @@ -53,7 +53,7 @@ make up ```bash cd enterprise-infra-simulator -ansible-playbook -i inventory/hosts.ini playbooks/harden.yml +ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml ``` **Hardening Steps:** @@ -106,7 +106,7 @@ make destroy ```bash cd migration-validation-framework -python cli.py snapshot --env production --label pre-migration +python3 cli.py collect --output before.json --systems web01,db01 ``` **Data Collected:** @@ -118,8 +118,8 @@ python cli.py snapshot --env production --label pre-migration ### Post-Migration Validation ```bash -python cli.py snapshot --env production --label post-migration -python cli.py compare pre-migration post-migration +python3 cli.py collect --output after.json --systems web01,db01 +python3 cli.py compare before.json after.json --output diff.json ``` **Validation Checks:** @@ -131,7 +131,7 @@ python cli.py compare pre-migration post-migration ### Report Generation ```bash -python cli.py report --comparison-id --format html +python3 cli.py report --comparison --format html ``` **Report Contents:** @@ -293,7 +293,7 @@ docker-compose exec elasticsearch curl -X PUT "localhost:9200/_snapshot/backup" #### Migration Data Backup ```bash cd migration-validation-framework -python cli.py backup --destination /backup/location +tar -czf /backup/location/migration-validation-framework.tgz migration-validation-framework ``` ## Emergency Procedures @@ -326,4 +326,4 @@ cd observability-stack && docker-compose up -d - **Daily:** Log rotation and cleanup - **Weekly:** Security patching and updates - **Monthly:** Performance optimization and capacity planning -- **Quarterly:** Architecture review and modernization \ No newline at end of file +- **Quarterly:** Architecture review and modernization diff --git a/enterprise-infra-simulator/Makefile b/enterprise-infra-simulator/Makefile index 95be5f8..a0b6064 100644 --- a/enterprise-infra-simulator/Makefile +++ b/enterprise-infra-simulator/Makefile @@ -1,6 +1,6 @@ # Enterprise Infrastructure Simulator Makefile -.PHONY: help up down patch destroy status logs clean test +.PHONY: help run demo up down patch destroy status logs clean test # Default target help: ## Show this help message @@ -9,6 +9,13 @@ help: ## Show this help message @echo "Available commands:" @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " %-15s %s\n", $$1, $$2}' +run: ## Run the default simulator workflow + ansible-playbook -i inventory/hosts.ini playbooks/provision.yml + +demo: ## Run a failure-and-patch demonstration + ./scripts/simulate_failure.sh service 30 web + ansible-playbook -i inventory/hosts.ini playbooks/patch.yml + # Infrastructure management up: ## Start the infrastructure simulation @echo "Starting enterprise infrastructure simulation..." @@ -144,7 +151,7 @@ format: ## Format code and configuration # Security harden: ## Apply security hardening @echo "Applying security hardening..." - ansible-playbook -i inventory/hosts.ini playbooks/harden.yml + ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml security-scan: ## Run security scans @echo "Running security scans..." @@ -163,4 +170,4 @@ help-failure: ## Show failure simulation commands @echo " make fail-network DURATION=60 - Network failure for 60s" @echo " make fail-disk DURATION=120 - Disk exhaustion for 120s" @echo " make fail-service DURATION=30 - Service failure for 30s" - @echo " make fail-node DURATION=300 - Node failure for 300s" \ No newline at end of file + @echo " make fail-node DURATION=300 - Node failure for 300s" diff --git a/enterprise-infra-simulator/README.md b/enterprise-infra-simulator/README.md index a709f84..f586bb9 100644 --- a/enterprise-infra-simulator/README.md +++ b/enterprise-infra-simulator/README.md @@ -1,268 +1,74 @@ # Enterprise Infrastructure Simulator -A container-based simulation environment for enterprise Linux infrastructure operations. This project provides Ansible automation for provisioning, patching, hardening, and decommissioning of simulated Linux nodes, along with scripts for scaling and failure simulation. +## Problem Statement -## Overview +Infrastructure teams need a safe place to rehearse lifecycle operations before applying them to production fleets. Patch windows, hardening changes, scale events, and node failures all carry operational risk when they are tested only during real incidents. -The Enterprise Infrastructure Simulator creates a realistic environment for testing and demonstrating infrastructure automation at scale. It uses Docker containers to simulate multiple Linux nodes and provides comprehensive Ansible playbooks for enterprise operations. +## Solution Overview -## Architecture +This project models common Linux infrastructure operations with Ansible playbooks and shell-based simulations. It keeps the automation readable and auditable while producing example evidence that resembles a real change record. -- **Container Simulation:** Docker-based Linux nodes with realistic configurations -- **Ansible Automation:** Modular playbooks for infrastructure lifecycle management -- **Dynamic Inventory:** Automated host discovery and grouping -- **Simulation Scripts:** Automated scaling and failure injection -- **Scenario Management:** Pre-defined operational scenarios +## Architecture Overview -## Quick Start +``` +Operator -> Make/CLI -> Ansible Inventory -> Playbooks -> Linux Nodes + | | + v v + Scenarios Reports/Logs +``` -### Prerequisites +Core components: -- Docker and Docker Compose -- Ansible 2.9+ -- Make +- `inventory/hosts.ini` defines managed node groups. +- `playbooks/` contains provisioning, patching, hardening, and decommissioning workflows. +- `scripts/` injects scaling and failure conditions. +- `scenarios/` documents operational exercises. +- `examples/` stores representative outputs for review. -### Setup +## How to Run ```bash -# Clone and navigate to project cd enterprise-infra-simulator -# Start the infrastructure -make up +# Validate playbook syntax. +make test -# Verify deployment -ansible -i inventory/hosts.ini all -m ping -``` +# Provision the simulated estate. +make run -## Available Operations - -### Infrastructure Management - -```bash -# Provision new nodes -ansible-playbook -i inventory/hosts.ini playbooks/provision.yml - -# Apply security patches +# Apply security patches. make patch -# Harden systems -ansible-playbook -i inventory/hosts.ini playbooks/harden.yml +# Apply host hardening. +make harden -# Decommission nodes -ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml - -# Destroy infrastructure -make destroy +# Run the failure and patch demo. +make demo ``` -### Simulation Operations +Direct Ansible commands are also supported: ```bash -# Scale up infrastructure -./scripts/simulate_scaling.sh up 5 - -# Simulate network failure -./scripts/simulate_failure.sh --type network --duration 300 - -# Run operational scenario -ansible-playbook -i inventory/hosts.ini scenarios/scaling_event.yml +ansible-playbook -i inventory/hosts.ini playbooks/provision.yml +ansible-playbook -i inventory/hosts.ini playbooks/patch.yml +ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml ``` -## Project Structure +## Example Output -``` -enterprise-infra-simulator/ -├── inventory/ # Ansible inventory files -│ └── hosts.ini # Dynamic host inventory -├── playbooks/ # Ansible automation playbooks -│ ├── provision.yml # Node provisioning -│ ├── patch.yml # Security patching -│ ├── harden.yml # Security hardening -│ └── decommission.yml # Node decommissioning -├── scripts/ # Simulation and utility scripts -│ ├── simulate_scaling.sh # Infrastructure scaling -│ └── simulate_failure.sh # Failure injection -├── scenarios/ # Operational scenarios -│ └── scaling_event.yml # Scaling scenario -├── docker-compose.yml # Container orchestration -├── Makefile # Build automation -└── README.md +```text +PLAY RECAP ********************************************************************* +web01 : ok=21 changed=7 unreachable=0 failed=0 skipped=3 rescued=0 ignored=1 +db01 : ok=18 changed=4 unreachable=0 failed=0 skipped=5 rescued=0 ignored=1 +lb01 : ok=16 changed=3 unreachable=0 failed=0 skipped=6 rescued=0 ignored=0 + +Patch status: SUCCESS +Updates applied: 12 +Reboot required: false ``` -## Inventory Management +Additional sample evidence is available in [examples/patch-output.txt](examples/patch-output.txt) and [examples/failure-simulation.txt](examples/failure-simulation.txt). -The simulator uses dynamic inventory with the following groups: +## Real-World Use Case -- `webservers`: Web application servers -- `databases`: Database servers -- `loadbalancers`: Load balancing infrastructure -- `monitoring`: Monitoring and logging servers - -## Playbooks - -### Provision Playbook -- Creates Docker containers with base Linux configurations -- Installs required packages and services -- Configures basic networking and security -- Registers nodes in inventory - -### Patch Playbook -- Updates system packages -- Applies security patches -- Restarts services as needed -- Generates patch reports - -### Harden Playbook -- Implements CIS security benchmarks -- Configures firewall rules -- Hardens SSH configuration -- Disables unnecessary services - -### Decommission Playbook -- Gracefully stops services -- Exports configuration and data -- Removes containers -- Cleans up inventory - -## Simulation Scripts - -### Scaling Simulation -```bash -./scripts/simulate_scaling.sh [up|down] [count] [type] -``` - -Parameters: -- `direction`: up/down -- `count`: Number of nodes to add/remove -- `type`: Node type (web/db/lb/monitor) - -### Failure Simulation -```bash -./scripts/simulate_failure.sh --type [failure_type] --duration [seconds] -``` - -Failure Types: -- `network`: Network connectivity issues -- `disk`: Disk space exhaustion -- `service`: Service failures -- `node`: Complete node outages - -## Scenarios - -Pre-defined operational scenarios for testing: - -- **Scaling Event:** Automated scaling during traffic spikes -- **Disaster Recovery:** Node failure and recovery procedures -- **Maintenance Window:** Scheduled patching and updates -- **Security Incident:** Breach simulation and response - -## Configuration - -### Environment Variables - -```bash -# Number of initial nodes -INFRA_NODE_COUNT=3 - -# Node types to deploy -INFRA_NODE_TYPES=web,db,lb - -# Simulation parameters -SIMULATION_DURATION=3600 -SIMULATION_INTENSITY=medium -``` - -### Docker Configuration - -Container resources and networking are configured in `docker-compose.yml`: - -```yaml -services: - infra-node: - image: ubuntu:20.04 - deploy: - replicas: 3 - resources: - limits: - memory: 512M - cpus: '0.5' -``` - -## Monitoring and Logging - -- Ansible execution logs: `ansible.log` -- Container logs: `docker logs ` -- Simulation logs: `logs/simulation.log` - -## Troubleshooting - -### Common Issues - -**Ansible Connection Failures:** -```bash -# Check container status -docker ps | grep infra-sim - -# Verify SSH connectivity -ansible -i inventory/hosts.ini all -m ping -``` - -**Container Resource Issues:** -```bash -# Check Docker resources -docker system df - -# Clean up containers -docker system prune -``` - -**Simulation Script Errors:** -```bash -# Check script permissions -chmod +x scripts/*.sh - -# Verify dependencies -./scripts/simulate_failure.sh --help -``` - -## Development - -### Adding New Playbooks - -1. Create playbook in `playbooks/` directory -2. Follow Ansible best practices -3. Test with `--check` mode -4. Update documentation - -### Custom Scenarios - -1. Define scenario in `scenarios/` directory -2. Include required variables -3. Test with dry-run -4. Document operational procedures - -## Security Considerations - -- Containers run with limited privileges -- SSH keys are generated per deployment -- Firewall rules are applied automatically -- Security scanning integrated in CI/CD - -## Performance Optimization - -- Container resource limits prevent resource exhaustion -- Ansible parallel execution for faster operations -- Efficient failure simulation without full outages -- Optimized Docker layer caching - -## Contributing - -1. Follow existing code structure and naming conventions -2. Add comprehensive documentation -3. Include tests for new functionality -4. Update runbooks for operational changes - -## License - -Enterprise Internal Use Only \ No newline at end of file +A platform team can use this project to demonstrate how routine operating procedures are encoded, reviewed, and tested before production change windows. The same patterns apply to regulated Linux estates where patch evidence, hardening controls, and incident drills must be repeatable. diff --git a/enterprise-infra-simulator/docs/architecture.md b/enterprise-infra-simulator/docs/architecture.md new file mode 100644 index 0000000..3eeb8d9 --- /dev/null +++ b/enterprise-infra-simulator/docs/architecture.md @@ -0,0 +1,30 @@ +# Enterprise Infrastructure Simulator Architecture + +## Components + +- Operator interface: `make` targets and direct Ansible commands. +- Inventory: static host groups in `inventory/hosts.ini`. +- Automation: lifecycle playbooks in `playbooks/`. +- Simulation scripts: controlled failure and scaling events in `scripts/`. +- Evidence: logs, reports, scenario notes, and examples. + +## Data Flow + +``` +Operator + -> Make target or shell script + -> Ansible inventory + -> lifecycle playbook + -> managed Linux node + -> log/report artifact +``` + +Failure drills follow a parallel flow: + +``` +Operator -> simulate_failure.sh -> target node/service -> health check -> patch/hardening playbook -> evidence +``` + +## Notes + +The project favors explicit playbooks over hidden orchestration so the operational intent is visible during review. In a production implementation, the same workflows would typically run from a CI runner or automation controller with credentials supplied by a secret manager. diff --git a/enterprise-infra-simulator/examples/failure-simulation.txt b/enterprise-infra-simulator/examples/failure-simulation.txt new file mode 100644 index 0000000..78a4c70 --- /dev/null +++ b/enterprise-infra-simulator/examples/failure-simulation.txt @@ -0,0 +1,8 @@ +2026-04-29 02:13:41 - Starting failure simulation: service 30 web +2026-04-29 02:13:41 - Simulating service failures on containers: web +2026-04-29 02:13:42 - Stopping services in container enterprise-web-1 +2026-04-29 02:13:44 - Health probe failed: http://web01/health returned 503 +2026-04-29 02:14:12 - Cleaning up failure simulation +2026-04-29 02:14:13 - Restarted nginx in enterprise-web-1 +2026-04-29 02:14:18 - Health probe recovered: http://web01/health returned 200 +2026-04-29 02:14:18 - Failure simulation completed successfully diff --git a/enterprise-infra-simulator/examples/patch-output.txt b/enterprise-infra-simulator/examples/patch-output.txt new file mode 100644 index 0000000..481d500 --- /dev/null +++ b/enterprise-infra-simulator/examples/patch-output.txt @@ -0,0 +1,33 @@ +PLAY [Apply Security Patches and Updates] ************************************** + +TASK [Update package cache] ***************************************************** +changed: [web01] +changed: [db01] +ok: [lb01] + +TASK [Check for available updates] ********************************************** +ok: [web01] => {"stdout": "9"} +ok: [db01] => {"stdout": "4"} +ok: [lb01] => {"stdout": "0"} + +TASK [Apply security updates only] ********************************************** +changed: [web01] +changed: [db01] +ok: [lb01] + +TASK [Verify critical services] ************************************************* +ok: [web01] => (item=systemd-journald) +ok: [web01] => (item=cron) +ok: [db01] => (item=systemd-journald) +ok: [lb01] => (item=cron) + +PLAY RECAP ********************************************************************* +web01 : ok=19 changed=6 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1 +db01 : ok=18 changed=5 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1 +lb01 : ok=15 changed=1 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0 + +Patch report +Status: SUCCESS +Window: 02:00-04:00 UTC +Reboot required: false +Notification: infra-team@example.com diff --git a/enterprise-infra-simulator/playbooks/harden.yml b/enterprise-infra-simulator/playbooks/hardening.yml similarity index 100% rename from enterprise-infra-simulator/playbooks/harden.yml rename to enterprise-infra-simulator/playbooks/hardening.yml diff --git a/enterprise-infra-simulator/scenarios/failure_patch.md b/enterprise-infra-simulator/scenarios/failure_patch.md new file mode 100644 index 0000000..ff2b1cb --- /dev/null +++ b/enterprise-infra-simulator/scenarios/failure_patch.md @@ -0,0 +1,21 @@ +# Scenario: Simulate Failure and Patch + +## Description + +Validate that a service-level failure can be detected, recovered, and followed by a controlled patch workflow. This mirrors a maintenance window where a degraded node is stabilized before package updates are applied. + +## Commands + +```bash +cd enterprise-infra-simulator +./scripts/simulate_failure.sh service 30 web +ansible-playbook -i inventory/hosts.ini playbooks/patch.yml +ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml --check +``` + +## Expected Result + +- The simulation records a temporary service failure. +- The service is restored after cleanup. +- The patch playbook completes without unreachable hosts. +- Hardening check mode reports no destructive changes. diff --git a/migration-validation-framework/Makefile b/migration-validation-framework/Makefile new file mode 100644 index 0000000..c4c8ee5 --- /dev/null +++ b/migration-validation-framework/Makefile @@ -0,0 +1,10 @@ +.PHONY: run test demo + +run: + python3 cli.py --help + +test: + python3 -m py_compile cli.py collectors/*.py validators/*.py reports/*.py + +demo: + python3 cli.py compare examples/before.json examples/after.json --output /tmp/migration-diff.json diff --git a/migration-validation-framework/README.md b/migration-validation-framework/README.md index ef6a563..044438a 100644 --- a/migration-validation-framework/README.md +++ b/migration-validation-framework/README.md @@ -1,389 +1,56 @@ # Migration Validation Framework -A comprehensive Python CLI tool for validating system migrations through data collection, snapshot comparison, and automated reporting. Designed for enterprise migration workflows where system consistency and data integrity are critical. +## Problem Statement -## Overview +Infrastructure migrations often fail in small, expensive ways: a mount option changes, a service is disabled, or disk usage moves past an operational threshold. Teams need structured evidence that the migrated host still matches the expected operating profile. -The Migration Validation Framework provides a systematic approach to validating system migrations by: +## Solution Overview -- Collecting comprehensive system data before and after migration -- Generating structured JSON snapshots for comparison -- Performing intelligent diff analysis between snapshots -- Generating detailed HTML reports with change visualization -- Providing CLI interface for integration into migration pipelines +This project provides a Python CLI that collects system state into JSON snapshots and compares before/after files. The output is designed for change records, migration gates, and post-cutover validation. -## Architecture +## Architecture Overview ``` -┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ -│ CLI Interface │ │ Data │ │ Validation │ -│ (cli.py) │◄──►│ Collectors │◄──►│ Engine │ -│ │ │ │ │ │ -│ - Command │ │ - mounts.py │ │ - compare.py │ -│ parsing │ │ - services.py │ │ - diff.py │ -│ - Workflow │ │ - disk_usage.py │ │ - validate.py │ -│ orchestration │ │ - network.py │ │ │ -└─────────────────┘ └─────────────────┘ └─────────────────┘ - │ │ │ - ▼ ▼ ▼ -┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ -│ JSON │ │ Comparison │ │ HTML │ -│ Snapshots │ │ Results │ │ Reports │ -│ │ │ │ │ │ -│ - Pre-migration │ │ - Differences │ │ - Summary │ -│ - Post-migration│ │ - Risk levels │ │ - Details │ -│ - Metadata │ │ - Validation │ │ - Charts │ -└─────────────────┘ └─────────────────┘ └─────────────────┘ +Operator -> CLI -> Collectors -> JSON Snapshot -> Comparator -> Diff/Report ``` -## Quick Start +Core components: -### Prerequisites +- `cli.py` provides collect, compare, snapshot, list, and report commands. +- `collectors/` gathers mounts, services, and disk usage. +- `validators/compare.py` identifies drift and validation failures. +- `reports/` contains report generation helpers. +- `examples/` contains realistic before/after evidence. -- Python 3.8+ -- SSH access to target systems -- Appropriate permissions for data collection - -### Installation +## How to Run ```bash cd migration-validation-framework -pip install -r requirements.txt +python3 cli.py collect --output before.json --systems web01,db01 +python3 cli.py collect --output after.json --systems web01,db01 +python3 cli.py compare before.json after.json --output diff.json +python3 cli.py compare examples/before.json examples/after.json --output /tmp/migration-diff.json ``` -### Basic Usage +Legacy snapshot IDs are still supported: ```bash -# Create pre-migration snapshot -python cli.py snapshot --env production --label pre-migration --systems web01,db01 - -# Perform migration... - -# Create post-migration snapshot -python cli.py snapshot --env production --label post-migration --systems web01,db01 - -# Compare snapshots -python cli.py compare pre-migration post-migration --output comparison_001 - -# Generate HTML report -python cli.py report --comparison comparison_001 --format html --output migration_report.html +python3 cli.py snapshot --env prod --label pre --systems web01,db01 +python3 cli.py compare prod-pre-20260429_020000 prod-post-20260429_030000 --output change-0429 ``` -## Project Structure +## Example Output -``` -migration-validation-framework/ -├── cli.py # Main CLI interface -├── collectors/ # Data collection modules -│ ├── mounts.py # Filesystem mount collection -│ ├── services.py # System services collection -│ ├── disk_usage.py # Disk usage statistics -│ ├── network.py # Network configuration -│ └── processes.py # Running processes -├── validators/ # Validation and comparison logic -│ ├── compare.py # Snapshot comparison engine -│ ├── diff.py # Difference calculation -│ └── validate.py # Validation rules -├── reports/ # Report generation -│ ├── html_report.py # HTML report generator -│ ├── json_report.py # JSON report generator -│ └── summary.py # Summary calculations -├── config/ # Configuration files -│ ├── collectors.yaml # Collector configurations -│ └── validators.yaml # Validation rules -├── tests/ # Unit and integration tests -├── logs/ # Application logs -└── snapshots/ # Stored snapshots +```text +Comparison completed: diff.json (FAIL) +Overall risk: high +Total changes: 4 +Failed checks: critical_services_running +Recommendation: restore sshd before production cutover ``` -## Data Collectors +Sample inputs and output are available in [examples/before.json](examples/before.json), [examples/after.json](examples/after.json), and [examples/diff.json](examples/diff.json). -### Mounts Collector (`collectors/mounts.py`) -Collects filesystem mount information including: -- Mount points and devices -- Filesystem types -- Mount options -- Capacity and usage statistics +## Real-World Use Case -### Services Collector (`collectors/services.py`) -Gathers system service status: -- Running services -- Service states (active, inactive, failed) -- Startup configuration -- Dependencies - -### Disk Usage Collector (`collectors/disk_usage.py`) -Analyzes disk space utilization: -- Directory size statistics -- File system usage -- Inode usage -- Largest files and directories - -### Network Collector (`collectors/network.py`) -Captures network configuration: -- Interface configurations -- Routing tables -- DNS settings -- Firewall rules - -### Processes Collector (`collectors/processes.py`) -Documents running processes: -- Process lists with PIDs -- Memory and CPU usage -- Process owners -- Command lines - -## Validation Engine - -### Comparison Logic (`validators/compare.py`) -Performs intelligent comparison of snapshots: -- Structural differences detection -- Semantic change analysis -- Risk level assessment -- Change categorization - -### Difference Calculator (`validators/diff.py`) -Calculates detailed differences: -- Added/removed/modified items -- Quantitative changes -- Configuration drift detection -- Anomaly identification - -### Validation Rules (`validators/validate.py`) -Applies validation rules: -- Critical change detection -- Compliance checking -- Threshold validation -- Custom rule engine - -## Reporting - -### HTML Reports (`reports/html_report.py`) -Generates comprehensive HTML reports featuring: -- Executive summary dashboard -- Detailed change logs -- Risk assessment visualizations -- Interactive charts and graphs -- Export capabilities - -### JSON Reports (`reports/json_report.py`) -Provides structured JSON output for: -- API integration -- Automated processing -- Audit trails -- Compliance reporting - -## CLI Interface - -### Commands - -```bash -# Snapshot management -python cli.py snapshot --env --label