From 77570200146729ff05d02ff10a658c0dbeae8c71 Mon Sep 17 00:00:00 2001 From: Mateusz Suski Date: Wed, 29 Apr 2026 23:14:14 +0000 Subject: [PATCH] feat: Add comprehensive enterprise Linux infrastructure portfolio with Ansible, Python, and ELK stack --- .gitea/workflows/ci.yml | 118 ++++ README.md | 59 ++ docs/architecture.md | 147 +++++ docs/runbooks.md | 329 ++++++++++ enterprise-infra-simulator/Makefile | 166 +++++ enterprise-infra-simulator/README.md | 268 ++++++++ .../inventory/hosts.ini | 35 + .../playbooks/decommission.yml | 181 ++++++ .../playbooks/harden.yml | 210 ++++++ .../playbooks/patch.yml | 139 ++++ .../playbooks/provision.yml | 158 +++++ .../scenarios/scaling_event.yml | 116 ++++ .../scripts/simulate_failure.sh | 343 ++++++++++ .../scripts/simulate_scaling.sh | 208 ++++++ migration-validation-framework/README.md | 389 +++++++++++ .../__pycache__/cli.cpython-312.pyc | Bin 0 -> 14427 bytes migration-validation-framework/cli.py | 270 ++++++++ .../__pycache__/disk_usage.cpython-312.pyc | Bin 0 -> 8139 bytes .../__pycache__/mounts.cpython-312.pyc | Bin 0 -> 7522 bytes .../__pycache__/services.cpython-312.pyc | Bin 0 -> 8103 bytes .../collectors/disk_usage.py | 207 ++++++ .../collectors/mounts.py | 173 +++++ .../collectors/services.py | 223 +++++++ .../__pycache__/html_report.cpython-312.pyc | Bin 0 -> 23093 bytes .../reports/html_report.py | 608 ++++++++++++++++++ .../__pycache__/compare.cpython-312.pyc | Bin 0 -> 21716 bytes .../validators/compare.py | 491 ++++++++++++++ observability-stack/README.md | 461 +++++++++++++ observability-stack/alerting/alert_rules.yml | 326 ++++++++++ observability-stack/docker-compose.yml | 122 ++++ .../logs/incident_simulation.log | 2 + observability-stack/logs/sample.log | 98 +++ .../scenarios/incident_simulation.sh | 318 +++++++++ 33 files changed, 6165 insertions(+) create mode 100644 .gitea/workflows/ci.yml create mode 100644 docs/architecture.md create mode 100644 docs/runbooks.md create mode 100644 enterprise-infra-simulator/Makefile create mode 100644 enterprise-infra-simulator/README.md create mode 100644 enterprise-infra-simulator/inventory/hosts.ini create mode 100644 enterprise-infra-simulator/playbooks/decommission.yml create mode 100644 enterprise-infra-simulator/playbooks/harden.yml create mode 100644 enterprise-infra-simulator/playbooks/patch.yml create mode 100644 enterprise-infra-simulator/playbooks/provision.yml create mode 100644 enterprise-infra-simulator/scenarios/scaling_event.yml create mode 100644 enterprise-infra-simulator/scripts/simulate_failure.sh create mode 100644 enterprise-infra-simulator/scripts/simulate_scaling.sh create mode 100644 migration-validation-framework/README.md create mode 100644 migration-validation-framework/__pycache__/cli.cpython-312.pyc create mode 100644 migration-validation-framework/cli.py create mode 100644 migration-validation-framework/collectors/__pycache__/disk_usage.cpython-312.pyc create mode 100644 migration-validation-framework/collectors/__pycache__/mounts.cpython-312.pyc create mode 100644 migration-validation-framework/collectors/__pycache__/services.cpython-312.pyc create mode 100644 migration-validation-framework/collectors/disk_usage.py create mode 100644 migration-validation-framework/collectors/mounts.py create mode 100644 migration-validation-framework/collectors/services.py create mode 100644 migration-validation-framework/reports/__pycache__/html_report.cpython-312.pyc create mode 100644 migration-validation-framework/reports/html_report.py create mode 100644 migration-validation-framework/validators/__pycache__/compare.cpython-312.pyc create mode 100644 migration-validation-framework/validators/compare.py create mode 100644 observability-stack/README.md create mode 100644 observability-stack/alerting/alert_rules.yml create mode 100644 observability-stack/docker-compose.yml create mode 100644 observability-stack/logs/incident_simulation.log create mode 100644 observability-stack/logs/sample.log create mode 100755 observability-stack/scenarios/incident_simulation.sh diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml new file mode 100644 index 0000000..2a410be --- /dev/null +++ b/.gitea/workflows/ci.yml @@ -0,0 +1,118 @@ +name: CI Pipeline + +on: + push: + branches: [ main, develop ] + pull_request: + branches: [ main ] + +jobs: + lint-ansible: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Install Ansible Lint + run: pip install ansible-lint + - name: Lint Ansible Playbooks + run: | + cd enterprise-infra-simulator + ansible-lint playbooks/*.yml + - name: Check Ansible Syntax + run: | + cd enterprise-infra-simulator + ansible-playbook --syntax-check playbooks/*.yml + + test-python: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + - name: Install Dependencies + run: | + cd migration-validation-framework + pip install -r requirements.txt + - name: Run Python Tests + run: | + cd migration-validation-framework + python -m pytest tests/ -v --cov=. --cov-report=xml + - name: Lint Python Code + run: | + pip install flake8 black isort + cd migration-validation-framework + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + black --check . + isort --check-only . + + validate-docker: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Validate Docker Compose + run: | + cd observability-stack + docker-compose config + - name: Check Docker Images + run: | + cd observability-stack + docker-compose pull --quiet + + security-scan: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + scan-type: 'fs' + scan-ref: '.' + format: 'sarif' + output: 'trivy-results.sarif' + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + if: always() + with: + sarif_file: 'trivy-results.sarif' + + documentation: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Check Documentation + run: | + # Check for broken links in README files + find . -name "README.md" -exec markdown-link-check {} \; + # Validate YAML files + find . -name "*.yml" -o -name "*.yaml" | xargs -I {} yamllint {} + + integration-test: + runs-on: ubuntu-latest + needs: [lint-ansible, test-python, validate-docker] + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + - name: Install Dependencies + run: | + pip install ansible docker-compose + - name: Run Integration Tests + run: | + # Start infrastructure simulator + cd enterprise-infra-simulator + make up + sleep 30 + # Run basic validation + ansible -i inventory/hosts.ini all -m ping + # Test migration framework + cd ../migration-validation-framework + python cli.py --help + # Validate observability stack + cd ../observability-stack + docker-compose config + # Cleanup + cd ../enterprise-infra-simulator + make destroy \ No newline at end of file diff --git a/README.md b/README.md index e69de29..3d54a49 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,59 @@ +# Enterprise Infrastructure Portfolio + +This mono-repository showcases enterprise-level Linux infrastructure engineering capabilities through three comprehensive projects demonstrating DevOps and Platform Engineering best practices. + +## Projects Overview + +### 1. Enterprise Infrastructure Simulator +A container-based simulation of enterprise Linux infrastructure with Ansible automation for provisioning, patching, hardening, and decommissioning operations. Includes failure simulation and scaling scenarios. + +**Key Features:** +- Multi-node Linux simulation using Docker containers +- Ansible playbooks for infrastructure lifecycle management +- Automated scaling and failure injection scripts +- Enterprise-grade inventory and scenario management + +### 2. Migration Validation Framework +A Python CLI tool for validating system migrations by collecting, comparing, and reporting on system state changes. Generates comprehensive before/after snapshots and HTML reports. + +**Key Features:** +- Automated system data collection (mounts, services, disk usage) +- JSON snapshot generation and comparison +- HTML report generation with change visualization +- CLI interface for enterprise migration workflows + +### 3. Observability Stack +A complete monitoring and logging stack using ELK (Elasticsearch, Logstash, Kibana) and Grafana for comprehensive infrastructure observability. + +**Key Features:** +- Docker Compose deployment of full observability stack +- Sample log ingestion and processing pipelines +- Alerting rules and incident simulation scenarios +- Real-time dashboards and monitoring capabilities + +## Architecture + +See [docs/architecture.md](docs/architecture.md) for detailed architecture overview. + +## Runbooks + +Operational procedures and troubleshooting guides available in [docs/runbooks.md](docs/runbooks.md). + +## Getting Started + +Each project contains its own README.md with setup and usage instructions. + +## CI/CD + +Automated testing and linting via Gitea workflows in [.gitea/workflows/](.gitea/workflows/). + +## Prerequisites + +- Docker and Docker Compose +- Ansible +- Python 3.8+ +- Make + +## License + +Enterprise Internal Use Only \ No newline at end of file diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..102f201 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,147 @@ +# Architecture Overview + +## Enterprise Infrastructure Portfolio Architecture + +This document provides a high-level overview of the architecture and design principles implemented across the three main projects in this portfolio. + +## Overall Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Enterprise Portfolio │ +├─────────────────────────────────────────────────────────────┤ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │ +│ │ Infra Simulator│ │Migration │ │Observability│ │ +│ │ (Ansible/Docker│ │Validation │ │Stack │ │ +│ │ Container Sim) │ │(Python CLI) │ │(ELK/Grafana)│ │ +│ └─────────────────┘ └─────────────────┘ └─────────────┘ │ +├─────────────────────────────────────────────────────────────┤ +│ Infrastructure Simulation │ Validation Framework │ Monitoring │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Project Architectures + +### 1. Enterprise Infrastructure Simulator + +**Architecture Pattern:** Container-based Infrastructure Simulation + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Ansible │ │ Docker │ │ Simulation │ +│ Controller │◄──►│ Containers │◄──►│ Scripts │ +│ │ │ (Linux Nodes) │ │ │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ + │ │ │ + ▼ ▼ ▼ +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Inventory │ │ Playbooks │ │ Scenarios │ +│ Management │ │ (Provision/ │ │ (Scaling/ │ +│ │ │ Patch/ │ │ Failures) │ +│ │ │ Harden/ │ │ │ +│ │ │ Decommission)│ │ │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ +``` + +**Key Components:** +- **Ansible Controller:** Central orchestration for infrastructure operations +- **Docker Containers:** Simulated Linux nodes with realistic configurations +- **Simulation Scripts:** Automated scaling and failure injection +- **Inventory System:** Dynamic host management and grouping +- **Playbook Library:** Modular automation for different lifecycle phases + +### 2. Migration Validation Framework + +**Architecture Pattern:** Data Collection and Comparison Pipeline + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ CLI Interface │ │ Data │ │ Validation │ +│ (cli.py) │◄──►│ Collectors │◄──►│ Engine │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ + │ │ │ + ▼ ▼ ▼ +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ JSON │ │ Comparison │ │ HTML │ +│ Snapshots │ │ Logic │ │ Reports │ +│ (Before/After)│ │ │ │ │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ +``` + +**Key Components:** +- **CLI Interface:** Command-line tool for migration workflow orchestration +- **Data Collectors:** Specialized modules for system data extraction +- **Validation Engine:** Snapshot comparison and difference analysis +- **Report Generator:** HTML output with change visualization +- **JSON Storage:** Structured data persistence for before/after states + +### 3. Observability Stack + +**Architecture Pattern:** Distributed Monitoring and Logging + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Logstash │ │ Elasticsearch │ │ Kibana │ +│ (Ingestion) │◄──►│ (Storage) │◄──►│ (Visualization)│ +└─────────────────┘ └─────────────────┘ └─────────────────┘ + ▲ ▲ ▲ + │ │ │ +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Sample Logs │ │ Alert Rules │ │ Grafana │ +│ (Data Sources)│ │ (Conditions) │ │ (Dashboards) │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ +``` + +**Key Components:** +- **Logstash Pipelines:** Data ingestion and transformation +- **Elasticsearch Cluster:** Distributed search and analytics +- **Kibana Dashboards:** Real-time visualization and exploration +- **Grafana Integration:** Advanced metrics and alerting +- **Alerting Engine:** Automated incident detection and notification + +## Design Principles + +### Infrastructure as Code +- All infrastructure defined in code (Ansible, Docker Compose, Python) +- Version-controlled configurations and automation +- Reproducible environments and deployments + +### Modular Architecture +- Separated concerns across projects and components +- Reusable modules and playbooks +- Clear interfaces between systems + +### Enterprise Standards +- Realistic naming conventions and structures +- Production-quality error handling and logging +- Security hardening and compliance considerations + +### Observability First +- Comprehensive logging and monitoring +- Automated alerting and incident response +- Performance metrics and health checks + +## Technology Stack + +- **Containerization:** Docker, Docker Compose +- **Configuration Management:** Ansible +- **Programming Language:** Python 3.8+ +- **Monitoring Stack:** ELK Stack (Elasticsearch, Logstash, Kibana) +- **Visualization:** Grafana +- **CI/CD:** Gitea Actions +- **Documentation:** Markdown + +## Security Considerations + +- Container security scanning integration +- Ansible vault for secrets management +- Network segmentation in Docker Compose +- Least privilege access principles +- Audit logging and compliance reporting + +## Scalability and Performance + +- Horizontal scaling through container orchestration +- Efficient data collection and processing +- Optimized Elasticsearch indexing +- Resource-aware automation scripts \ No newline at end of file diff --git a/docs/runbooks.md b/docs/runbooks.md new file mode 100644 index 0000000..29a5e32 --- /dev/null +++ b/docs/runbooks.md @@ -0,0 +1,329 @@ +# Runbooks and Operational Procedures + +This document contains operational runbooks for deploying, managing, and troubleshooting the Enterprise Infrastructure Portfolio projects. + +## Table of Contents + +1. [Infrastructure Simulator Operations](#infrastructure-simulator-operations) +2. [Migration Validation Procedures](#migration-validation-procedures) +3. [Observability Stack Management](#observability-stack-management) +4. [Troubleshooting Guide](#troubleshooting-guide) + +## Infrastructure Simulator Operations + +### Starting the Infrastructure + +```bash +cd enterprise-infra-simulator +make up +``` + +**Expected Outcome:** +- Docker containers for simulated Linux nodes are created +- Ansible inventory is populated +- Basic services are running on all nodes + +**Verification:** +```bash +docker ps | grep infra-sim +ansible -i inventory/hosts.ini all -m ping +``` + +### Patching Operations + +```bash +cd enterprise-infra-simulator +make patch +``` + +**Procedure:** +1. Backup current container states +2. Apply security patches via Ansible +3. Validate service availability +4. Generate patch report + +**Rollback:** +```bash +docker-compose down +docker-compose up --scale node=0 +make up +``` + +### Hardening Operations + +```bash +cd enterprise-infra-simulator +ansible-playbook -i inventory/hosts.ini playbooks/harden.yml +``` + +**Hardening Steps:** +- Disable unnecessary services +- Configure firewall rules +- Set secure SSH configurations +- Apply CIS benchmarks + +### Scaling Operations + +```bash +cd enterprise-infra-simulator +./scripts/simulate_scaling.sh up 3 +``` + +**Scaling Parameters:** +- Direction: up/down +- Count: number of nodes to add/remove +- Type: web/app/db + +### Failure Simulation + +```bash +cd enterprise-infra-simulator +./scripts/simulate_failure.sh --type network --duration 300 +``` + +**Failure Types:** +- network: Network partition +- disk: Disk space exhaustion +- service: Service crashes +- node: Complete node failure + +### Decommissioning + +```bash +cd enterprise-infra-simulator +make destroy +``` + +**Decommission Steps:** +1. Graceful service shutdown +2. Data backup and export +3. Configuration cleanup +4. Container removal + +## Migration Validation Procedures + +### Pre-Migration Snapshot + +```bash +cd migration-validation-framework +python cli.py snapshot --env production --label pre-migration +``` + +**Data Collected:** +- Mount points and filesystem usage +- Running services and their states +- Disk usage statistics +- Network configurations + +### Post-Migration Validation + +```bash +python cli.py snapshot --env production --label post-migration +python cli.py compare pre-migration post-migration +``` + +**Validation Checks:** +- Service availability verification +- Filesystem integrity +- Configuration consistency +- Performance metrics comparison + +### Report Generation + +```bash +python cli.py report --comparison-id --format html +``` + +**Report Contents:** +- Executive summary +- Detailed change log +- Risk assessment +- Recommendations + +## Observability Stack Management + +### Starting the Stack + +```bash +cd observability-stack +docker-compose up -d +``` + +**Service Startup Order:** +1. Elasticsearch +2. Logstash +3. Kibana +4. Grafana + +### Log Ingestion Testing + +```bash +# Send sample logs +curl -X POST "localhost:8080" -H "Content-Type: application/json" -d @logs/sample.log +``` + +### Alert Configuration + +```bash +# Load alert rules +curl -X POST "localhost:3000/api/alerts" -H "Authorization: Bearer " -d @alerting/alert_rules.json +``` + +### Incident Simulation + +```bash +cd observability-stack +./scenarios/incident_simulation.sh --type disk-full --severity critical +``` + +**Incident Types:** +- disk-full: Simulate disk space exhaustion +- service-down: Service failure simulation +- high-cpu: CPU utilization spike +- network-latency: Network performance degradation + +## Troubleshooting Guide + +### Common Issues + +#### Ansible Connection Failures + +**Symptoms:** +- `UNREACHABLE` errors in Ansible output +- SSH connection timeouts + +**Resolution:** +```bash +# Check container status +docker ps | grep infra-sim + +# Verify SSH keys +ansible -i inventory/hosts.ini all -m ping --private-key ~/.ssh/id_rsa + +# Restart containers +make destroy && make up +``` + +#### Elasticsearch Cluster Issues + +**Symptoms:** +- Kibana shows "No living connections" +- Logstash pipeline failures + +**Resolution:** +```bash +# Check cluster health +curl -X GET "localhost:9200/_cluster/health?pretty" + +# Restart services +docker-compose restart elasticsearch logstash kibana +``` + +#### Python Import Errors + +**Symptoms:** +- ModuleNotFoundError in migration framework +- Collector failures + +**Resolution:** +```bash +# Install dependencies +pip install -r requirements.txt + +# Check Python path +python -c "import sys; print(sys.path)" +``` + +#### Docker Resource Constraints + +**Symptoms:** +- Container startup failures +- Out of memory errors + +**Resolution:** +```bash +# Check Docker resources +docker system df + +# Clean up unused resources +docker system prune -a + +# Increase Docker memory limit +# Edit /etc/docker/daemon.json +{ + "memory": "4g", + "cpu-count": 2 +} +``` + +### Log Locations + +- **Ansible:** `enterprise-infra-simulator/ansible.log` +- **Docker:** `docker logs ` +- **Elasticsearch:** `observability-stack/logs/elasticsearch.log` +- **Migration Framework:** `migration-validation-framework/logs/validation.log` + +### Performance Monitoring + +```bash +# Infrastructure monitoring +ansible -i inventory/hosts.ini all -m shell -a "top -b -n1 | head -20" + +# Elasticsearch metrics +curl -X GET "localhost:9200/_cluster/stats?pretty" + +# Python performance +python -m cProfile cli.py snapshot +``` + +### Backup and Recovery + +#### Infrastructure Backup +```bash +cd enterprise-infra-simulator +docker-compose exec ansible ansible-playbook /playbooks/backup.yml +``` + +#### Data Backup +```bash +cd observability-stack +docker-compose exec elasticsearch curl -X PUT "localhost:9200/_snapshot/backup" -H "Content-Type: application/json" -d @backup_config.json +``` + +#### Migration Data Backup +```bash +cd migration-validation-framework +python cli.py backup --destination /backup/location +``` + +## Emergency Procedures + +### Complete System Reset + +```bash +# Stop all services +docker-compose down -v +cd enterprise-infra-simulator && make destroy + +# Clean up volumes +docker volume prune -f + +# Restart from clean state +cd enterprise-infra-simulator && make up +cd observability-stack && docker-compose up -d +``` + +### Incident Response + +1. **Assess Impact:** Check monitoring dashboards +2. **Isolate Issue:** Use failure simulation scripts to reproduce +3. **Implement Fix:** Apply appropriate runbook procedure +4. **Validate Recovery:** Run validation framework +5. **Document Incident:** Update runbooks with lessons learned + +## Maintenance Schedules + +- **Daily:** Log rotation and cleanup +- **Weekly:** Security patching and updates +- **Monthly:** Performance optimization and capacity planning +- **Quarterly:** Architecture review and modernization \ No newline at end of file diff --git a/enterprise-infra-simulator/Makefile b/enterprise-infra-simulator/Makefile new file mode 100644 index 0000000..95be5f8 --- /dev/null +++ b/enterprise-infra-simulator/Makefile @@ -0,0 +1,166 @@ +# Enterprise Infrastructure Simulator Makefile + +.PHONY: help up down patch destroy status logs clean test + +# Default target +help: ## Show this help message + @echo "Enterprise Infrastructure Simulator" + @echo "" + @echo "Available commands:" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " %-15s %s\n", $$1, $$2}' + +# Infrastructure management +up: ## Start the infrastructure simulation + @echo "Starting enterprise infrastructure simulation..." + docker-compose up -d + @echo "Waiting for containers to be ready..." + @sleep 30 + ansible-playbook -i inventory/hosts.ini playbooks/provision.yml + @echo "Infrastructure simulation started successfully" + +down: ## Stop the infrastructure simulation + @echo "Stopping infrastructure simulation..." + ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml || true + docker-compose down + @echo "Infrastructure simulation stopped" + +patch: ## Apply security patches to all nodes + @echo "Applying security patches..." + ansible-playbook -i inventory/hosts.ini playbooks/patch.yml + @echo "Security patches applied" + +destroy: ## Completely destroy the infrastructure + @echo "Destroying infrastructure..." + ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml || true + docker-compose down -v --remove-orphans + docker system prune -f + rm -rf logs/* reports/* + @echo "Infrastructure completely destroyed" + +# Scaling operations +scale-up-web: ## Scale up web servers (usage: make scale-up-web COUNT=2) + @echo "Scaling up $(COUNT) web servers..." + ./scripts/simulate_scaling.sh up $(or $(COUNT),1) web + +scale-up-db: ## Scale up database servers (usage: make scale-up-db COUNT=1) + @echo "Scaling up $(COUNT) database servers..." + ./scripts/simulate_scaling.sh up $(or $(COUNT),1) db + +scale-down-web: ## Scale down web servers (usage: make scale-down-web COUNT=1) + @echo "Scaling down $(COUNT) web servers..." + ./scripts/simulate_scaling.sh down $(or $(COUNT),1) web + +scale-down-db: ## Scale down database servers (usage: make scale-down-db COUNT=1) + @echo "Scaling down $(COUNT) database servers..." + ./scripts/simulate_scaling.sh down $(or $(COUNT),1) db + +# Failure simulation +fail-network: ## Simulate network failure (usage: make fail-network DURATION=60) + @echo "Simulating network failure for $(or $(DURATION),60) seconds..." + ./scripts/simulate_failure.sh network $(or $(DURATION),60) + +fail-disk: ## Simulate disk space exhaustion (usage: make fail-disk DURATION=120) + @echo "Simulating disk failure for $(or $(DURATION),120) seconds..." + ./scripts/simulate_failure.sh disk $(or $(DURATION),120) + +fail-service: ## Simulate service failures (usage: make fail-service DURATION=30) + @echo "Simulating service failure for $(or $(DURATION),30) seconds..." + ./scripts/simulate_failure.sh service $(or $(DURATION),30) + +fail-node: ## Simulate complete node failure (usage: make fail-node DURATION=300) + @echo "Simulating node failure for $(or $(DURATION),300) seconds..." + ./scripts/simulate_failure.sh node $(or $(DURATION),300) + +# Monitoring and status +status: ## Show infrastructure status + @echo "=== Docker Containers ===" + docker-compose ps + @echo "" + @echo "=== Ansible Inventory ===" + ansible -i inventory/hosts.ini --list-hosts all || echo "Inventory check failed" + @echo "" + @echo "=== System Resources ===" + docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}\t{{.NetIO}}" + +logs: ## Show infrastructure logs + docker-compose logs -f --tail=100 + +logs-web: ## Show web server logs + docker-compose logs -f web + +logs-db: ## Show database logs + docker-compose logs -f db + +# Testing and validation +test: ## Run infrastructure tests + @echo "Running infrastructure tests..." + ansible -i inventory/hosts.ini all -m ping + ansible-playbook -i inventory/hosts.ini --syntax-check playbooks/*.yml + @echo "Testing scaling scripts..." + ./scripts/simulate_scaling.sh up 0 web # Dry run + ./scripts/simulate_failure.sh network 1 # Quick test + @echo "All tests passed" + +validate: ## Validate infrastructure configuration + @echo "Validating configuration..." + ansible-playbook -i inventory/hosts.ini playbooks/provision.yml --check + docker-compose config + @echo "Configuration validation complete" + +# Scenarios +scenario-scaling: ## Run scaling event scenario + @echo "Running scaling event scenario..." + ansible-playbook -i inventory/hosts.ini scenarios/scaling_event.yml + +scenario-disaster: ## Run disaster recovery scenario + @echo "Running disaster recovery scenario..." + ansible-playbook -i inventory/hosts.ini scenarios/disaster_recovery.yml + +# Maintenance +clean: ## Clean up temporary files and logs + @echo "Cleaning up temporary files..." + rm -rf logs/*.log reports/*.txt + docker system prune -f + @echo "Cleanup complete" + +backup: ## Create infrastructure backup + @echo "Creating infrastructure backup..." + mkdir -p backups/$(shell date +%Y%m%d_%H%M%S) + ansible-playbook -i inventory/hosts.ini playbooks/backup.yml + docker-compose exec ansible tar -czf /backups/infra_backup.tar.gz /infrastructure + @echo "Backup created" + +# Development +lint: ## Lint Ansible playbooks + @echo "Linting Ansible playbooks..." + ansible-lint playbooks/*.yml scenarios/*.yml + @echo "Linting complete" + +format: ## Format code and configuration + @echo "Formatting code..." + # Add formatting commands here + @echo "Formatting complete" + +# Security +harden: ## Apply security hardening + @echo "Applying security hardening..." + ansible-playbook -i inventory/hosts.ini playbooks/harden.yml + +security-scan: ## Run security scans + @echo "Running security scans..." + ansible-playbook -i inventory/hosts.ini playbooks/security_scan.yml + +# Help for specific targets +help-scaling: ## Show scaling-related commands + @echo "Scaling Commands:" + @echo " make scale-up-web COUNT=2 - Add 2 web servers" + @echo " make scale-up-db COUNT=1 - Add 1 database server" + @echo " make scale-down-web COUNT=1 - Remove 1 web server" + @echo " make scale-down-db COUNT=1 - Remove 1 database server" + +help-failure: ## Show failure simulation commands + @echo "Failure Simulation Commands:" + @echo " make fail-network DURATION=60 - Network failure for 60s" + @echo " make fail-disk DURATION=120 - Disk exhaustion for 120s" + @echo " make fail-service DURATION=30 - Service failure for 30s" + @echo " make fail-node DURATION=300 - Node failure for 300s" \ No newline at end of file diff --git a/enterprise-infra-simulator/README.md b/enterprise-infra-simulator/README.md new file mode 100644 index 0000000..a709f84 --- /dev/null +++ b/enterprise-infra-simulator/README.md @@ -0,0 +1,268 @@ +# Enterprise Infrastructure Simulator + +A container-based simulation environment for enterprise Linux infrastructure operations. This project provides Ansible automation for provisioning, patching, hardening, and decommissioning of simulated Linux nodes, along with scripts for scaling and failure simulation. + +## Overview + +The Enterprise Infrastructure Simulator creates a realistic environment for testing and demonstrating infrastructure automation at scale. It uses Docker containers to simulate multiple Linux nodes and provides comprehensive Ansible playbooks for enterprise operations. + +## Architecture + +- **Container Simulation:** Docker-based Linux nodes with realistic configurations +- **Ansible Automation:** Modular playbooks for infrastructure lifecycle management +- **Dynamic Inventory:** Automated host discovery and grouping +- **Simulation Scripts:** Automated scaling and failure injection +- **Scenario Management:** Pre-defined operational scenarios + +## Quick Start + +### Prerequisites + +- Docker and Docker Compose +- Ansible 2.9+ +- Make + +### Setup + +```bash +# Clone and navigate to project +cd enterprise-infra-simulator + +# Start the infrastructure +make up + +# Verify deployment +ansible -i inventory/hosts.ini all -m ping +``` + +## Available Operations + +### Infrastructure Management + +```bash +# Provision new nodes +ansible-playbook -i inventory/hosts.ini playbooks/provision.yml + +# Apply security patches +make patch + +# Harden systems +ansible-playbook -i inventory/hosts.ini playbooks/harden.yml + +# Decommission nodes +ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml + +# Destroy infrastructure +make destroy +``` + +### Simulation Operations + +```bash +# Scale up infrastructure +./scripts/simulate_scaling.sh up 5 + +# Simulate network failure +./scripts/simulate_failure.sh --type network --duration 300 + +# Run operational scenario +ansible-playbook -i inventory/hosts.ini scenarios/scaling_event.yml +``` + +## Project Structure + +``` +enterprise-infra-simulator/ +├── inventory/ # Ansible inventory files +│ └── hosts.ini # Dynamic host inventory +├── playbooks/ # Ansible automation playbooks +│ ├── provision.yml # Node provisioning +│ ├── patch.yml # Security patching +│ ├── harden.yml # Security hardening +│ └── decommission.yml # Node decommissioning +├── scripts/ # Simulation and utility scripts +│ ├── simulate_scaling.sh # Infrastructure scaling +│ └── simulate_failure.sh # Failure injection +├── scenarios/ # Operational scenarios +│ └── scaling_event.yml # Scaling scenario +├── docker-compose.yml # Container orchestration +├── Makefile # Build automation +└── README.md +``` + +## Inventory Management + +The simulator uses dynamic inventory with the following groups: + +- `webservers`: Web application servers +- `databases`: Database servers +- `loadbalancers`: Load balancing infrastructure +- `monitoring`: Monitoring and logging servers + +## Playbooks + +### Provision Playbook +- Creates Docker containers with base Linux configurations +- Installs required packages and services +- Configures basic networking and security +- Registers nodes in inventory + +### Patch Playbook +- Updates system packages +- Applies security patches +- Restarts services as needed +- Generates patch reports + +### Harden Playbook +- Implements CIS security benchmarks +- Configures firewall rules +- Hardens SSH configuration +- Disables unnecessary services + +### Decommission Playbook +- Gracefully stops services +- Exports configuration and data +- Removes containers +- Cleans up inventory + +## Simulation Scripts + +### Scaling Simulation +```bash +./scripts/simulate_scaling.sh [up|down] [count] [type] +``` + +Parameters: +- `direction`: up/down +- `count`: Number of nodes to add/remove +- `type`: Node type (web/db/lb/monitor) + +### Failure Simulation +```bash +./scripts/simulate_failure.sh --type [failure_type] --duration [seconds] +``` + +Failure Types: +- `network`: Network connectivity issues +- `disk`: Disk space exhaustion +- `service`: Service failures +- `node`: Complete node outages + +## Scenarios + +Pre-defined operational scenarios for testing: + +- **Scaling Event:** Automated scaling during traffic spikes +- **Disaster Recovery:** Node failure and recovery procedures +- **Maintenance Window:** Scheduled patching and updates +- **Security Incident:** Breach simulation and response + +## Configuration + +### Environment Variables + +```bash +# Number of initial nodes +INFRA_NODE_COUNT=3 + +# Node types to deploy +INFRA_NODE_TYPES=web,db,lb + +# Simulation parameters +SIMULATION_DURATION=3600 +SIMULATION_INTENSITY=medium +``` + +### Docker Configuration + +Container resources and networking are configured in `docker-compose.yml`: + +```yaml +services: + infra-node: + image: ubuntu:20.04 + deploy: + replicas: 3 + resources: + limits: + memory: 512M + cpus: '0.5' +``` + +## Monitoring and Logging + +- Ansible execution logs: `ansible.log` +- Container logs: `docker logs ` +- Simulation logs: `logs/simulation.log` + +## Troubleshooting + +### Common Issues + +**Ansible Connection Failures:** +```bash +# Check container status +docker ps | grep infra-sim + +# Verify SSH connectivity +ansible -i inventory/hosts.ini all -m ping +``` + +**Container Resource Issues:** +```bash +# Check Docker resources +docker system df + +# Clean up containers +docker system prune +``` + +**Simulation Script Errors:** +```bash +# Check script permissions +chmod +x scripts/*.sh + +# Verify dependencies +./scripts/simulate_failure.sh --help +``` + +## Development + +### Adding New Playbooks + +1. Create playbook in `playbooks/` directory +2. Follow Ansible best practices +3. Test with `--check` mode +4. Update documentation + +### Custom Scenarios + +1. Define scenario in `scenarios/` directory +2. Include required variables +3. Test with dry-run +4. Document operational procedures + +## Security Considerations + +- Containers run with limited privileges +- SSH keys are generated per deployment +- Firewall rules are applied automatically +- Security scanning integrated in CI/CD + +## Performance Optimization + +- Container resource limits prevent resource exhaustion +- Ansible parallel execution for faster operations +- Efficient failure simulation without full outages +- Optimized Docker layer caching + +## Contributing + +1. Follow existing code structure and naming conventions +2. Add comprehensive documentation +3. Include tests for new functionality +4. Update runbooks for operational changes + +## License + +Enterprise Internal Use Only \ No newline at end of file diff --git a/enterprise-infra-simulator/inventory/hosts.ini b/enterprise-infra-simulator/inventory/hosts.ini new file mode 100644 index 0000000..c41d2fa --- /dev/null +++ b/enterprise-infra-simulator/inventory/hosts.ini @@ -0,0 +1,35 @@ +[webservers] +web01 ansible_host=172.20.0.11 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa +web02 ansible_host=172.20.0.12 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa +web03 ansible_host=172.20.0.13 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa + +[databases] +db01 ansible_host=172.20.0.21 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa +db02 ansible_host=172.20.0.22 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa + +[loadbalancers] +lb01 ansible_host=172.20.0.31 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa + +[monitoring] +mon01 ansible_host=172.20.0.41 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa + +[all:vars] +ansible_python_interpreter=/usr/bin/python3 +ansible_ssh_common_args='-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' +ansible_connection=ssh + +[webservers:vars] +node_type=web +environment=production + +[databases:vars] +node_type=database +environment=production + +[loadbalancers:vars] +node_type=loadbalancer +environment=production + +[monitoring:vars] +node_type=monitoring +environment=production \ No newline at end of file diff --git a/enterprise-infra-simulator/playbooks/decommission.yml b/enterprise-infra-simulator/playbooks/decommission.yml new file mode 100644 index 0000000..99c68d0 --- /dev/null +++ b/enterprise-infra-simulator/playbooks/decommission.yml @@ -0,0 +1,181 @@ +--- +- name: Decommission Enterprise Infrastructure Nodes + hosts: all + become: true + gather_facts: true + vars: + backup_data: true + export_config: true + graceful_shutdown: true + cleanup_inventory: true + + pre_tasks: + - name: Check node health before decommissioning + uri: + url: http://localhost/health + method: GET + status_code: 200 + register: health_check + ignore_errors: true + when: "'webservers' in group_names" + + - name: Create decommissioning backup directory + file: + path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}" + state: directory + mode: '0755' + + - name: Log decommissioning start + lineinfile: + path: "/var/log/decommission.log" + line: "{{ ansible_date_time.iso8601 }} - Starting decommissioning of {{ inventory_hostname }}" + create: yes + + tasks: + - name: Stop application services gracefully + service: + name: "{{ item }}" + state: stopped + loop: "{{ application_services | default(['nginx', 'postgresql', 'haproxy']) }}" + ignore_errors: true + when: graceful_shutdown + + - name: Wait for connections to drain + pause: + seconds: 30 + when: graceful_shutdown and "'webservers' in group_names or 'loadbalancers' in group_names" + + - name: Export configuration files + block: + - name: Create config export directory + file: + path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config" + state: directory + + - name: Archive system configuration + archive: + path: + - /etc/ + - /opt/application/ + dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config/system_config.tar.gz" + format: gz + + - name: Export service configurations + command: > + tar -czf /var/backups/decommission-{{ ansible_date_time.iso8601 }}/config/services.tar.gz + /etc/nginx /etc/postgresql /etc/haproxy + ignore_errors: true + when: export_config + + - name: Backup application data + block: + - name: Create data backup directory + file: + path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data" + state: directory + + - name: Backup database data + command: > + pg_dumpall -U postgres > /var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/database_backup.sql + ignore_errors: true + when: "'databases' in group_names" + + - name: Backup application files + archive: + path: "/var/www/html" + dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/application_data.tar.gz" + format: gz + ignore_errors: true + when: "'webservers' in group_names" + + - name: Backup monitoring data + archive: + path: "/var/lib/prometheus" + dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/monitoring_data.tar.gz" + format: gz + ignore_errors: true + when: "'monitoring' in group_names" + when: backup_data + + - name: Remove from load balancer + include_tasks: tasks/remove_from_lb.yml + when: "'webservers' in group_names or 'databases' in group_names" + + - name: Update monitoring alerts + include_tasks: tasks/update_monitoring.yml + when: "'monitoring' not in group_names" + + - name: Clean up application directories + file: + path: "{{ item }}" + state: absent + loop: + - /opt/application + - /var/www/html + - /var/lib/postgresql + - /var/lib/prometheus + ignore_errors: true + + - name: Remove application packages + apt: + name: "{{ item }}" + state: absent + purge: yes + loop: "{{ application_packages | default(['nginx', 'postgresql', 'haproxy', 'prometheus']) }}" + when: ansible_os_family == "Debian" + ignore_errors: true + + - name: Clean up system logs + command: > + find /var/log -name "*.log" -type f -exec truncate -s 0 {} \; + ignore_errors: true + + - name: Remove SSH keys and known hosts + file: + path: "{{ item }}" + state: absent + loop: + - /root/.ssh/authorized_keys + - /root/.ssh/known_hosts + - /home/infra-admin/.ssh/authorized_keys + ignore_errors: true + + - name: Generate decommissioning report + template: + src: templates/decommission_report.j2 + dest: "/var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log" + vars: + decommission_status: "SUCCESS" + backup_location: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}" + + post_tasks: + - name: Send decommissioning notification + mail: + to: "{{ decommission_notification_email | default('infra-team@company.com') }}" + subject: "Node Decommissioned - {{ inventory_hostname }}" + body: | + Node {{ inventory_hostname }} has been successfully decommissioned. + + Backup location: /var/backups/decommission-{{ ansible_date_time.iso8601 }} + Services stopped: {{ application_services | default(['nginx', 'postgresql', 'haproxy']) | join(', ') }} + Configuration exported: {{ export_config }} + Data backed up: {{ backup_data }} + + See /var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log for details + when: decommission_notification_email is defined + ignore_errors: true + + - name: Update dynamic inventory + include_tasks: tasks/update_inventory.yml + when: cleanup_inventory + + - name: Final log entry + lineinfile: + path: "/var/log/decommission.log" + line: "{{ ansible_date_time.iso8601 }} - Decommissioning completed for {{ inventory_hostname }}" + + - name: Shutdown node + command: shutdown -h now + async: 10 + poll: 0 + when: auto_shutdown | default(false) | bool \ No newline at end of file diff --git a/enterprise-infra-simulator/playbooks/harden.yml b/enterprise-infra-simulator/playbooks/harden.yml new file mode 100644 index 0000000..a623231 --- /dev/null +++ b/enterprise-infra-simulator/playbooks/harden.yml @@ -0,0 +1,210 @@ +--- +- name: Harden Enterprise Infrastructure Nodes + hosts: all + become: true + gather_facts: true + vars: + cis_level: 1 + disable_root_login: true + secure_ssh_config: true + firewall_policy: deny + auditd_enabled: true + selinux_mode: enforcing + apparmor_enabled: true + + tasks: + - name: Include CIS hardening tasks + include_tasks: tasks/cis_hardening.yml + + - name: Configure SSH hardening + block: + - name: Disable root SSH login + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^PermitRootLogin' + line: 'PermitRootLogin no' + when: disable_root_login + + - name: Disable password authentication + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^PasswordAuthentication' + line: 'PasswordAuthentication no' + + - name: Set MaxAuthTries + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^MaxAuthTries' + line: 'MaxAuthTries 3' + + - name: Disable empty passwords + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^PermitEmptyPasswords' + line: 'PermitEmptyPasswords no' + + - name: Set ClientAliveInterval + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^ClientAliveInterval' + line: 'ClientAliveInterval 300' + + - name: Set ClientAliveCountMax + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^ClientAliveCountMax' + line: 'ClientAliveCountMax 2' + + notify: restart sshd + + - name: Configure firewall + ufw: + state: enabled + policy: "{{ firewall_policy }}" + rules: + - rule: allow + port: '22' + proto: tcp + from: 10.0.0.0/8 + - rule: allow + port: '22' + proto: tcp + from: 172.16.0.0/12 + - rule: allow + port: '22' + proto: tcp + from: 192.168.0.0/16 + + - name: Disable unnecessary services + service: + name: "{{ item }}" + state: stopped + enabled: no + loop: + - cups + - avahi-daemon + - bluetooth + - nfs-server + - rpcbind + ignore_errors: true + + - name: Remove unnecessary packages + apt: + name: "{{ item }}" + state: absent + purge: yes + loop: + - telnet + - rsh-client + - talk + - ntalk + when: ansible_os_family == "Debian" + ignore_errors: true + + - name: Configure auditd + block: + - name: Install auditd + apt: + name: auditd + state: present + when: ansible_os_family == "Debian" + + - name: Configure audit rules + template: + src: templates/audit.rules.j2 + dest: /etc/audit/rules.d/hardening.rules + + - name: Enable auditd service + service: + name: auditd + state: started + enabled: yes + when: auditd_enabled + + - name: Configure AppArmor + block: + - name: Install apparmor + apt: + name: apparmor + state: present + when: ansible_os_family == "Debian" + + - name: Enable apparmor service + service: + name: apparmor + state: started + enabled: yes + when: apparmor_enabled and ansible_os_family == "Debian" + + - name: Configure sysctl hardening + sysctl: + name: "{{ item.key }}" + value: "{{ item.value }}" + state: present + reload: yes + loop: + - { key: 'net.ipv4.ip_forward', value: '0' } + - { key: 'net.ipv4.conf.all.send_redirects', value: '0' } + - { key: 'net.ipv4.conf.default.send_redirects', value: '0' } + - { key: 'net.ipv4.tcp_syncookies', value: '1' } + - { key: 'net.ipv4.icmp_echo_ignore_broadcasts', value: '1' } + + - name: Set secure file permissions + file: + path: "{{ item }}" + mode: '0644' + owner: root + group: root + loop: + - /etc/passwd + - /etc/group + - /etc/shadow + - /etc/gshadow + + - name: Lock inactive user accounts + command: usermod -L "{{ item }}" + loop: "{{ inactive_users | default([]) }}" + ignore_errors: true + + - name: Configure password policies + pam_limits: + domain: '*' + limit_type: hard + limit_item: nofile + value: 1024 + + - name: Generate hardening report + template: + src: templates/hardening_report.j2 + dest: "/var/log/hardening_report_{{ ansible_date_time.iso8601 }}.log" + + handlers: + - name: restart sshd + service: + name: ssh + state: restarted + + - name: restart auditd + service: + name: auditd + state: restarted + when: auditd_enabled + + post_tasks: + - name: Run CIS compliance check + command: > + bash -c " + score=0 + total=0 + echo 'CIS Compliance Check Results:' > /tmp/cis_check.log + # Add CIS checks here + echo 'Overall Score: $score/$total' >> /tmp/cis_check.log + cat /tmp/cis_check.log + " + register: cis_check + changed_when: false + + - name: Archive CIS results + copy: + content: "{{ cis_check.stdout }}" + dest: "/var/log/cis_compliance_{{ ansible_date_time.iso8601 }}.log" \ No newline at end of file diff --git a/enterprise-infra-simulator/playbooks/patch.yml b/enterprise-infra-simulator/playbooks/patch.yml new file mode 100644 index 0000000..49547c9 --- /dev/null +++ b/enterprise-infra-simulator/playbooks/patch.yml @@ -0,0 +1,139 @@ +--- +- name: Apply Security Patches and Updates + hosts: all + become: true + gather_facts: true + vars: + patch_window_start: "02:00" + patch_window_end: "04:00" + reboot_required: false + security_only: true + + pre_tasks: + - name: Check patch window + assert: + that: ansible_date_time.hour|int >= patch_window_start.split(':')[0]|int and ansible_date_time.hour|int < patch_window_end.split(':')[0]|int + fail_msg: "Current time {{ ansible_date_time.hour }}:{{ ansible_date_time.minute }} is outside patch window {{ patch_window_start }}-{{ patch_window_end }}" + when: enforce_patch_window | default(true) | bool + + - name: Create patch backup + file: + path: "/var/backups/pre-patch-{{ ansible_date_time.iso8601 }}" + state: directory + + - name: Backup package list + command: dpkg --get-selections + register: package_backup + changed_when: false + + - name: Save package backup + copy: + content: "{{ package_backup.stdout }}" + dest: "/var/backups/pre-patch-{{ ansible_date_time.iso8601 }}/packages.list" + + tasks: + - name: Update package cache + apt: + update_cache: yes + cache_valid_time: 300 + when: ansible_os_family == "Debian" + + - name: Check for available updates + command: apt list --upgradable 2>/dev/null | grep -v "Listing..." | wc -l + register: updates_available + changed_when: false + when: ansible_os_family == "Debian" + + - name: Apply security updates only + apt: + upgrade: dist + update_cache: yes + when: security_only and ansible_os_family == "Debian" + + - name: Apply all updates + apt: + upgrade: dist + update_cache: yes + when: not security_only and ansible_os_family == "Debian" + + - name: Check if reboot required + stat: + path: /var/run/reboot-required + register: reboot_required_file + when: ansible_os_family == "Debian" + + - name: Set reboot flag + set_fact: + reboot_required: "{{ reboot_required_file.stat.exists | default(false) }}" + + - name: Restart services after patching + service: + name: "{{ item }}" + state: restarted + loop: + - sshd + - fail2ban + - unattended-upgrades + ignore_errors: true + + - name: Update monitoring agent + include_role: + name: monitoring_agent_update + when: "'monitoring' in group_names" + + - name: Verify critical services + service: + name: "{{ item }}" + state: started + loop: + - systemd-journald + - systemd-logind + - cron + ignore_errors: true + + - name: Run post-patch health checks + uri: + url: http://localhost/health + method: GET + status_code: 200 + register: health_result + ignore_errors: true + when: "'webservers' in group_names" + + post_tasks: + - name: Generate patch report + template: + src: templates/patch_report.j2 + dest: "/var/log/patch_report_{{ ansible_date_time.iso8601 }}.log" + vars: + patch_status: "{{ 'SUCCESS' if health_result.status == 200 else 'WARNING' }}" + updates_applied: "{{ updates_available.stdout | default('0') }}" + reboot_needed: "{{ reboot_required }}" + + - name: Send patch notification + mail: + to: "{{ patch_notification_email | default('infra-team@company.com') }}" + subject: "Patch Report - {{ inventory_hostname }}" + body: | + Patch completed for {{ inventory_hostname }} + + Updates applied: {{ updates_applied }} + Reboot required: {{ reboot_required }} + Health check: {{ 'PASSED' if health_result.status == 200 else 'FAILED' }} + + See /var/log/patch_report_{{ ansible_date_time.iso8601 }}.log for details + when: patch_notification_email is defined + ignore_errors: true + + - name: Schedule reboot if required + command: shutdown -r +5 "Rebooting for security patches" + when: reboot_required and auto_reboot | default(false) | bool + async: 600 + poll: 0 + + handlers: + - name: restart monitoring + service: + name: "{{ monitoring_service | default('prometheus-node-exporter') }}" + state: restarted + when: "'monitoring' in group_names" \ No newline at end of file diff --git a/enterprise-infra-simulator/playbooks/provision.yml b/enterprise-infra-simulator/playbooks/provision.yml new file mode 100644 index 0000000..169a023 --- /dev/null +++ b/enterprise-infra-simulator/playbooks/provision.yml @@ -0,0 +1,158 @@ +--- +- name: Provision Enterprise Infrastructure Nodes + hosts: all + become: true + gather_facts: true + vars: + node_timezone: "UTC" + admin_user: "infra-admin" + ssh_port: 22 + packages: + - curl + - wget + - vim + - htop + - net-tools + - iptables + - fail2ban + - unattended-upgrades + + tasks: + - name: Update package cache + apt: + update_cache: yes + cache_valid_time: 3600 + when: ansible_os_family == "Debian" + + - name: Install base packages + apt: + name: "{{ packages }}" + state: present + when: ansible_os_family == "Debian" + + - name: Create admin user + user: + name: "{{ admin_user }}" + groups: sudo + append: yes + create_home: yes + shell: /bin/bash + password: "{{ 'infra-admin-password' | password_hash('sha512') }}" + + - name: Configure timezone + timezone: + name: "{{ node_timezone }}" + + - name: Configure SSH + block: + - name: Disable root SSH login + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^PermitRootLogin' + line: 'PermitRootLogin no' + + - name: Set SSH port + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^Port' + line: "Port {{ ssh_port }}" + + - name: Disable password authentication + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^PasswordAuthentication' + line: 'PasswordAuthentication no' + + - name: Restart SSH service + service: + name: sshd + state: restarted + + - name: Configure firewall + ufw: + state: enabled + policy: deny + rules: + - rule: allow + port: "{{ ssh_port }}" + proto: tcp + - rule: allow + port: '80' + proto: tcp + - rule: allow + port: '443' + proto: tcp + + - name: Configure fail2ban + template: + src: templates/jail.local.j2 + dest: /etc/fail2ban/jail.local + notify: restart fail2ban + + - name: Enable unattended upgrades + lineinfile: + path: /etc/apt/apt.conf.d/20auto-upgrades + regexp: '^APT::Periodic::Unattended-Upgrade' + line: 'APT::Periodic::Unattended-Upgrade "1";' + when: ansible_os_family == "Debian" + + - name: Create application directories + file: + path: "{{ item }}" + state: directory + owner: "{{ admin_user }}" + group: "{{ admin_user }}" + mode: '0755' + loop: + - /opt/application + - /var/log/application + - /etc/application + + - name: Deploy monitoring agent + include_role: + name: monitoring_agent + when: "'monitoring' in group_names" + + - name: Deploy web server + include_role: + name: nginx + when: "'webservers' in group_names" + + - name: Deploy database server + include_role: + name: postgresql + when: "'databases' in group_names" + + - name: Deploy load balancer + include_role: + name: haproxy + when: "'loadbalancers' in group_names" + + - name: Generate provisioning report + template: + src: templates/provisioning_report.j2 + dest: /var/log/provisioning_report_{{ ansible_date_time.iso8601 }}.log + delegate_to: localhost + + handlers: + - name: restart fail2ban + service: + name: fail2ban + state: restarted + + post_tasks: + - name: Verify services + service: + name: "{{ item }}" + state: started + enabled: yes + loop: "{{ services_to_verify | default([]) }}" + ignore_errors: true + + - name: Run health checks + uri: + url: http://localhost/health + method: GET + register: health_check + ignore_errors: true + when: "'webservers' in group_names" \ No newline at end of file diff --git a/enterprise-infra-simulator/scenarios/scaling_event.yml b/enterprise-infra-simulator/scenarios/scaling_event.yml new file mode 100644 index 0000000..68a23ac --- /dev/null +++ b/enterprise-infra-simulator/scenarios/scaling_event.yml @@ -0,0 +1,116 @@ +--- +- name: Enterprise Scaling Event Scenario + hosts: all + become: yes + gather_facts: yes + vars: + scaling_threshold: 80 + cooldown_period: 300 + max_scale_up: 5 + min_instances: 2 + + pre_tasks: + - name: Log scenario start + lineinfile: + path: "/var/log/scaling_scenario.log" + line: "{{ ansible_date_time.iso8601 }} - Starting scaling event scenario" + create: yes + + - name: Check current load + command: uptime + register: system_load + changed_when: false + + - name: Parse load average + set_fact: + load_1min: "{{ system_load.stdout.split(',')[0].split()[-1] | float }}" + load_5min: "{{ system_load.stdout.split(',')[1] | float }}" + load_15min: "{{ system_load.stdout.split(',')[2] | float }}" + + tasks: + - name: Evaluate scaling conditions + set_fact: + scale_up_needed: "{{ load_5min > scaling_threshold }}" + scale_down_needed: "{{ load_5min < (scaling_threshold * 0.3) }}" + + - name: Scale up web servers + include_role: + name: scale_up + tasks_from: web_servers + vars: + scale_count: "{{ [max_scale_up, (load_5min / 10) | int] | min }}" + when: scale_up_needed and "'webservers' in group_names" + + - name: Scale up database servers + include_role: + name: scale_up + tasks_from: database_servers + vars: + scale_count: "{{ [2, (load_5min / 20) | int] | min }}" + when: scale_up_needed and "'databases' in group_names" + + - name: Update load balancer configuration + include_role: + name: load_balancer + tasks_from: update_backends + when: scale_up_needed + + - name: Scale down web servers + include_role: + name: scale_down + tasks_from: web_servers + vars: + scale_count: "{{ [(inventory_hostname | regex_findall('[0-9]+') | first | int) - min_instances, 1] | max }}" + when: scale_down_needed and "'webservers' in group_names" and (inventory_hostname | regex_findall('[0-9]+') | first | int) > min_instances + + - name: Wait for cooldown period + pause: + seconds: "{{ cooldown_period }}" + when: scale_up_needed or scale_down_needed + + - name: Verify scaling results + uri: + url: http://localhost/health + method: GET + status_code: 200 + register: health_check + until: health_check.status == 200 + retries: 5 + delay: 10 + when: "'webservers' in group_names" + + - name: Update monitoring thresholds + include_role: + name: monitoring + tasks_from: update_alerts + vars: + new_threshold: "{{ scaling_threshold + 10 }}" + + - name: Send scaling notification + mail: + to: "{{ scaling_notification_email | default('infra-team@company.com') }}" + subject: "Infrastructure Scaling Event - {{ inventory_hostname }}" + body: | + Scaling event completed on {{ inventory_hostname }} + + Load averages: {{ load_1min }}, {{ load_5min }}, {{ load_15min }} + Action taken: {{ 'Scale Up' if scale_up_needed else 'Scale Down' if scale_down_needed else 'No Action' }} + Health check: {{ 'PASSED' if health_check.status == 200 else 'FAILED' }} + + See /var/log/scaling_scenario.log for details + when: scaling_notification_email is defined + ignore_errors: yes + + post_tasks: + - name: Generate scaling scenario report + template: + src: templates/scaling_scenario_report.j2 + dest: "/var/log/scaling_scenario_report_{{ ansible_date_time.iso8601 }}.log" + vars: + scenario_outcome: "{{ 'SUCCESS' if health_check.status == 200 else 'WARNING' }}" + load_metrics: "{{ load_1min }}, {{ load_5min }}, {{ load_15min }}" + + - name: Log scenario completion + lineinfile: + path: "/var/log/scaling_scenario.log" + line: "{{ ansible_date_time.iso8601 }} - Scaling event scenario completed" \ No newline at end of file diff --git a/enterprise-infra-simulator/scripts/simulate_failure.sh b/enterprise-infra-simulator/scripts/simulate_failure.sh new file mode 100644 index 0000000..4d22855 --- /dev/null +++ b/enterprise-infra-simulator/scripts/simulate_failure.sh @@ -0,0 +1,343 @@ +#!/bin/bash + +# Enterprise Infrastructure Failure Simulation Script +# Simulates various types of infrastructure failures for testing + +set -euo pipefail + +# Configuration +DOCKER_COMPOSE_FILE="docker-compose.yml" +INVENTORY_FILE="inventory/hosts.ini" +LOG_FILE="logs/failure_simulation.log" + +# Default values +FAILURE_TYPE="${1:-network}" +DURATION="${2:-60}" +TARGET_NODES="${3:-all}" +INTENSITY="${INTENSITY:-medium}" + +# Logging function +log() { + echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE" +} + +# Error handling +error_exit() { + log "ERROR: $1" + # Cleanup any active failures + cleanup_failure + exit 1 +} + +# Validate inputs +validate_inputs() { + case "$FAILURE_TYPE" in + network|disk|service|node|cpu|memory) ;; + *) error_exit "Invalid failure type: $FAILURE_TYPE. Must be network, disk, service, node, cpu, or memory" ;; + esac + + if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -lt 1 ]; then + error_exit "Invalid duration: $DURATION. Must be a positive integer (seconds)" + fi + + case "$INTENSITY" in + low|medium|high|critical) ;; + *) error_exit "Invalid intensity: $INTENSITY. Must be low, medium, high, or critical" ;; + esac +} + +# Get target containers +get_target_containers() { + case "$TARGET_NODES" in + all) + docker-compose ps --services | grep -v "^NAME$" || true + ;; + web) + echo "web" + ;; + db) + echo "db" + ;; + lb) + echo "lb" + ;; + monitor) + echo "monitor" + ;; + *) + echo "$TARGET_NODES" + ;; + esac +} + +# Network failure simulation +simulate_network_failure() { + local containers=$(get_target_containers) + log "Simulating network failure on containers: $containers" + + for container in $containers; do + local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true) + + for cid in $container_ids; do + if [ -n "$cid" ]; then + log "Disconnecting network for container $cid" + + # Disconnect from network + docker network disconnect "$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" "$cid" 2>/dev/null || true + + # Store original network for restoration + echo "$cid:$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" >> /tmp/network_failure_state + fi + done + done +} + +# Disk failure simulation +simulate_disk_failure() { + local containers=$(get_target_containers) + log "Simulating disk space exhaustion on containers: $containers" + + for container in $containers; do + local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true) + + for cid in $container_ids; do + if [ -n "$cid" ]; then + log "Filling disk space in container $cid" + + # Create a large file to consume disk space + local fill_size="100M" + case "$INTENSITY" in + low) fill_size="50M" ;; + medium) fill_size="100M" ;; + high) fill_size="500M" ;; + critical) fill_size="1G" ;; + esac + + docker exec "$cid" bash -c "dd if=/dev/zero of=/tmp/disk_fill bs=1M count=$(( ${fill_size%M} * 1024 ))" 2>/dev/null || true + echo "$cid:disk_fill" >> /tmp/disk_failure_state + fi + done + done +} + +# Service failure simulation +simulate_service_failure() { + local containers=$(get_target_containers) + log "Simulating service failures on containers: $containers" + + for container in $containers; do + local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true) + + for cid in $container_ids; do + if [ -n "$cid" ]; then + log "Stopping services in container $cid" + + # Stop common services + docker exec "$cid" systemctl stop nginx 2>/dev/null || true + docker exec "$cid" systemctl stop postgresql 2>/dev/null || true + docker exec "$cid" systemctl stop haproxy 2>/dev/null || true + + echo "$cid:services" >> /tmp/service_failure_state + fi + done + done +} + +# Node failure simulation +simulate_node_failure() { + local containers=$(get_target_containers) + log "Simulating complete node failures on containers: $containers" + + for container in $containers; do + local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true) + + for cid in $container_ids; do + if [ -n "$cid" ]; then + log "Stopping container $cid (node failure)" + docker pause "$cid" + echo "$cid:paused" >> /tmp/node_failure_state + fi + done + done +} + +# CPU stress simulation +simulate_cpu_failure() { + local containers=$(get_target_containers) + log "Simulating CPU stress on containers: $containers" + + for container in $containers; do + local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true) + + for cid in $container_ids; do + if [ -n "$cid" ]; then + log "Starting CPU stress in container $cid" + + # Start CPU stress process + docker exec -d "$cid" bash -c "while true; do :; done" 2>/dev/null || true + echo "$cid:cpu_stress:$(docker exec "$cid" ps aux | grep "while true" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/cpu_failure_state + fi + done + done +} + +# Memory stress simulation +simulate_memory_failure() { + local containers=$(get_target_containers) + log "Simulating memory exhaustion on containers: $containers" + + for container in $containers; do + local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true) + + for cid in $container_ids; do + if [ -n "$cid" ]; then + log "Starting memory stress in container $cid" + + # Start memory stress process + docker exec -d "$cid" bash -c "tail /dev/zero" 2>/dev/null || true + echo "$cid:memory_stress:$(docker exec "$cid" ps aux | grep "tail /dev/zero" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/memory_failure_state + fi + done + done +} + +# Inject failure +inject_failure() { + case "$FAILURE_TYPE" in + network) simulate_network_failure ;; + disk) simulate_disk_failure ;; + service) simulate_service_failure ;; + node) simulate_node_failure ;; + cpu) simulate_cpu_failure ;; + memory) simulate_memory_failure ;; + esac +} + +# Cleanup failure +cleanup_failure() { + log "Cleaning up failure simulation" + + # Restore network connections + if [ -f /tmp/network_failure_state ]; then + while IFS=: read -r cid network; do + docker network connect "$network" "$cid" 2>/dev/null || true + done < /tmp/network_failure_state + rm -f /tmp/network_failure_state + fi + + # Clean up disk fill files + if [ -f /tmp/disk_failure_state ]; then + while IFS=: read -r cid _; do + docker exec "$cid" rm -f /tmp/disk_fill 2>/dev/null || true + done < /tmp/disk_failure_state + rm -f /tmp/disk_failure_state + fi + + # Restart services + if [ -f /tmp/service_failure_state ]; then + while IFS=: read -r cid _; do + docker exec "$cid" systemctl start nginx 2>/dev/null || true + docker exec "$cid" systemctl start postgresql 2>/dev/null || true + docker exec "$cid" systemctl start haproxy 2>/dev/null || true + done < /tmp/service_failure_state + rm -f /tmp/service_failure_state + fi + + # Unpause containers + if [ -f /tmp/node_failure_state ]; then + while IFS=: read -r cid _; do + docker unpause "$cid" 2>/dev/null || true + done < /tmp/node_failure_state + rm -f /tmp/node_failure_state + fi + + # Kill stress processes + if [ -f /tmp/cpu_failure_state ]; then + while IFS=: read -r cid _ pid; do + docker exec "$cid" kill -9 "$pid" 2>/dev/null || true + done < /tmp/cpu_failure_state + rm -f /tmp/cpu_failure_state + fi + + if [ -f /tmp/memory_failure_state ]; then + while IFS=: read -r cid _ pid; do + docker exec "$cid" kill -9 "$pid" 2>/dev/null || true + done < /tmp/memory_failure_state + rm -f /tmp/memory_failure_state + fi +} + +# Monitor failure +monitor_failure() { + local end_time=$(( $(date +%s) + DURATION )) + + log "Monitoring failure for $DURATION seconds" + + while [ $(date +%s) -lt $end_time ]; do + # Check container status + if ! docker-compose ps | grep -q "Up\|Paused"; then + log "WARNING: All containers are down" + fi + + # Log system metrics + log "System status: $(docker stats --no-stream --format 'table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}' | tail -n +2)" + + sleep 10 + done +} + +# Generate failure report +generate_report() { + local report_file="reports/failure_simulation_$(date +%Y%m%d_%H%M%S).txt" + + cat > "$report_file" << EOF +Failure Simulation Report +======================== + +Timestamp: $(date) +Failure Type: $FAILURE_TYPE +Duration: $DURATION seconds +Target Nodes: $TARGET_NODES +Intensity: $INTENSITY + +Pre-failure Status: +$(docker-compose ps) + +Post-failure Status: +$(docker-compose ps) + +Log File: $LOG_FILE +EOF + + log "Failure simulation report generated: $report_file" +} + +# Main execution +main() { + log "Starting failure simulation: $FAILURE_TYPE for $DURATION seconds" + + validate_inputs + + # Inject failure + inject_failure + + # Monitor during failure + monitor_failure + + # Cleanup + cleanup_failure + + # Generate report + generate_report + + log "Failure simulation completed successfully" +} + +# Trap for cleanup on script exit +trap cleanup_failure EXIT + +# Initialize logging +mkdir -p logs reports + +# Run main function +main "$@" \ No newline at end of file diff --git a/enterprise-infra-simulator/scripts/simulate_scaling.sh b/enterprise-infra-simulator/scripts/simulate_scaling.sh new file mode 100644 index 0000000..4b3176d --- /dev/null +++ b/enterprise-infra-simulator/scripts/simulate_scaling.sh @@ -0,0 +1,208 @@ +#!/bin/bash + +# Enterprise Infrastructure Scaling Simulation Script +# Simulates scaling operations for infrastructure nodes + +set -euo pipefail + +# Configuration +DOCKER_COMPOSE_FILE="docker-compose.yml" +INVENTORY_FILE="inventory/hosts.ini" +LOG_FILE="logs/scaling_simulation.log" + +# Default values +DIRECTION="${1:-up}" +COUNT="${2:-1}" +NODE_TYPE="${3:-web}" +SIMULATION_MODE="${SIMULATION_MODE:-false}" + +# Logging function +log() { + echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE" +} + +# Error handling +error_exit() { + log "ERROR: $1" + exit 1 +} + +# Validate inputs +validate_inputs() { + if [[ "$DIRECTION" != "up" && "$DIRECTION" != "down" ]]; then + error_exit "Invalid direction: $DIRECTION. Must be 'up' or 'down'" + fi + + if ! [[ "$COUNT" =~ ^[0-9]+$ ]] || [ "$COUNT" -lt 1 ]; then + error_exit "Invalid count: $COUNT. Must be a positive integer" + fi + + case "$NODE_TYPE" in + web|db|lb|monitor) ;; + *) error_exit "Invalid node type: $NODE_TYPE. Must be web, db, lb, or monitor" ;; + esac +} + +# Get current node count +get_current_count() { + local type="$1" + case "$type" in + web) docker-compose ps web | grep -c "Up" ;; + db) docker-compose ps db | grep -c "Up" ;; + lb) docker-compose ps lb | grep -c "Up" ;; + monitor) docker-compose ps monitor | grep -c "Up" ;; + esac +} + +# Scale up infrastructure +scale_up() { + local type="$1" + local count="$2" + + log "Scaling up $count $type nodes" + + # Update docker-compose replica count + sed -i.bak "s/replicas: [0-9]\+/replicas: $(( $(get_current_count "$type") + count ))/" "$DOCKER_COMPOSE_FILE" + + # Deploy new containers + docker-compose up -d --scale "${type}=${count}" + + # Wait for containers to be ready + log "Waiting for containers to be ready..." + sleep 30 + + # Update inventory + update_inventory "$type" "$count" "add" + + # Run provisioning playbook on new nodes + if [ "$SIMULATION_MODE" = false ]; then + ansible-playbook -i "$INVENTORY_FILE" playbooks/provision.yml --limit "${type}*" + fi + + log "Successfully scaled up $count $type nodes" +} + +# Scale down infrastructure +scale_down() { + local type="$1" + local count="$2" + + local current_count=$(get_current_count "$type") + if [ "$current_count" -lt "$count" ]; then + error_exit "Cannot scale down $count nodes. Only $current_count $type nodes currently running" + fi + + log "Scaling down $count $type nodes" + + # Select nodes to remove (oldest first) + local nodes_to_remove=$(docker-compose ps "$type" | grep "Up" | head -n "$count" | awk '{print $1}') + + # Decommission nodes + for node in $nodes_to_remove; do + if [ "$SIMULATION_MODE" = false ]; then + ansible-playbook -i "$INVENTORY_FILE" playbooks/decommission.yml --limit "$node" + fi + docker stop "$node" + docker rm "$node" + done + + # Update docker-compose replica count + sed -i.bak "s/replicas: [0-9]\+/replicas: $(( current_count - count ))/" "$DOCKER_COMPOSE_FILE" + + # Update inventory + update_inventory "$type" "$count" "remove" + + log "Successfully scaled down $count $type nodes" +} + +# Update Ansible inventory +update_inventory() { + local type="$1" + local count="$2" + local action="$3" + + log "Updating inventory for $action $count $type nodes" + + # This would be more complex in a real implementation + # For simulation, we'll just log the action + case "$action" in + add) + log "Added $count $type nodes to inventory" + ;; + remove) + log "Removed $count $type nodes from inventory" + ;; + esac +} + +# Health check after scaling +health_check() { + log "Running health checks after scaling" + + # Check container status + if ! docker-compose ps | grep -q "Up"; then + error_exit "Some containers failed to start" + fi + + # Ansible ping check + if [ "$SIMULATION_MODE" = false ]; then + if ! ansible -i "$INVENTORY_FILE" all -m ping >/dev/null 2>&1; then + log "WARNING: Some nodes failed Ansible ping check" + fi + fi + + log "Health checks completed" +} + +# Generate scaling report +generate_report() { + local report_file="reports/scaling_report_$(date +%Y%m%d_%H%M%S).txt" + + cat > "$report_file" << EOF +Scaling Simulation Report +======================== + +Timestamp: $(date) +Direction: $DIRECTION +Node Type: $NODE_TYPE +Count: $COUNT +Simulation Mode: $SIMULATION_MODE + +Current Status: +$(docker-compose ps) + +Inventory Status: +$(ansible -i "$INVENTORY_FILE" --list-hosts all 2>/dev/null || echo "Ansible inventory check failed") + +Log File: $LOG_FILE +EOF + + log "Scaling report generated: $report_file" +} + +# Main execution +main() { + log "Starting scaling simulation: $DIRECTION $COUNT $NODE_TYPE nodes" + + validate_inputs + + case "$DIRECTION" in + up) + scale_up "$NODE_TYPE" "$COUNT" + ;; + down) + scale_down "$NODE_TYPE" "$COUNT" + ;; + esac + + health_check + generate_report + + log "Scaling simulation completed successfully" +} + +# Initialize logging +mkdir -p logs reports + +# Run main function +main "$@" \ No newline at end of file diff --git a/migration-validation-framework/README.md b/migration-validation-framework/README.md new file mode 100644 index 0000000..ef6a563 --- /dev/null +++ b/migration-validation-framework/README.md @@ -0,0 +1,389 @@ +# Migration Validation Framework + +A comprehensive Python CLI tool for validating system migrations through data collection, snapshot comparison, and automated reporting. Designed for enterprise migration workflows where system consistency and data integrity are critical. + +## Overview + +The Migration Validation Framework provides a systematic approach to validating system migrations by: + +- Collecting comprehensive system data before and after migration +- Generating structured JSON snapshots for comparison +- Performing intelligent diff analysis between snapshots +- Generating detailed HTML reports with change visualization +- Providing CLI interface for integration into migration pipelines + +## Architecture + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ CLI Interface │ │ Data │ │ Validation │ +│ (cli.py) │◄──►│ Collectors │◄──►│ Engine │ +│ │ │ │ │ │ +│ - Command │ │ - mounts.py │ │ - compare.py │ +│ parsing │ │ - services.py │ │ - diff.py │ +│ - Workflow │ │ - disk_usage.py │ │ - validate.py │ +│ orchestration │ │ - network.py │ │ │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ + │ │ │ + ▼ ▼ ▼ +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ JSON │ │ Comparison │ │ HTML │ +│ Snapshots │ │ Results │ │ Reports │ +│ │ │ │ │ │ +│ - Pre-migration │ │ - Differences │ │ - Summary │ +│ - Post-migration│ │ - Risk levels │ │ - Details │ +│ - Metadata │ │ - Validation │ │ - Charts │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ +``` + +## Quick Start + +### Prerequisites + +- Python 3.8+ +- SSH access to target systems +- Appropriate permissions for data collection + +### Installation + +```bash +cd migration-validation-framework +pip install -r requirements.txt +``` + +### Basic Usage + +```bash +# Create pre-migration snapshot +python cli.py snapshot --env production --label pre-migration --systems web01,db01 + +# Perform migration... + +# Create post-migration snapshot +python cli.py snapshot --env production --label post-migration --systems web01,db01 + +# Compare snapshots +python cli.py compare pre-migration post-migration --output comparison_001 + +# Generate HTML report +python cli.py report --comparison comparison_001 --format html --output migration_report.html +``` + +## Project Structure + +``` +migration-validation-framework/ +├── cli.py # Main CLI interface +├── collectors/ # Data collection modules +│ ├── mounts.py # Filesystem mount collection +│ ├── services.py # System services collection +│ ├── disk_usage.py # Disk usage statistics +│ ├── network.py # Network configuration +│ └── processes.py # Running processes +├── validators/ # Validation and comparison logic +│ ├── compare.py # Snapshot comparison engine +│ ├── diff.py # Difference calculation +│ └── validate.py # Validation rules +├── reports/ # Report generation +│ ├── html_report.py # HTML report generator +│ ├── json_report.py # JSON report generator +│ └── summary.py # Summary calculations +├── config/ # Configuration files +│ ├── collectors.yaml # Collector configurations +│ └── validators.yaml # Validation rules +├── tests/ # Unit and integration tests +├── logs/ # Application logs +└── snapshots/ # Stored snapshots +``` + +## Data Collectors + +### Mounts Collector (`collectors/mounts.py`) +Collects filesystem mount information including: +- Mount points and devices +- Filesystem types +- Mount options +- Capacity and usage statistics + +### Services Collector (`collectors/services.py`) +Gathers system service status: +- Running services +- Service states (active, inactive, failed) +- Startup configuration +- Dependencies + +### Disk Usage Collector (`collectors/disk_usage.py`) +Analyzes disk space utilization: +- Directory size statistics +- File system usage +- Inode usage +- Largest files and directories + +### Network Collector (`collectors/network.py`) +Captures network configuration: +- Interface configurations +- Routing tables +- DNS settings +- Firewall rules + +### Processes Collector (`collectors/processes.py`) +Documents running processes: +- Process lists with PIDs +- Memory and CPU usage +- Process owners +- Command lines + +## Validation Engine + +### Comparison Logic (`validators/compare.py`) +Performs intelligent comparison of snapshots: +- Structural differences detection +- Semantic change analysis +- Risk level assessment +- Change categorization + +### Difference Calculator (`validators/diff.py`) +Calculates detailed differences: +- Added/removed/modified items +- Quantitative changes +- Configuration drift detection +- Anomaly identification + +### Validation Rules (`validators/validate.py`) +Applies validation rules: +- Critical change detection +- Compliance checking +- Threshold validation +- Custom rule engine + +## Reporting + +### HTML Reports (`reports/html_report.py`) +Generates comprehensive HTML reports featuring: +- Executive summary dashboard +- Detailed change logs +- Risk assessment visualizations +- Interactive charts and graphs +- Export capabilities + +### JSON Reports (`reports/json_report.py`) +Provides structured JSON output for: +- API integration +- Automated processing +- Audit trails +- Compliance reporting + +## CLI Interface + +### Commands + +```bash +# Snapshot management +python cli.py snapshot --env --label