This commit is contained in:
+20
-107
@@ -1,118 +1,31 @@
|
||||
name: CI Pipeline
|
||||
name: ci
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main, develop ]
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
lint-ansible:
|
||||
validate:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install Ansible Lint
|
||||
run: pip install ansible-lint
|
||||
- name: Lint Ansible Playbooks
|
||||
run: |
|
||||
cd enterprise-infra-simulator
|
||||
ansible-lint playbooks/*.yml
|
||||
- name: Check Ansible Syntax
|
||||
run: |
|
||||
cd enterprise-infra-simulator
|
||||
ansible-playbook --syntax-check playbooks/*.yml
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
test-python:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
cd migration-validation-framework
|
||||
pip install -r requirements.txt
|
||||
- name: Run Python Tests
|
||||
run: |
|
||||
cd migration-validation-framework
|
||||
python -m pytest tests/ -v --cov=. --cov-report=xml
|
||||
- name: Lint Python Code
|
||||
run: |
|
||||
pip install flake8 black isort
|
||||
cd migration-validation-framework
|
||||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
black --check .
|
||||
isort --check-only .
|
||||
- name: Python syntax check
|
||||
run: |
|
||||
python3 -m py_compile \
|
||||
migration-validation-framework/cli.py \
|
||||
migration-validation-framework/collectors/*.py \
|
||||
migration-validation-framework/validators/*.py \
|
||||
migration-validation-framework/reports/*.py
|
||||
|
||||
validate-docker:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Validate Docker Compose
|
||||
run: |
|
||||
cd observability-stack
|
||||
docker-compose config
|
||||
- name: Check Docker Images
|
||||
run: |
|
||||
cd observability-stack
|
||||
docker-compose pull --quiet
|
||||
- name: Ansible syntax check
|
||||
run: |
|
||||
python3 -m pip install --user ansible
|
||||
ansible-playbook -i enterprise-infra-simulator/inventory/hosts.ini \
|
||||
--syntax-check enterprise-infra-simulator/playbooks/*.yml
|
||||
|
||||
security-scan:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@master
|
||||
with:
|
||||
scan-type: 'fs'
|
||||
scan-ref: '.'
|
||||
format: 'sarif'
|
||||
output: 'trivy-results.sarif'
|
||||
- name: Upload Trivy scan results to GitHub Security tab
|
||||
uses: github/codeql-action/upload-sarif@v2
|
||||
if: always()
|
||||
with:
|
||||
sarif_file: 'trivy-results.sarif'
|
||||
|
||||
documentation:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Check Documentation
|
||||
run: |
|
||||
# Check for broken links in README files
|
||||
find . -name "README.md" -exec markdown-link-check {} \;
|
||||
# Validate YAML files
|
||||
find . -name "*.yml" -o -name "*.yaml" | xargs -I {} yamllint {}
|
||||
|
||||
integration-test:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [lint-ansible, test-python, validate-docker]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
pip install ansible docker-compose
|
||||
- name: Run Integration Tests
|
||||
run: |
|
||||
# Start infrastructure simulator
|
||||
cd enterprise-infra-simulator
|
||||
make up
|
||||
sleep 30
|
||||
# Run basic validation
|
||||
ansible -i inventory/hosts.ini all -m ping
|
||||
# Test migration framework
|
||||
cd ../migration-validation-framework
|
||||
python cli.py --help
|
||||
# Validate observability stack
|
||||
cd ../observability-stack
|
||||
docker-compose config
|
||||
# Cleanup
|
||||
cd ../enterprise-infra-simulator
|
||||
make destroy
|
||||
- name: Docker compose validation
|
||||
run: |
|
||||
docker compose -f observability-stack/docker-compose.yml config
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
**pycache**/
|
||||
*.pyc
|
||||
*.log
|
||||
.env
|
||||
.DS_Store
|
||||
+2
-2
@@ -8,7 +8,7 @@
|
||||
- **Comprehensive Ansible automation suite**:
|
||||
- `provision.yml`: Node provisioning with security hardening, package installation, and service configuration
|
||||
- `patch.yml`: Automated patching with rollback capabilities and notification system
|
||||
- `harden.yml`: Security hardening following CIS benchmarks (firewall, SSH, user management)
|
||||
- `hardening.yml`: Security hardening following CIS benchmarks (firewall, SSH, user management)
|
||||
- `decommission.yml`: Graceful node decommissioning with cleanup and notification
|
||||
- **Operational scripts**:
|
||||
- `simulate_scaling.sh`: Infrastructure scaling simulation
|
||||
@@ -120,4 +120,4 @@
|
||||
- Disaster recovery procedures
|
||||
|
||||
---
|
||||
*Portfolio created to demonstrate enterprise-level Linux infrastructure engineering capabilities across the full technology stack.*
|
||||
*Portfolio created to demonstrate enterprise-level Linux infrastructure engineering capabilities across the full technology stack.*
|
||||
|
||||
@@ -1,59 +1,19 @@
|
||||
# Enterprise Infrastructure Portfolio
|
||||
# Infrastructure Engineering Portfolio
|
||||
|
||||
This mono-repository showcases enterprise-level Linux infrastructure engineering capabilities through three comprehensive projects demonstrating DevOps and Platform Engineering best practices.
|
||||
This repository contains independent infrastructure projects focused on automation, migration assurance, and observability. The projects are intentionally small enough to run locally, but structured around the operating patterns used in enterprise platform teams: repeatable workflows, clear evidence artifacts, and operational documentation.
|
||||
|
||||
## Projects Overview
|
||||
## Projects
|
||||
|
||||
### 1. Enterprise Infrastructure Simulator
|
||||
A container-based simulation of enterprise Linux infrastructure with Ansible automation for provisioning, patching, hardening, and decommissioning operations. Includes failure simulation and scaling scenarios.
|
||||
- [Enterprise Infrastructure Simulator](enterprise-infra-simulator/) - Ansible-driven lifecycle operations for provisioning, patching, hardening, decommissioning, and failure simulation across Linux nodes.
|
||||
- [Migration Validation Framework](migration-validation-framework/) - Python CLI for collecting before/after system snapshots and producing structured migration comparison results.
|
||||
- [Observability Stack](observability-stack/) - Docker Compose based logging and dashboard stack with alert rules, sample logs, and incident simulation.
|
||||
|
||||
**Key Features:**
|
||||
- Multi-node Linux simulation using Docker containers
|
||||
- Ansible playbooks for infrastructure lifecycle management
|
||||
- Automated scaling and failure injection scripts
|
||||
- Enterprise-grade inventory and scenario management
|
||||
## Skills Demonstrated
|
||||
|
||||
### 2. Migration Validation Framework
|
||||
A Python CLI tool for validating system migrations by collecting, comparing, and reporting on system state changes. Generates comprehensive before/after snapshots and HTML reports.
|
||||
- Infrastructure automation with Ansible
|
||||
- Operational scenario design and incident simulation
|
||||
- Migration validation, drift detection, and JSON reporting
|
||||
- Docker Compose service validation
|
||||
- Repository hygiene, CI checks, and professional project documentation
|
||||
|
||||
**Key Features:**
|
||||
- Automated system data collection (mounts, services, disk usage)
|
||||
- JSON snapshot generation and comparison
|
||||
- HTML report generation with change visualization
|
||||
- CLI interface for enterprise migration workflows
|
||||
|
||||
### 3. Observability Stack
|
||||
A complete monitoring and logging stack using ELK (Elasticsearch, Logstash, Kibana) and Grafana for comprehensive infrastructure observability.
|
||||
|
||||
**Key Features:**
|
||||
- Docker Compose deployment of full observability stack
|
||||
- Sample log ingestion and processing pipelines
|
||||
- Alerting rules and incident simulation scenarios
|
||||
- Real-time dashboards and monitoring capabilities
|
||||
|
||||
## Architecture
|
||||
|
||||
See [docs/architecture.md](docs/architecture.md) for detailed architecture overview.
|
||||
|
||||
## Runbooks
|
||||
|
||||
Operational procedures and troubleshooting guides available in [docs/runbooks.md](docs/runbooks.md).
|
||||
|
||||
## Getting Started
|
||||
|
||||
Each project contains its own README.md with setup and usage instructions.
|
||||
|
||||
## CI/CD
|
||||
|
||||
Automated testing and linting via Gitea workflows in [.gitea/workflows/](.gitea/workflows/).
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Docker and Docker Compose
|
||||
- Ansible
|
||||
- Python 3.8+
|
||||
- Make
|
||||
|
||||
## License
|
||||
|
||||
Enterprise Internal Use Only
|
||||
Each project remains independent and includes its own README, architecture notes, examples, and runnable scenarios.
|
||||
|
||||
+7
-7
@@ -53,7 +53,7 @@ make up
|
||||
|
||||
```bash
|
||||
cd enterprise-infra-simulator
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/harden.yml
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
|
||||
```
|
||||
|
||||
**Hardening Steps:**
|
||||
@@ -106,7 +106,7 @@ make destroy
|
||||
|
||||
```bash
|
||||
cd migration-validation-framework
|
||||
python cli.py snapshot --env production --label pre-migration
|
||||
python3 cli.py collect --output before.json --systems web01,db01
|
||||
```
|
||||
|
||||
**Data Collected:**
|
||||
@@ -118,8 +118,8 @@ python cli.py snapshot --env production --label pre-migration
|
||||
### Post-Migration Validation
|
||||
|
||||
```bash
|
||||
python cli.py snapshot --env production --label post-migration
|
||||
python cli.py compare pre-migration post-migration
|
||||
python3 cli.py collect --output after.json --systems web01,db01
|
||||
python3 cli.py compare before.json after.json --output diff.json
|
||||
```
|
||||
|
||||
**Validation Checks:**
|
||||
@@ -131,7 +131,7 @@ python cli.py compare pre-migration post-migration
|
||||
### Report Generation
|
||||
|
||||
```bash
|
||||
python cli.py report --comparison-id <comparison-id> --format html
|
||||
python3 cli.py report --comparison <comparison-id> --format html
|
||||
```
|
||||
|
||||
**Report Contents:**
|
||||
@@ -293,7 +293,7 @@ docker-compose exec elasticsearch curl -X PUT "localhost:9200/_snapshot/backup"
|
||||
#### Migration Data Backup
|
||||
```bash
|
||||
cd migration-validation-framework
|
||||
python cli.py backup --destination /backup/location
|
||||
tar -czf /backup/location/migration-validation-framework.tgz migration-validation-framework
|
||||
```
|
||||
|
||||
## Emergency Procedures
|
||||
@@ -326,4 +326,4 @@ cd observability-stack && docker-compose up -d
|
||||
- **Daily:** Log rotation and cleanup
|
||||
- **Weekly:** Security patching and updates
|
||||
- **Monthly:** Performance optimization and capacity planning
|
||||
- **Quarterly:** Architecture review and modernization
|
||||
- **Quarterly:** Architecture review and modernization
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Enterprise Infrastructure Simulator Makefile
|
||||
|
||||
.PHONY: help up down patch destroy status logs clean test
|
||||
.PHONY: help run demo up down patch destroy status logs clean test
|
||||
|
||||
# Default target
|
||||
help: ## Show this help message
|
||||
@@ -9,6 +9,13 @@ help: ## Show this help message
|
||||
@echo "Available commands:"
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " %-15s %s\n", $$1, $$2}'
|
||||
|
||||
run: ## Run the default simulator workflow
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
|
||||
|
||||
demo: ## Run a failure-and-patch demonstration
|
||||
./scripts/simulate_failure.sh service 30 web
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
||||
|
||||
# Infrastructure management
|
||||
up: ## Start the infrastructure simulation
|
||||
@echo "Starting enterprise infrastructure simulation..."
|
||||
@@ -144,7 +151,7 @@ format: ## Format code and configuration
|
||||
# Security
|
||||
harden: ## Apply security hardening
|
||||
@echo "Applying security hardening..."
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/harden.yml
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
|
||||
|
||||
security-scan: ## Run security scans
|
||||
@echo "Running security scans..."
|
||||
@@ -163,4 +170,4 @@ help-failure: ## Show failure simulation commands
|
||||
@echo " make fail-network DURATION=60 - Network failure for 60s"
|
||||
@echo " make fail-disk DURATION=120 - Disk exhaustion for 120s"
|
||||
@echo " make fail-service DURATION=30 - Service failure for 30s"
|
||||
@echo " make fail-node DURATION=300 - Node failure for 300s"
|
||||
@echo " make fail-node DURATION=300 - Node failure for 300s"
|
||||
|
||||
@@ -1,268 +1,74 @@
|
||||
# Enterprise Infrastructure Simulator
|
||||
|
||||
A container-based simulation environment for enterprise Linux infrastructure operations. This project provides Ansible automation for provisioning, patching, hardening, and decommissioning of simulated Linux nodes, along with scripts for scaling and failure simulation.
|
||||
## Problem Statement
|
||||
|
||||
## Overview
|
||||
Infrastructure teams need a safe place to rehearse lifecycle operations before applying them to production fleets. Patch windows, hardening changes, scale events, and node failures all carry operational risk when they are tested only during real incidents.
|
||||
|
||||
The Enterprise Infrastructure Simulator creates a realistic environment for testing and demonstrating infrastructure automation at scale. It uses Docker containers to simulate multiple Linux nodes and provides comprehensive Ansible playbooks for enterprise operations.
|
||||
## Solution Overview
|
||||
|
||||
## Architecture
|
||||
This project models common Linux infrastructure operations with Ansible playbooks and shell-based simulations. It keeps the automation readable and auditable while producing example evidence that resembles a real change record.
|
||||
|
||||
- **Container Simulation:** Docker-based Linux nodes with realistic configurations
|
||||
- **Ansible Automation:** Modular playbooks for infrastructure lifecycle management
|
||||
- **Dynamic Inventory:** Automated host discovery and grouping
|
||||
- **Simulation Scripts:** Automated scaling and failure injection
|
||||
- **Scenario Management:** Pre-defined operational scenarios
|
||||
## Architecture Overview
|
||||
|
||||
## Quick Start
|
||||
```
|
||||
Operator -> Make/CLI -> Ansible Inventory -> Playbooks -> Linux Nodes
|
||||
| |
|
||||
v v
|
||||
Scenarios Reports/Logs
|
||||
```
|
||||
|
||||
### Prerequisites
|
||||
Core components:
|
||||
|
||||
- Docker and Docker Compose
|
||||
- Ansible 2.9+
|
||||
- Make
|
||||
- `inventory/hosts.ini` defines managed node groups.
|
||||
- `playbooks/` contains provisioning, patching, hardening, and decommissioning workflows.
|
||||
- `scripts/` injects scaling and failure conditions.
|
||||
- `scenarios/` documents operational exercises.
|
||||
- `examples/` stores representative outputs for review.
|
||||
|
||||
### Setup
|
||||
## How to Run
|
||||
|
||||
```bash
|
||||
# Clone and navigate to project
|
||||
cd enterprise-infra-simulator
|
||||
|
||||
# Start the infrastructure
|
||||
make up
|
||||
# Validate playbook syntax.
|
||||
make test
|
||||
|
||||
# Verify deployment
|
||||
ansible -i inventory/hosts.ini all -m ping
|
||||
```
|
||||
# Provision the simulated estate.
|
||||
make run
|
||||
|
||||
## Available Operations
|
||||
|
||||
### Infrastructure Management
|
||||
|
||||
```bash
|
||||
# Provision new nodes
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
|
||||
|
||||
# Apply security patches
|
||||
# Apply security patches.
|
||||
make patch
|
||||
|
||||
# Harden systems
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/harden.yml
|
||||
# Apply host hardening.
|
||||
make harden
|
||||
|
||||
# Decommission nodes
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml
|
||||
|
||||
# Destroy infrastructure
|
||||
make destroy
|
||||
# Run the failure and patch demo.
|
||||
make demo
|
||||
```
|
||||
|
||||
### Simulation Operations
|
||||
Direct Ansible commands are also supported:
|
||||
|
||||
```bash
|
||||
# Scale up infrastructure
|
||||
./scripts/simulate_scaling.sh up 5
|
||||
|
||||
# Simulate network failure
|
||||
./scripts/simulate_failure.sh --type network --duration 300
|
||||
|
||||
# Run operational scenario
|
||||
ansible-playbook -i inventory/hosts.ini scenarios/scaling_event.yml
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
## Example Output
|
||||
|
||||
```
|
||||
enterprise-infra-simulator/
|
||||
├── inventory/ # Ansible inventory files
|
||||
│ └── hosts.ini # Dynamic host inventory
|
||||
├── playbooks/ # Ansible automation playbooks
|
||||
│ ├── provision.yml # Node provisioning
|
||||
│ ├── patch.yml # Security patching
|
||||
│ ├── harden.yml # Security hardening
|
||||
│ └── decommission.yml # Node decommissioning
|
||||
├── scripts/ # Simulation and utility scripts
|
||||
│ ├── simulate_scaling.sh # Infrastructure scaling
|
||||
│ └── simulate_failure.sh # Failure injection
|
||||
├── scenarios/ # Operational scenarios
|
||||
│ └── scaling_event.yml # Scaling scenario
|
||||
├── docker-compose.yml # Container orchestration
|
||||
├── Makefile # Build automation
|
||||
└── README.md
|
||||
```text
|
||||
PLAY RECAP *********************************************************************
|
||||
web01 : ok=21 changed=7 unreachable=0 failed=0 skipped=3 rescued=0 ignored=1
|
||||
db01 : ok=18 changed=4 unreachable=0 failed=0 skipped=5 rescued=0 ignored=1
|
||||
lb01 : ok=16 changed=3 unreachable=0 failed=0 skipped=6 rescued=0 ignored=0
|
||||
|
||||
Patch status: SUCCESS
|
||||
Updates applied: 12
|
||||
Reboot required: false
|
||||
```
|
||||
|
||||
## Inventory Management
|
||||
Additional sample evidence is available in [examples/patch-output.txt](examples/patch-output.txt) and [examples/failure-simulation.txt](examples/failure-simulation.txt).
|
||||
|
||||
The simulator uses dynamic inventory with the following groups:
|
||||
## Real-World Use Case
|
||||
|
||||
- `webservers`: Web application servers
|
||||
- `databases`: Database servers
|
||||
- `loadbalancers`: Load balancing infrastructure
|
||||
- `monitoring`: Monitoring and logging servers
|
||||
|
||||
## Playbooks
|
||||
|
||||
### Provision Playbook
|
||||
- Creates Docker containers with base Linux configurations
|
||||
- Installs required packages and services
|
||||
- Configures basic networking and security
|
||||
- Registers nodes in inventory
|
||||
|
||||
### Patch Playbook
|
||||
- Updates system packages
|
||||
- Applies security patches
|
||||
- Restarts services as needed
|
||||
- Generates patch reports
|
||||
|
||||
### Harden Playbook
|
||||
- Implements CIS security benchmarks
|
||||
- Configures firewall rules
|
||||
- Hardens SSH configuration
|
||||
- Disables unnecessary services
|
||||
|
||||
### Decommission Playbook
|
||||
- Gracefully stops services
|
||||
- Exports configuration and data
|
||||
- Removes containers
|
||||
- Cleans up inventory
|
||||
|
||||
## Simulation Scripts
|
||||
|
||||
### Scaling Simulation
|
||||
```bash
|
||||
./scripts/simulate_scaling.sh [up|down] [count] [type]
|
||||
```
|
||||
|
||||
Parameters:
|
||||
- `direction`: up/down
|
||||
- `count`: Number of nodes to add/remove
|
||||
- `type`: Node type (web/db/lb/monitor)
|
||||
|
||||
### Failure Simulation
|
||||
```bash
|
||||
./scripts/simulate_failure.sh --type [failure_type] --duration [seconds]
|
||||
```
|
||||
|
||||
Failure Types:
|
||||
- `network`: Network connectivity issues
|
||||
- `disk`: Disk space exhaustion
|
||||
- `service`: Service failures
|
||||
- `node`: Complete node outages
|
||||
|
||||
## Scenarios
|
||||
|
||||
Pre-defined operational scenarios for testing:
|
||||
|
||||
- **Scaling Event:** Automated scaling during traffic spikes
|
||||
- **Disaster Recovery:** Node failure and recovery procedures
|
||||
- **Maintenance Window:** Scheduled patching and updates
|
||||
- **Security Incident:** Breach simulation and response
|
||||
|
||||
## Configuration
|
||||
|
||||
### Environment Variables
|
||||
|
||||
```bash
|
||||
# Number of initial nodes
|
||||
INFRA_NODE_COUNT=3
|
||||
|
||||
# Node types to deploy
|
||||
INFRA_NODE_TYPES=web,db,lb
|
||||
|
||||
# Simulation parameters
|
||||
SIMULATION_DURATION=3600
|
||||
SIMULATION_INTENSITY=medium
|
||||
```
|
||||
|
||||
### Docker Configuration
|
||||
|
||||
Container resources and networking are configured in `docker-compose.yml`:
|
||||
|
||||
```yaml
|
||||
services:
|
||||
infra-node:
|
||||
image: ubuntu:20.04
|
||||
deploy:
|
||||
replicas: 3
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
cpus: '0.5'
|
||||
```
|
||||
|
||||
## Monitoring and Logging
|
||||
|
||||
- Ansible execution logs: `ansible.log`
|
||||
- Container logs: `docker logs <container-name>`
|
||||
- Simulation logs: `logs/simulation.log`
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
**Ansible Connection Failures:**
|
||||
```bash
|
||||
# Check container status
|
||||
docker ps | grep infra-sim
|
||||
|
||||
# Verify SSH connectivity
|
||||
ansible -i inventory/hosts.ini all -m ping
|
||||
```
|
||||
|
||||
**Container Resource Issues:**
|
||||
```bash
|
||||
# Check Docker resources
|
||||
docker system df
|
||||
|
||||
# Clean up containers
|
||||
docker system prune
|
||||
```
|
||||
|
||||
**Simulation Script Errors:**
|
||||
```bash
|
||||
# Check script permissions
|
||||
chmod +x scripts/*.sh
|
||||
|
||||
# Verify dependencies
|
||||
./scripts/simulate_failure.sh --help
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
### Adding New Playbooks
|
||||
|
||||
1. Create playbook in `playbooks/` directory
|
||||
2. Follow Ansible best practices
|
||||
3. Test with `--check` mode
|
||||
4. Update documentation
|
||||
|
||||
### Custom Scenarios
|
||||
|
||||
1. Define scenario in `scenarios/` directory
|
||||
2. Include required variables
|
||||
3. Test with dry-run
|
||||
4. Document operational procedures
|
||||
|
||||
## Security Considerations
|
||||
|
||||
- Containers run with limited privileges
|
||||
- SSH keys are generated per deployment
|
||||
- Firewall rules are applied automatically
|
||||
- Security scanning integrated in CI/CD
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
- Container resource limits prevent resource exhaustion
|
||||
- Ansible parallel execution for faster operations
|
||||
- Efficient failure simulation without full outages
|
||||
- Optimized Docker layer caching
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Follow existing code structure and naming conventions
|
||||
2. Add comprehensive documentation
|
||||
3. Include tests for new functionality
|
||||
4. Update runbooks for operational changes
|
||||
|
||||
## License
|
||||
|
||||
Enterprise Internal Use Only
|
||||
A platform team can use this project to demonstrate how routine operating procedures are encoded, reviewed, and tested before production change windows. The same patterns apply to regulated Linux estates where patch evidence, hardening controls, and incident drills must be repeatable.
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
# Enterprise Infrastructure Simulator Architecture
|
||||
|
||||
## Components
|
||||
|
||||
- Operator interface: `make` targets and direct Ansible commands.
|
||||
- Inventory: static host groups in `inventory/hosts.ini`.
|
||||
- Automation: lifecycle playbooks in `playbooks/`.
|
||||
- Simulation scripts: controlled failure and scaling events in `scripts/`.
|
||||
- Evidence: logs, reports, scenario notes, and examples.
|
||||
|
||||
## Data Flow
|
||||
|
||||
```
|
||||
Operator
|
||||
-> Make target or shell script
|
||||
-> Ansible inventory
|
||||
-> lifecycle playbook
|
||||
-> managed Linux node
|
||||
-> log/report artifact
|
||||
```
|
||||
|
||||
Failure drills follow a parallel flow:
|
||||
|
||||
```
|
||||
Operator -> simulate_failure.sh -> target node/service -> health check -> patch/hardening playbook -> evidence
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
The project favors explicit playbooks over hidden orchestration so the operational intent is visible during review. In a production implementation, the same workflows would typically run from a CI runner or automation controller with credentials supplied by a secret manager.
|
||||
@@ -0,0 +1,8 @@
|
||||
2026-04-29 02:13:41 - Starting failure simulation: service 30 web
|
||||
2026-04-29 02:13:41 - Simulating service failures on containers: web
|
||||
2026-04-29 02:13:42 - Stopping services in container enterprise-web-1
|
||||
2026-04-29 02:13:44 - Health probe failed: http://web01/health returned 503
|
||||
2026-04-29 02:14:12 - Cleaning up failure simulation
|
||||
2026-04-29 02:14:13 - Restarted nginx in enterprise-web-1
|
||||
2026-04-29 02:14:18 - Health probe recovered: http://web01/health returned 200
|
||||
2026-04-29 02:14:18 - Failure simulation completed successfully
|
||||
@@ -0,0 +1,33 @@
|
||||
PLAY [Apply Security Patches and Updates] **************************************
|
||||
|
||||
TASK [Update package cache] *****************************************************
|
||||
changed: [web01]
|
||||
changed: [db01]
|
||||
ok: [lb01]
|
||||
|
||||
TASK [Check for available updates] **********************************************
|
||||
ok: [web01] => {"stdout": "9"}
|
||||
ok: [db01] => {"stdout": "4"}
|
||||
ok: [lb01] => {"stdout": "0"}
|
||||
|
||||
TASK [Apply security updates only] **********************************************
|
||||
changed: [web01]
|
||||
changed: [db01]
|
||||
ok: [lb01]
|
||||
|
||||
TASK [Verify critical services] *************************************************
|
||||
ok: [web01] => (item=systemd-journald)
|
||||
ok: [web01] => (item=cron)
|
||||
ok: [db01] => (item=systemd-journald)
|
||||
ok: [lb01] => (item=cron)
|
||||
|
||||
PLAY RECAP *********************************************************************
|
||||
web01 : ok=19 changed=6 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
|
||||
db01 : ok=18 changed=5 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
|
||||
lb01 : ok=15 changed=1 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0
|
||||
|
||||
Patch report
|
||||
Status: SUCCESS
|
||||
Window: 02:00-04:00 UTC
|
||||
Reboot required: false
|
||||
Notification: infra-team@example.com
|
||||
@@ -0,0 +1,21 @@
|
||||
# Scenario: Simulate Failure and Patch
|
||||
|
||||
## Description
|
||||
|
||||
Validate that a service-level failure can be detected, recovered, and followed by a controlled patch workflow. This mirrors a maintenance window where a degraded node is stabilized before package updates are applied.
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
cd enterprise-infra-simulator
|
||||
./scripts/simulate_failure.sh service 30 web
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml --check
|
||||
```
|
||||
|
||||
## Expected Result
|
||||
|
||||
- The simulation records a temporary service failure.
|
||||
- The service is restored after cleanup.
|
||||
- The patch playbook completes without unreachable hosts.
|
||||
- Hardening check mode reports no destructive changes.
|
||||
@@ -0,0 +1,10 @@
|
||||
.PHONY: run test demo
|
||||
|
||||
run:
|
||||
python3 cli.py --help
|
||||
|
||||
test:
|
||||
python3 -m py_compile cli.py collectors/*.py validators/*.py reports/*.py
|
||||
|
||||
demo:
|
||||
python3 cli.py compare examples/before.json examples/after.json --output /tmp/migration-diff.json
|
||||
@@ -1,389 +1,56 @@
|
||||
# Migration Validation Framework
|
||||
|
||||
A comprehensive Python CLI tool for validating system migrations through data collection, snapshot comparison, and automated reporting. Designed for enterprise migration workflows where system consistency and data integrity are critical.
|
||||
## Problem Statement
|
||||
|
||||
## Overview
|
||||
Infrastructure migrations often fail in small, expensive ways: a mount option changes, a service is disabled, or disk usage moves past an operational threshold. Teams need structured evidence that the migrated host still matches the expected operating profile.
|
||||
|
||||
The Migration Validation Framework provides a systematic approach to validating system migrations by:
|
||||
## Solution Overview
|
||||
|
||||
- Collecting comprehensive system data before and after migration
|
||||
- Generating structured JSON snapshots for comparison
|
||||
- Performing intelligent diff analysis between snapshots
|
||||
- Generating detailed HTML reports with change visualization
|
||||
- Providing CLI interface for integration into migration pipelines
|
||||
This project provides a Python CLI that collects system state into JSON snapshots and compares before/after files. The output is designed for change records, migration gates, and post-cutover validation.
|
||||
|
||||
## Architecture
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
│ CLI Interface │ │ Data │ │ Validation │
|
||||
│ (cli.py) │◄──►│ Collectors │◄──►│ Engine │
|
||||
│ │ │ │ │ │
|
||||
│ - Command │ │ - mounts.py │ │ - compare.py │
|
||||
│ parsing │ │ - services.py │ │ - diff.py │
|
||||
│ - Workflow │ │ - disk_usage.py │ │ - validate.py │
|
||||
│ orchestration │ │ - network.py │ │ │
|
||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
│ JSON │ │ Comparison │ │ HTML │
|
||||
│ Snapshots │ │ Results │ │ Reports │
|
||||
│ │ │ │ │ │
|
||||
│ - Pre-migration │ │ - Differences │ │ - Summary │
|
||||
│ - Post-migration│ │ - Risk levels │ │ - Details │
|
||||
│ - Metadata │ │ - Validation │ │ - Charts │
|
||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
||||
Operator -> CLI -> Collectors -> JSON Snapshot -> Comparator -> Diff/Report
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
Core components:
|
||||
|
||||
### Prerequisites
|
||||
- `cli.py` provides collect, compare, snapshot, list, and report commands.
|
||||
- `collectors/` gathers mounts, services, and disk usage.
|
||||
- `validators/compare.py` identifies drift and validation failures.
|
||||
- `reports/` contains report generation helpers.
|
||||
- `examples/` contains realistic before/after evidence.
|
||||
|
||||
- Python 3.8+
|
||||
- SSH access to target systems
|
||||
- Appropriate permissions for data collection
|
||||
|
||||
### Installation
|
||||
## How to Run
|
||||
|
||||
```bash
|
||||
cd migration-validation-framework
|
||||
pip install -r requirements.txt
|
||||
python3 cli.py collect --output before.json --systems web01,db01
|
||||
python3 cli.py collect --output after.json --systems web01,db01
|
||||
python3 cli.py compare before.json after.json --output diff.json
|
||||
python3 cli.py compare examples/before.json examples/after.json --output /tmp/migration-diff.json
|
||||
```
|
||||
|
||||
### Basic Usage
|
||||
Legacy snapshot IDs are still supported:
|
||||
|
||||
```bash
|
||||
# Create pre-migration snapshot
|
||||
python cli.py snapshot --env production --label pre-migration --systems web01,db01
|
||||
|
||||
# Perform migration...
|
||||
|
||||
# Create post-migration snapshot
|
||||
python cli.py snapshot --env production --label post-migration --systems web01,db01
|
||||
|
||||
# Compare snapshots
|
||||
python cli.py compare pre-migration post-migration --output comparison_001
|
||||
|
||||
# Generate HTML report
|
||||
python cli.py report --comparison comparison_001 --format html --output migration_report.html
|
||||
python3 cli.py snapshot --env prod --label pre --systems web01,db01
|
||||
python3 cli.py compare prod-pre-20260429_020000 prod-post-20260429_030000 --output change-0429
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
## Example Output
|
||||
|
||||
```
|
||||
migration-validation-framework/
|
||||
├── cli.py # Main CLI interface
|
||||
├── collectors/ # Data collection modules
|
||||
│ ├── mounts.py # Filesystem mount collection
|
||||
│ ├── services.py # System services collection
|
||||
│ ├── disk_usage.py # Disk usage statistics
|
||||
│ ├── network.py # Network configuration
|
||||
│ └── processes.py # Running processes
|
||||
├── validators/ # Validation and comparison logic
|
||||
│ ├── compare.py # Snapshot comparison engine
|
||||
│ ├── diff.py # Difference calculation
|
||||
│ └── validate.py # Validation rules
|
||||
├── reports/ # Report generation
|
||||
│ ├── html_report.py # HTML report generator
|
||||
│ ├── json_report.py # JSON report generator
|
||||
│ └── summary.py # Summary calculations
|
||||
├── config/ # Configuration files
|
||||
│ ├── collectors.yaml # Collector configurations
|
||||
│ └── validators.yaml # Validation rules
|
||||
├── tests/ # Unit and integration tests
|
||||
├── logs/ # Application logs
|
||||
└── snapshots/ # Stored snapshots
|
||||
```text
|
||||
Comparison completed: diff.json (FAIL)
|
||||
Overall risk: high
|
||||
Total changes: 4
|
||||
Failed checks: critical_services_running
|
||||
Recommendation: restore sshd before production cutover
|
||||
```
|
||||
|
||||
## Data Collectors
|
||||
Sample inputs and output are available in [examples/before.json](examples/before.json), [examples/after.json](examples/after.json), and [examples/diff.json](examples/diff.json).
|
||||
|
||||
### Mounts Collector (`collectors/mounts.py`)
|
||||
Collects filesystem mount information including:
|
||||
- Mount points and devices
|
||||
- Filesystem types
|
||||
- Mount options
|
||||
- Capacity and usage statistics
|
||||
## Real-World Use Case
|
||||
|
||||
### Services Collector (`collectors/services.py`)
|
||||
Gathers system service status:
|
||||
- Running services
|
||||
- Service states (active, inactive, failed)
|
||||
- Startup configuration
|
||||
- Dependencies
|
||||
|
||||
### Disk Usage Collector (`collectors/disk_usage.py`)
|
||||
Analyzes disk space utilization:
|
||||
- Directory size statistics
|
||||
- File system usage
|
||||
- Inode usage
|
||||
- Largest files and directories
|
||||
|
||||
### Network Collector (`collectors/network.py`)
|
||||
Captures network configuration:
|
||||
- Interface configurations
|
||||
- Routing tables
|
||||
- DNS settings
|
||||
- Firewall rules
|
||||
|
||||
### Processes Collector (`collectors/processes.py`)
|
||||
Documents running processes:
|
||||
- Process lists with PIDs
|
||||
- Memory and CPU usage
|
||||
- Process owners
|
||||
- Command lines
|
||||
|
||||
## Validation Engine
|
||||
|
||||
### Comparison Logic (`validators/compare.py`)
|
||||
Performs intelligent comparison of snapshots:
|
||||
- Structural differences detection
|
||||
- Semantic change analysis
|
||||
- Risk level assessment
|
||||
- Change categorization
|
||||
|
||||
### Difference Calculator (`validators/diff.py`)
|
||||
Calculates detailed differences:
|
||||
- Added/removed/modified items
|
||||
- Quantitative changes
|
||||
- Configuration drift detection
|
||||
- Anomaly identification
|
||||
|
||||
### Validation Rules (`validators/validate.py`)
|
||||
Applies validation rules:
|
||||
- Critical change detection
|
||||
- Compliance checking
|
||||
- Threshold validation
|
||||
- Custom rule engine
|
||||
|
||||
## Reporting
|
||||
|
||||
### HTML Reports (`reports/html_report.py`)
|
||||
Generates comprehensive HTML reports featuring:
|
||||
- Executive summary dashboard
|
||||
- Detailed change logs
|
||||
- Risk assessment visualizations
|
||||
- Interactive charts and graphs
|
||||
- Export capabilities
|
||||
|
||||
### JSON Reports (`reports/json_report.py`)
|
||||
Provides structured JSON output for:
|
||||
- API integration
|
||||
- Automated processing
|
||||
- Audit trails
|
||||
- Compliance reporting
|
||||
|
||||
## CLI Interface
|
||||
|
||||
### Commands
|
||||
|
||||
```bash
|
||||
# Snapshot management
|
||||
python cli.py snapshot --env <env> --label <label> [--systems <hosts>]
|
||||
python cli.py list-snapshots [--env <env>]
|
||||
python cli.py delete-snapshot <snapshot-id>
|
||||
|
||||
# Comparison operations
|
||||
python cli.py compare <snapshot1> <snapshot2> [--output <comparison-id>]
|
||||
python cli.py list-comparisons
|
||||
python cli.py show-comparison <comparison-id>
|
||||
|
||||
# Reporting
|
||||
python cli.py report --comparison <comparison-id> --format <format> [--output <file>]
|
||||
python cli.py export --comparison <comparison-id> --format <format>
|
||||
|
||||
# Configuration
|
||||
python cli.py config --show
|
||||
python cli.py config --set <key> <value>
|
||||
```
|
||||
|
||||
### Options
|
||||
|
||||
- `--env`: Target environment (production, staging, development)
|
||||
- `--systems`: Comma-separated list of target systems
|
||||
- `--parallel`: Number of parallel collection threads
|
||||
- `--timeout`: Collection timeout in seconds
|
||||
- `--verbose`: Enable verbose output
|
||||
- `--dry-run`: Preview actions without execution
|
||||
|
||||
## Configuration
|
||||
|
||||
### Collector Configuration (`config/collectors.yaml`)
|
||||
|
||||
```yaml
|
||||
collectors:
|
||||
mounts:
|
||||
enabled: true
|
||||
timeout: 30
|
||||
exclude_patterns:
|
||||
- "/proc/*"
|
||||
- "/sys/*"
|
||||
|
||||
services:
|
||||
enabled: true
|
||||
include_disabled: false
|
||||
service_manager: systemd
|
||||
|
||||
disk_usage:
|
||||
enabled: true
|
||||
max_depth: 3
|
||||
exclude_paths:
|
||||
- "/tmp"
|
||||
- "/var/log"
|
||||
```
|
||||
|
||||
### Validation Rules (`config/validators.yaml`)
|
||||
|
||||
```yaml
|
||||
rules:
|
||||
critical_services:
|
||||
- sshd
|
||||
- systemd
|
||||
- network
|
||||
|
||||
filesystem_thresholds:
|
||||
warning: 80
|
||||
critical: 95
|
||||
|
||||
network_changes:
|
||||
allow_new_interfaces: false
|
||||
allow_route_changes: false
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Complete Migration Validation Workflow
|
||||
|
||||
```bash
|
||||
# 1. Pre-migration snapshot
|
||||
python cli.py snapshot --env production --label "migration-pre-20241201" \
|
||||
--systems web01,web02,db01,lb01 --parallel 4
|
||||
|
||||
# 2. Execute migration process
|
||||
# ... migration steps ...
|
||||
|
||||
# 3. Post-migration snapshot
|
||||
python cli.py snapshot --env production --label "migration-post-20241201" \
|
||||
--systems web01,web02,db01,lb01 --parallel 4
|
||||
|
||||
# 4. Compare snapshots
|
||||
python cli.py compare migration-pre-20241201 migration-post-20241201 \
|
||||
--output migration-dec2024
|
||||
|
||||
# 5. Generate reports
|
||||
python cli.py report --comparison migration-dec2024 --format html \
|
||||
--output migration_validation_report.html
|
||||
|
||||
python cli.py report --comparison migration-dec2024 --format json \
|
||||
--output migration_validation_data.json
|
||||
```
|
||||
|
||||
### Automated Validation in CI/CD
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# CI/CD validation script
|
||||
|
||||
ENVIRONMENT=$1
|
||||
SNAPSHOT_LABEL="ci-${BUILD_NUMBER}"
|
||||
|
||||
# Create snapshot
|
||||
python cli.py snapshot --env $ENVIRONMENT --label $SNAPSHOT_LABEL
|
||||
|
||||
# Compare with baseline
|
||||
python cli.py compare baseline-$ENVIRONMENT $SNAPSHOT_LABEL --output ci-$BUILD_NUMBER
|
||||
|
||||
# Generate report
|
||||
python cli.py report --comparison ci-$BUILD_NUMBER --format html
|
||||
|
||||
# Check for critical changes
|
||||
if python cli.py check-critical --comparison ci-$BUILD_NUMBER; then
|
||||
echo "Migration validation passed"
|
||||
exit 0
|
||||
else
|
||||
echo "Critical changes detected - review required"
|
||||
exit 1
|
||||
fi
|
||||
```
|
||||
|
||||
## Security Considerations
|
||||
|
||||
- SSH key-based authentication only
|
||||
- Encrypted snapshot storage
|
||||
- Access control for sensitive data
|
||||
- Audit logging of all operations
|
||||
- Data sanitization and filtering
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
- Parallel data collection
|
||||
- Incremental snapshots
|
||||
- Compressed storage
|
||||
- Memory-efficient processing
|
||||
- Timeout handling
|
||||
|
||||
## Monitoring and Logging
|
||||
|
||||
- Comprehensive logging to `logs/validation.log`
|
||||
- Performance metrics collection
|
||||
- Error tracking and alerting
|
||||
- Audit trail generation
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
**Connection Failures:**
|
||||
```bash
|
||||
# Check SSH connectivity
|
||||
ssh -i ~/.ssh/id_rsa user@target-host
|
||||
|
||||
# Verify Python availability
|
||||
python cli.py --test-connection --systems target-host
|
||||
```
|
||||
|
||||
**Collection Timeouts:**
|
||||
```bash
|
||||
# Increase timeout
|
||||
python cli.py snapshot --timeout 300 --systems slow-host
|
||||
|
||||
# Check system load
|
||||
ssh user@target-host uptime
|
||||
```
|
||||
|
||||
**Permission Errors:**
|
||||
```bash
|
||||
# Verify sudo access
|
||||
ssh user@target-host sudo -l
|
||||
|
||||
# Check file permissions
|
||||
ssh user@target-host ls -la /etc/
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
### Adding New Collectors
|
||||
|
||||
1. Create collector module in `collectors/`
|
||||
2. Implement collection logic
|
||||
3. Add configuration schema
|
||||
4. Update CLI interface
|
||||
5. Add unit tests
|
||||
|
||||
### Custom Validation Rules
|
||||
|
||||
1. Define rules in `config/validators.yaml`
|
||||
2. Implement validation logic in `validators/`
|
||||
3. Update report generation
|
||||
4. Test with sample data
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Follow existing code structure and naming conventions
|
||||
2. Add comprehensive tests for new functionality
|
||||
3. Update documentation for API changes
|
||||
4. Ensure backward compatibility
|
||||
|
||||
## License
|
||||
|
||||
Enterprise Internal Use Only
|
||||
During a data center migration, a platform team can collect baseline state before cutover, collect the same evidence after DNS or workload migration, and attach the diff to the change ticket. The framework gives reviewers a compact signal on whether the host is ready for production traffic.
|
||||
|
||||
Binary file not shown.
@@ -29,8 +29,8 @@ class MigrationValidator:
|
||||
|
||||
def __init__(self, verbose: bool = False):
|
||||
self.verbose = verbose
|
||||
self.setup_logging()
|
||||
self.ensure_directories()
|
||||
self.setup_logging()
|
||||
|
||||
def setup_logging(self):
|
||||
"""Configure logging."""
|
||||
@@ -97,13 +97,23 @@ class MigrationValidator:
|
||||
|
||||
def load_snapshot(self, snapshot_id: str) -> Dict[str, Any]:
|
||||
"""Load snapshot from disk."""
|
||||
snapshot_file = SNAPSHOTS_DIR / f"{snapshot_id}.json"
|
||||
snapshot_path = Path(snapshot_id)
|
||||
snapshot_file = snapshot_path if snapshot_path.exists() else SNAPSHOTS_DIR / f"{snapshot_id}.json"
|
||||
if not snapshot_file.exists():
|
||||
raise FileNotFoundError(f"Snapshot {snapshot_id} not found")
|
||||
|
||||
with open(snapshot_file, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
def collect_to_file(self, output_file: str, systems: List[str]) -> str:
|
||||
"""Collect a snapshot and write it to an explicit file path."""
|
||||
snapshot = self.collect_system_data(systems)
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(snapshot, f, indent=2)
|
||||
f.write("\n")
|
||||
self.logger.info(f"Snapshot written: {output_file}")
|
||||
return output_file
|
||||
|
||||
def create_snapshot(self, env: str, label: str, systems: List[str]) -> str:
|
||||
"""Create and save a system snapshot."""
|
||||
self.logger.info(f"Creating snapshot for environment: {env}, label: {label}")
|
||||
@@ -136,6 +146,27 @@ class MigrationValidator:
|
||||
self.logger.info(f"Comparison saved: {output_id}")
|
||||
return comparison
|
||||
|
||||
def compare_files(self, before_file: str, after_file: str, output_file: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Compare two explicit JSON snapshot files."""
|
||||
self.logger.info(f"Comparing files: {before_file} vs {after_file}")
|
||||
|
||||
before = self.load_snapshot(before_file)
|
||||
after = self.load_snapshot(after_file)
|
||||
comparison = compare.compare_snapshots(before, after)
|
||||
comparison["metadata"] = {
|
||||
"before": before_file,
|
||||
"after": after_file,
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
if output_file:
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(comparison, f, indent=2)
|
||||
f.write("\n")
|
||||
self.logger.info(f"Comparison written: {output_file}")
|
||||
|
||||
return comparison
|
||||
|
||||
def generate_report(self, comparison_id: str, format_type: str, output_file: Optional[str] = None) -> str:
|
||||
"""Generate a report from comparison results."""
|
||||
self.logger.info(f"Generating {format_type} report for comparison: {comparison_id}")
|
||||
@@ -169,14 +200,14 @@ def main():
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Create pre-migration snapshot
|
||||
python cli.py snapshot --env production --label pre-migration --systems web01,db01
|
||||
# Collect pre-migration snapshot
|
||||
python3 cli.py collect --output before.json --systems web01,db01
|
||||
|
||||
# Compare snapshots
|
||||
python cli.py compare pre-migration-snapshot post-migration-snapshot --output comparison_001
|
||||
# Compare snapshot files
|
||||
python3 cli.py compare before.json after.json --output diff.json
|
||||
|
||||
# Generate HTML report
|
||||
python cli.py report --comparison comparison_001 --format html
|
||||
python3 cli.py report --comparison comparison_001 --format html
|
||||
"""
|
||||
)
|
||||
|
||||
@@ -185,6 +216,11 @@ Examples:
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
||||
|
||||
# Collect command
|
||||
collect_parser = subparsers.add_parser('collect', help='Collect a system snapshot to a JSON file')
|
||||
collect_parser.add_argument('--output', required=True, help='Output JSON file')
|
||||
collect_parser.add_argument('--systems', default='localhost', help='Comma-separated list of systems')
|
||||
|
||||
# Snapshot command
|
||||
snapshot_parser = subparsers.add_parser('snapshot', help='Create system snapshot')
|
||||
snapshot_parser.add_argument('--env', required=True, help='Target environment')
|
||||
@@ -195,7 +231,7 @@ Examples:
|
||||
compare_parser = subparsers.add_parser('compare', help='Compare two snapshots')
|
||||
compare_parser.add_argument('snapshot1', help='First snapshot ID')
|
||||
compare_parser.add_argument('snapshot2', help='Second snapshot ID')
|
||||
compare_parser.add_argument('--output', required=True, help='Comparison output ID')
|
||||
compare_parser.add_argument('--output', help='Output comparison ID or JSON file')
|
||||
|
||||
# Report command
|
||||
report_parser = subparsers.add_parser('report', help='Generate report from comparison')
|
||||
@@ -217,7 +253,16 @@ Examples:
|
||||
validator = MigrationValidator(verbose=args.verbose)
|
||||
|
||||
try:
|
||||
if args.command == 'snapshot':
|
||||
if args.command == 'collect':
|
||||
systems = [system.strip() for system in args.systems.split(',') if system.strip()]
|
||||
if args.dry_run:
|
||||
print(f"DRY RUN: Would collect {systems} into {args.output}")
|
||||
return
|
||||
|
||||
output_file = validator.collect_to_file(args.output, systems)
|
||||
print(f"Snapshot written: {output_file}")
|
||||
|
||||
elif args.command == 'snapshot':
|
||||
systems = args.systems.split(',')
|
||||
if args.dry_run:
|
||||
print(f"DRY RUN: Would create snapshot for systems: {systems}")
|
||||
@@ -231,8 +276,16 @@ Examples:
|
||||
print(f"DRY RUN: Would compare {args.snapshot1} vs {args.snapshot2}")
|
||||
return
|
||||
|
||||
comparison = validator.compare_snapshots(args.snapshot1, args.snapshot2, args.output)
|
||||
print(f"Comparison completed: {args.output}")
|
||||
output = args.output
|
||||
if output and output.endswith('.json'):
|
||||
comparison = validator.compare_files(args.snapshot1, args.snapshot2, output)
|
||||
result = "PASS" if comparison.get("validation_results", {}).get("passed") else "FAIL"
|
||||
print(f"Comparison completed: {output} ({result})")
|
||||
else:
|
||||
output_id = output or datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
comparison = validator.compare_snapshots(args.snapshot1, args.snapshot2, output_id)
|
||||
result = "PASS" if comparison.get("validation_results", {}).get("passed") else "FAIL"
|
||||
print(f"Comparison completed: {output_id} ({result})")
|
||||
|
||||
elif args.command == 'report':
|
||||
if args.dry_run:
|
||||
@@ -267,4 +320,4 @@ Examples:
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,30 @@
|
||||
# Migration Validation Framework Architecture
|
||||
|
||||
## Components
|
||||
|
||||
- CLI: parses operator commands and coordinates workflows.
|
||||
- Collectors: gather mounts, services, and disk usage from target systems.
|
||||
- Snapshot files: JSON evidence used as immutable migration checkpoints.
|
||||
- Comparator: evaluates drift between before and after snapshots.
|
||||
- Reports: stores JSON or HTML output for audit and review.
|
||||
|
||||
## Data Flow
|
||||
|
||||
```
|
||||
Operator
|
||||
-> python3 cli.py collect
|
||||
-> collectors over SSH
|
||||
-> before.json / after.json
|
||||
-> python3 cli.py compare
|
||||
-> diff.json with PASS/FAIL validation
|
||||
```
|
||||
|
||||
## Validation Flow
|
||||
|
||||
```
|
||||
before.json -> Comparator -> service checks
|
||||
after.json -> Comparator -> filesystem checks -> validation result
|
||||
-> mount checks
|
||||
```
|
||||
|
||||
The framework keeps collection and comparison separate so migration evidence can be reviewed, archived, and replayed without recollecting from production systems.
|
||||
@@ -0,0 +1,40 @@
|
||||
{
|
||||
"metadata": {
|
||||
"timestamp": "2026-04-29T03:40:00Z",
|
||||
"systems": ["web01"],
|
||||
"version": "1.0"
|
||||
},
|
||||
"data": {
|
||||
"web01": {
|
||||
"mounts": {
|
||||
"mounts": [
|
||||
{"device": "/dev/sda1", "mountpoint": "/", "fstype": "ext4", "options": "rw,relatime"},
|
||||
{"device": "/dev/sdb1", "mountpoint": "/var", "fstype": "xfs", "options": "rw,noatime"}
|
||||
],
|
||||
"usage": {
|
||||
"/": {"filesystem": "/dev/sda1", "use_percent": "62%"},
|
||||
"/var": {"filesystem": "/dev/sdb1", "use_percent": "94%"}
|
||||
},
|
||||
"timestamp": "2026-04-29T03:40:00Z"
|
||||
},
|
||||
"services": {
|
||||
"service_manager": "systemd",
|
||||
"services": [
|
||||
{"name": "sshd", "active_state": "failed", "sub_state": "failed"},
|
||||
{"name": "nginx", "active_state": "active", "sub_state": "running"},
|
||||
{"name": "node-exporter", "active_state": "active", "sub_state": "running"}
|
||||
],
|
||||
"timestamp": "2026-04-29T03:40:00Z"
|
||||
},
|
||||
"disk_usage": {
|
||||
"filesystem_usage": [
|
||||
{"filesystem": "/dev/sda1", "type": "ext4", "size": "80G", "used": "50G", "available": "30G", "use_percent": "62%", "mountpoint": "/"},
|
||||
{"filesystem": "/dev/sdb1", "type": "xfs", "size": "200G", "used": "188G", "available": "12G", "use_percent": "94%", "mountpoint": "/var"}
|
||||
],
|
||||
"directory_sizes": [{"path": "/var/lib/app", "size": "139G"}],
|
||||
"largest_files": [{"path": "/var/lib/app/import/archive.tar", "size": "42G"}],
|
||||
"timestamp": "2026-04-29T03:40:00Z"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
{
|
||||
"metadata": {
|
||||
"timestamp": "2026-04-29T01:15:00Z",
|
||||
"systems": ["web01"],
|
||||
"version": "1.0"
|
||||
},
|
||||
"data": {
|
||||
"web01": {
|
||||
"mounts": {
|
||||
"mounts": [
|
||||
{"device": "/dev/sda1", "mountpoint": "/", "fstype": "ext4", "options": "rw,relatime"},
|
||||
{"device": "/dev/sdb1", "mountpoint": "/var", "fstype": "xfs", "options": "rw,noatime"}
|
||||
],
|
||||
"usage": {
|
||||
"/": {"filesystem": "/dev/sda1", "use_percent": "61%"},
|
||||
"/var": {"filesystem": "/dev/sdb1", "use_percent": "68%"}
|
||||
},
|
||||
"timestamp": "2026-04-29T01:15:00Z"
|
||||
},
|
||||
"services": {
|
||||
"service_manager": "systemd",
|
||||
"services": [
|
||||
{"name": "sshd", "active_state": "active", "sub_state": "running"},
|
||||
{"name": "nginx", "active_state": "active", "sub_state": "running"}
|
||||
],
|
||||
"timestamp": "2026-04-29T01:15:00Z"
|
||||
},
|
||||
"disk_usage": {
|
||||
"filesystem_usage": [
|
||||
{"filesystem": "/dev/sda1", "type": "ext4", "size": "80G", "used": "49G", "available": "31G", "use_percent": "61%", "mountpoint": "/"},
|
||||
{"filesystem": "/dev/sdb1", "type": "xfs", "size": "200G", "used": "136G", "available": "64G", "use_percent": "68%", "mountpoint": "/var"}
|
||||
],
|
||||
"directory_sizes": [{"path": "/var/lib/app", "size": "84G"}],
|
||||
"largest_files": [],
|
||||
"timestamp": "2026-04-29T01:15:00Z"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,211 @@
|
||||
{
|
||||
"summary": {
|
||||
"total_systems": 1,
|
||||
"systems_with_changes": 1,
|
||||
"total_changes": 7,
|
||||
"changes_by_type": {
|
||||
"mounts": 2,
|
||||
"services": 2,
|
||||
"disk_usage": 3
|
||||
},
|
||||
"most_affected_systems": [
|
||||
[
|
||||
"web01",
|
||||
7
|
||||
]
|
||||
]
|
||||
},
|
||||
"differences": {
|
||||
"mounts": {
|
||||
"web01": {
|
||||
"added_mounts": [],
|
||||
"removed_mounts": [],
|
||||
"changed_mounts": [],
|
||||
"usage_changes": [
|
||||
{
|
||||
"mountpoint": "/",
|
||||
"before": {
|
||||
"filesystem": "/dev/sda1",
|
||||
"use_percent": "61%"
|
||||
},
|
||||
"after": {
|
||||
"filesystem": "/dev/sda1",
|
||||
"use_percent": "62%"
|
||||
}
|
||||
},
|
||||
{
|
||||
"mountpoint": "/var",
|
||||
"before": {
|
||||
"filesystem": "/dev/sdb1",
|
||||
"use_percent": "68%"
|
||||
},
|
||||
"after": {
|
||||
"filesystem": "/dev/sdb1",
|
||||
"use_percent": "94%"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"services": {
|
||||
"web01": {
|
||||
"added_services": [
|
||||
{
|
||||
"name": "node-exporter",
|
||||
"active_state": "active",
|
||||
"sub_state": "running"
|
||||
}
|
||||
],
|
||||
"removed_services": [],
|
||||
"status_changes": [
|
||||
{
|
||||
"name": "sshd",
|
||||
"before": {
|
||||
"active_state": "active",
|
||||
"sub_state": "running"
|
||||
},
|
||||
"after": {
|
||||
"active_state": "failed",
|
||||
"sub_state": "failed"
|
||||
}
|
||||
}
|
||||
],
|
||||
"configuration_changes": []
|
||||
}
|
||||
},
|
||||
"disk_usage": {
|
||||
"web01": {
|
||||
"filesystem_changes": [
|
||||
{
|
||||
"mountpoint": "/",
|
||||
"before": {
|
||||
"filesystem": "/dev/sda1",
|
||||
"type": "ext4",
|
||||
"size": "80G",
|
||||
"used": "49G",
|
||||
"available": "31G",
|
||||
"use_percent": "61%",
|
||||
"mountpoint": "/"
|
||||
},
|
||||
"after": {
|
||||
"filesystem": "/dev/sda1",
|
||||
"type": "ext4",
|
||||
"size": "80G",
|
||||
"used": "50G",
|
||||
"available": "30G",
|
||||
"use_percent": "62%",
|
||||
"mountpoint": "/"
|
||||
}
|
||||
},
|
||||
{
|
||||
"mountpoint": "/var",
|
||||
"before": {
|
||||
"filesystem": "/dev/sdb1",
|
||||
"type": "xfs",
|
||||
"size": "200G",
|
||||
"used": "136G",
|
||||
"available": "64G",
|
||||
"use_percent": "68%",
|
||||
"mountpoint": "/var"
|
||||
},
|
||||
"after": {
|
||||
"filesystem": "/dev/sdb1",
|
||||
"type": "xfs",
|
||||
"size": "200G",
|
||||
"used": "188G",
|
||||
"available": "12G",
|
||||
"use_percent": "94%",
|
||||
"mountpoint": "/var"
|
||||
}
|
||||
}
|
||||
],
|
||||
"directory_size_changes": [],
|
||||
"significant_usage_changes": [
|
||||
{
|
||||
"mountpoint": "/var",
|
||||
"change_percent": 26,
|
||||
"before": {
|
||||
"filesystem": "/dev/sdb1",
|
||||
"type": "xfs",
|
||||
"size": "200G",
|
||||
"used": "136G",
|
||||
"available": "64G",
|
||||
"use_percent": "68%",
|
||||
"mountpoint": "/var"
|
||||
},
|
||||
"after": {
|
||||
"filesystem": "/dev/sdb1",
|
||||
"type": "xfs",
|
||||
"size": "200G",
|
||||
"used": "188G",
|
||||
"available": "12G",
|
||||
"use_percent": "94%",
|
||||
"mountpoint": "/var"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"risk_assessment": {
|
||||
"overall_risk": "high",
|
||||
"risk_factors": [
|
||||
{
|
||||
"type": "service_failure",
|
||||
"description": "Service failed: sshd",
|
||||
"level": 3
|
||||
},
|
||||
{
|
||||
"type": "disk_usage_spike",
|
||||
"description": "Significant disk usage change: /var (26%)",
|
||||
"level": 2
|
||||
}
|
||||
],
|
||||
"critical_changes": [],
|
||||
"recommendations": [
|
||||
"Immediate review required - critical changes detected",
|
||||
"Consider rolling back migration if critical services are affected"
|
||||
]
|
||||
},
|
||||
"validation_results": {
|
||||
"passed": false,
|
||||
"checks": [
|
||||
{
|
||||
"name": "critical_services_running",
|
||||
"description": "Verify critical services remain operational",
|
||||
"passed": false,
|
||||
"details": [
|
||||
"Critical service sshd failed on web01"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "filesystem_integrity",
|
||||
"description": "Verify filesystem integrity maintained",
|
||||
"passed": true,
|
||||
"details": []
|
||||
},
|
||||
{
|
||||
"name": "no_critical_mounts_removed",
|
||||
"description": "Verify critical mount points remain",
|
||||
"passed": true,
|
||||
"details": []
|
||||
}
|
||||
],
|
||||
"failed_checks": [
|
||||
{
|
||||
"name": "critical_services_running",
|
||||
"description": "Verify critical services remain operational",
|
||||
"passed": false,
|
||||
"details": [
|
||||
"Critical service sshd failed on web01"
|
||||
]
|
||||
}
|
||||
],
|
||||
"result": "FAIL"
|
||||
},
|
||||
"metadata": {
|
||||
"before": "migration-validation-framework/examples/before.json",
|
||||
"after": "migration-validation-framework/examples/after.json",
|
||||
"timestamp": "2026-04-29T23:29:07.510774"
|
||||
}
|
||||
}
|
||||
Binary file not shown.
@@ -0,0 +1,19 @@
|
||||
# Scenario: Before/After Migration Comparison
|
||||
|
||||
## Description
|
||||
|
||||
Compare a pre-cutover host snapshot against a post-cutover snapshot and determine whether the migrated system is ready for production traffic.
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
cd migration-validation-framework
|
||||
python3 cli.py compare examples/before.json examples/after.json --output /tmp/migration-diff.json
|
||||
```
|
||||
|
||||
## Expected Result
|
||||
|
||||
- The command writes a JSON diff.
|
||||
- The result is `FAIL` because `sshd` is failed after migration.
|
||||
- The risk assessment highlights the `/var` disk usage increase.
|
||||
- The remediation path is to restore SSH and reduce or expand `/var` before approving cutover.
|
||||
Binary file not shown.
@@ -37,9 +37,12 @@ class SnapshotComparator:
|
||||
# Compare each data type
|
||||
data_types = ["mounts", "services", "disk_usage"]
|
||||
|
||||
data1 = snapshot1.get("data", {})
|
||||
data2 = snapshot2.get("data", {})
|
||||
|
||||
for data_type in data_types:
|
||||
if data_type in snapshot1.get("data", {}) and data_type in snapshot2.get("data", {}):
|
||||
differences = self.compare_data_type(snapshot1["data"], snapshot2["data"], data_type)
|
||||
if self.data_type_exists(data1, data_type) or self.data_type_exists(data2, data_type):
|
||||
differences = self.compare_data_type(data1, data2, data_type)
|
||||
comparison["differences"][data_type] = differences
|
||||
|
||||
# Generate summary
|
||||
@@ -50,10 +53,15 @@ class SnapshotComparator:
|
||||
|
||||
# Validation results
|
||||
comparison["validation_results"] = self.validate_changes(comparison["differences"])
|
||||
comparison["validation_results"]["result"] = "PASS" if comparison["validation_results"]["passed"] else "FAIL"
|
||||
|
||||
logger.info("Snapshot comparison completed")
|
||||
return comparison
|
||||
|
||||
def data_type_exists(self, systems: Dict[str, Any], data_type: str) -> bool:
|
||||
"""Return true when at least one system has the requested collector data."""
|
||||
return any(data_type in system_data for system_data in systems.values())
|
||||
|
||||
def compare_data_type(self, data1: Dict[str, Any], data2: Dict[str, Any], data_type: str) -> Dict[str, Any]:
|
||||
"""Compare a specific data type between two snapshots."""
|
||||
differences = {}
|
||||
@@ -237,7 +245,7 @@ class SnapshotComparator:
|
||||
def generate_summary(self, differences: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Generate a summary of all differences."""
|
||||
summary = {
|
||||
"total_systems": len(differences),
|
||||
"total_systems": 0,
|
||||
"systems_with_changes": 0,
|
||||
"total_changes": 0,
|
||||
"changes_by_type": {},
|
||||
@@ -259,6 +267,8 @@ class SnapshotComparator:
|
||||
summary["changes_by_type"][data_type] += change_count
|
||||
summary["total_changes"] += change_count
|
||||
|
||||
summary["total_systems"] = len(system_change_counts)
|
||||
|
||||
# Count systems with changes
|
||||
summary["systems_with_changes"] = len([s for s in system_change_counts.values() if s > 0])
|
||||
|
||||
@@ -488,4 +498,4 @@ class SnapshotComparator:
|
||||
def compare_snapshots(snapshot1: Dict[str, Any], snapshot2: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Main comparison function."""
|
||||
comparator = SnapshotComparator()
|
||||
return comparator.compare_snapshots(snapshot1, snapshot2)
|
||||
return comparator.compare_snapshots(snapshot1, snapshot2)
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
.PHONY: run test demo
|
||||
|
||||
run:
|
||||
docker compose up -d
|
||||
|
||||
test:
|
||||
docker compose config
|
||||
|
||||
demo:
|
||||
./scenarios/incident_simulation.sh comprehensive
|
||||
+40
-434
@@ -1,461 +1,67 @@
|
||||
# Observability Stack
|
||||
|
||||
A comprehensive monitoring and logging stack for enterprise infrastructure observability using the ELK (Elasticsearch, Logstash, Kibana) stack and Grafana. Includes sample data ingestion, alerting rules, and incident simulation scenarios.
|
||||
## Problem Statement
|
||||
|
||||
## Overview
|
||||
Operations teams need correlated logs, dashboards, and alert examples that make incidents observable before they become customer-facing outages. A stack that only starts containers is not enough; it also needs meaningful sample data and incident exercises.
|
||||
|
||||
The Observability Stack provides a complete monitoring solution with:
|
||||
## Solution Overview
|
||||
|
||||
- **Elasticsearch**: Distributed search and analytics engine for logs and metrics
|
||||
- **Logstash**: Data processing pipeline for log ingestion and transformation
|
||||
- **Kibana**: Visualization and exploration interface for Elasticsearch data
|
||||
- **Grafana**: Advanced metrics dashboarding and alerting platform
|
||||
- **Sample Logs**: Realistic log data for testing and demonstration
|
||||
- **Alerting**: Automated incident detection and notification rules
|
||||
- **Incident Simulation**: Scenarios for testing monitoring and response procedures
|
||||
This project defines a local observability environment with Elasticsearch, Logstash, Kibana, Grafana, Filebeat, alert rules, sample logs, and an incident simulation script. It is built to demonstrate practical monitoring workflows rather than a production-sized cluster.
|
||||
|
||||
## Architecture
|
||||
## Architecture Overview
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
│ Log Sources │ │ Logstash │ │ Elasticsearch │
|
||||
│ (Applications │───►│ (Ingestion & │───►│ (Storage & │
|
||||
│ / Systems) │ │ Processing) │ │ Analytics) │
|
||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
│ Alerting │ │ Kibana │ │ Grafana │
|
||||
│ Rules │ │ (Dashboards & │ │ (Metrics & │
|
||||
│ │ │ Exploration) │ │ Dashboards) │
|
||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
||||
Application/System Logs -> Filebeat -> Logstash -> Elasticsearch -> Kibana
|
||||
|
|
||||
v
|
||||
Grafana
|
||||
|
||||
Incident Scenario -> Sample Logs -> Alert Rules -> Operator Review
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
Core components:
|
||||
|
||||
### Prerequisites
|
||||
- `docker-compose.yml` defines the observability services.
|
||||
- `alerting/alert_rules.yml` records alert intent and severity.
|
||||
- `logs/` contains representative operational logs.
|
||||
- `scenarios/incident_simulation.sh` emits incident activity.
|
||||
- `examples/` contains sample alert and log outputs.
|
||||
|
||||
- Docker and Docker Compose
|
||||
- At least 4GB RAM available
|
||||
- Ports 5601 (Kibana), 9200 (Elasticsearch), 3000 (Grafana) available
|
||||
|
||||
### Setup
|
||||
## How to Run
|
||||
|
||||
```bash
|
||||
cd observability-stack
|
||||
|
||||
# Start the observability stack
|
||||
docker-compose up -d
|
||||
# Validate the compose model.
|
||||
make test
|
||||
|
||||
# Wait for services to be ready (may take 2-3 minutes)
|
||||
sleep 180
|
||||
# Start the stack.
|
||||
make run
|
||||
|
||||
# Verify services are running
|
||||
curl -X GET "localhost:9200/_cluster/health?pretty"
|
||||
curl -X GET "localhost:5601/api/status"
|
||||
curl -X GET "localhost:3000/api/health"
|
||||
# Run the incident simulation.
|
||||
make demo
|
||||
|
||||
# Stop the stack.
|
||||
docker compose down
|
||||
```
|
||||
|
||||
### Access Interfaces
|
||||
When running locally:
|
||||
|
||||
- **Kibana**: http://localhost:5601 (admin/elastic)
|
||||
- **Grafana**: http://localhost:3000 (admin/admin)
|
||||
- **Elasticsearch**: http://localhost:9200
|
||||
- Kibana: `http://localhost:5601`
|
||||
- Grafana: `http://localhost:3000`
|
||||
- Elasticsearch: `http://localhost:9200`
|
||||
|
||||
## Project Structure
|
||||
## Example Output
|
||||
|
||||
```
|
||||
observability-stack/
|
||||
├── docker-compose.yml # Service orchestration
|
||||
├── logstash/ # Logstash configuration
|
||||
│ ├── pipeline/ # Processing pipelines
|
||||
│ └── config/ # Logstash settings
|
||||
├── elasticsearch/ # Elasticsearch configuration
|
||||
│ └── config/ # Cluster settings
|
||||
├── kibana/ # Kibana configuration
|
||||
│ └── config/ # Dashboard settings
|
||||
├── grafana/ # Grafana configuration
|
||||
│ ├── provisioning/ # Dashboards and datasources
|
||||
│ └── dashboards/ # Dashboard definitions
|
||||
├── logs/ # Sample log data
|
||||
│ └── sample.log # Realistic application logs
|
||||
├── alerting/ # Alert configuration
|
||||
│ └── alert_rules.yml # Alert definitions
|
||||
├── scenarios/ # Incident simulation
|
||||
│ └── incident_simulation.sh # Simulation scripts
|
||||
└── README.md
|
||||
```text
|
||||
[2026-04-29 04:18:23] WARN Database connection pool nearing capacity
|
||||
[2026-04-29 04:18:28] ERROR Database connection pool exhausted
|
||||
[2026-04-29 04:18:33] ERROR Database query timeout occurred
|
||||
[2026-04-29 04:18:44] INFO Database connections restored
|
||||
```
|
||||
|
||||
## Services Configuration
|
||||
Additional examples are available in [examples/alert-output.txt](examples/alert-output.txt) and [examples/sample-log.txt](examples/sample-log.txt).
|
||||
|
||||
### Elasticsearch
|
||||
## Real-World Use Case
|
||||
|
||||
**Configuration**: `elasticsearch/config/elasticsearch.yml`
|
||||
|
||||
Key settings:
|
||||
- Single-node cluster for development
|
||||
- Memory limits and heap sizing
|
||||
- Security enabled with basic authentication
|
||||
- CORS enabled for Kibana access
|
||||
|
||||
**Data Indices**:
|
||||
- `logs-*`: Application and system logs
|
||||
- `metrics-*`: System and application metrics
|
||||
- `alerts-*`: Alert and incident data
|
||||
|
||||
### Logstash
|
||||
|
||||
**Pipelines**: `logstash/pipeline/`
|
||||
|
||||
- **apache_logs**: Apache/Nginx access log processing
|
||||
- **system_logs**: System log parsing and enrichment
|
||||
- **application_logs**: Custom application log processing
|
||||
- **metrics_pipeline**: Metrics data processing
|
||||
|
||||
**Input Sources**:
|
||||
- Filebeat agents
|
||||
- TCP/UDP syslog inputs
|
||||
- HTTP endpoints for metrics
|
||||
- Docker container logs
|
||||
|
||||
### Kibana
|
||||
|
||||
**Dashboards**:
|
||||
- Log analysis dashboard
|
||||
- System metrics overview
|
||||
- Application performance dashboard
|
||||
- Security events dashboard
|
||||
|
||||
**Saved Objects**:
|
||||
- Index patterns for log data
|
||||
- Visualizations for common metrics
|
||||
- Search queries for troubleshooting
|
||||
|
||||
### Grafana
|
||||
|
||||
**Data Sources**:
|
||||
- Elasticsearch for logs and metrics
|
||||
- Prometheus (if available)
|
||||
- InfluxDB for time-series data
|
||||
|
||||
**Dashboards**:
|
||||
- Infrastructure overview
|
||||
- Application performance
|
||||
- System resources
|
||||
- Custom business metrics
|
||||
|
||||
## Log Ingestion
|
||||
|
||||
### Sample Data
|
||||
|
||||
The stack includes realistic sample logs for testing:
|
||||
|
||||
```bash
|
||||
# Ingest sample logs
|
||||
curl -X POST "localhost:8080" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d @logs/sample.log
|
||||
```
|
||||
|
||||
### Log Formats Supported
|
||||
|
||||
- **Apache/Nginx**: Combined log format
|
||||
- **Syslog**: RFC 3164/5424 compliant
|
||||
- **JSON**: Structured application logs
|
||||
- **Custom**: Configurable parsing rules
|
||||
|
||||
### Data Enrichment
|
||||
|
||||
Logstash pipelines add:
|
||||
- GeoIP location data
|
||||
- User agent parsing
|
||||
- Timestamp normalization
|
||||
- Host metadata enrichment
|
||||
|
||||
## Alerting and Monitoring
|
||||
|
||||
### Alert Rules
|
||||
|
||||
Located in `alerting/alert_rules.yml`:
|
||||
|
||||
```yaml
|
||||
alert_rules:
|
||||
- name: "High CPU Usage"
|
||||
condition: "cpu_usage > 90"
|
||||
duration: "5m"
|
||||
severity: "critical"
|
||||
channels: ["email", "slack"]
|
||||
|
||||
- name: "Disk Space Low"
|
||||
condition: "disk_usage > 85"
|
||||
duration: "10m"
|
||||
severity: "warning"
|
||||
channels: ["email"]
|
||||
|
||||
- name: "Service Down"
|
||||
condition: "service_status == 'down'"
|
||||
duration: "2m"
|
||||
severity: "critical"
|
||||
channels: ["email", "pagerduty"]
|
||||
```
|
||||
|
||||
### Alert Channels
|
||||
|
||||
- **Email**: SMTP-based notifications
|
||||
- **Slack**: Real-time messaging
|
||||
- **PagerDuty**: Incident management integration
|
||||
- **Webhook**: Custom HTTP endpoints
|
||||
|
||||
## Incident Simulation
|
||||
|
||||
### Available Scenarios
|
||||
|
||||
```bash
|
||||
cd scenarios
|
||||
|
||||
# Simulate disk space exhaustion
|
||||
./incident_simulation.sh --type disk-full --severity critical
|
||||
|
||||
# Simulate service failure
|
||||
./incident_simulation.sh --type service-down --service nginx
|
||||
|
||||
# Simulate network latency
|
||||
./incident_simulation.sh --type network-latency --delay 500ms
|
||||
|
||||
# Simulate high CPU usage
|
||||
./incident_simulation.sh --type high-cpu --cores 4
|
||||
```
|
||||
|
||||
### Scenario Types
|
||||
|
||||
- **disk-full**: Filesystem capacity exhaustion
|
||||
- **service-down**: Application service failures
|
||||
- **network-latency**: Network performance degradation
|
||||
- **high-cpu**: CPU utilization spikes
|
||||
- **memory-leak**: Memory consumption growth
|
||||
- **log-flood**: Excessive log generation
|
||||
|
||||
## Dashboards and Visualization
|
||||
|
||||
### Kibana Dashboards
|
||||
|
||||
Pre-configured dashboards for:
|
||||
|
||||
1. **Log Analysis**
|
||||
- Log volume over time
|
||||
- Error rate trends
|
||||
- Top error messages
|
||||
- Geographic request distribution
|
||||
|
||||
2. **System Monitoring**
|
||||
- CPU and memory usage
|
||||
- Disk I/O statistics
|
||||
- Network traffic
|
||||
- System load averages
|
||||
|
||||
3. **Application Performance**
|
||||
- Response time distributions
|
||||
- Request rate metrics
|
||||
- Error percentages
|
||||
- User session analytics
|
||||
|
||||
### Grafana Dashboards
|
||||
|
||||
Advanced visualization panels:
|
||||
|
||||
- **Infrastructure Overview**: Multi-system resource usage
|
||||
- **Application Metrics**: Custom business KPIs
|
||||
- **Alert Status**: Active alerts and trends
|
||||
- **Capacity Planning**: Resource utilization forecasting
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Elasticsearch APIs
|
||||
|
||||
```bash
|
||||
# Cluster health
|
||||
GET /_cluster/health
|
||||
|
||||
# Index statistics
|
||||
GET /_cat/indices?v
|
||||
|
||||
# Search logs
|
||||
GET /logs-*/_search
|
||||
{
|
||||
"query": {
|
||||
"match": {
|
||||
"message": "ERROR"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Kibana APIs
|
||||
|
||||
```bash
|
||||
# Get dashboard list
|
||||
GET /api/saved_objects/_find?type=dashboard
|
||||
|
||||
# Export visualizations
|
||||
GET /api/saved_objects/visualization/{id}
|
||||
```
|
||||
|
||||
### Grafana APIs
|
||||
|
||||
```bash
|
||||
# Get dashboard list
|
||||
GET /api/search?query=*
|
||||
|
||||
# Alert status
|
||||
GET /api/alerts
|
||||
```
|
||||
|
||||
## Configuration Management
|
||||
|
||||
### Environment Variables
|
||||
|
||||
```bash
|
||||
# Elasticsearch
|
||||
ES_JAVA_OPTS="-Xms1g -Xmx1g"
|
||||
ELASTIC_PASSWORD="elastic"
|
||||
|
||||
# Logstash
|
||||
LS_JAVA_OPTS="-Xms512m -Xmx512m"
|
||||
|
||||
# Grafana
|
||||
GF_SECURITY_ADMIN_PASSWORD="admin"
|
||||
```
|
||||
|
||||
### Scaling Configuration
|
||||
|
||||
For production deployment:
|
||||
|
||||
```yaml
|
||||
version: '3.8'
|
||||
services:
|
||||
elasticsearch:
|
||||
deploy:
|
||||
replicas: 3
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
cpus: '2.0'
|
||||
```
|
||||
|
||||
## Security Considerations
|
||||
|
||||
### Authentication
|
||||
|
||||
- Elasticsearch basic authentication enabled
|
||||
- Grafana admin credentials configured
|
||||
- Kibana anonymous access disabled
|
||||
|
||||
### Network Security
|
||||
|
||||
- Services bound to localhost only
|
||||
- Internal network for service communication
|
||||
- TLS encryption for external access (production)
|
||||
|
||||
### Data Protection
|
||||
|
||||
- Elasticsearch encryption at rest
|
||||
- Log data retention policies
|
||||
- Backup and recovery procedures
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
**Elasticsearch Won't Start:**
|
||||
```bash
|
||||
# Check memory allocation
|
||||
docker-compose logs elasticsearch
|
||||
|
||||
# Verify Java heap settings
|
||||
docker-compose exec elasticsearch ps aux
|
||||
```
|
||||
|
||||
**Logstash Pipeline Errors:**
|
||||
```bash
|
||||
# Check pipeline configuration
|
||||
docker-compose logs logstash
|
||||
|
||||
# Validate pipeline syntax
|
||||
docker-compose exec logstash logstash -t -f /usr/share/logstash/pipeline/
|
||||
```
|
||||
|
||||
**Kibana Connection Issues:**
|
||||
```bash
|
||||
# Verify Elasticsearch connectivity
|
||||
curl -u elastic:elastic "localhost:9200/_cluster/health"
|
||||
|
||||
# Check Kibana logs
|
||||
docker-compose logs kibana
|
||||
```
|
||||
|
||||
### Performance Tuning
|
||||
|
||||
**Elasticsearch:**
|
||||
- Increase heap size for larger datasets
|
||||
- Configure shard allocation
|
||||
- Enable index optimization
|
||||
|
||||
**Logstash:**
|
||||
- Adjust worker threads
|
||||
- Configure batch sizes
|
||||
- Enable persistent queues
|
||||
|
||||
**Grafana:**
|
||||
- Configure query caching
|
||||
- Set dashboard refresh intervals
|
||||
- Optimize panel queries
|
||||
|
||||
## Development and Testing
|
||||
|
||||
### Adding New Dashboards
|
||||
|
||||
1. Create dashboard JSON in `grafana/dashboards/`
|
||||
2. Update provisioning configuration
|
||||
3. Restart Grafana service
|
||||
|
||||
### Custom Alert Rules
|
||||
|
||||
1. Define rules in `alerting/alert_rules.yml`
|
||||
2. Update alerting configuration
|
||||
3. Test rules with simulation scenarios
|
||||
|
||||
### Log Pipeline Development
|
||||
|
||||
1. Add pipeline configuration in `logstash/pipeline/`
|
||||
2. Test with sample data
|
||||
3. Validate parsing with Kibana
|
||||
|
||||
## Backup and Recovery
|
||||
|
||||
### Data Backup
|
||||
|
||||
```bash
|
||||
# Elasticsearch snapshot
|
||||
curl -X PUT "localhost:9200/_snapshot/backup/snapshot_$(date +%Y%m%d_%H%M%S)" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"indices": "*"}'
|
||||
```
|
||||
|
||||
### Configuration Backup
|
||||
|
||||
```bash
|
||||
# Backup all configurations
|
||||
tar -czf backup_$(date +%Y%m%d).tar.gz \
|
||||
logstash/ elasticsearch/ kibana/ grafana/
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Follow existing configuration patterns
|
||||
2. Test changes with simulation scenarios
|
||||
3. Update documentation for new features
|
||||
4. Ensure backward compatibility
|
||||
|
||||
## License
|
||||
|
||||
Enterprise Internal Use Only
|
||||
A platform team can use this project to explain how logs move through an ingestion pipeline, how alert rules map to operational symptoms, and how incident exercises create evidence for on-call readiness reviews.
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
elasticsearch:
|
||||
image: docker.elastic.co/elasticsearch/elasticsearch:8.11.0
|
||||
@@ -119,4 +117,4 @@ networks:
|
||||
driver: bridge
|
||||
ipam:
|
||||
config:
|
||||
- subnet: 172.25.0.0/16
|
||||
- subnet: 172.25.0.0/16
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
# Observability Stack Architecture
|
||||
|
||||
## Components
|
||||
|
||||
- Filebeat: tails sample and container logs.
|
||||
- Logstash: receives and processes log events.
|
||||
- Elasticsearch: stores searchable observability data.
|
||||
- Kibana: supports log exploration and dashboards.
|
||||
- Grafana: provides operational dashboards.
|
||||
- Alert rules: document symptoms, thresholds, and severity.
|
||||
- Incident simulation: generates controlled failure signals.
|
||||
|
||||
## Data Flow
|
||||
|
||||
```
|
||||
Log source -> Filebeat -> Logstash -> Elasticsearch -> Kibana
|
||||
|
|
||||
v
|
||||
Grafana
|
||||
```
|
||||
|
||||
Incident exercises follow this flow:
|
||||
|
||||
```
|
||||
Operator -> incident_simulation.sh -> logs/incident_simulation.log -> Filebeat -> Logstash -> alerts/dashboards
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
This is a local demonstration stack, not a production Elasticsearch deployment. A production version would add dedicated nodes, TLS, secret management, retention policies, index lifecycle management, and external alert delivery.
|
||||
@@ -0,0 +1,4 @@
|
||||
2026-04-29T04:19:00Z alert=database_connection_pool_exhausted severity=critical service=checkout-api host=app-web-02 value=100 threshold=95 status=firing
|
||||
2026-04-29T04:19:30Z alert=api_error_rate_high severity=warning service=checkout-api host=app-web-02 value=7.8 threshold=5.0 status=firing
|
||||
2026-04-29T04:22:00Z alert=database_connection_pool_exhausted severity=critical service=checkout-api host=app-web-02 value=71 threshold=95 status=resolved
|
||||
2026-04-29T04:23:15Z alert=api_error_rate_high severity=warning service=checkout-api host=app-web-02 value=1.2 threshold=5.0 status=resolved
|
||||
@@ -0,0 +1,5 @@
|
||||
2026-04-29T04:18:21Z INFO service=checkout-api host=app-web-02 request_id=8f4b2 path=/checkout status=200 latency_ms=142
|
||||
2026-04-29T04:18:28Z WARN service=checkout-api host=app-web-02 event=db_pool_pressure active=92 max=100
|
||||
2026-04-29T04:18:33Z ERROR service=checkout-api host=app-web-02 event=db_timeout query=CreateOrder timeout_ms=5000 customer_tier=enterprise
|
||||
2026-04-29T04:18:39Z ERROR service=checkout-api host=app-web-02 event=payment_retry_exhausted order_id=ord-104288 provider=stripe
|
||||
2026-04-29T04:18:44Z INFO service=checkout-api host=app-web-02 event=recovery db_pool_active=48
|
||||
@@ -0,0 +1,21 @@
|
||||
# Scenario: Incident Simulation
|
||||
|
||||
## Description
|
||||
|
||||
Generate a controlled application and infrastructure incident so the logging pipeline, alert rules, and dashboards can be reviewed with realistic event timing.
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
cd observability-stack
|
||||
docker compose config
|
||||
./scenarios/incident_simulation.sh comprehensive
|
||||
tail -n 40 logs/incident_simulation.log
|
||||
```
|
||||
|
||||
## Expected Result
|
||||
|
||||
- The compose file validates successfully.
|
||||
- The simulation writes a sequence of CPU, memory, service, database, and application error events.
|
||||
- Alert examples indicate firing and resolved states.
|
||||
- Operators can trace incident progression through logs and dashboard queries.
|
||||
Reference in New Issue
Block a user