Compare commits
31 Commits
78bcfce43a
..
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 4e739c5c99 | |||
| 8cb92de06f | |||
| 1843796e92 | |||
| cd6830334b | |||
| e2624a7533 | |||
| 6475f76787 | |||
| e851568c8c | |||
| 8a7b7c5abc | |||
| 1636f46f81 | |||
| 5fc96348c5 | |||
| 89b7fabb96 | |||
| 2da5e8b46c | |||
| 452ff4fac1 | |||
| 5dde403ce3 | |||
| 61483c233f | |||
| a527022518 | |||
| 0d3905b8a1 | |||
| ca5a876d03 | |||
| deb12a0b4f | |||
| 02a51f72f9 | |||
| 2fd9c0b5ef | |||
| 75a11f7650 | |||
| 1e2db3e125 | |||
| c88428d092 | |||
| 65c7c82f0f | |||
| 76e24796bb | |||
| 5dd8c34952 | |||
| c42d8bfb8f | |||
| 9fb291f834 | |||
| 0a242e82b7 | |||
| 942e0a806f |
@@ -1,30 +0,0 @@
|
|||||||
name: ci
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches: [main]
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
validate:
|
|
||||||
runs-on: ubuntu
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Install deps
|
|
||||||
run: |
|
|
||||||
apt-get update
|
|
||||||
apt-get install -y python3 python3-pip ansible docker.io
|
|
||||||
|
|
||||||
- name: Python syntax check
|
|
||||||
run: |
|
|
||||||
find migration-validation-framework -name "*.py" -exec python3 -m py_compile {} \;
|
|
||||||
|
|
||||||
- name: Ansible syntax check
|
|
||||||
run: |
|
|
||||||
ansible-playbook -i enterprise-infra-simulator/inventory/hosts.ini \
|
|
||||||
--syntax-check enterprise-infra-simulator/playbooks/*.yml
|
|
||||||
|
|
||||||
- name: Docker compose validation
|
|
||||||
run: |
|
|
||||||
docker compose -f observability-stack/docker-compose.yml config
|
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
---
|
||||||
|
name: lint
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
shell-yaml-ansible:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check out repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install lint tools
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y shellcheck yamllint python3-pip
|
||||||
|
python3 -m pip install --user ansible-lint
|
||||||
|
echo "$HOME/.local/bin" >> "$GITHUB_PATH"
|
||||||
|
|
||||||
|
- name: ShellCheck Bash scripts
|
||||||
|
run: |
|
||||||
|
find infra-run/scripts/bash -name '*.sh' -print0 | xargs -0 shellcheck -x \
|
||||||
|
-P infra-run/scripts/bash/disk-full \
|
||||||
|
-P infra-run/scripts/bash/gpfs \
|
||||||
|
-P infra-run/scripts/bash/veritas
|
||||||
|
|
||||||
|
- name: Python syntax checks
|
||||||
|
run: bash scripts/check-python.sh
|
||||||
|
|
||||||
|
- name: yamllint
|
||||||
|
run: yamllint .
|
||||||
|
|
||||||
|
- name: ansible-lint
|
||||||
|
continue-on-error: true
|
||||||
|
run: cd infra-run/ansible && ansible-lint playbooks roles
|
||||||
+3
-2
@@ -1,5 +1,6 @@
|
|||||||
**pycache**/
|
.venv/
|
||||||
|
__pycache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
*.log
|
|
||||||
.env
|
.env
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
*.log
|
||||||
|
|||||||
@@ -0,0 +1,8 @@
|
|||||||
|
---
|
||||||
|
extends: default
|
||||||
|
|
||||||
|
rules:
|
||||||
|
line-length:
|
||||||
|
max: 140
|
||||||
|
truthy:
|
||||||
|
allowed-values: ["true", "false", "on"]
|
||||||
@@ -0,0 +1,126 @@
|
|||||||
|
# AGENTS.md
|
||||||
|
|
||||||
|
Guidance for Codex and other automated agents working in this repository.
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
This repository is a Linux/Unix infrastructure engineering portfolio. It shows practical operational work: incident response, troubleshooting, safe Bash tooling, Ansible hardening examples, storage workflows, runbooks, and platform/lab notes.
|
||||||
|
|
||||||
|
Treat it like internal operations tooling maintained by an infrastructure engineer. Preserve operational realism and avoid generic tutorial or template filler.
|
||||||
|
|
||||||
|
## Layout
|
||||||
|
|
||||||
|
- `infra-run/` - core operational tooling, Ansible, Bash scripts, runbooks, examples, and operations docs.
|
||||||
|
- `platform-projects/` - larger platform topics such as monitoring, storage, clustering, virtualization, and observability.
|
||||||
|
- `labs/` - experimental/lab environments for Kubernetes, Terraform, networking, CI/CD, Docker, and related work.
|
||||||
|
- `docs/codex/` - Codex workflow guidance, task templates, review checklist, and planning template.
|
||||||
|
- `scripts/` - repository validation helpers.
|
||||||
|
|
||||||
|
## Inspect First
|
||||||
|
|
||||||
|
Before editing, inspect the affected tree and nearby README files. Prefer:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
rg --files
|
||||||
|
git status --short
|
||||||
|
sed -n '1,220p' <file>
|
||||||
|
```
|
||||||
|
|
||||||
|
Check existing style before introducing new structure. Keep changes small and reviewable.
|
||||||
|
|
||||||
|
## Validation
|
||||||
|
|
||||||
|
Run the broad repo check when practical:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/validate-repo.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Focused checks:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/check-bash.sh
|
||||||
|
./scripts/check-ansible.sh
|
||||||
|
./scripts/check-python.sh
|
||||||
|
./scripts/check-docs.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Optional strict mode fails when optional tools are missing:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
STRICT=1 ./scripts/validate-repo.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Also run targeted checks for changed files, such as `bash -n`, `ansible-playbook --syntax-check`, or link checks when relevant.
|
||||||
|
|
||||||
|
## Bash Standards
|
||||||
|
|
||||||
|
- Use `#!/usr/bin/env bash`.
|
||||||
|
- Use `set -o errexit`, `set -o nounset`, and `set -o pipefail`.
|
||||||
|
- Validate input before using it.
|
||||||
|
- Handle missing commands clearly.
|
||||||
|
- Default to read-only or dry-run behavior.
|
||||||
|
- Require explicit `--execute` plus confirmation for destructive operations.
|
||||||
|
- Use clear `OK`, `WARNING`, and `CRITICAL` output.
|
||||||
|
- Exit codes: `0` OK, `1` operational issue, `2` invalid input or missing dependency.
|
||||||
|
- Keep scripts readable; separate discovery, pre-check, change, post-check, and reporting when it helps.
|
||||||
|
|
||||||
|
## Python Standards
|
||||||
|
|
||||||
|
- Use Python for parsing, reporting, and structured operational tooling where it adds value over Bash.
|
||||||
|
- Keep Python tools read-only by default.
|
||||||
|
- Prefer the Python standard library.
|
||||||
|
- Avoid frameworks and unnecessary abstractions.
|
||||||
|
- Use clear operational output and meaningful exit codes.
|
||||||
|
- Keep tools small, focused, and easy to validate.
|
||||||
|
|
||||||
|
## Ansible Standards
|
||||||
|
|
||||||
|
- Keep playbooks short and roles simple.
|
||||||
|
- Prefer modules over `shell` or `command`.
|
||||||
|
- Use `shell` or `command` only when the module set cannot express the operation, and document why if risk is not obvious.
|
||||||
|
- Preserve check-mode and diff-mode friendliness where possible.
|
||||||
|
- Use handlers, tags, defaults, and validation tasks when they clarify operations.
|
||||||
|
- Keep inventory under `inventory/hosts.yml`, `group_vars/`, and `host_vars/`.
|
||||||
|
- Do not present selected hardening examples as complete compliance certification.
|
||||||
|
|
||||||
|
## Documentation Standards
|
||||||
|
|
||||||
|
- Explain what exists, what is planned, and what is intentionally not supported.
|
||||||
|
- Prefer runbook style: scope, pre-checks, execution guardrails, rollback thinking, post-checks, and evidence.
|
||||||
|
- Avoid marketing language, fake enterprise wording, and tutorial bloat.
|
||||||
|
- Update README files and `CHANGELOG.md` when adding meaningful behavior or structure.
|
||||||
|
|
||||||
|
## Safety Rules
|
||||||
|
|
||||||
|
- Do not run destructive commands.
|
||||||
|
- Do not rename large directories unless the benefit is clear and low-risk.
|
||||||
|
- Do not hide validation failures.
|
||||||
|
- Do not claim live production validation for sanitized examples.
|
||||||
|
- Do not add secrets, real hostnames, customer identifiers, or private infrastructure details.
|
||||||
|
- Do not turn placeholders into fake completed projects.
|
||||||
|
|
||||||
|
## PR and Review Expectations
|
||||||
|
|
||||||
|
- State the operational risk of the change.
|
||||||
|
- Include commands run and whether tools were missing.
|
||||||
|
- Review scripts for dry-run behavior, input validation, dependency handling, and rollback path.
|
||||||
|
- Review Ansible for idempotency, check-mode behavior, inventory targeting, tags, handlers, and module choice.
|
||||||
|
- Keep diffs focused.
|
||||||
|
|
||||||
|
## Definition of Done
|
||||||
|
|
||||||
|
- The change preserves the repository intent.
|
||||||
|
- Relevant docs are updated.
|
||||||
|
- Changed Bash scripts pass `bash -n`.
|
||||||
|
- Available validation helpers were run.
|
||||||
|
- Missing optional tools are reported.
|
||||||
|
- Any remaining risk or follow-up is documented.
|
||||||
|
|
||||||
|
## Do Not
|
||||||
|
|
||||||
|
- Do not add an "ultimate DevOps template" structure.
|
||||||
|
- Do not replace working simple Bash with unnecessary abstractions.
|
||||||
|
- Do not make examples appear production-certified.
|
||||||
|
- Do not add destructive behavior without `--execute`, confirmation, and clear rollback notes.
|
||||||
|
- Do not delete useful content unless it is clearly duplicate, broken, or misleading.
|
||||||
-192
@@ -1,192 +0,0 @@
|
|||||||
# AI Context File - Portfolio Expansion Guide
|
|
||||||
|
|
||||||
## Portfolio Overview
|
|
||||||
This is a comprehensive enterprise Linux infrastructure portfolio demonstrating advanced engineering skills across three main domains:
|
|
||||||
1. **Enterprise Infrastructure Simulator** - Ansible-based container infrastructure automation
|
|
||||||
2. **Migration Validation Framework** - Python CLI for system migration validation
|
|
||||||
3. **Observability Stack** - ELK + Grafana monitoring platform
|
|
||||||
|
|
||||||
## Current Architecture
|
|
||||||
|
|
||||||
### Enterprise Infrastructure Simulator
|
|
||||||
**Technology Stack**: Ansible, Docker Compose, Bash
|
|
||||||
**Key Components**:
|
|
||||||
- Container-based Linux node simulation
|
|
||||||
- Ansible playbooks for provisioning, patching, hardening, decommissioning
|
|
||||||
- Operational scripts for scaling and failure simulation
|
|
||||||
- Multi-group inventory with realistic enterprise structure
|
|
||||||
|
|
||||||
**Expansion Opportunities**:
|
|
||||||
- Add Kubernetes support for container orchestration
|
|
||||||
- Implement multi-cloud deployment (AWS, Azure, GCP)
|
|
||||||
- Add Terraform integration for infrastructure provisioning
|
|
||||||
- Create custom Ansible modules for enterprise-specific tasks
|
|
||||||
- Implement backup and disaster recovery procedures
|
|
||||||
|
|
||||||
### Migration Validation Framework
|
|
||||||
**Technology Stack**: Python 3.8+, HTML/CSS/JavaScript
|
|
||||||
**Key Components**:
|
|
||||||
- CLI application with snapshot/compare/report commands
|
|
||||||
- Modular collectors (mounts, services, disk usage)
|
|
||||||
- Intelligent comparison engine with drift detection
|
|
||||||
- Interactive HTML reporting with Bootstrap styling
|
|
||||||
|
|
||||||
**Expansion Opportunities**:
|
|
||||||
- Add database migration validation (MySQL, PostgreSQL, MongoDB)
|
|
||||||
- Implement cloud migration support (AWS, Azure)
|
|
||||||
- Add performance benchmarking capabilities
|
|
||||||
- Create REST API for integration with CI/CD pipelines
|
|
||||||
- Implement machine learning for change prediction
|
|
||||||
- Add compliance validation (PCI-DSS, HIPAA, GDPR)
|
|
||||||
|
|
||||||
### Observability Stack
|
|
||||||
**Technology Stack**: ELK Stack, Grafana, Docker Compose
|
|
||||||
**Key Components**:
|
|
||||||
- Elasticsearch, Logstash, Kibana, Grafana
|
|
||||||
- Filebeat for log collection
|
|
||||||
- Comprehensive alerting rules
|
|
||||||
- Incident simulation framework
|
|
||||||
- Sample logs for testing
|
|
||||||
|
|
||||||
**Expansion Opportunities**:
|
|
||||||
- Add Prometheus and Grafana for metrics collection
|
|
||||||
- Implement distributed tracing (Jaeger, Zipkin)
|
|
||||||
- Add anomaly detection with machine learning
|
|
||||||
- Create custom dashboards for each project
|
|
||||||
- Implement log aggregation from cloud services
|
|
||||||
- Add synthetic monitoring and uptime checks
|
|
||||||
|
|
||||||
## Technical Standards & Conventions
|
|
||||||
|
|
||||||
### Code Quality
|
|
||||||
- Python: Type hints, comprehensive error handling, logging
|
|
||||||
- Ansible: Modern syntax (true/false booleans), modular structure
|
|
||||||
- Docker: Multi-stage builds, security best practices
|
|
||||||
- Documentation: Comprehensive READMEs, inline comments
|
|
||||||
|
|
||||||
### Naming Conventions
|
|
||||||
- Projects: kebab-case (enterprise-infra-simulator)
|
|
||||||
- Files: snake_case for Python, kebab-case for YAML
|
|
||||||
- Variables: snake_case, descriptive names
|
|
||||||
- Services: realistic enterprise naming (no "foo", "bar")
|
|
||||||
|
|
||||||
### Security Standards
|
|
||||||
- CIS benchmarks for Linux hardening
|
|
||||||
- Secure defaults in all configurations
|
|
||||||
- Input validation and sanitization
|
|
||||||
- Least privilege principles
|
|
||||||
|
|
||||||
## Future Development Roadmap
|
|
||||||
|
|
||||||
### Phase 1: Infrastructure Enhancement
|
|
||||||
- [ ] Add Kubernetes manifests for container orchestration
|
|
||||||
- [ ] Implement Helm charts for service deployment
|
|
||||||
- [ ] Add Terraform modules for cloud infrastructure
|
|
||||||
- [ ] Create Ansible Tower/AWX integration
|
|
||||||
|
|
||||||
### Phase 2: Application Expansion
|
|
||||||
- [ ] Extend migration framework with database support
|
|
||||||
- [ ] Add REST API to validation framework
|
|
||||||
- [ ] Implement OAuth2 authentication
|
|
||||||
- [ ] Create web-based dashboard for validation results
|
|
||||||
|
|
||||||
### Phase 3: Monitoring & Observability
|
|
||||||
- [ ] Add Prometheus metrics collection
|
|
||||||
- [ ] Implement distributed tracing
|
|
||||||
- [ ] Create ML-based anomaly detection
|
|
||||||
- [ ] Add synthetic monitoring capabilities
|
|
||||||
|
|
||||||
### Phase 4: Enterprise Integration
|
|
||||||
- [ ] Jira/ServiceNow integration for incident management
|
|
||||||
- [ ] Slack/Microsoft Teams notifications
|
|
||||||
- [ ] LDAP/Active Directory authentication
|
|
||||||
- [ ] Audit logging and compliance reporting
|
|
||||||
|
|
||||||
### Phase 5: Cloud & Multi-Platform
|
|
||||||
- [ ] AWS ECS/EKS deployment support
|
|
||||||
- [ ] Azure AKS deployment support
|
|
||||||
- [ ] GCP GKE deployment support
|
|
||||||
- [ ] Multi-cloud failover capabilities
|
|
||||||
|
|
||||||
## Development Guidelines
|
|
||||||
|
|
||||||
### Code Style
|
|
||||||
- Follow PEP 8 for Python code
|
|
||||||
- Use ansible-lint for playbook validation
|
|
||||||
- Implement comprehensive error handling
|
|
||||||
- Add logging at appropriate levels
|
|
||||||
- Write unit tests for critical functions
|
|
||||||
|
|
||||||
### Documentation Standards
|
|
||||||
- Update README.md for each new feature
|
|
||||||
- Maintain CHANGELOG.md with detailed entries
|
|
||||||
- Document API endpoints and CLI commands
|
|
||||||
- Include setup and troubleshooting guides
|
|
||||||
- Add architecture diagrams for complex features
|
|
||||||
|
|
||||||
### Testing Strategy
|
|
||||||
- Unit tests for Python modules
|
|
||||||
- Integration tests for Ansible playbooks
|
|
||||||
- End-to-end tests for complete workflows
|
|
||||||
- Performance testing for critical paths
|
|
||||||
- Security testing and vulnerability scanning
|
|
||||||
|
|
||||||
## Project Dependencies & Requirements
|
|
||||||
|
|
||||||
### System Requirements
|
|
||||||
- Docker Engine 20.10+
|
|
||||||
- Docker Compose 2.0+
|
|
||||||
- Python 3.8+
|
|
||||||
- Ansible 2.10+
|
|
||||||
- Git 2.25+
|
|
||||||
|
|
||||||
### External Services
|
|
||||||
- Gitea for CI/CD (optional)
|
|
||||||
- SMTP server for notifications (optional)
|
|
||||||
- LDAP server for authentication (optional)
|
|
||||||
|
|
||||||
## Risk Assessment & Mitigation
|
|
||||||
|
|
||||||
### Technical Risks
|
|
||||||
- **Dependency Updates**: Regular security updates and compatibility testing
|
|
||||||
- **Performance**: Monitoring and optimization of resource usage
|
|
||||||
- **Security**: Regular vulnerability scanning and patching
|
|
||||||
- **Scalability**: Load testing and capacity planning
|
|
||||||
|
|
||||||
### Operational Risks
|
|
||||||
- **Documentation**: Keep runbooks current with system changes
|
|
||||||
- **Monitoring**: Comprehensive alerting for all critical components
|
|
||||||
- **Backup**: Regular backups of configurations and data
|
|
||||||
- **Disaster Recovery**: Tested recovery procedures
|
|
||||||
|
|
||||||
## Success Metrics
|
|
||||||
|
|
||||||
### Technical Metrics
|
|
||||||
- Code coverage > 80%
|
|
||||||
- Performance benchmarks met
|
|
||||||
- Security scan clean
|
|
||||||
- Zero critical vulnerabilities
|
|
||||||
|
|
||||||
### Operational Metrics
|
|
||||||
- Successful deployments
|
|
||||||
- Incident response < 15 minutes
|
|
||||||
- System uptime > 99.9%
|
|
||||||
- User satisfaction scores
|
|
||||||
|
|
||||||
## Communication & Collaboration
|
|
||||||
|
|
||||||
### Internal Communication
|
|
||||||
- Regular architecture reviews
|
|
||||||
- Code review requirements
|
|
||||||
- Documentation standards
|
|
||||||
- Knowledge sharing sessions
|
|
||||||
|
|
||||||
### External Communication
|
|
||||||
- Clear project documentation
|
|
||||||
- API documentation
|
|
||||||
- User guides and tutorials
|
|
||||||
- Support and troubleshooting guides
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*This context file serves as a comprehensive guide for future portfolio expansion and maintenance. Update this file as new features are added or architectural decisions are made.*
|
|
||||||
+83
-111
@@ -1,123 +1,95 @@
|
|||||||
# Portfolio Changelog
|
# Changelog
|
||||||
|
|
||||||
## [1.0.0] - 2026-04-29 - Initial Enterprise Portfolio Release
|
## [Unreleased]
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
#### Enterprise Infrastructure Simulator
|
|
||||||
- **Container-based Linux node simulation** with Docker Compose
|
|
||||||
- **Comprehensive Ansible automation suite**:
|
|
||||||
- `provision.yml`: Node provisioning with security hardening, package installation, and service configuration
|
|
||||||
- `patch.yml`: Automated patching with rollback capabilities and notification system
|
|
||||||
- `hardening.yml`: Security hardening following CIS benchmarks (firewall, SSH, user management)
|
|
||||||
- `decommission.yml`: Graceful node decommissioning with cleanup and notification
|
|
||||||
- **Operational scripts**:
|
|
||||||
- `simulate_scaling.sh`: Infrastructure scaling simulation
|
|
||||||
- `simulate_failure.sh`: Failure injection for testing resilience
|
|
||||||
- **Realistic scenarios**:
|
|
||||||
- `scaling_event.yml`: Automated scaling event playbook
|
|
||||||
- **Production Makefile** with targets: `up`, `patch`, `harden`, `destroy`
|
|
||||||
- **Multi-group Ansible inventory** (`inventory/hosts.ini`) with realistic enterprise structure
|
|
||||||
|
|
||||||
#### Migration Validation Framework
|
- Added Linux Fresh Setup Toolkit under `labs/linux/setup` for day-0 Ubuntu lab host bootstrap automation.
|
||||||
- **Python 3.8+ CLI application** (`cli.py`) with command structure:
|
- Added AI Lab Maintenance Toolkit with systemd-based Linux maintenance automation.
|
||||||
- `snapshot`: Collect system data from target hosts
|
- Python tooling validation for operational scripts.
|
||||||
- `compare`: Compare snapshots for migration validation
|
- `incident-log-summary` for general incident log summarization.
|
||||||
- `report`: Generate HTML reports from comparison results
|
- `log-diff-checker` for pre-change and post-change log comparison.
|
||||||
- **Modular collector architecture**:
|
- `auth-log-audit` for Linux authentication log review.
|
||||||
- `collectors/mounts.py`: Filesystem mount point analysis
|
- `jvm-log-analyzer` for JVM application log summaries.
|
||||||
- `collectors/services.py`: System service inventory and status
|
- `journal-analyzer` for exported `journalctl` log review.
|
||||||
- `collectors/disk_usage.py`: Disk usage statistics and trends
|
- `known-error-matcher` with JSON-based known error patterns.
|
||||||
- **Intelligent comparison engine** (`validators/compare.py`):
|
- Standalone Bash incident checks for CPU, memory/OOM, service restart loops, failed SSH logins, certificate expiry, DNS connectivity, NTP drift, read-only filesystems, inode usage, and JVM process diagnostics.
|
||||||
- Drift detection algorithms
|
- `incident_triage_report.sh` for L2 Markdown incident handover reports built from existing Bash incident checks.
|
||||||
- Change categorization (additions, modifications, removals)
|
- Repository-level Codex guidance:
|
||||||
- Risk assessment scoring
|
- `AGENTS.md`
|
||||||
- **Interactive HTML reporting** (`reports/html_report.py`):
|
- `docs/codex/README.md`
|
||||||
- Bootstrap CSS styling
|
- `docs/codex/review-checklist.md`
|
||||||
- JavaScript-powered filtering and sorting
|
- `docs/codex/task-template.md`
|
||||||
- Detailed change summaries with timestamps
|
- `docs/codex/plans-template.md`
|
||||||
- Export capabilities
|
- Lightweight validation helpers:
|
||||||
|
- `scripts/validate-repo.sh`
|
||||||
|
- `scripts/check-bash.sh`
|
||||||
|
- `scripts/check-ansible.sh`
|
||||||
|
- `scripts/check-docs.sh`
|
||||||
|
- Cross-repository operational documentation structure:
|
||||||
|
- `infra-run/docs/operations-cheatsheet.md`
|
||||||
|
- `platform-projects/docs/platform-cheatsheet.md`
|
||||||
|
- `labs/docs/lab-cheatsheet.md`
|
||||||
|
- Production-oriented Linux/Unix operations reference with incident workflows, storage and networking checks, SSL/TLS notes, AIX commands, automation safety patterns, Ansible operational usage, and observability quick-reference.
|
||||||
|
- SELinux operational coverage for mode checks, context inspection, AVC audit review, persistent relabel workflow, booleans, and SELinux-specific incident response.
|
||||||
|
- Selected baseline Ansible hardening automation:
|
||||||
|
- RHEL 9 role and playbook.
|
||||||
|
- Debian 13 / Ubuntu 26.04 role and playbook.
|
||||||
|
- IBM AIX 7 role and playbook.
|
||||||
|
- Shared sanitized Ansible inventory defaults for Linux and AIX examples.
|
||||||
|
- Role-level task structure covering pre-checks, SSH, sudo, auditing, logging, services, filesystem controls, platform-specific settings, handlers, and post-check validation.
|
||||||
|
- Slurm AI/HPC Cluster Automation Lab under `platform-projects`, covering Ansible-managed Slurm operations, GPU scheduling, cgroup enforcement, SlurmDBD accounting, QOS/fairshare, lifecycle workflows, rolling upgrades, and health remediation.
|
||||||
|
|
||||||
#### Observability Stack
|
### Changed
|
||||||
- **Complete ELK + Grafana monitoring platform** (`docker-compose.yml`):
|
|
||||||
- Elasticsearch 8.11.0 with security enabled
|
|
||||||
- Logstash 8.11.0 with custom pipelines
|
|
||||||
- Kibana 8.11.0 with pre-configured dashboards
|
|
||||||
- Grafana 10.2.0 with alerting and visualization
|
|
||||||
- Filebeat for log collection
|
|
||||||
- **Realistic sample logs** (`logs/sample.log`):
|
|
||||||
- Application logs with various log levels
|
|
||||||
- System logs (nginx, systemd, kernel)
|
|
||||||
- Database logs (PostgreSQL, Redis)
|
|
||||||
- Security events and authentication logs
|
|
||||||
- **Enterprise alerting system** (`alerting/alert_rules.yml`):
|
|
||||||
- System resource alerts (CPU, memory, disk)
|
|
||||||
- Service availability monitoring
|
|
||||||
- Application performance alerts
|
|
||||||
- Security incident detection
|
|
||||||
- Multi-channel notifications (email, Slack, PagerDuty)
|
|
||||||
- **Incident simulation framework** (`scenarios/incident_simulation.sh`):
|
|
||||||
- CPU spike simulation
|
|
||||||
- Memory leak scenarios
|
|
||||||
- Disk space exhaustion
|
|
||||||
- Network latency/packet loss
|
|
||||||
- Service crash simulation
|
|
||||||
- Database connection issues
|
|
||||||
- Application error bursts
|
|
||||||
- Comprehensive incident scenarios
|
|
||||||
|
|
||||||
#### Documentation and Infrastructure
|
- Updated root, `infra-run`, Bash, Ansible, platform, and lab README guidance for safety-first usage, validation, and future Codex-driven work.
|
||||||
- **Root documentation**:
|
- Updated repository and `infra-run` README files to surface the new documentation structure and operational cheatsheets.
|
||||||
- `README.md`: Portfolio landing page with project overview and architecture summary
|
- Updated repository, `infra-run`, and Ansible README files to describe the new hardening automation instead of placeholder-only Ansible structure.
|
||||||
- `docs/architecture.md`: Detailed system architecture and design principles
|
- Updated Python tooling documentation and repository roadmap.
|
||||||
- `docs/runbooks.md`: Operational procedures and troubleshooting guides
|
- Integrated Python syntax validation into repository validation workflow and CI.
|
||||||
- **CI/CD Pipeline** (`.gitea/workflows/ci.yml`):
|
|
||||||
- Ansible syntax validation and linting
|
|
||||||
- Python code testing and type checking
|
|
||||||
- Docker image validation
|
|
||||||
- Security scanning
|
|
||||||
- Documentation generation
|
|
||||||
|
|
||||||
### Technical Implementation Details
|
### Notes
|
||||||
- **Languages**: Python 3.8+, YAML, Bash, HTML/CSS/JavaScript
|
|
||||||
- **Frameworks**: Ansible, Docker Compose, ELK Stack, Grafana
|
|
||||||
- **Infrastructure**: Container-based with production networking
|
|
||||||
- **Security**: CIS-compliant hardening, secure defaults, input validation
|
|
||||||
- **Monitoring**: Comprehensive alerting with escalation policies
|
|
||||||
- **Testing**: Incident simulation, syntax validation, compilation checks
|
|
||||||
|
|
||||||
### Quality Assurance
|
- Hardening content covers selected baseline controls and intended for portfolio/lab use; live use requires environment-specific review and validation.
|
||||||
- ✅ **Syntax validation**: All Ansible playbooks and Python code compile without errors
|
|
||||||
- ✅ **Boolean fixes**: Updated Ansible syntax from 'yes/no' to 'true/false' for modern compatibility
|
|
||||||
- ✅ **Enterprise naming**: Realistic hostnames, service names, and configurations
|
|
||||||
- ✅ **Production quality**: Error handling, logging, health checks, and rollback capabilities
|
|
||||||
- ✅ **Documentation**: Comprehensive READMEs, architecture docs, and operational runbooks
|
|
||||||
|
|
||||||
### Architecture Highlights
|
## [Initial Version]
|
||||||
- **Modular design**: Each project operates independently with clear interfaces
|
|
||||||
- **Enterprise patterns**: Multi-tier architecture, service separation, monitoring integration
|
|
||||||
- **Scalability**: Container-based deployment with orchestration
|
|
||||||
- **Observability**: End-to-end monitoring from infrastructure to application level
|
|
||||||
- **Automation**: Infrastructure as Code with comprehensive automation coverage
|
|
||||||
|
|
||||||
### Skills Demonstrated
|
### Added
|
||||||
- **Infrastructure Automation**: Ansible playbook development and enterprise infrastructure management
|
|
||||||
- **Application Development**: Python CLI application with modular architecture and reporting
|
|
||||||
- **Monitoring & Alerting**: ELK stack configuration, alerting rules, and incident response
|
|
||||||
- **Container Orchestration**: Docker Compose for multi-service applications
|
|
||||||
- **DevOps Practices**: CI/CD pipeline implementation, documentation, and operational procedures
|
|
||||||
- **System Administration**: Linux hardening, patching strategies, and decommissioning procedures
|
|
||||||
- **Security**: CIS benchmarks implementation and security monitoring
|
|
||||||
- **Data Analysis**: System data collection, comparison algorithms, and visualization
|
|
||||||
|
|
||||||
### Future Expansion Points
|
- Repository structure:
|
||||||
- Kubernetes orchestration integration
|
- `infra-run`
|
||||||
- Multi-cloud deployment support
|
- `platform-projects`
|
||||||
- Advanced monitoring dashboards
|
- `labs`
|
||||||
- Machine learning-based anomaly detection
|
- Linux operations Bash toolkit under `infra-run/scripts/bash/os-healthcheck/`:
|
||||||
- Integration with enterprise tools (Jira, ServiceNow)
|
- healthcheck
|
||||||
- Performance optimization and benchmarking
|
- disk usage checks
|
||||||
- Compliance automation (PCI-DSS, HIPAA)
|
- service checks
|
||||||
- Disaster recovery procedures
|
- system reporting
|
||||||
|
- Disk full incident toolkit:
|
||||||
|
- disk analysis
|
||||||
|
- large files detection
|
||||||
|
- deleted open files detection
|
||||||
|
- safe cleanup suggestions
|
||||||
|
- Network troubleshooting script under `infra-run/scripts/bash/os-healthcheck/`:
|
||||||
|
- interface, routing, DNS, connectivity checks
|
||||||
|
- Veritas storage toolkit:
|
||||||
|
- VxVM disk detection
|
||||||
|
- diskgroup extension
|
||||||
|
- volume/filesystem resize
|
||||||
|
- VCS freeze/unfreeze workflow
|
||||||
|
- GPFS storage toolkit:
|
||||||
|
- cluster validation
|
||||||
|
- NSD planning
|
||||||
|
- filesystem expansion
|
||||||
|
- rebalance
|
||||||
|
- Runbook-style structure and step-based execution.
|
||||||
|
|
||||||
---
|
### Changed
|
||||||
*Portfolio created to demonstrate enterprise-level Linux infrastructure engineering capabilities across the full technology stack.*
|
|
||||||
|
- Moved Linux operations scripts into `infra-run/scripts/bash/os-healthcheck/` to keep host health and troubleshooting checks grouped together.
|
||||||
|
|
||||||
|
### Notes
|
||||||
|
|
||||||
|
- All scripts default to dry-run where change actions are present.
|
||||||
|
- Designed for safety and readability.
|
||||||
|
- No destructive actions without explicit confirmation.
|
||||||
|
|||||||
Binary file not shown.
@@ -1,19 +1,111 @@
|
|||||||
# Infrastructure Engineering Portfolio
|
# Linux/Unix Infrastructure Engineering Portfolio
|
||||||
|
|
||||||
This repository contains independent infrastructure projects focused on automation, migration assurance, and observability. The projects are intentionally small enough to run locally, but structured around the operating patterns used in enterprise platform teams: repeatable workflows, clear evidence artifacts, and operational documentation.
|
This repository contains sanitized infrastructure automation examples based on Linux/Unix operations and infrastructure workflows. The focus is on incident response, troubleshooting, pre-checks, dry-run behavior, controlled execution, post-checks, and readable operational evidence.
|
||||||
|
|
||||||
## Projects
|
It is a technical portfolio, not a production toolkit. The examples show how operational work is structured: understand the current state, make changes only with explicit controls, verify the result, and leave enough evidence for review.
|
||||||
|
|
||||||
- [Enterprise Infrastructure Simulator](enterprise-infra-simulator/) - Ansible-driven lifecycle operations for provisioning, patching, hardening, decommissioning, and failure simulation across Linux nodes.
|
## What This Repo Is
|
||||||
- [Migration Validation Framework](migration-validation-framework/) - Python CLI for collecting before/after system snapshots and producing structured migration comparison results.
|
|
||||||
- [Observability Stack](observability-stack/) - Docker Compose based logging and dashboard stack with alert rules, sample logs, and incident simulation.
|
|
||||||
|
|
||||||
## Skills Demonstrated
|
- Practical Linux/Unix operations examples.
|
||||||
|
- Safe Bash and Ansible patterns for lab and review.
|
||||||
|
- Runbook-driven examples for incident response, storage operations, hardening, and observability.
|
||||||
|
- A place for platform and lab topics to grow without pretending unfinished areas are complete.
|
||||||
|
|
||||||
- Infrastructure automation with Ansible
|
## What This Repo Is Not
|
||||||
- Operational scenario design and incident simulation
|
|
||||||
- Migration validation, drift detection, and JSON reporting
|
|
||||||
- Docker Compose service validation
|
|
||||||
- Repository hygiene, CI checks, and professional project documentation
|
|
||||||
|
|
||||||
Each project remains independent and includes its own README, architecture notes, examples, and runnable scenarios.
|
- It is not a compliance benchmark implementation.
|
||||||
|
- It is not a drop-in change automation framework.
|
||||||
|
- It is not proof that these exact scripts ran in any production environment.
|
||||||
|
- It does not replace change review, peer review, backups, monitoring, or platform-specific runbooks.
|
||||||
|
|
||||||
|
## Repository Layout
|
||||||
|
|
||||||
|
- [infra-run](./infra-run/) - core operational tooling and automation.
|
||||||
|
- [platform-projects](./platform-projects/) - larger platform topics and case-study areas.
|
||||||
|
- [labs](./labs/) - experimental/lab environments and notes.
|
||||||
|
- [docs/codex](./docs/codex/) - guidance for future Codex-driven changes.
|
||||||
|
- [scripts](./scripts/) - lightweight repository validation helpers.
|
||||||
|
|
||||||
|
## Usable Now
|
||||||
|
|
||||||
|
- [infra-run](./infra-run/) - the main implemented project in this repository.
|
||||||
|
- [Linux healthcheck scripts](./infra-run/scripts/bash/os-healthcheck/) - host, disk, service, network, and report helpers.
|
||||||
|
- [Bash incident checks](./infra-run/scripts/bash/incident-checks/) - standalone read-only checks for common Linux incidents, plus an L2 Markdown triage report wrapper for repeatable handoff and ticket evidence.
|
||||||
|
- [Disk full workflow](./infra-run/scripts/bash/disk-full/) - triage scripts for usage, inode pressure, deleted open files, large files, log cleanup review, and postchecks.
|
||||||
|
- [Veritas examples](./infra-run/scripts/bash/veritas/) - dry-run-first VxVM/VCS storage expansion workflow examples.
|
||||||
|
- [GPFS examples](./infra-run/scripts/bash/gpfs/) - dry-run-first IBM Spectrum Scale expansion workflow examples.
|
||||||
|
- [Incident log summary](./infra-run/scripts/python/incident-log-summary/) - read-only Python helper for local incident log pattern summaries.
|
||||||
|
- [Log diff checker](./infra-run/scripts/python/log-diff-checker/) - read-only Python helper for before/after change log comparison.
|
||||||
|
- [Auth log audit](./infra-run/scripts/python/auth-log-audit/) - read-only Python helper for local authentication log review.
|
||||||
|
- [JVM log analyzer](./infra-run/scripts/python/jvm-log-analyzer/) - read-only Python helper for local JVM and Java application log review.
|
||||||
|
- [Journal analyzer](./infra-run/scripts/python/journal-analyzer/) - read-only Python helper for exported `journalctl` text review.
|
||||||
|
- [Known error matcher](./infra-run/scripts/python/known-error-matcher/) - read-only Python helper for matching logs against a JSON known-error catalog with runbook references.
|
||||||
|
- [Python operational log analysis tools](./infra-run/scripts/python/) - small standard-library helpers for local log summaries, before/after comparisons, and evidence reports.
|
||||||
|
- [Ansible hardening examples](./infra-run/ansible/) - selected Linux and AIX baseline hardening tasks organized as lab-safe roles.
|
||||||
|
- [Slurm AI/HPC cluster automation lab](./platform-projects/hpc-slurm-ai-cluster/) - Ansible-managed Slurm lab covering CPU/GPU scheduling, GRES, cgroups, accounting, QOS/fairshare, lifecycle workflows, rolling upgrades, and health remediation.
|
||||||
|
|
||||||
|
## Planned Areas
|
||||||
|
|
||||||
|
The `labs` and `platform-projects` trees are intentionally thin. They are kept as planning areas for future lab notes and case studies, not as completed projects. Current planned topics are tracked in [ROADMAP.md](./ROADMAP.md).
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
### Production Operations
|
||||||
|
|
||||||
|
- [infra-run/docs/operations-cheatsheet.md](./infra-run/docs/operations-cheatsheet.md) - production-focused Linux/Unix operations reference for incident handling, validation, storage, networking, Ansible, observability, and safety-first change execution.
|
||||||
|
|
||||||
|
### Platform Engineering
|
||||||
|
|
||||||
|
- [platform-projects/docs/platform-cheatsheet.md](./platform-projects/docs/platform-cheatsheet.md) - platform operations reference for Kubernetes, Helm, containers, Terraform, CI/CD, observability, and GPU-backed infrastructure troubleshooting.
|
||||||
|
|
||||||
|
### Labs & Experiments
|
||||||
|
|
||||||
|
- [labs/docs/lab-cheatsheet.md](./labs/docs/lab-cheatsheet.md) - quick-reference scratchpad for K3s, Proxmox, Terraform, Docker, networking, and short-lived lab troubleshooting work.
|
||||||
|
|
||||||
|
### Codex and Review Guidance
|
||||||
|
|
||||||
|
- [AGENTS.md](./AGENTS.md) - repository rules for automated and assisted changes.
|
||||||
|
- [docs/codex/README.md](./docs/codex/README.md) - Codex workflow and expected final response format.
|
||||||
|
- [docs/codex/review-checklist.md](./docs/codex/review-checklist.md) - safety, Bash, Ansible, docs, and validation review checklist.
|
||||||
|
- [docs/codex/task-template.md](./docs/codex/task-template.md) - reusable scoped task templates.
|
||||||
|
|
||||||
|
## Safety-First Usage
|
||||||
|
|
||||||
|
Read scripts and playbooks before running them. Operational examples are sanitized and may need adaptation for a real system.
|
||||||
|
|
||||||
|
- Prefer read-only commands first.
|
||||||
|
- Use dry-run/check mode before execution.
|
||||||
|
- Treat `--execute` as a change-control boundary.
|
||||||
|
- Confirm backups, monitoring, application impact, and rollback steps before live use.
|
||||||
|
- Do not run platform-specific storage commands without a matching Veritas, GPFS, or AIX lab.
|
||||||
|
|
||||||
|
## Validation
|
||||||
|
|
||||||
|
Basic local validation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/validate-repo.sh
|
||||||
|
./scripts/check-bash.sh
|
||||||
|
./scripts/check-ansible.sh
|
||||||
|
./scripts/check-python.sh
|
||||||
|
./scripts/check-docs.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
The validation helpers run required lightweight checks and use optional tools such as `shellcheck`, `yamllint`, `ansible-playbook`, `ansible-lint`, and `markdownlint` when available. Python checks use `python3 -m py_compile` and do not require external Python tooling. Set `STRICT=1` to fail when optional tools are missing.
|
||||||
|
|
||||||
|
Some scripts depend on platform tools such as `vxdisk`, `hagrp`, `mmcrnsd`, and `mmlscluster`. Those commands are not expected to exist on a normal workstation, so functional testing against Veritas or GPFS requires a real lab environment.
|
||||||
|
|
||||||
|
See [infra-run/TESTED.md](./infra-run/TESTED.md) and [infra-run/KNOWN_LIMITATIONS.md](./infra-run/KNOWN_LIMITATIONS.md) for the current validation status.
|
||||||
|
|
||||||
|
## Operational Areas Demonstrated
|
||||||
|
|
||||||
|
- Linux operations triage and reporting.
|
||||||
|
- Local operational log analysis with read-only Python helpers.
|
||||||
|
- Disk pressure and deleted-file incident analysis.
|
||||||
|
- Dry-run-first Bash automation.
|
||||||
|
- Controlled storage change workflow design.
|
||||||
|
- Veritas VxVM/VCS operational awareness.
|
||||||
|
- GPFS / IBM Spectrum Scale operational awareness.
|
||||||
|
- Ansible role organization for selected hardening controls.
|
||||||
|
- Slurm AI/HPC cluster operations with GPU scheduling, accounting, lifecycle workflows, and remediation.
|
||||||
|
- Clear documentation of what was tested and what still needs a real system.
|
||||||
|
|||||||
+38
@@ -0,0 +1,38 @@
|
|||||||
|
# Roadmap
|
||||||
|
|
||||||
|
This file keeps future portfolio ideas in one place so empty folders do not look like finished work.
|
||||||
|
|
||||||
|
## Planned Lab Areas
|
||||||
|
|
||||||
|
- Docker: image build notes, container troubleshooting, and small service examples.
|
||||||
|
- Kubernetes: workload inspection, basic operations checks, and failure scenario notes.
|
||||||
|
- Terraform: small infrastructure-as-code examples with clear plan/apply separation.
|
||||||
|
- Networking: DNS, routing, firewall, and connectivity troubleshooting labs.
|
||||||
|
- CI/CD: validation pipelines for shell, YAML, and Ansible examples.
|
||||||
|
|
||||||
|
## Planned Platform Case Studies
|
||||||
|
|
||||||
|
- Storage: expansion planning, filesystem checks, and SAN handoff documentation.
|
||||||
|
- Clustering: service group checks, failover review, and operational checklists.
|
||||||
|
- Monitoring: Zabbix-oriented alert review and host onboarding notes.
|
||||||
|
- Virtualization: VM lifecycle and platform operations examples.
|
||||||
|
- Log analysis: optional ELK-style search case study under `platform-projects`, separate from current local Python helpers.
|
||||||
|
|
||||||
|
## Implemented Portfolio Additions
|
||||||
|
|
||||||
|
- Standalone Bash incident checks under `infra-run/scripts/bash/incident-checks/` for common Linux incident triage and ticket evidence.
|
||||||
|
- Python operational log analysis suite under `infra-run/scripts/python/`:
|
||||||
|
- `incident-log-summary`
|
||||||
|
- `log-diff-checker`
|
||||||
|
- `auth-log-audit`
|
||||||
|
- `jvm-log-analyzer`
|
||||||
|
- `journal-analyzer`
|
||||||
|
- `known-error-matcher`
|
||||||
|
|
||||||
|
## Future Python Tooling Ideas
|
||||||
|
|
||||||
|
- Real-world sample report examples using sanitized evidence.
|
||||||
|
- Integration examples that combine log summaries with change evidence collection.
|
||||||
|
- A shared Python helper library only if the standalone tools begin duplicating enough stable behavior to justify it.
|
||||||
|
|
||||||
|
Planned sections remain future work unless listed as implemented.
|
||||||
@@ -1,147 +0,0 @@
|
|||||||
# Architecture Overview
|
|
||||||
|
|
||||||
## Enterprise Infrastructure Portfolio Architecture
|
|
||||||
|
|
||||||
This document provides a high-level overview of the architecture and design principles implemented across the three main projects in this portfolio.
|
|
||||||
|
|
||||||
## Overall Architecture
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────────────────┐
|
|
||||||
│ Enterprise Portfolio │
|
|
||||||
├─────────────────────────────────────────────────────────────┤
|
|
||||||
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │
|
|
||||||
│ │ Infra Simulator│ │Migration │ │Observability│ │
|
|
||||||
│ │ (Ansible/Docker│ │Validation │ │Stack │ │
|
|
||||||
│ │ Container Sim) │ │(Python CLI) │ │(ELK/Grafana)│ │
|
|
||||||
│ └─────────────────┘ └─────────────────┘ └─────────────┘ │
|
|
||||||
├─────────────────────────────────────────────────────────────┤
|
|
||||||
│ Infrastructure Simulation │ Validation Framework │ Monitoring │
|
|
||||||
└─────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Project Architectures
|
|
||||||
|
|
||||||
### 1. Enterprise Infrastructure Simulator
|
|
||||||
|
|
||||||
**Architecture Pattern:** Container-based Infrastructure Simulation
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
||||||
│ Ansible │ │ Docker │ │ Simulation │
|
|
||||||
│ Controller │◄──►│ Containers │◄──►│ Scripts │
|
|
||||||
│ │ │ (Linux Nodes) │ │ │
|
|
||||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
|
||||||
│ │ │
|
|
||||||
▼ ▼ ▼
|
|
||||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
||||||
│ Inventory │ │ Playbooks │ │ Scenarios │
|
|
||||||
│ Management │ │ (Provision/ │ │ (Scaling/ │
|
|
||||||
│ │ │ Patch/ │ │ Failures) │
|
|
||||||
│ │ │ Harden/ │ │ │
|
|
||||||
│ │ │ Decommission)│ │ │
|
|
||||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key Components:**
|
|
||||||
- **Ansible Controller:** Central orchestration for infrastructure operations
|
|
||||||
- **Docker Containers:** Simulated Linux nodes with realistic configurations
|
|
||||||
- **Simulation Scripts:** Automated scaling and failure injection
|
|
||||||
- **Inventory System:** Dynamic host management and grouping
|
|
||||||
- **Playbook Library:** Modular automation for different lifecycle phases
|
|
||||||
|
|
||||||
### 2. Migration Validation Framework
|
|
||||||
|
|
||||||
**Architecture Pattern:** Data Collection and Comparison Pipeline
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
||||||
│ CLI Interface │ │ Data │ │ Validation │
|
|
||||||
│ (cli.py) │◄──►│ Collectors │◄──►│ Engine │
|
|
||||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
|
||||||
│ │ │
|
|
||||||
▼ ▼ ▼
|
|
||||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
||||||
│ JSON │ │ Comparison │ │ HTML │
|
|
||||||
│ Snapshots │ │ Logic │ │ Reports │
|
|
||||||
│ (Before/After)│ │ │ │ │
|
|
||||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key Components:**
|
|
||||||
- **CLI Interface:** Command-line tool for migration workflow orchestration
|
|
||||||
- **Data Collectors:** Specialized modules for system data extraction
|
|
||||||
- **Validation Engine:** Snapshot comparison and difference analysis
|
|
||||||
- **Report Generator:** HTML output with change visualization
|
|
||||||
- **JSON Storage:** Structured data persistence for before/after states
|
|
||||||
|
|
||||||
### 3. Observability Stack
|
|
||||||
|
|
||||||
**Architecture Pattern:** Distributed Monitoring and Logging
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
||||||
│ Logstash │ │ Elasticsearch │ │ Kibana │
|
|
||||||
│ (Ingestion) │◄──►│ (Storage) │◄──►│ (Visualization)│
|
|
||||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
|
||||||
▲ ▲ ▲
|
|
||||||
│ │ │
|
|
||||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
|
||||||
│ Sample Logs │ │ Alert Rules │ │ Grafana │
|
|
||||||
│ (Data Sources)│ │ (Conditions) │ │ (Dashboards) │
|
|
||||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key Components:**
|
|
||||||
- **Logstash Pipelines:** Data ingestion and transformation
|
|
||||||
- **Elasticsearch Cluster:** Distributed search and analytics
|
|
||||||
- **Kibana Dashboards:** Real-time visualization and exploration
|
|
||||||
- **Grafana Integration:** Advanced metrics and alerting
|
|
||||||
- **Alerting Engine:** Automated incident detection and notification
|
|
||||||
|
|
||||||
## Design Principles
|
|
||||||
|
|
||||||
### Infrastructure as Code
|
|
||||||
- All infrastructure defined in code (Ansible, Docker Compose, Python)
|
|
||||||
- Version-controlled configurations and automation
|
|
||||||
- Reproducible environments and deployments
|
|
||||||
|
|
||||||
### Modular Architecture
|
|
||||||
- Separated concerns across projects and components
|
|
||||||
- Reusable modules and playbooks
|
|
||||||
- Clear interfaces between systems
|
|
||||||
|
|
||||||
### Enterprise Standards
|
|
||||||
- Realistic naming conventions and structures
|
|
||||||
- Production-quality error handling and logging
|
|
||||||
- Security hardening and compliance considerations
|
|
||||||
|
|
||||||
### Observability First
|
|
||||||
- Comprehensive logging and monitoring
|
|
||||||
- Automated alerting and incident response
|
|
||||||
- Performance metrics and health checks
|
|
||||||
|
|
||||||
## Technology Stack
|
|
||||||
|
|
||||||
- **Containerization:** Docker, Docker Compose
|
|
||||||
- **Configuration Management:** Ansible
|
|
||||||
- **Programming Language:** Python 3.8+
|
|
||||||
- **Monitoring Stack:** ELK Stack (Elasticsearch, Logstash, Kibana)
|
|
||||||
- **Visualization:** Grafana
|
|
||||||
- **CI/CD:** Gitea Actions
|
|
||||||
- **Documentation:** Markdown
|
|
||||||
|
|
||||||
## Security Considerations
|
|
||||||
|
|
||||||
- Container security scanning integration
|
|
||||||
- Ansible vault for secrets management
|
|
||||||
- Network segmentation in Docker Compose
|
|
||||||
- Least privilege access principles
|
|
||||||
- Audit logging and compliance reporting
|
|
||||||
|
|
||||||
## Scalability and Performance
|
|
||||||
|
|
||||||
- Horizontal scaling through container orchestration
|
|
||||||
- Efficient data collection and processing
|
|
||||||
- Optimized Elasticsearch indexing
|
|
||||||
- Resource-aware automation scripts
|
|
||||||
@@ -0,0 +1,55 @@
|
|||||||
|
# Codex Workflow
|
||||||
|
|
||||||
|
This directory keeps future Codex sessions consistent when working in this infrastructure portfolio.
|
||||||
|
|
||||||
|
## How To Start
|
||||||
|
|
||||||
|
1. Read [AGENTS.md](../../AGENTS.md).
|
||||||
|
2. Inspect the affected tree and nearby README files.
|
||||||
|
3. Check `git status --short` so existing user work is preserved.
|
||||||
|
4. Decide whether a plan is needed before editing.
|
||||||
|
5. Make small, reviewable changes.
|
||||||
|
6. Run focused validation plus `./scripts/validate-repo.sh` when practical.
|
||||||
|
|
||||||
|
## When To Plan First
|
||||||
|
|
||||||
|
Plan before editing when a task touches more than one subsystem, changes operational behavior, adds or modifies destructive actions, changes Ansible targeting, or updates repository conventions.
|
||||||
|
|
||||||
|
For small typo fixes, narrow README updates, or obvious syntax fixes, inspect first and then make the change directly.
|
||||||
|
|
||||||
|
Use [plans-template.md](./plans-template.md) for larger changes.
|
||||||
|
|
||||||
|
## Scoped Tasks
|
||||||
|
|
||||||
|
Good tasks name the operational goal, affected directories, constraints, validation commands, and what "done" means. Use [task-template.md](./task-template.md) for reusable prompts.
|
||||||
|
|
||||||
|
Keep scope tied to real operations:
|
||||||
|
|
||||||
|
- Bash tool: discovery, pre-check, dry-run, execute, post-check, report.
|
||||||
|
- Ansible change: inventory target, role/playbook scope, check mode, idempotency, validation.
|
||||||
|
- Runbook: incident signal, triage, decision points, rollback, evidence.
|
||||||
|
- Lab/platform project: status, prerequisites, validation, limitations.
|
||||||
|
|
||||||
|
## Validation
|
||||||
|
|
||||||
|
Prefer the repository helpers:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/check-bash.sh
|
||||||
|
./scripts/check-ansible.sh
|
||||||
|
./scripts/check-docs.sh
|
||||||
|
./scripts/validate-repo.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
If optional tools are missing, report that clearly and continue with available checks. Do not claim skipped checks passed.
|
||||||
|
|
||||||
|
## Final Response Format
|
||||||
|
|
||||||
|
End with:
|
||||||
|
|
||||||
|
1. Summary of what changed.
|
||||||
|
2. Files created or modified.
|
||||||
|
3. Validation commands run and results.
|
||||||
|
4. Skipped checks and why.
|
||||||
|
5. Risks or follow-ups.
|
||||||
|
6. Whether the repo is ready for future Codex-driven work.
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
# Implementation Plan Template
|
||||||
|
|
||||||
|
Use this for changes that touch multiple files, alter operational behavior, or add new repository conventions.
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
State the operational or maintenance outcome.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
Summarize the directories and conventions inspected.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
List files or directories expected to change.
|
||||||
|
|
||||||
|
## Non-Goals
|
||||||
|
|
||||||
|
Name what will not be redesigned, renamed, deleted, or claimed as complete.
|
||||||
|
|
||||||
|
## Plan
|
||||||
|
|
||||||
|
1. Inspect relevant scripts, playbooks, docs, and examples.
|
||||||
|
2. Make the smallest structural or documentation changes needed.
|
||||||
|
3. Update validation or runbook guidance.
|
||||||
|
4. Run focused checks.
|
||||||
|
5. Summarize residual risk and follow-ups.
|
||||||
|
|
||||||
|
## Validation
|
||||||
|
|
||||||
|
List commands to run, including fallback behavior for missing tools.
|
||||||
|
|
||||||
|
## Risks
|
||||||
|
|
||||||
|
Call out destructive operations, platform assumptions, missing lab environments, or checks that require real systems.
|
||||||
@@ -0,0 +1,52 @@
|
|||||||
|
# Review Checklist
|
||||||
|
|
||||||
|
Use this checklist for repository reviews and pull requests.
|
||||||
|
|
||||||
|
## Safety
|
||||||
|
|
||||||
|
- Destructive actions default to dry-run or read-only.
|
||||||
|
- Real changes require explicit `--execute` and operator confirmation.
|
||||||
|
- Inputs are validated before use.
|
||||||
|
- Paths, service names, disks, volumes, and inventory targets are constrained.
|
||||||
|
- Rollback or recovery thinking is documented where the operation can change state.
|
||||||
|
|
||||||
|
## Bash
|
||||||
|
|
||||||
|
- Uses `#!/usr/bin/env bash`.
|
||||||
|
- Uses `set -o errexit`, `set -o nounset`, and `set -o pipefail`.
|
||||||
|
- Missing commands return a clear warning or invalid-input/dependency exit.
|
||||||
|
- Output uses `OK`, `WARNING`, and `CRITICAL` consistently.
|
||||||
|
- Exit codes follow repo convention: `0` OK, `1` operational issue, `2` invalid input or missing dependency.
|
||||||
|
- Help output exists for scripts that accept arguments.
|
||||||
|
|
||||||
|
## Ansible
|
||||||
|
|
||||||
|
- Target hosts are explicit and appropriate for the role.
|
||||||
|
- Modules are preferred over `shell` or `command`.
|
||||||
|
- Check mode and diff mode are considered.
|
||||||
|
- Tasks are idempotent or clearly documented when a check is inherently read-only or platform-specific.
|
||||||
|
- Handlers, tags, defaults, and validation tasks are used where useful.
|
||||||
|
- Inventory, vars, and role defaults do not contain secrets or real environment data.
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
- README files explain current state without overstating completeness.
|
||||||
|
- Runbooks include scope, pre-checks, execution controls, post-checks, and evidence.
|
||||||
|
- Docs avoid tutorial filler and fake enterprise complexity.
|
||||||
|
- Important limitations are linked or documented.
|
||||||
|
- `CHANGELOG.md` is updated for meaningful repo changes.
|
||||||
|
|
||||||
|
## Operational Realism
|
||||||
|
|
||||||
|
- The change reflects RHEL/Oracle Linux, Debian/Ubuntu, AIX, Veritas, GPFS, Zabbix, ELK, Docker, Kubernetes/K3s, Terraform, VMware, or Proxmox operations accurately.
|
||||||
|
- Examples remain sanitized.
|
||||||
|
- Placeholder projects are identified as placeholders.
|
||||||
|
- There is no unnecessary abstraction or invented complexity.
|
||||||
|
|
||||||
|
## Validation
|
||||||
|
|
||||||
|
- Changed Bash scripts pass `bash -n`.
|
||||||
|
- `shellcheck` was run if available, or its absence was reported.
|
||||||
|
- Ansible syntax/lint checks were run if available and relevant.
|
||||||
|
- YAML/Markdown sanity checks were run if available.
|
||||||
|
- Failures and skipped checks are visible in the final summary.
|
||||||
@@ -0,0 +1,276 @@
|
|||||||
|
# Task Templates
|
||||||
|
|
||||||
|
Copy the relevant section into a future Codex request and fill in the blanks.
|
||||||
|
|
||||||
|
## Operational Bash Tool
|
||||||
|
|
||||||
|
### Goal
|
||||||
|
|
||||||
|
Build or improve a Bash tool for:
|
||||||
|
|
||||||
|
### Context
|
||||||
|
|
||||||
|
Affected platform, incident, or operational workflow:
|
||||||
|
|
||||||
|
### Constraints
|
||||||
|
|
||||||
|
- Default to dry-run/read-only.
|
||||||
|
- Require `--execute` for changes.
|
||||||
|
- Use `OK`, `WARNING`, and `CRITICAL`.
|
||||||
|
- Exit `0` OK, `1` operational issue, `2` invalid input or missing dependency.
|
||||||
|
|
||||||
|
### Files/directories to inspect
|
||||||
|
|
||||||
|
- `infra-run/scripts/bash/`
|
||||||
|
- Relevant runbook or README:
|
||||||
|
|
||||||
|
### Implementation steps
|
||||||
|
|
||||||
|
1. Inspect neighboring scripts and shared helpers.
|
||||||
|
2. Add or adjust usage/help output.
|
||||||
|
3. Add discovery, pre-check, guarded change, post-check, and reporting sections where useful.
|
||||||
|
4. Update README or runbook notes.
|
||||||
|
|
||||||
|
### Validation commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash -n <script>
|
||||||
|
./scripts/check-bash.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Done when
|
||||||
|
|
||||||
|
The tool is readable, safe by default, validates inputs, reports clearly, and has updated docs.
|
||||||
|
|
||||||
|
## Ansible Playbook/Role
|
||||||
|
|
||||||
|
### Goal
|
||||||
|
|
||||||
|
Add or improve Ansible automation for:
|
||||||
|
|
||||||
|
### Context
|
||||||
|
|
||||||
|
Target OS and inventory group:
|
||||||
|
|
||||||
|
### Constraints
|
||||||
|
|
||||||
|
- Preserve check-mode friendliness.
|
||||||
|
- Prefer modules over shell/command.
|
||||||
|
- Keep playbooks short.
|
||||||
|
- Keep role defaults sanitized.
|
||||||
|
|
||||||
|
### Files/directories to inspect
|
||||||
|
|
||||||
|
- `infra-run/ansible/README.md`
|
||||||
|
- `infra-run/ansible/inventory/`
|
||||||
|
- `infra-run/ansible/playbooks/`
|
||||||
|
- `infra-run/ansible/roles/`
|
||||||
|
|
||||||
|
### Implementation steps
|
||||||
|
|
||||||
|
1. Inspect existing role/playbook patterns.
|
||||||
|
2. Add defaults, tasks, handlers, and tags only where needed.
|
||||||
|
3. Add validation or post-check tasks for operational evidence.
|
||||||
|
4. Update role/playbook README.
|
||||||
|
|
||||||
|
### Validation commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/check-ansible.sh
|
||||||
|
cd infra-run/ansible && ansible-playbook --syntax-check -i inventory/hosts.yml playbooks/<playbook>.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
### Done when
|
||||||
|
|
||||||
|
The playbook targets the right hosts, is idempotent where practical, supports review with `--check --diff`, and docs explain limitations.
|
||||||
|
|
||||||
|
## Runbook
|
||||||
|
|
||||||
|
### Goal
|
||||||
|
|
||||||
|
Create or improve a runbook for:
|
||||||
|
|
||||||
|
### Context
|
||||||
|
|
||||||
|
Incident signal, platform, and affected service:
|
||||||
|
|
||||||
|
### Constraints
|
||||||
|
|
||||||
|
- Include pre-checks, decision points, rollback, post-checks, and evidence.
|
||||||
|
- Avoid pretending lab notes are production-certified.
|
||||||
|
|
||||||
|
### Files/directories to inspect
|
||||||
|
|
||||||
|
- `infra-run/runbooks/`
|
||||||
|
- `infra-run/docs/`
|
||||||
|
- Related scripts/examples:
|
||||||
|
|
||||||
|
### Implementation steps
|
||||||
|
|
||||||
|
1. Define scope and assumptions.
|
||||||
|
2. Add triage steps and command examples.
|
||||||
|
3. Add safe execution gates.
|
||||||
|
4. Add validation and handoff notes.
|
||||||
|
|
||||||
|
### Validation commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/check-docs.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Done when
|
||||||
|
|
||||||
|
An operator can follow the runbook without guessing the risk, inputs, or success criteria.
|
||||||
|
|
||||||
|
## Lab Scenario
|
||||||
|
|
||||||
|
### Goal
|
||||||
|
|
||||||
|
Add or improve a lab scenario for:
|
||||||
|
|
||||||
|
### Context
|
||||||
|
|
||||||
|
Technology and local environment:
|
||||||
|
|
||||||
|
### Constraints
|
||||||
|
|
||||||
|
- Mark lab-only behavior clearly.
|
||||||
|
- Keep prerequisites and cleanup explicit.
|
||||||
|
|
||||||
|
### Files/directories to inspect
|
||||||
|
|
||||||
|
- `labs/`
|
||||||
|
- `labs/docs/lab-cheatsheet.md`
|
||||||
|
|
||||||
|
### Implementation steps
|
||||||
|
|
||||||
|
1. Document prerequisites and topology.
|
||||||
|
2. Add setup, validation, failure injection if relevant, and cleanup.
|
||||||
|
3. Link related scripts or runbooks.
|
||||||
|
|
||||||
|
### Validation commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/check-docs.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Done when
|
||||||
|
|
||||||
|
The lab is reproducible enough to review and does not imply production readiness.
|
||||||
|
|
||||||
|
## Platform Project
|
||||||
|
|
||||||
|
### Goal
|
||||||
|
|
||||||
|
Add or improve a platform project for:
|
||||||
|
|
||||||
|
### Context
|
||||||
|
|
||||||
|
Monitoring, storage, clustering, virtualization, observability, or related topic:
|
||||||
|
|
||||||
|
### Constraints
|
||||||
|
|
||||||
|
- Keep status honest: planned, partial, lab-tested, or complete.
|
||||||
|
- Prefer operational notes over marketing language.
|
||||||
|
|
||||||
|
### Files/directories to inspect
|
||||||
|
|
||||||
|
- `platform-projects/`
|
||||||
|
- `platform-projects/docs/platform-cheatsheet.md`
|
||||||
|
|
||||||
|
### Implementation steps
|
||||||
|
|
||||||
|
1. Identify scope and current maturity.
|
||||||
|
2. Add design notes, operational workflows, and validation.
|
||||||
|
3. Link runbooks, examples, and known limitations.
|
||||||
|
|
||||||
|
### Validation commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/check-docs.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Done when
|
||||||
|
|
||||||
|
The project explains what exists, how to validate it, and what remains unproven.
|
||||||
|
|
||||||
|
## Documentation Cleanup
|
||||||
|
|
||||||
|
### Goal
|
||||||
|
|
||||||
|
Clean up documentation for:
|
||||||
|
|
||||||
|
### Context
|
||||||
|
|
||||||
|
Current confusion, duplication, or missing links:
|
||||||
|
|
||||||
|
### Constraints
|
||||||
|
|
||||||
|
- Preserve useful operational detail.
|
||||||
|
- Avoid tutorial-style filler.
|
||||||
|
|
||||||
|
### Files/directories to inspect
|
||||||
|
|
||||||
|
- Root `README.md`
|
||||||
|
- Section README files
|
||||||
|
- Related docs/runbooks:
|
||||||
|
|
||||||
|
### Implementation steps
|
||||||
|
|
||||||
|
1. Remove duplication where it hurts navigation.
|
||||||
|
2. Add links to canonical docs.
|
||||||
|
3. Make limitations explicit.
|
||||||
|
4. Update changelog if meaningful.
|
||||||
|
|
||||||
|
### Validation commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/check-docs.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Done when
|
||||||
|
|
||||||
|
Readers can find the right tool, runbook, or validation command quickly.
|
||||||
|
|
||||||
|
## Repository Review
|
||||||
|
|
||||||
|
### Goal
|
||||||
|
|
||||||
|
Review repository quality for:
|
||||||
|
|
||||||
|
### Context
|
||||||
|
|
||||||
|
Areas of concern:
|
||||||
|
|
||||||
|
### Constraints
|
||||||
|
|
||||||
|
- Findings first, ordered by severity.
|
||||||
|
- Include file/line references where possible.
|
||||||
|
- Do not rewrite unrelated content.
|
||||||
|
|
||||||
|
### Files/directories to inspect
|
||||||
|
|
||||||
|
- `AGENTS.md`
|
||||||
|
- `README.md`
|
||||||
|
- `infra-run/`
|
||||||
|
- `platform-projects/`
|
||||||
|
- `labs/`
|
||||||
|
- `scripts/`
|
||||||
|
|
||||||
|
### Implementation steps
|
||||||
|
|
||||||
|
1. Inspect structure and conventions.
|
||||||
|
2. Review safety, validation, docs, and maintainability.
|
||||||
|
3. Patch only low-risk issues if requested.
|
||||||
|
4. Report risks and follow-ups.
|
||||||
|
|
||||||
|
### Validation commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/validate-repo.sh
|
||||||
|
git diff --stat
|
||||||
|
```
|
||||||
|
|
||||||
|
### Done when
|
||||||
|
|
||||||
|
The review identifies practical risks and leaves a clear next action list.
|
||||||
@@ -1,329 +0,0 @@
|
|||||||
# Runbooks and Operational Procedures
|
|
||||||
|
|
||||||
This document contains operational runbooks for deploying, managing, and troubleshooting the Enterprise Infrastructure Portfolio projects.
|
|
||||||
|
|
||||||
## Table of Contents
|
|
||||||
|
|
||||||
1. [Infrastructure Simulator Operations](#infrastructure-simulator-operations)
|
|
||||||
2. [Migration Validation Procedures](#migration-validation-procedures)
|
|
||||||
3. [Observability Stack Management](#observability-stack-management)
|
|
||||||
4. [Troubleshooting Guide](#troubleshooting-guide)
|
|
||||||
|
|
||||||
## Infrastructure Simulator Operations
|
|
||||||
|
|
||||||
### Starting the Infrastructure
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd enterprise-infra-simulator
|
|
||||||
make up
|
|
||||||
```
|
|
||||||
|
|
||||||
**Expected Outcome:**
|
|
||||||
- Docker containers for simulated Linux nodes are created
|
|
||||||
- Ansible inventory is populated
|
|
||||||
- Basic services are running on all nodes
|
|
||||||
|
|
||||||
**Verification:**
|
|
||||||
```bash
|
|
||||||
docker ps | grep infra-sim
|
|
||||||
ansible -i inventory/hosts.ini all -m ping
|
|
||||||
```
|
|
||||||
|
|
||||||
### Patching Operations
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd enterprise-infra-simulator
|
|
||||||
make patch
|
|
||||||
```
|
|
||||||
|
|
||||||
**Procedure:**
|
|
||||||
1. Backup current container states
|
|
||||||
2. Apply security patches via Ansible
|
|
||||||
3. Validate service availability
|
|
||||||
4. Generate patch report
|
|
||||||
|
|
||||||
**Rollback:**
|
|
||||||
```bash
|
|
||||||
docker-compose down
|
|
||||||
docker-compose up --scale node=0
|
|
||||||
make up
|
|
||||||
```
|
|
||||||
|
|
||||||
### Hardening Operations
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd enterprise-infra-simulator
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
**Hardening Steps:**
|
|
||||||
- Disable unnecessary services
|
|
||||||
- Configure firewall rules
|
|
||||||
- Set secure SSH configurations
|
|
||||||
- Apply CIS benchmarks
|
|
||||||
|
|
||||||
### Scaling Operations
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd enterprise-infra-simulator
|
|
||||||
./scripts/simulate_scaling.sh up 3
|
|
||||||
```
|
|
||||||
|
|
||||||
**Scaling Parameters:**
|
|
||||||
- Direction: up/down
|
|
||||||
- Count: number of nodes to add/remove
|
|
||||||
- Type: web/app/db
|
|
||||||
|
|
||||||
### Failure Simulation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd enterprise-infra-simulator
|
|
||||||
./scripts/simulate_failure.sh --type network --duration 300
|
|
||||||
```
|
|
||||||
|
|
||||||
**Failure Types:**
|
|
||||||
- network: Network partition
|
|
||||||
- disk: Disk space exhaustion
|
|
||||||
- service: Service crashes
|
|
||||||
- node: Complete node failure
|
|
||||||
|
|
||||||
### Decommissioning
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd enterprise-infra-simulator
|
|
||||||
make destroy
|
|
||||||
```
|
|
||||||
|
|
||||||
**Decommission Steps:**
|
|
||||||
1. Graceful service shutdown
|
|
||||||
2. Data backup and export
|
|
||||||
3. Configuration cleanup
|
|
||||||
4. Container removal
|
|
||||||
|
|
||||||
## Migration Validation Procedures
|
|
||||||
|
|
||||||
### Pre-Migration Snapshot
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd migration-validation-framework
|
|
||||||
python3 cli.py collect --output before.json --systems web01,db01
|
|
||||||
```
|
|
||||||
|
|
||||||
**Data Collected:**
|
|
||||||
- Mount points and filesystem usage
|
|
||||||
- Running services and their states
|
|
||||||
- Disk usage statistics
|
|
||||||
- Network configurations
|
|
||||||
|
|
||||||
### Post-Migration Validation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 cli.py collect --output after.json --systems web01,db01
|
|
||||||
python3 cli.py compare before.json after.json --output diff.json
|
|
||||||
```
|
|
||||||
|
|
||||||
**Validation Checks:**
|
|
||||||
- Service availability verification
|
|
||||||
- Filesystem integrity
|
|
||||||
- Configuration consistency
|
|
||||||
- Performance metrics comparison
|
|
||||||
|
|
||||||
### Report Generation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 cli.py report --comparison <comparison-id> --format html
|
|
||||||
```
|
|
||||||
|
|
||||||
**Report Contents:**
|
|
||||||
- Executive summary
|
|
||||||
- Detailed change log
|
|
||||||
- Risk assessment
|
|
||||||
- Recommendations
|
|
||||||
|
|
||||||
## Observability Stack Management
|
|
||||||
|
|
||||||
### Starting the Stack
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd observability-stack
|
|
||||||
docker-compose up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
**Service Startup Order:**
|
|
||||||
1. Elasticsearch
|
|
||||||
2. Logstash
|
|
||||||
3. Kibana
|
|
||||||
4. Grafana
|
|
||||||
|
|
||||||
### Log Ingestion Testing
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Send sample logs
|
|
||||||
curl -X POST "localhost:8080" -H "Content-Type: application/json" -d @logs/sample.log
|
|
||||||
```
|
|
||||||
|
|
||||||
### Alert Configuration
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Load alert rules
|
|
||||||
curl -X POST "localhost:3000/api/alerts" -H "Authorization: Bearer <token>" -d @alerting/alert_rules.json
|
|
||||||
```
|
|
||||||
|
|
||||||
### Incident Simulation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd observability-stack
|
|
||||||
./scenarios/incident_simulation.sh --type disk-full --severity critical
|
|
||||||
```
|
|
||||||
|
|
||||||
**Incident Types:**
|
|
||||||
- disk-full: Simulate disk space exhaustion
|
|
||||||
- service-down: Service failure simulation
|
|
||||||
- high-cpu: CPU utilization spike
|
|
||||||
- network-latency: Network performance degradation
|
|
||||||
|
|
||||||
## Troubleshooting Guide
|
|
||||||
|
|
||||||
### Common Issues
|
|
||||||
|
|
||||||
#### Ansible Connection Failures
|
|
||||||
|
|
||||||
**Symptoms:**
|
|
||||||
- `UNREACHABLE` errors in Ansible output
|
|
||||||
- SSH connection timeouts
|
|
||||||
|
|
||||||
**Resolution:**
|
|
||||||
```bash
|
|
||||||
# Check container status
|
|
||||||
docker ps | grep infra-sim
|
|
||||||
|
|
||||||
# Verify SSH keys
|
|
||||||
ansible -i inventory/hosts.ini all -m ping --private-key ~/.ssh/id_rsa
|
|
||||||
|
|
||||||
# Restart containers
|
|
||||||
make destroy && make up
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Elasticsearch Cluster Issues
|
|
||||||
|
|
||||||
**Symptoms:**
|
|
||||||
- Kibana shows "No living connections"
|
|
||||||
- Logstash pipeline failures
|
|
||||||
|
|
||||||
**Resolution:**
|
|
||||||
```bash
|
|
||||||
# Check cluster health
|
|
||||||
curl -X GET "localhost:9200/_cluster/health?pretty"
|
|
||||||
|
|
||||||
# Restart services
|
|
||||||
docker-compose restart elasticsearch logstash kibana
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Python Import Errors
|
|
||||||
|
|
||||||
**Symptoms:**
|
|
||||||
- ModuleNotFoundError in migration framework
|
|
||||||
- Collector failures
|
|
||||||
|
|
||||||
**Resolution:**
|
|
||||||
```bash
|
|
||||||
# Install dependencies
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
# Check Python path
|
|
||||||
python -c "import sys; print(sys.path)"
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Docker Resource Constraints
|
|
||||||
|
|
||||||
**Symptoms:**
|
|
||||||
- Container startup failures
|
|
||||||
- Out of memory errors
|
|
||||||
|
|
||||||
**Resolution:**
|
|
||||||
```bash
|
|
||||||
# Check Docker resources
|
|
||||||
docker system df
|
|
||||||
|
|
||||||
# Clean up unused resources
|
|
||||||
docker system prune -a
|
|
||||||
|
|
||||||
# Increase Docker memory limit
|
|
||||||
# Edit /etc/docker/daemon.json
|
|
||||||
{
|
|
||||||
"memory": "4g",
|
|
||||||
"cpu-count": 2
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Log Locations
|
|
||||||
|
|
||||||
- **Ansible:** `enterprise-infra-simulator/ansible.log`
|
|
||||||
- **Docker:** `docker logs <container-name>`
|
|
||||||
- **Elasticsearch:** `observability-stack/logs/elasticsearch.log`
|
|
||||||
- **Migration Framework:** `migration-validation-framework/logs/validation.log`
|
|
||||||
|
|
||||||
### Performance Monitoring
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Infrastructure monitoring
|
|
||||||
ansible -i inventory/hosts.ini all -m shell -a "top -b -n1 | head -20"
|
|
||||||
|
|
||||||
# Elasticsearch metrics
|
|
||||||
curl -X GET "localhost:9200/_cluster/stats?pretty"
|
|
||||||
|
|
||||||
# Python performance
|
|
||||||
python -m cProfile cli.py snapshot
|
|
||||||
```
|
|
||||||
|
|
||||||
### Backup and Recovery
|
|
||||||
|
|
||||||
#### Infrastructure Backup
|
|
||||||
```bash
|
|
||||||
cd enterprise-infra-simulator
|
|
||||||
docker-compose exec ansible ansible-playbook /playbooks/backup.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Data Backup
|
|
||||||
```bash
|
|
||||||
cd observability-stack
|
|
||||||
docker-compose exec elasticsearch curl -X PUT "localhost:9200/_snapshot/backup" -H "Content-Type: application/json" -d @backup_config.json
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Migration Data Backup
|
|
||||||
```bash
|
|
||||||
cd migration-validation-framework
|
|
||||||
tar -czf /backup/location/migration-validation-framework.tgz migration-validation-framework
|
|
||||||
```
|
|
||||||
|
|
||||||
## Emergency Procedures
|
|
||||||
|
|
||||||
### Complete System Reset
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Stop all services
|
|
||||||
docker-compose down -v
|
|
||||||
cd enterprise-infra-simulator && make destroy
|
|
||||||
|
|
||||||
# Clean up volumes
|
|
||||||
docker volume prune -f
|
|
||||||
|
|
||||||
# Restart from clean state
|
|
||||||
cd enterprise-infra-simulator && make up
|
|
||||||
cd observability-stack && docker-compose up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
### Incident Response
|
|
||||||
|
|
||||||
1. **Assess Impact:** Check monitoring dashboards
|
|
||||||
2. **Isolate Issue:** Use failure simulation scripts to reproduce
|
|
||||||
3. **Implement Fix:** Apply appropriate runbook procedure
|
|
||||||
4. **Validate Recovery:** Run validation framework
|
|
||||||
5. **Document Incident:** Update runbooks with lessons learned
|
|
||||||
|
|
||||||
## Maintenance Schedules
|
|
||||||
|
|
||||||
- **Daily:** Log rotation and cleanup
|
|
||||||
- **Weekly:** Security patching and updates
|
|
||||||
- **Monthly:** Performance optimization and capacity planning
|
|
||||||
- **Quarterly:** Architecture review and modernization
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
---
|
|
||||||
# Ansible-lint configuration
|
|
||||||
#extends: default
|
|
||||||
|
|
||||||
skip_list:
|
|
||||||
- 'role-name'
|
|
||||||
- 'name[casing]'
|
|
||||||
- 'line-too-long'
|
|
||||||
|
|
||||||
# Ignore these rules
|
|
||||||
exclude_paths:
|
|
||||||
- .git
|
|
||||||
- .github
|
|
||||||
- molecule/default/tests/
|
|
||||||
|
|
||||||
# Custom rules
|
|
||||||
#rules:
|
|
||||||
# line-length:
|
|
||||||
# max: 160
|
|
||||||
# level: warning
|
|
||||||
@@ -1,173 +0,0 @@
|
|||||||
# Enterprise Infrastructure Simulator Makefile
|
|
||||||
|
|
||||||
.PHONY: help run demo up down patch destroy status logs clean test
|
|
||||||
|
|
||||||
# Default target
|
|
||||||
help: ## Show this help message
|
|
||||||
@echo "Enterprise Infrastructure Simulator"
|
|
||||||
@echo ""
|
|
||||||
@echo "Available commands:"
|
|
||||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " %-15s %s\n", $$1, $$2}'
|
|
||||||
|
|
||||||
run: ## Run the default simulator workflow
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
|
|
||||||
|
|
||||||
demo: ## Run a failure-and-patch demonstration
|
|
||||||
./scripts/simulate_failure.sh service 30 web
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
|
||||||
|
|
||||||
# Infrastructure management
|
|
||||||
up: ## Start the infrastructure simulation
|
|
||||||
@echo "Starting enterprise infrastructure simulation..."
|
|
||||||
docker-compose up -d
|
|
||||||
@echo "Waiting for containers to be ready..."
|
|
||||||
@sleep 30
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
|
|
||||||
@echo "Infrastructure simulation started successfully"
|
|
||||||
|
|
||||||
down: ## Stop the infrastructure simulation
|
|
||||||
@echo "Stopping infrastructure simulation..."
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml || true
|
|
||||||
docker-compose down
|
|
||||||
@echo "Infrastructure simulation stopped"
|
|
||||||
|
|
||||||
patch: ## Apply security patches to all nodes
|
|
||||||
@echo "Applying security patches..."
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
|
||||||
@echo "Security patches applied"
|
|
||||||
|
|
||||||
destroy: ## Completely destroy the infrastructure
|
|
||||||
@echo "Destroying infrastructure..."
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/decommission.yml || true
|
|
||||||
docker-compose down -v --remove-orphans
|
|
||||||
docker system prune -f
|
|
||||||
rm -rf logs/* reports/*
|
|
||||||
@echo "Infrastructure completely destroyed"
|
|
||||||
|
|
||||||
# Scaling operations
|
|
||||||
scale-up-web: ## Scale up web servers (usage: make scale-up-web COUNT=2)
|
|
||||||
@echo "Scaling up $(COUNT) web servers..."
|
|
||||||
./scripts/simulate_scaling.sh up $(or $(COUNT),1) web
|
|
||||||
|
|
||||||
scale-up-db: ## Scale up database servers (usage: make scale-up-db COUNT=1)
|
|
||||||
@echo "Scaling up $(COUNT) database servers..."
|
|
||||||
./scripts/simulate_scaling.sh up $(or $(COUNT),1) db
|
|
||||||
|
|
||||||
scale-down-web: ## Scale down web servers (usage: make scale-down-web COUNT=1)
|
|
||||||
@echo "Scaling down $(COUNT) web servers..."
|
|
||||||
./scripts/simulate_scaling.sh down $(or $(COUNT),1) web
|
|
||||||
|
|
||||||
scale-down-db: ## Scale down database servers (usage: make scale-down-db COUNT=1)
|
|
||||||
@echo "Scaling down $(COUNT) database servers..."
|
|
||||||
./scripts/simulate_scaling.sh down $(or $(COUNT),1) db
|
|
||||||
|
|
||||||
# Failure simulation
|
|
||||||
fail-network: ## Simulate network failure (usage: make fail-network DURATION=60)
|
|
||||||
@echo "Simulating network failure for $(or $(DURATION),60) seconds..."
|
|
||||||
./scripts/simulate_failure.sh network $(or $(DURATION),60)
|
|
||||||
|
|
||||||
fail-disk: ## Simulate disk space exhaustion (usage: make fail-disk DURATION=120)
|
|
||||||
@echo "Simulating disk failure for $(or $(DURATION),120) seconds..."
|
|
||||||
./scripts/simulate_failure.sh disk $(or $(DURATION),120)
|
|
||||||
|
|
||||||
fail-service: ## Simulate service failures (usage: make fail-service DURATION=30)
|
|
||||||
@echo "Simulating service failure for $(or $(DURATION),30) seconds..."
|
|
||||||
./scripts/simulate_failure.sh service $(or $(DURATION),30)
|
|
||||||
|
|
||||||
fail-node: ## Simulate complete node failure (usage: make fail-node DURATION=300)
|
|
||||||
@echo "Simulating node failure for $(or $(DURATION),300) seconds..."
|
|
||||||
./scripts/simulate_failure.sh node $(or $(DURATION),300)
|
|
||||||
|
|
||||||
# Monitoring and status
|
|
||||||
status: ## Show infrastructure status
|
|
||||||
@echo "=== Docker Containers ==="
|
|
||||||
docker-compose ps
|
|
||||||
@echo ""
|
|
||||||
@echo "=== Ansible Inventory ==="
|
|
||||||
ansible -i inventory/hosts.ini --list-hosts all || echo "Inventory check failed"
|
|
||||||
@echo ""
|
|
||||||
@echo "=== System Resources ==="
|
|
||||||
docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}\t{{.NetIO}}"
|
|
||||||
|
|
||||||
logs: ## Show infrastructure logs
|
|
||||||
docker-compose logs -f --tail=100
|
|
||||||
|
|
||||||
logs-web: ## Show web server logs
|
|
||||||
docker-compose logs -f web
|
|
||||||
|
|
||||||
logs-db: ## Show database logs
|
|
||||||
docker-compose logs -f db
|
|
||||||
|
|
||||||
# Testing and validation
|
|
||||||
test: ## Run infrastructure tests
|
|
||||||
@echo "Running infrastructure tests..."
|
|
||||||
ansible -i inventory/hosts.ini all -m ping
|
|
||||||
ansible-playbook -i inventory/hosts.ini --syntax-check playbooks/*.yml
|
|
||||||
@echo "Testing scaling scripts..."
|
|
||||||
./scripts/simulate_scaling.sh up 0 web # Dry run
|
|
||||||
./scripts/simulate_failure.sh network 1 # Quick test
|
|
||||||
@echo "All tests passed"
|
|
||||||
|
|
||||||
validate: ## Validate infrastructure configuration
|
|
||||||
@echo "Validating configuration..."
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml --check
|
|
||||||
docker-compose config
|
|
||||||
@echo "Configuration validation complete"
|
|
||||||
|
|
||||||
# Scenarios
|
|
||||||
scenario-scaling: ## Run scaling event scenario
|
|
||||||
@echo "Running scaling event scenario..."
|
|
||||||
ansible-playbook -i inventory/hosts.ini scenarios/scaling_event.yml
|
|
||||||
|
|
||||||
scenario-disaster: ## Run disaster recovery scenario
|
|
||||||
@echo "Running disaster recovery scenario..."
|
|
||||||
ansible-playbook -i inventory/hosts.ini scenarios/disaster_recovery.yml
|
|
||||||
|
|
||||||
# Maintenance
|
|
||||||
clean: ## Clean up temporary files and logs
|
|
||||||
@echo "Cleaning up temporary files..."
|
|
||||||
rm -rf logs/*.log reports/*.txt
|
|
||||||
docker system prune -f
|
|
||||||
@echo "Cleanup complete"
|
|
||||||
|
|
||||||
backup: ## Create infrastructure backup
|
|
||||||
@echo "Creating infrastructure backup..."
|
|
||||||
mkdir -p backups/$(shell date +%Y%m%d_%H%M%S)
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/backup.yml
|
|
||||||
docker-compose exec ansible tar -czf /backups/infra_backup.tar.gz /infrastructure
|
|
||||||
@echo "Backup created"
|
|
||||||
|
|
||||||
# Development
|
|
||||||
lint: ## Lint Ansible playbooks
|
|
||||||
@echo "Linting Ansible playbooks..."
|
|
||||||
ansible-lint playbooks/*.yml scenarios/*.yml
|
|
||||||
@echo "Linting complete"
|
|
||||||
|
|
||||||
format: ## Format code and configuration
|
|
||||||
@echo "Formatting code..."
|
|
||||||
# Add formatting commands here
|
|
||||||
@echo "Formatting complete"
|
|
||||||
|
|
||||||
# Security
|
|
||||||
harden: ## Apply security hardening
|
|
||||||
@echo "Applying security hardening..."
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
|
|
||||||
|
|
||||||
security-scan: ## Run security scans
|
|
||||||
@echo "Running security scans..."
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/security_scan.yml
|
|
||||||
|
|
||||||
# Help for specific targets
|
|
||||||
help-scaling: ## Show scaling-related commands
|
|
||||||
@echo "Scaling Commands:"
|
|
||||||
@echo " make scale-up-web COUNT=2 - Add 2 web servers"
|
|
||||||
@echo " make scale-up-db COUNT=1 - Add 1 database server"
|
|
||||||
@echo " make scale-down-web COUNT=1 - Remove 1 web server"
|
|
||||||
@echo " make scale-down-db COUNT=1 - Remove 1 database server"
|
|
||||||
|
|
||||||
help-failure: ## Show failure simulation commands
|
|
||||||
@echo "Failure Simulation Commands:"
|
|
||||||
@echo " make fail-network DURATION=60 - Network failure for 60s"
|
|
||||||
@echo " make fail-disk DURATION=120 - Disk exhaustion for 120s"
|
|
||||||
@echo " make fail-service DURATION=30 - Service failure for 30s"
|
|
||||||
@echo " make fail-node DURATION=300 - Node failure for 300s"
|
|
||||||
@@ -1,74 +0,0 @@
|
|||||||
# Enterprise Infrastructure Simulator
|
|
||||||
|
|
||||||
## Problem Statement
|
|
||||||
|
|
||||||
Infrastructure teams need a safe place to rehearse lifecycle operations before applying them to production fleets. Patch windows, hardening changes, scale events, and node failures all carry operational risk when they are tested only during real incidents.
|
|
||||||
|
|
||||||
## Solution Overview
|
|
||||||
|
|
||||||
This project models common Linux infrastructure operations with Ansible playbooks and shell-based simulations. It keeps the automation readable and auditable while producing example evidence that resembles a real change record.
|
|
||||||
|
|
||||||
## Architecture Overview
|
|
||||||
|
|
||||||
```
|
|
||||||
Operator -> Make/CLI -> Ansible Inventory -> Playbooks -> Linux Nodes
|
|
||||||
| |
|
|
||||||
v v
|
|
||||||
Scenarios Reports/Logs
|
|
||||||
```
|
|
||||||
|
|
||||||
Core components:
|
|
||||||
|
|
||||||
- `inventory/hosts.ini` defines managed node groups.
|
|
||||||
- `playbooks/` contains provisioning, patching, hardening, and decommissioning workflows.
|
|
||||||
- `scripts/` injects scaling and failure conditions.
|
|
||||||
- `scenarios/` documents operational exercises.
|
|
||||||
- `examples/` stores representative outputs for review.
|
|
||||||
|
|
||||||
## How to Run
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd enterprise-infra-simulator
|
|
||||||
|
|
||||||
# Validate playbook syntax.
|
|
||||||
make test
|
|
||||||
|
|
||||||
# Provision the simulated estate.
|
|
||||||
make run
|
|
||||||
|
|
||||||
# Apply security patches.
|
|
||||||
make patch
|
|
||||||
|
|
||||||
# Apply host hardening.
|
|
||||||
make harden
|
|
||||||
|
|
||||||
# Run the failure and patch demo.
|
|
||||||
make demo
|
|
||||||
```
|
|
||||||
|
|
||||||
Direct Ansible commands are also supported:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/provision.yml
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
## Example Output
|
|
||||||
|
|
||||||
```text
|
|
||||||
PLAY RECAP *********************************************************************
|
|
||||||
web01 : ok=21 changed=7 unreachable=0 failed=0 skipped=3 rescued=0 ignored=1
|
|
||||||
db01 : ok=18 changed=4 unreachable=0 failed=0 skipped=5 rescued=0 ignored=1
|
|
||||||
lb01 : ok=16 changed=3 unreachable=0 failed=0 skipped=6 rescued=0 ignored=0
|
|
||||||
|
|
||||||
Patch status: SUCCESS
|
|
||||||
Updates applied: 12
|
|
||||||
Reboot required: false
|
|
||||||
```
|
|
||||||
|
|
||||||
Additional sample evidence is available in [examples/patch-output.txt](examples/patch-output.txt) and [examples/failure-simulation.txt](examples/failure-simulation.txt).
|
|
||||||
|
|
||||||
## Real-World Use Case
|
|
||||||
|
|
||||||
A platform team can use this project to demonstrate how routine operating procedures are encoded, reviewed, and tested before production change windows. The same patterns apply to regulated Linux estates where patch evidence, hardening controls, and incident drills must be repeatable.
|
|
||||||
@@ -1,207 +0,0 @@
|
|||||||
# Enterprise Infrastructure Simulator - Refactored
|
|
||||||
|
|
||||||
Refactored enterprise infrastructure automation using Ansible best practices.
|
|
||||||
|
|
||||||
## Structure
|
|
||||||
|
|
||||||
```
|
|
||||||
playbooks/ # Main playbooks
|
|
||||||
├── provision.yml # Provision infrastructure nodes
|
|
||||||
├── patch.yml # Apply security patches
|
|
||||||
├── hardening.yml # Harden infrastructure
|
|
||||||
└── decommission.yml # Decommission nodes
|
|
||||||
|
|
||||||
roles/ # Reusable Ansible roles
|
|
||||||
├── base_provision/ # Base OS provisioning
|
|
||||||
├── patching/ # Patch management
|
|
||||||
├── hardening/ # Security hardening
|
|
||||||
└── decommission/ # Node decommissioning
|
|
||||||
|
|
||||||
group_vars/ # Group-level variables
|
|
||||||
├── all.yml # All hosts
|
|
||||||
├── webservers.yml # Web servers
|
|
||||||
├── databases.yml # Database servers
|
|
||||||
├── loadbalancers.yml
|
|
||||||
├── monitoring.yml
|
|
||||||
└── vault.yml # Encrypted secrets (Vault)
|
|
||||||
|
|
||||||
molecule/default/ # Testing with Molecule
|
|
||||||
├── molecule.yml # Molecule config
|
|
||||||
├── converge.yml # Test playbook
|
|
||||||
└── verify.yml # Test verification
|
|
||||||
```
|
|
||||||
|
|
||||||
## Best Practices Implemented
|
|
||||||
|
|
||||||
### ✅ Idempotencja
|
|
||||||
- All tasks use `changed_when` and `failed_when` for proper state detection
|
|
||||||
- Command modules replaced with native Ansible modules where possible
|
|
||||||
- Shell tasks include `changed_when: false` when appropriate
|
|
||||||
|
|
||||||
### ✅ Role + Struktura
|
|
||||||
- Clean role separation: `base_provision`, `patching`, `hardening`, `decommission`
|
|
||||||
- Each role has: `tasks/`, `handlers/`, `defaults/`, `templates/`, `README.md`
|
|
||||||
- Proper namespacing prevents variable conflicts
|
|
||||||
|
|
||||||
### ✅ Brak Hardcodu
|
|
||||||
- All variables in `defaults/main.yml` or `group_vars/`
|
|
||||||
- No hardcoded values in playbooks
|
|
||||||
- Configurable through `group_vars` for different environments
|
|
||||||
|
|
||||||
### ✅ Handlers zamiast Restartów
|
|
||||||
- SSH restart via handler (triggered only on config change)
|
|
||||||
- fail2ban restart via handler
|
|
||||||
- Services not restarted unnecessarily
|
|
||||||
|
|
||||||
### ✅ Vault do Sekretów
|
|
||||||
- Secrets go in `group_vars/vault.yml` (encrypted with Ansible Vault)
|
|
||||||
- Admin passwords not in plaintext
|
|
||||||
- Database credentials managed via Vault
|
|
||||||
|
|
||||||
### ✅ ansible-lint
|
|
||||||
- `.ansible-lint` configuration included
|
|
||||||
- Rules configured for project standards
|
|
||||||
- Run: `ansible-lint playbooks/ roles/`
|
|
||||||
|
|
||||||
### ✅ Molecule
|
|
||||||
- Docker-based testing in `molecule/default/`
|
|
||||||
- Test convergence and verification
|
|
||||||
- Run: `molecule test`
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
### Run Provisioning
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-playbook playbooks/provision.yml -i inventory/hosts.ini
|
|
||||||
```
|
|
||||||
|
|
||||||
### Run Patching
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-playbook playbooks/patch.yml -i inventory/hosts.ini --ask-vault-pass
|
|
||||||
```
|
|
||||||
|
|
||||||
### Run Hardening
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-playbook playbooks/hardening.yml -i inventory/hosts.ini --ask-vault-pass
|
|
||||||
```
|
|
||||||
|
|
||||||
### Run Decommissioning
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-playbook playbooks/decommission.yml -i inventory/hosts.ini --ask-vault-pass
|
|
||||||
```
|
|
||||||
|
|
||||||
## Vault Management
|
|
||||||
|
|
||||||
### Create Vault Password File
|
|
||||||
|
|
||||||
```bash
|
|
||||||
echo "your-secure-password" > ~/.vault_pass.txt
|
|
||||||
chmod 600 ~/.vault_pass.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
### Encrypt Secrets
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-vault encrypt group_vars/vault.yml --vault-password-file ~/.vault_pass.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
### Edit Encrypted Vault
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-vault edit group_vars/vault.yml --vault-password-file ~/.vault_pass.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
### Run with Vault
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-playbook playbooks/provision.yml \
|
|
||||||
--vault-password-file ~/.vault_pass.txt \
|
|
||||||
-i inventory/hosts.ini
|
|
||||||
```
|
|
||||||
|
|
||||||
## Linting
|
|
||||||
|
|
||||||
### Run ansible-lint
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-lint playbooks/ roles/
|
|
||||||
```
|
|
||||||
|
|
||||||
### Fix Issues
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-lint playbooks/ roles/ --fix
|
|
||||||
```
|
|
||||||
|
|
||||||
## Testing with Molecule
|
|
||||||
|
|
||||||
### Run All Tests
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd enterprise-infra-simulator
|
|
||||||
molecule test
|
|
||||||
```
|
|
||||||
|
|
||||||
### Run Specific Scenarios
|
|
||||||
|
|
||||||
```bash
|
|
||||||
molecule converge # Apply roles
|
|
||||||
molecule verify # Verify results
|
|
||||||
molecule destroy # Cleanup
|
|
||||||
```
|
|
||||||
|
|
||||||
## Role Documentation
|
|
||||||
|
|
||||||
Each role has detailed README:
|
|
||||||
|
|
||||||
- [base_provision/README.md](roles/base_provision/README.md)
|
|
||||||
- [patching/README.md](roles/patching/README.md)
|
|
||||||
- [hardening/README.md](roles/hardening/README.md)
|
|
||||||
- [decommission/README.md](roles/decommission/README.md)
|
|
||||||
|
|
||||||
## Group Variables
|
|
||||||
|
|
||||||
- `group_vars/all.yml` - Global configuration
|
|
||||||
- `group_vars/webservers.yml` - Web server config
|
|
||||||
- `group_vars/databases.yml` - Database config
|
|
||||||
- `group_vars/loadbalancers.yml` - Load balancer config
|
|
||||||
- `group_vars/monitoring.yml` - Monitoring config
|
|
||||||
- `group_vars/vault.yml` - Encrypted secrets
|
|
||||||
|
|
||||||
## Tags
|
|
||||||
|
|
||||||
Use tags to run specific parts:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-playbook playbooks/provision.yml --tags base,provision
|
|
||||||
ansible-playbook playbooks/hardening.yml --tags security,hardening
|
|
||||||
```
|
|
||||||
|
|
||||||
## Error Handling
|
|
||||||
|
|
||||||
- Proper use of `failed_when` for critical failures
|
|
||||||
- Strategic use of `ignore_errors` only for optional operations
|
|
||||||
- Comprehensive assertion checks for prerequisites
|
|
||||||
|
|
||||||
## Security
|
|
||||||
|
|
||||||
- Passwords stored in encrypted Vault
|
|
||||||
- SSH key-based authentication
|
|
||||||
- Firewall configured with deny-by-default policy
|
|
||||||
- SELinux/AppArmor support
|
|
||||||
- CIS hardening levels 1-2
|
|
||||||
|
|
||||||
## Monitoring
|
|
||||||
|
|
||||||
- Health checks included in playbooks
|
|
||||||
- Service verification after operations
|
|
||||||
- Detailed logging to `/var/log/`
|
|
||||||
- Report generation for audit trails
|
|
||||||
|
|
||||||
## Support
|
|
||||||
|
|
||||||
For issues or questions about the roles, see individual role README files.
|
|
||||||
@@ -1,231 +0,0 @@
|
|||||||
# Vault Configuration Guide
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
This project uses Ansible Vault to securely manage sensitive data such as passwords, API keys, and credentials.
|
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
### 1. Create Vault Password File
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Generate a secure password
|
|
||||||
openssl rand -base64 32 > ~/.vault_pass.txt
|
|
||||||
|
|
||||||
# Secure the file
|
|
||||||
chmod 600 ~/.vault_pass.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Add to .bashrc or .zshrc
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export ANSIBLE_VAULT_PASSWORD_FILE="$HOME/.vault_pass.txt"
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Configure ansible.cfg
|
|
||||||
|
|
||||||
```ini
|
|
||||||
[defaults]
|
|
||||||
vault_password_file = ~/.vault_pass.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
## Vault Files
|
|
||||||
|
|
||||||
### group_vars/vault.yml
|
|
||||||
|
|
||||||
This file contains all encrypted secrets:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
---
|
|
||||||
# Vault variables for sensitive data
|
|
||||||
vault_admin_password: "<secure_password>"
|
|
||||||
vault_db_password: "<db_password>"
|
|
||||||
vault_grafana_password: "<grafana_password>"
|
|
||||||
vault_ssh_key_passphrase: "<ssh_passphrase>"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Encrypting Secrets
|
|
||||||
|
|
||||||
### First Time - Encrypt vault.yml
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Edit the file first with plain text secrets
|
|
||||||
ansible-vault encrypt group_vars/vault.yml
|
|
||||||
|
|
||||||
# You'll be prompted for vault password
|
|
||||||
# Then the file will be automatically encrypted
|
|
||||||
```
|
|
||||||
|
|
||||||
### Edit Encrypted Vault
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Edit the vault file (will decrypt, open editor, re-encrypt)
|
|
||||||
ansible-vault edit group_vars/vault.yml
|
|
||||||
|
|
||||||
# Or view without editing
|
|
||||||
ansible-vault view group_vars/vault.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
### Encrypt New Files
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-vault encrypt group_vars/new_secrets.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
## Using Vault in Playbooks
|
|
||||||
|
|
||||||
### Import Vault Variables
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
---
|
|
||||||
- name: My Playbook
|
|
||||||
hosts: all
|
|
||||||
vars_files:
|
|
||||||
- vars/vault.yml
|
|
||||||
|
|
||||||
tasks:
|
|
||||||
- name: Use vault password
|
|
||||||
user:
|
|
||||||
name: admin
|
|
||||||
password: "{{ vault_admin_password | password_hash('sha512') }}"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Running Playbooks with Vault
|
|
||||||
|
|
||||||
### Method 1: Using .vault_pass.txt
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export ANSIBLE_VAULT_PASSWORD_FILE="$HOME/.vault_pass.txt"
|
|
||||||
ansible-playbook playbooks/provision.yml -i inventory/hosts.ini
|
|
||||||
```
|
|
||||||
|
|
||||||
### Method 2: Inline Flag
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-playbook playbooks/provision.yml \
|
|
||||||
--vault-password-file ~/.vault_pass.txt \
|
|
||||||
-i inventory/hosts.ini
|
|
||||||
```
|
|
||||||
|
|
||||||
### Method 3: Prompt for Password
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-playbook playbooks/provision.yml \
|
|
||||||
--ask-vault-pass \
|
|
||||||
-i inventory/hosts.ini
|
|
||||||
|
|
||||||
# You'll be prompted to enter vault password
|
|
||||||
```
|
|
||||||
|
|
||||||
## Viewing Vault Contents
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# View encrypted file
|
|
||||||
ansible-vault view group_vars/vault.yml
|
|
||||||
|
|
||||||
# View specific variable
|
|
||||||
ansible-playbook playbooks/provision.yml \
|
|
||||||
--tags never \
|
|
||||||
-e "ansible_connection=local" \
|
|
||||||
-i localhost, \
|
|
||||||
-m debug \
|
|
||||||
-a "var=vault_admin_password"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Vault Best Practices
|
|
||||||
|
|
||||||
### ✅ DO
|
|
||||||
|
|
||||||
- Store all passwords in vault.yml
|
|
||||||
- Use strong vault passwords (32+ characters)
|
|
||||||
- Keep vault password file secure (chmod 600)
|
|
||||||
- Rotate vault passwords periodically
|
|
||||||
- Version control only encrypted files
|
|
||||||
- Document what each variable contains
|
|
||||||
|
|
||||||
### ❌ DON'T
|
|
||||||
|
|
||||||
- Commit unencrypted vault.yml to git
|
|
||||||
- Share vault password file
|
|
||||||
- Hardcode secrets in playbooks
|
|
||||||
- Use weak passwords
|
|
||||||
- Check plaintext secrets into version control
|
|
||||||
|
|
||||||
## Rekeying Vault
|
|
||||||
|
|
||||||
To change the vault password:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ansible-vault rekey group_vars/vault.yml
|
|
||||||
|
|
||||||
# You'll be prompted for:
|
|
||||||
# 1. Current vault password
|
|
||||||
# 2. New vault password
|
|
||||||
# 3. Confirm new vault password
|
|
||||||
```
|
|
||||||
|
|
||||||
## CI/CD Integration
|
|
||||||
|
|
||||||
For CI/CD pipelines (GitHub Actions, GitLab CI, etc.):
|
|
||||||
|
|
||||||
### GitHub Actions Example
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
- name: Run Ansible Playbook
|
|
||||||
env:
|
|
||||||
ANSIBLE_VAULT_PASSWORD: ${{ secrets.ANSIBLE_VAULT_PASSWORD }}
|
|
||||||
run: |
|
|
||||||
echo "$ANSIBLE_VAULT_PASSWORD" > ~/.vault_pass.txt
|
|
||||||
ansible-playbook playbooks/provision.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
### GitLab CI Example
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
deploy:
|
|
||||||
script:
|
|
||||||
- echo "$ANSIBLE_VAULT_PASSWORD" > ~/.vault_pass.txt
|
|
||||||
- ansible-playbook playbooks/provision.yml
|
|
||||||
secrets:
|
|
||||||
- ANSIBLE_VAULT_PASSWORD
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### "Decryption failed"
|
|
||||||
|
|
||||||
- Wrong vault password
|
|
||||||
- File is corrupted
|
|
||||||
- Check file permissions
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Check if file is encrypted
|
|
||||||
file group_vars/vault.yml
|
|
||||||
|
|
||||||
# Should show: ASCII text, with very long lines
|
|
||||||
```
|
|
||||||
|
|
||||||
### "vault password not found"
|
|
||||||
|
|
||||||
- ANSIBLE_VAULT_PASSWORD_FILE not set
|
|
||||||
- Path is incorrect
|
|
||||||
- File permissions wrong (needs 600)
|
|
||||||
|
|
||||||
### "Secrets leaked"
|
|
||||||
|
|
||||||
If secrets are accidentally committed:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Remove from git history
|
|
||||||
git filter-branch --force --index-filter \
|
|
||||||
'git rm --cached --ignore-unmatch group_vars/vault.yml' \
|
|
||||||
--prune-empty --tag-name-filter cat -- --all
|
|
||||||
|
|
||||||
# Force push (careful!)
|
|
||||||
git push origin --force --all
|
|
||||||
```
|
|
||||||
|
|
||||||
## Additional Resources
|
|
||||||
|
|
||||||
- [Ansible Vault Documentation](https://docs.ansible.com/ansible/latest/vault_guide/)
|
|
||||||
- [Vault Best Practices](https://docs.ansible.com/ansible/latest/vault_guide/vault_managing_passwords.html)
|
|
||||||
@@ -1,30 +0,0 @@
|
|||||||
# Enterprise Infrastructure Simulator Architecture
|
|
||||||
|
|
||||||
## Components
|
|
||||||
|
|
||||||
- Operator interface: `make` targets and direct Ansible commands.
|
|
||||||
- Inventory: static host groups in `inventory/hosts.ini`.
|
|
||||||
- Automation: lifecycle playbooks in `playbooks/`.
|
|
||||||
- Simulation scripts: controlled failure and scaling events in `scripts/`.
|
|
||||||
- Evidence: logs, reports, scenario notes, and examples.
|
|
||||||
|
|
||||||
## Data Flow
|
|
||||||
|
|
||||||
```
|
|
||||||
Operator
|
|
||||||
-> Make target or shell script
|
|
||||||
-> Ansible inventory
|
|
||||||
-> lifecycle playbook
|
|
||||||
-> managed Linux node
|
|
||||||
-> log/report artifact
|
|
||||||
```
|
|
||||||
|
|
||||||
Failure drills follow a parallel flow:
|
|
||||||
|
|
||||||
```
|
|
||||||
Operator -> simulate_failure.sh -> target node/service -> health check -> patch/hardening playbook -> evidence
|
|
||||||
```
|
|
||||||
|
|
||||||
## Notes
|
|
||||||
|
|
||||||
The project favors explicit playbooks over hidden orchestration so the operational intent is visible during review. In a production implementation, the same workflows would typically run from a CI runner or automation controller with credentials supplied by a secret manager.
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
2026-04-29 02:13:41 - Starting failure simulation: service 30 web
|
|
||||||
2026-04-29 02:13:41 - Simulating service failures on containers: web
|
|
||||||
2026-04-29 02:13:42 - Stopping services in container enterprise-web-1
|
|
||||||
2026-04-29 02:13:44 - Health probe failed: http://web01/health returned 503
|
|
||||||
2026-04-29 02:14:12 - Cleaning up failure simulation
|
|
||||||
2026-04-29 02:14:13 - Restarted nginx in enterprise-web-1
|
|
||||||
2026-04-29 02:14:18 - Health probe recovered: http://web01/health returned 200
|
|
||||||
2026-04-29 02:14:18 - Failure simulation completed successfully
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
PLAY [Apply Security Patches and Updates] **************************************
|
|
||||||
|
|
||||||
TASK [Update package cache] *****************************************************
|
|
||||||
changed: [web01]
|
|
||||||
changed: [db01]
|
|
||||||
ok: [lb01]
|
|
||||||
|
|
||||||
TASK [Check for available updates] **********************************************
|
|
||||||
ok: [web01] => {"stdout": "9"}
|
|
||||||
ok: [db01] => {"stdout": "4"}
|
|
||||||
ok: [lb01] => {"stdout": "0"}
|
|
||||||
|
|
||||||
TASK [Apply security updates only] **********************************************
|
|
||||||
changed: [web01]
|
|
||||||
changed: [db01]
|
|
||||||
ok: [lb01]
|
|
||||||
|
|
||||||
TASK [Verify critical services] *************************************************
|
|
||||||
ok: [web01] => (item=systemd-journald)
|
|
||||||
ok: [web01] => (item=cron)
|
|
||||||
ok: [db01] => (item=systemd-journald)
|
|
||||||
ok: [lb01] => (item=cron)
|
|
||||||
|
|
||||||
PLAY RECAP *********************************************************************
|
|
||||||
web01 : ok=19 changed=6 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
|
|
||||||
db01 : ok=18 changed=5 unreachable=0 failed=0 skipped=2 rescued=0 ignored=1
|
|
||||||
lb01 : ok=15 changed=1 unreachable=0 failed=0 skipped=4 rescued=0 ignored=0
|
|
||||||
|
|
||||||
Patch report
|
|
||||||
Status: SUCCESS
|
|
||||||
Window: 02:00-04:00 UTC
|
|
||||||
Reboot required: false
|
|
||||||
Notification: infra-team@example.com
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
---
|
|
||||||
# Group variables for all hosts
|
|
||||||
|
|
||||||
# SSH Configuration
|
|
||||||
ssh_config:
|
|
||||||
port: 22
|
|
||||||
max_auth_tries: 3
|
|
||||||
alive_interval: 300
|
|
||||||
|
|
||||||
# Firewall defaults
|
|
||||||
firewall_enabled: true
|
|
||||||
firewall_default_policy: deny
|
|
||||||
|
|
||||||
# Patching defaults
|
|
||||||
patch_enabled: true
|
|
||||||
enforce_patch_window: true
|
|
||||||
|
|
||||||
# Services monitoring
|
|
||||||
enable_monitoring: false
|
|
||||||
enable_health_checks: true
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
---
|
|
||||||
# Database servers group configuration
|
|
||||||
db_type: postgresql
|
|
||||||
db_port: 5432
|
|
||||||
db_backup_enabled: true
|
|
||||||
db_backup_path: /var/backups/database
|
|
||||||
|
|
||||||
# Database user (use vault for production)
|
|
||||||
db_admin_user: postgres
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
---
|
|
||||||
# Load balancers group configuration
|
|
||||||
lb_type: haproxy
|
|
||||||
lb_port: 443
|
|
||||||
lb_stats_port: 8404
|
|
||||||
lb_stats_enabled: true
|
|
||||||
|
|
||||||
# Frontend configuration
|
|
||||||
frontend_host: "0.0.0.0"
|
|
||||||
frontend_port: 80
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
---
|
|
||||||
# Monitoring servers group configuration
|
|
||||||
monitoring_type: prometheus
|
|
||||||
monitoring_port: 9090
|
|
||||||
monitoring_retention: 30d
|
|
||||||
monitoring_scrape_interval: 15s
|
|
||||||
|
|
||||||
# Grafana configuration
|
|
||||||
grafana_port: 3000
|
|
||||||
grafana_admin_password: "{{ vault_grafana_password }}"
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
---
|
|
||||||
# Vault variables for sensitive data
|
|
||||||
# NOTE: This file should be encrypted with: ansible-vault encrypt group_vars/vault.yml
|
|
||||||
# Run: ansible-playbook --ask-vault-pass playbooks/provision.yml
|
|
||||||
|
|
||||||
vault_admin_password: "{{ admin_password }}"
|
|
||||||
vault_db_password: "{{ db_root_password }}"
|
|
||||||
vault_grafana_password: "{{ grafana_admin_password }}"
|
|
||||||
vault_ssh_key_passphrase: "{{ ssh_key_passphrase }}"
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
---
|
|
||||||
# Webservers group configuration
|
|
||||||
webserver_type: nginx
|
|
||||||
http_port: 80
|
|
||||||
https_port: 443
|
|
||||||
health_check_path: /health
|
|
||||||
|
|
||||||
# Application configuration
|
|
||||||
app_name: "{{ group_names[0] | default('app') }}"
|
|
||||||
app_user: "{{ admin_user }}"
|
|
||||||
app_group: "{{ admin_user }}"
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
[webservers]
|
|
||||||
web01 ansible_host=172.20.0.11 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
|
||||||
web02 ansible_host=172.20.0.12 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
|
||||||
web03 ansible_host=172.20.0.13 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
|
||||||
|
|
||||||
[databases]
|
|
||||||
db01 ansible_host=172.20.0.21 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
|
||||||
db02 ansible_host=172.20.0.22 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
|
||||||
|
|
||||||
[loadbalancers]
|
|
||||||
lb01 ansible_host=172.20.0.31 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
|
||||||
|
|
||||||
[monitoring]
|
|
||||||
mon01 ansible_host=172.20.0.41 ansible_user=root ansible_ssh_private_key_file=/root/.ssh/id_rsa
|
|
||||||
|
|
||||||
[all:vars]
|
|
||||||
ansible_python_interpreter=/usr/bin/python3
|
|
||||||
ansible_ssh_common_args='-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
|
|
||||||
ansible_connection=ssh
|
|
||||||
|
|
||||||
[webservers:vars]
|
|
||||||
node_type=web
|
|
||||||
environment=production
|
|
||||||
|
|
||||||
[databases:vars]
|
|
||||||
node_type=database
|
|
||||||
environment=production
|
|
||||||
|
|
||||||
[loadbalancers:vars]
|
|
||||||
node_type=loadbalancer
|
|
||||||
environment=production
|
|
||||||
|
|
||||||
[monitoring:vars]
|
|
||||||
node_type=monitoring
|
|
||||||
environment=production
|
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
---
|
|
||||||
# Molecule converge playbook - applies roles to test them
|
|
||||||
|
|
||||||
- name: Converge
|
|
||||||
hosts: all
|
|
||||||
become: true
|
|
||||||
gather_facts: true
|
|
||||||
|
|
||||||
pre_tasks:
|
|
||||||
- name: Update apt cache
|
|
||||||
apt:
|
|
||||||
update_cache: yes
|
|
||||||
cache_valid_time: 3600
|
|
||||||
when: ansible_os_family == "Debian"
|
|
||||||
|
|
||||||
roles:
|
|
||||||
- role: base_provision
|
|
||||||
- role: hardening
|
|
||||||
- role: patching
|
|
||||||
|
|
||||||
post_tasks:
|
|
||||||
- name: Print Ansible facts
|
|
||||||
debug:
|
|
||||||
var: ansible_facts
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
---
|
|
||||||
# Molecule destroy playbook
|
|
||||||
|
|
||||||
- name: Destroy
|
|
||||||
hosts: localhost
|
|
||||||
gather_facts: false
|
|
||||||
tasks:
|
|
||||||
- name: Destroy molecule containers
|
|
||||||
docker_container:
|
|
||||||
name: "{{ item }}"
|
|
||||||
state: absent
|
|
||||||
force_kill: yes
|
|
||||||
loop: "{{ molecule_yml.platforms | map(attribute='name') | list }}"
|
|
||||||
register: destroy_result
|
|
||||||
ignore_errors: yes
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
---
|
|
||||||
# Molecule configuration for Ansible role testing
|
|
||||||
|
|
||||||
driver:
|
|
||||||
name: docker
|
|
||||||
|
|
||||||
platforms:
|
|
||||||
- name: ubuntu-22.04
|
|
||||||
image: geerlingguy/docker-ubuntu2204-ansible:latest
|
|
||||||
pre_build_image: true
|
|
||||||
privileged: true
|
|
||||||
volumes:
|
|
||||||
- /sys/fs/cgroup:/sys/fs/cgroup:rw
|
|
||||||
|
|
||||||
provisioner:
|
|
||||||
name: ansible
|
|
||||||
config_options:
|
|
||||||
defaults:
|
|
||||||
gathering: smart
|
|
||||||
fact_caching: jsonfile
|
|
||||||
fact_caching_connection: /tmp/ansible_facts
|
|
||||||
fact_caching_timeout: 3600
|
|
||||||
deprecation_warnings: false
|
|
||||||
|
|
||||||
verifier:
|
|
||||||
name: ansible
|
|
||||||
directory: molecule/default/tests
|
|
||||||
|
|
||||||
lint: |
|
|
||||||
yamllint .
|
|
||||||
ansible-lint
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
---
|
|
||||||
# Molecule verify playbook - runs tests to verify roles
|
|
||||||
|
|
||||||
- name: Verify
|
|
||||||
hosts: all
|
|
||||||
gather_facts: false
|
|
||||||
tasks:
|
|
||||||
- name: Check if base OS packages are installed
|
|
||||||
shell: dpkg -l | grep -E '(curl|wget|vim|htop)'
|
|
||||||
register: package_check
|
|
||||||
failed_when: package_check.rc not in [0, 1]
|
|
||||||
|
|
||||||
- name: Check SSH configuration
|
|
||||||
stat:
|
|
||||||
path: /etc/ssh/sshd_config
|
|
||||||
register: ssh_config_stat
|
|
||||||
failed_when: not ssh_config_stat.stat.exists
|
|
||||||
|
|
||||||
- name: Check firewall status
|
|
||||||
shell: ufw status | grep -q active
|
|
||||||
register: firewall_check
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Verify admin user exists
|
|
||||||
getent:
|
|
||||||
database: passwd
|
|
||||||
key: infra-admin
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Print verification results
|
|
||||||
debug:
|
|
||||||
msg: "Role verification completed"
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
---
|
|
||||||
- name: Decommission Enterprise Infrastructure Nodes
|
|
||||||
hosts: all
|
|
||||||
become: true
|
|
||||||
gather_facts: true
|
|
||||||
vars_files:
|
|
||||||
- vars/vault.yml
|
|
||||||
|
|
||||||
pre_tasks:
|
|
||||||
- name: Confirm decommissioning
|
|
||||||
ansible.builtin.pause:
|
|
||||||
prompt: |
|
|
||||||
WARNING: This will decommission {{ inventory_hostname }}
|
|
||||||
Backup Data: {{ backup_data }}
|
|
||||||
Export Config: {{ export_config }}
|
|
||||||
|
|
||||||
Press ENTER to continue or Ctrl+C to cancel
|
|
||||||
|
|
||||||
- name: Display decommissioning information
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: |
|
|
||||||
Decommissioning {{ inventory_hostname }}
|
|
||||||
Auto Shutdown: {{ auto_shutdown }}
|
|
||||||
Backup Enabled: {{ backup_data }}
|
|
||||||
|
|
||||||
roles:
|
|
||||||
- role: decommission
|
|
||||||
tags: ['decommission', 'cleanup']
|
|
||||||
|
|
||||||
post_tasks:
|
|
||||||
- name: Display decommissioning summary
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: |
|
|
||||||
Decommissioning completed!
|
|
||||||
Host: {{ inventory_hostname }}
|
|
||||||
Backup Location: /var/backups/decommission-{{ ansible_date_time.iso8601 }}/
|
|
||||||
@@ -1,126 +0,0 @@
|
|||||||
---
|
|
||||||
- name: Harden Enterprise Infrastructure Nodes
|
|
||||||
hosts: all
|
|
||||||
become: true
|
|
||||||
gather_facts: true
|
|
||||||
vars_files:
|
|
||||||
- vars/vault.yml
|
|
||||||
|
|
||||||
pre_tasks:
|
|
||||||
- name: Validate hardening prerequisites
|
|
||||||
ansible.builtin.assert:
|
|
||||||
that:
|
|
||||||
- ansible_os_family == "Debian"
|
|
||||||
- cis_level in [1, 2]
|
|
||||||
fail_msg: "Invalid hardening configuration"
|
|
||||||
|
|
||||||
- name: Display hardening information
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: |
|
|
||||||
Hardening {{ inventory_hostname }}
|
|
||||||
CIS Level: {{ cis_level }}
|
|
||||||
Disable Root Login: {{ disable_root_login }}
|
|
||||||
|
|
||||||
roles:
|
|
||||||
- role: hardening
|
|
||||||
tags: ['hardening', 'security']
|
|
||||||
|
|
||||||
post_tasks:
|
|
||||||
- name: Display hardening summary
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: |
|
|
||||||
Hardening completed successfully!
|
|
||||||
Host: {{ inventory_hostname }}
|
|
||||||
|
|
||||||
when: ansible_os_family == "Debian"
|
|
||||||
|
|
||||||
- name: Configure auditd
|
|
||||||
when: auditd_enabled
|
|
||||||
block:
|
|
||||||
- name: Install auditd
|
|
||||||
ansible.builtin.apt:
|
|
||||||
name: auditd
|
|
||||||
state: present
|
|
||||||
when: ansible_os_family == "Debian"
|
|
||||||
|
|
||||||
- name: Configure audit rules
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: templates/audit.rules.j2
|
|
||||||
dest: /etc/audit/rules.d/hardening.rules
|
|
||||||
mode: '0644'
|
|
||||||
|
|
||||||
- name: Enable auditd service
|
|
||||||
ansible.builtin.service:
|
|
||||||
name: auditd
|
|
||||||
state: started
|
|
||||||
enabled: true
|
|
||||||
|
|
||||||
- name: Configure AppArmor
|
|
||||||
when: apparmor_enabled and ansible_os_family == "Debian"
|
|
||||||
block:
|
|
||||||
- name: Install apparmor
|
|
||||||
ansible.builtin.apt:
|
|
||||||
name: apparmor
|
|
||||||
state: present
|
|
||||||
when: ansible_os_family == "Debian"
|
|
||||||
|
|
||||||
- name: Enable apparmor service
|
|
||||||
ansible.builtin.service:
|
|
||||||
name: apparmor
|
|
||||||
state: started
|
|
||||||
enabled: true
|
|
||||||
|
|
||||||
- name: Configure sysctl hardening
|
|
||||||
ansible.posix.sysctl:
|
|
||||||
name: "{{ item.key }}"
|
|
||||||
value: "{{ item.value }}"
|
|
||||||
state: present
|
|
||||||
reload: true
|
|
||||||
loop:
|
|
||||||
- { key: 'net.ipv4.ip_forward', value: '0' }
|
|
||||||
- { key: 'net.ipv4.conf.all.send_redirects', value: '0' }
|
|
||||||
- { key: 'net.ipv4.conf.default.send_redirects', value: '0' }
|
|
||||||
- { key: 'net.ipv4.tcp_syncookies', value: '1' }
|
|
||||||
- { key: 'net.ipv4.icmp_echo_ignore_broadcasts', value: '1' }
|
|
||||||
|
|
||||||
- name: Set secure file permissions
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "{{ item }}"
|
|
||||||
mode: '0644'
|
|
||||||
owner: root
|
|
||||||
group: root
|
|
||||||
loop:
|
|
||||||
- /etc/passwd
|
|
||||||
- /etc/group
|
|
||||||
- /etc/shadow
|
|
||||||
- /etc/gshadow
|
|
||||||
|
|
||||||
- name: Lock inactive user accounts
|
|
||||||
ansible.builtin.command: usermod -L "{{ item }}"
|
|
||||||
loop: "{{ inactive_users | default([]) }}"
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Configure password policies
|
|
||||||
community.general.pam_limits:
|
|
||||||
domain: '*'
|
|
||||||
limit_type: hard
|
|
||||||
limit_item: nofile
|
|
||||||
value: 1024
|
|
||||||
|
|
||||||
- name: Generate hardening report
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: templates/hardening_report.j2
|
|
||||||
dest: "/var/log/hardening_report_{{ ansible_date_time.iso8601 }}.log"
|
|
||||||
mode: '0644'
|
|
||||||
|
|
||||||
handlers:
|
|
||||||
- name: restart sshd
|
|
||||||
ansible.builtin.service:
|
|
||||||
name: ssh
|
|
||||||
state: restarted
|
|
||||||
|
|
||||||
- name: restart auditd
|
|
||||||
ansible.builtin.service:
|
|
||||||
name: auditd
|
|
||||||
state: restarted
|
|
||||||
when: auditd_enabled
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
---
|
|
||||||
- name: Apply Security Patches and Updates
|
|
||||||
hosts: all
|
|
||||||
become: true
|
|
||||||
gather_facts: true
|
|
||||||
vars_files:
|
|
||||||
- vars/vault.yml
|
|
||||||
|
|
||||||
pre_tasks:
|
|
||||||
- name: Validate patch prerequisites
|
|
||||||
ansible.builtin.assert:
|
|
||||||
that:
|
|
||||||
- ansible_os_family == "Debian"
|
|
||||||
fail_msg: "Patching supported only on Debian-based systems"
|
|
||||||
|
|
||||||
- name: Display patch information
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: |
|
|
||||||
Patching {{ inventory_hostname }}
|
|
||||||
Patch Window: {{ patch_window_start }} - {{ patch_window_end }}
|
|
||||||
Security Only: {{ patch_security_only }}
|
|
||||||
|
|
||||||
roles:
|
|
||||||
- role: patching
|
|
||||||
tags: ['patch', 'updates']
|
|
||||||
|
|
||||||
post_tasks:
|
|
||||||
- name: Display patching summary
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: |
|
|
||||||
Patching completed!
|
|
||||||
Host: {{ inventory_hostname }}
|
|
||||||
Reboot Required: {{ reboot_required | default(false) }}
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
---
|
|
||||||
- name: Provision Enterprise Infrastructure Nodes
|
|
||||||
hosts: all
|
|
||||||
become: true
|
|
||||||
gather_facts: true
|
|
||||||
vars_files:
|
|
||||||
- vars/vault.yml
|
|
||||||
|
|
||||||
pre_tasks:
|
|
||||||
- name: Validate Ansible version
|
|
||||||
ansible.builtin.assert:
|
|
||||||
that:
|
|
||||||
- ansible_version.major >= 2
|
|
||||||
- ansible_version.minor >= 9
|
|
||||||
fail_msg: "Ansible 2.9+ is required"
|
|
||||||
|
|
||||||
- name: Display provisioning information
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: |
|
|
||||||
Provisioning {{ inventory_hostname }}
|
|
||||||
OS: {{ ansible_os_family }}
|
|
||||||
Python: {{ ansible_python_version }}
|
|
||||||
|
|
||||||
roles:
|
|
||||||
- role: base_provision
|
|
||||||
tags: ['provision', 'base']
|
|
||||||
|
|
||||||
post_tasks:
|
|
||||||
- name: Generate provisioning summary
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: |
|
|
||||||
Provisioning completed successfully!
|
|
||||||
Host: {{ inventory_hostname }}
|
|
||||||
IP: {{ ansible_default_ipv4.address }}
|
|
||||||
OS: {{ ansible_os_family }} {{ ansible_os_version }}
|
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
# Base Provision Role
|
|
||||||
|
|
||||||
Provision basic infrastructure on enterprise nodes with security hardening.
|
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
- **Idempotent**: All tasks use proper idempotency markers (`changed_when`, `failed_when`)
|
|
||||||
- **Handlers**: SSH and fail2ban restarts use handlers instead of direct service calls
|
|
||||||
- **Variables**: All configuration in `defaults/main.yml` - no hardcoding
|
|
||||||
- **Validation**: Pre-flight checks for system requirements
|
|
||||||
- **Firewall**: UFW firewall configuration with configurable rules
|
|
||||||
- **SSH Security**: Root login disabled, password auth disabled, key-based auth only
|
|
||||||
|
|
||||||
## Role Variables
|
|
||||||
|
|
||||||
See `defaults/main.yml` for all available variables.
|
|
||||||
|
|
||||||
### Key Variables
|
|
||||||
|
|
||||||
- `node_timezone`: System timezone (default: UTC)
|
|
||||||
- `admin_user`: Admin username for infrastructure access
|
|
||||||
- `ssh_port`: SSH service port (default: 22)
|
|
||||||
- `base_packages`: List of base packages to install
|
|
||||||
- `firewall_enabled`: Enable UFW firewall (default: true)
|
|
||||||
- `firewall_allowed_tcp_ports`: Allowed TCP ports for firewall
|
|
||||||
|
|
||||||
## Vault Variables
|
|
||||||
|
|
||||||
Admin password should be stored in encrypted vault:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# vars/vault.yml (encrypted)
|
|
||||||
admin_password: "{{ vault_admin_password }}"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
- role: base_provision
|
|
||||||
vars:
|
|
||||||
node_timezone: "Europe/Warsaw"
|
|
||||||
firewall_enabled: true
|
|
||||||
```
|
|
||||||
|
|
||||||
## Handlers
|
|
||||||
|
|
||||||
- `restart sshd`: Restarts SSH service (triggered by config changes)
|
|
||||||
- `restart fail2ban`: Restarts fail2ban service (triggered by config changes)
|
|
||||||
|
|
||||||
## Tags
|
|
||||||
|
|
||||||
- `provision`: All provisioning tasks
|
|
||||||
- `base`: Base provision role tasks
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
---
|
|
||||||
# Base provisioning configuration
|
|
||||||
node_timezone: "UTC"
|
|
||||||
admin_user: "infra-admin"
|
|
||||||
ssh_port: 22
|
|
||||||
ssh_disabled_root_login: true
|
|
||||||
ssh_disable_password_auth: true
|
|
||||||
|
|
||||||
# Packages to install
|
|
||||||
base_packages:
|
|
||||||
- curl
|
|
||||||
- wget
|
|
||||||
- vim
|
|
||||||
- htop
|
|
||||||
- net-tools
|
|
||||||
- iptables
|
|
||||||
- fail2ban
|
|
||||||
- unattended-upgrades
|
|
||||||
|
|
||||||
# Firewall rules
|
|
||||||
firewall_enabled: true
|
|
||||||
firewall_default_policy: deny
|
|
||||||
firewall_allowed_tcp_ports:
|
|
||||||
- 22
|
|
||||||
- 80
|
|
||||||
- 443
|
|
||||||
|
|
||||||
# Application directories
|
|
||||||
app_directories:
|
|
||||||
- path: /opt/application
|
|
||||||
owner: "{{ admin_user }}"
|
|
||||||
group: "{{ admin_user }}"
|
|
||||||
mode: '0755'
|
|
||||||
- path: /var/log/application
|
|
||||||
owner: "{{ admin_user }}"
|
|
||||||
group: "{{ admin_user }}"
|
|
||||||
mode: '0755'
|
|
||||||
- path: /etc/application
|
|
||||||
owner: root
|
|
||||||
group: root
|
|
||||||
mode: '0755'
|
|
||||||
|
|
||||||
# Service verification
|
|
||||||
services_to_verify: []
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
---
|
|
||||||
- name: restart sshd
|
|
||||||
ansible.builtin.service:
|
|
||||||
name: sshd
|
|
||||||
state: restarted
|
|
||||||
|
|
||||||
- name: restart fail2ban
|
|
||||||
ansible.builtin.service:
|
|
||||||
name: fail2ban
|
|
||||||
state: restarted
|
|
||||||
enabled: true
|
|
||||||
@@ -1,156 +0,0 @@
|
|||||||
---
|
|
||||||
- name: Validate system requirements
|
|
||||||
ansible.builtin.assert:
|
|
||||||
that:
|
|
||||||
- ansible_os_family == "Debian"
|
|
||||||
- ansible_python_version is version('3.6', '>=')
|
|
||||||
fail_msg: "Unsupported system - requires Debian and Python 3.6+"
|
|
||||||
|
|
||||||
- name: Update package cache
|
|
||||||
ansible.builtin.apt:
|
|
||||||
update_cache: true
|
|
||||||
cache_valid_time: 3600
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Install base packages
|
|
||||||
ansible.builtin.apt:
|
|
||||||
name: "{{ base_packages }}"
|
|
||||||
state: present
|
|
||||||
update_cache: true
|
|
||||||
|
|
||||||
- name: Check if admin user exists
|
|
||||||
ansible.builtin.getent:
|
|
||||||
database: passwd
|
|
||||||
key: "{{ admin_user }}"
|
|
||||||
register: admin_check
|
|
||||||
failed_when: false
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Create admin user
|
|
||||||
ansible.builtin.user:
|
|
||||||
name: "{{ admin_user }}"
|
|
||||||
groups: sudo
|
|
||||||
append: true
|
|
||||||
create_home: true
|
|
||||||
shell: /bin/bash
|
|
||||||
password: "{{ admin_password | password_hash('sha512') }}"
|
|
||||||
when: admin_check.failed
|
|
||||||
no_log: true
|
|
||||||
|
|
||||||
- name: Configure timezone
|
|
||||||
community.general.timezone:
|
|
||||||
name: "{{ node_timezone }}"
|
|
||||||
|
|
||||||
- name: Configure SSH security
|
|
||||||
block:
|
|
||||||
- name: Disable root SSH login
|
|
||||||
ansible.builtin.lineinfile:
|
|
||||||
path: /etc/ssh/sshd_config
|
|
||||||
regexp: '^PermitRootLogin'
|
|
||||||
line: 'PermitRootLogin no'
|
|
||||||
state: present
|
|
||||||
when: ssh_disabled_root_login
|
|
||||||
notify: restart sshd
|
|
||||||
|
|
||||||
- name: Set SSH port
|
|
||||||
ansible.builtin.lineinfile:
|
|
||||||
path: /etc/ssh/sshd_config
|
|
||||||
regexp: '^Port'
|
|
||||||
line: "Port {{ ssh_port }}"
|
|
||||||
state: present
|
|
||||||
notify: restart sshd
|
|
||||||
|
|
||||||
- name: Disable password authentication
|
|
||||||
ansible.builtin.lineinfile:
|
|
||||||
path: /etc/ssh/sshd_config
|
|
||||||
regexp: '^PasswordAuthentication'
|
|
||||||
line: 'PasswordAuthentication no'
|
|
||||||
state: present
|
|
||||||
when: ssh_disable_password_auth
|
|
||||||
notify: restart sshd
|
|
||||||
|
|
||||||
- name: Configure firewall
|
|
||||||
block:
|
|
||||||
- name: Enable UFW firewall
|
|
||||||
community.general.ufw:
|
|
||||||
state: enabled
|
|
||||||
policy: "{{ firewall_default_policy }}"
|
|
||||||
when: firewall_enabled
|
|
||||||
|
|
||||||
- name: Allow SSH access
|
|
||||||
community.general.ufw:
|
|
||||||
rule: allow
|
|
||||||
port: "{{ ssh_port }}"
|
|
||||||
proto: tcp
|
|
||||||
when: firewall_enabled
|
|
||||||
|
|
||||||
- name: Allow HTTP/HTTPS
|
|
||||||
community.general.ufw:
|
|
||||||
rule: allow
|
|
||||||
port: "{{ item }}"
|
|
||||||
proto: tcp
|
|
||||||
loop: "{{ firewall_allowed_tcp_ports }}"
|
|
||||||
when: firewall_enabled and item not in [ssh_port]
|
|
||||||
|
|
||||||
- name: Configure fail2ban
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: jail.local.j2
|
|
||||||
dest: /etc/fail2ban/jail.local
|
|
||||||
backup: true
|
|
||||||
mode: '0644'
|
|
||||||
notify: restart fail2ban
|
|
||||||
|
|
||||||
- name: Enable unattended upgrades
|
|
||||||
ansible.builtin.lineinfile:
|
|
||||||
path: /etc/apt/apt.conf.d/20auto-upgrades
|
|
||||||
regexp: '^APT::Periodic::Unattended-Upgrade'
|
|
||||||
line: 'APT::Periodic::Unattended-Upgrade "1";'
|
|
||||||
state: present
|
|
||||||
|
|
||||||
- name: Create application directories
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "{{ item.path }}"
|
|
||||||
state: directory
|
|
||||||
owner: "{{ item.owner }}"
|
|
||||||
group: "{{ item.group }}"
|
|
||||||
mode: "{{ item.mode }}"
|
|
||||||
loop: "{{ app_directories }}"
|
|
||||||
|
|
||||||
- name: Deploy monitoring agent
|
|
||||||
ansible.builtin.include_role:
|
|
||||||
name: monitoring_agent
|
|
||||||
when: "'monitoring' in group_names"
|
|
||||||
|
|
||||||
- name: Deploy web server
|
|
||||||
ansible.builtin.include_role:
|
|
||||||
name: nginx
|
|
||||||
when: "'webservers' in group_names"
|
|
||||||
|
|
||||||
- name: Deploy database server
|
|
||||||
ansible.builtin.include_role:
|
|
||||||
name: postgresql
|
|
||||||
when: "'databases' in group_names"
|
|
||||||
|
|
||||||
- name: Deploy load balancer
|
|
||||||
ansible.builtin.include_role:
|
|
||||||
name: haproxy
|
|
||||||
when: "'loadbalancers' in group_names"
|
|
||||||
|
|
||||||
- name: Verify services are running
|
|
||||||
ansible.builtin.service:
|
|
||||||
name: "{{ item }}"
|
|
||||||
state: started
|
|
||||||
enabled: true
|
|
||||||
loop: "{{ services_to_verify }}"
|
|
||||||
when: services_to_verify | length > 0
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Run health checks
|
|
||||||
ansible.builtin.uri:
|
|
||||||
url: http://localhost/health
|
|
||||||
method: GET
|
|
||||||
status_code: 200
|
|
||||||
register: health_check
|
|
||||||
failed_when: false
|
|
||||||
ignore_errors: true
|
|
||||||
when: "'webservers' in group_names"
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
# fail2ban configuration
|
|
||||||
[DEFAULT]
|
|
||||||
bantime = 3600
|
|
||||||
findtime = 600
|
|
||||||
maxretry = 5
|
|
||||||
|
|
||||||
[sshd]
|
|
||||||
enabled = true
|
|
||||||
port = {{ ssh_port }}
|
|
||||||
logpath = /var/log/auth.log
|
|
||||||
maxretry = 3
|
|
||||||
|
|
||||||
[recidive]
|
|
||||||
enabled = true
|
|
||||||
@@ -1,62 +0,0 @@
|
|||||||
# Decommission Role
|
|
||||||
|
|
||||||
Gracefully decommission enterprise infrastructure nodes with comprehensive backup and cleanup.
|
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
- **Confirmation Prompt**: Interactive confirmation before decommissioning
|
|
||||||
- **Graceful Shutdown**: Stop services gracefully with connection drain time
|
|
||||||
- **Comprehensive Backup**: Archive configurations and data before cleanup
|
|
||||||
- **Selective Cleanup**: Only remove items that were deployed
|
|
||||||
- **Logging**: Detailed decommissioning logs for audit trail
|
|
||||||
- **Notifications**: Optional email notifications on completion
|
|
||||||
|
|
||||||
## Role Variables
|
|
||||||
|
|
||||||
See `defaults/main.yml` for all available variables.
|
|
||||||
|
|
||||||
### Key Variables
|
|
||||||
|
|
||||||
- `backup_data`: Backup application data (default: true)
|
|
||||||
- `export_config`: Export system configuration (default: true)
|
|
||||||
- `graceful_shutdown`: Graceful service shutdown (default: true)
|
|
||||||
- `auto_shutdown`: Auto shutdown after decommissioning (default: false)
|
|
||||||
- `application_services`: Services to stop
|
|
||||||
- `application_packages`: Packages to remove
|
|
||||||
- `decommission_notification_email`: Email for notifications (optional)
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
- role: decommission
|
|
||||||
vars:
|
|
||||||
backup_data: true
|
|
||||||
export_config: true
|
|
||||||
auto_shutdown: false
|
|
||||||
decommission_notification_email: "ops@company.com"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Backup Locations
|
|
||||||
|
|
||||||
- Configuration: `/var/backups/decommission-<timestamp>/config/`
|
|
||||||
- Data: `/var/backups/decommission-<timestamp>/data/`
|
|
||||||
- Report: `/var/log/decommission_report_<timestamp>.log`
|
|
||||||
|
|
||||||
## Supported Groups
|
|
||||||
|
|
||||||
- `webservers`: Backs up /var/www/html
|
|
||||||
- `databases`: Backs up PostgreSQL data
|
|
||||||
- `monitoring`: Backs up Prometheus data
|
|
||||||
- `loadbalancers`: Loadbalancer cleanup
|
|
||||||
|
|
||||||
## Safety Features
|
|
||||||
|
|
||||||
- Interactive confirmation before execution
|
|
||||||
- Connection drain time before shutdown (30 seconds)
|
|
||||||
- Errors are logged but don't stop the process
|
|
||||||
- Comprehensive audit log
|
|
||||||
|
|
||||||
## Tags
|
|
||||||
|
|
||||||
- `decommission`: All decommissioning tasks
|
|
||||||
- `cleanup`: Cleanup-related tasks
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
---
|
|
||||||
# Decommissioning configuration
|
|
||||||
backup_data: true
|
|
||||||
export_config: true
|
|
||||||
graceful_shutdown: true
|
|
||||||
cleanup_inventory: true
|
|
||||||
auto_shutdown: false
|
|
||||||
shutdown_delay: 10
|
|
||||||
|
|
||||||
# Services to stop gracefully
|
|
||||||
application_services:
|
|
||||||
- nginx
|
|
||||||
- postgresql
|
|
||||||
- haproxy
|
|
||||||
|
|
||||||
# Packages to remove
|
|
||||||
application_packages:
|
|
||||||
- nginx
|
|
||||||
- postgresql
|
|
||||||
- haproxy
|
|
||||||
- prometheus
|
|
||||||
|
|
||||||
# Directories to archive
|
|
||||||
config_paths:
|
|
||||||
- /etc/
|
|
||||||
- /opt/application/
|
|
||||||
|
|
||||||
data_paths:
|
|
||||||
- /var/www/html
|
|
||||||
- /var/lib/postgresql
|
|
||||||
- /var/lib/prometheus
|
|
||||||
|
|
||||||
# Notification settings
|
|
||||||
decommission_notification_email: null
|
|
||||||
@@ -1,177 +0,0 @@
|
|||||||
---
|
|
||||||
- name: Validate decommissioning requirements
|
|
||||||
ansible.builtin.assert:
|
|
||||||
that:
|
|
||||||
- backup_data or not backup_data
|
|
||||||
fail_msg: "Invalid decommissioning configuration"
|
|
||||||
|
|
||||||
- name: Pre-decommissioning checks
|
|
||||||
block:
|
|
||||||
- name: Check node health
|
|
||||||
ansible.builtin.uri:
|
|
||||||
url: http://localhost/health
|
|
||||||
method: GET
|
|
||||||
status_code: 200
|
|
||||||
register: health_check
|
|
||||||
failed_when: false
|
|
||||||
ignore_errors: true
|
|
||||||
when: "'webservers' in group_names"
|
|
||||||
|
|
||||||
- name: Create decommissioning backup directory
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}"
|
|
||||||
state: directory
|
|
||||||
mode: '0755'
|
|
||||||
|
|
||||||
- name: Initialize decommissioning log
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "/var/log/decommission.log"
|
|
||||||
state: touch
|
|
||||||
mode: '0644'
|
|
||||||
modification_time: now
|
|
||||||
access_time: now
|
|
||||||
|
|
||||||
- name: Log decommissioning start
|
|
||||||
ansible.builtin.lineinfile:
|
|
||||||
path: "/var/log/decommission.log"
|
|
||||||
line: "{{ ansible_date_time.iso8601 }} - Starting decommissioning of {{ inventory_hostname }}"
|
|
||||||
state: present
|
|
||||||
|
|
||||||
- name: Graceful application shutdown
|
|
||||||
block:
|
|
||||||
- name: Stop application services
|
|
||||||
ansible.builtin.service:
|
|
||||||
name: "{{ item }}"
|
|
||||||
state: stopped
|
|
||||||
loop: "{{ application_services }}"
|
|
||||||
failed_when: false
|
|
||||||
when: graceful_shutdown
|
|
||||||
|
|
||||||
- name: Wait for connections to drain
|
|
||||||
ansible.builtin.pause:
|
|
||||||
seconds: 30
|
|
||||||
when: graceful_shutdown and ("webservers" in group_names or "loadbalancers" in group_names)
|
|
||||||
|
|
||||||
- name: Export and backup data
|
|
||||||
block:
|
|
||||||
- name: Create config export directory
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config"
|
|
||||||
state: directory
|
|
||||||
mode: '0755'
|
|
||||||
|
|
||||||
- name: Archive system configuration
|
|
||||||
community.general.archive:
|
|
||||||
path: "{{ config_paths }}"
|
|
||||||
dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/config/system_config.tar.gz"
|
|
||||||
format: gz
|
|
||||||
when: export_config
|
|
||||||
failed_when: false # noqa risky-file-permissions
|
|
||||||
|
|
||||||
- name: Create data backup directory
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data"
|
|
||||||
state: directory
|
|
||||||
mode: '0755'
|
|
||||||
when: backup_data
|
|
||||||
|
|
||||||
- name: Backup individual data paths
|
|
||||||
community.general.archive:
|
|
||||||
path: "{{ item }}"
|
|
||||||
dest: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}/data/{{ item | regex_replace('/', '_') }}.tar.gz"
|
|
||||||
format: gz
|
|
||||||
loop: "{{ data_paths }}"
|
|
||||||
when: backup_data
|
|
||||||
failed_when: false # noqa risky-file-permissions
|
|
||||||
|
|
||||||
- name: Update monitoring and load balancing
|
|
||||||
block:
|
|
||||||
- name: Remove from load balancer
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: "Would remove {{ inventory_hostname }} from load balancer"
|
|
||||||
when: "'webservers' in group_names or 'databases' in group_names"
|
|
||||||
|
|
||||||
- name: Update monitoring alerts
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: "Would update monitoring alerts for {{ inventory_hostname }}"
|
|
||||||
when: "'monitoring' not in group_names"
|
|
||||||
|
|
||||||
- name: Clean up application
|
|
||||||
block:
|
|
||||||
- name: Remove application directories
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "{{ item }}"
|
|
||||||
state: absent
|
|
||||||
loop:
|
|
||||||
- /opt/application
|
|
||||||
- /var/www/html
|
|
||||||
- /var/lib/postgresql
|
|
||||||
- /var/lib/prometheus
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Remove application packages
|
|
||||||
ansible.builtin.apt:
|
|
||||||
name: "{{ item }}"
|
|
||||||
state: absent
|
|
||||||
purge: true
|
|
||||||
loop: "{{ application_packages }}"
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Clean system logs
|
|
||||||
ansible.builtin.shell: |
|
|
||||||
set -o pipefail
|
|
||||||
find /var/log -name "*.log" -type f -size +0 -exec truncate -s 0 {} \;
|
|
||||||
changed_when: false
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Remove SSH credentials
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "{{ item }}"
|
|
||||||
state: absent
|
|
||||||
loop:
|
|
||||||
- /root/.ssh/authorized_keys
|
|
||||||
- /root/.ssh/known_hosts
|
|
||||||
- /home/infra-admin/.ssh/authorized_keys
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Generate decommissioning report
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: decommission_report.j2
|
|
||||||
dest: "/var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log"
|
|
||||||
mode: '0644'
|
|
||||||
vars:
|
|
||||||
backup_location: "/var/backups/decommission-{{ ansible_date_time.iso8601 }}"
|
|
||||||
|
|
||||||
- name: Send decommissioning notification
|
|
||||||
community.general.mail:
|
|
||||||
host: localhost
|
|
||||||
port: 25
|
|
||||||
to: "{{ decommission_notification_email }}"
|
|
||||||
subject: "Node Decommissioned - {{ inventory_hostname }}"
|
|
||||||
body: |
|
|
||||||
Node {{ inventory_hostname }} has been successfully decommissioned.
|
|
||||||
|
|
||||||
Backup location: /var/backups/decommission-{{ ansible_date_time.iso8601 }}/
|
|
||||||
Services stopped: {{ application_services | join(', ') }}
|
|
||||||
Configuration exported: {{ export_config }}
|
|
||||||
Data backed up: {{ backup_data }}
|
|
||||||
|
|
||||||
See /var/log/decommission_report_{{ ansible_date_time.iso8601 }}.log for details
|
|
||||||
when: decommission_notification_email is defined
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Finalize decommissioning
|
|
||||||
block:
|
|
||||||
- name: Log decommissioning completion
|
|
||||||
ansible.builtin.lineinfile:
|
|
||||||
path: "/var/log/decommission.log"
|
|
||||||
line: "{{ ansible_date_time.iso8601 }} - Decommissioning completed for {{ inventory_hostname }}"
|
|
||||||
state: present
|
|
||||||
|
|
||||||
- name: Perform system shutdown
|
|
||||||
ansible.builtin.reboot:
|
|
||||||
msg: "System scheduled for shutdown after decommissioning"
|
|
||||||
delay: "{{ shutdown_delay }}"
|
|
||||||
when: auto_shutdown | bool
|
|
||||||
async: 1
|
|
||||||
poll: 0
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
Decommissioning Report
|
|
||||||
======================
|
|
||||||
Generated: {{ ansible_date_time.iso8601 }}
|
|
||||||
Host: {{ inventory_hostname }}
|
|
||||||
|
|
||||||
Status: COMPLETED
|
|
||||||
Backup Location: {{ backup_location }}
|
|
||||||
|
|
||||||
Configuration Exported: {{ export_config }}
|
|
||||||
Data Backed Up: {{ backup_data }}
|
|
||||||
Services Stopped: {{ application_services | join(', ') }}
|
|
||||||
|
|
||||||
Log Location: /var/log/decommission.log
|
|
||||||
@@ -1,58 +0,0 @@
|
|||||||
# Hardening Role
|
|
||||||
|
|
||||||
Apply security hardening to enterprise infrastructure nodes following CIS benchmarks.
|
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
- **CIS Compliance**: Support for CIS hardening levels 1 and 2
|
|
||||||
- **SSH Hardening**: Disable root login, password auth, set auth limits
|
|
||||||
- **Firewall Configuration**: UFW with configurable rules
|
|
||||||
- **Service Cleanup**: Disable unnecessary services and remove insecure packages
|
|
||||||
- **Handlers**: SSH restarts via handlers
|
|
||||||
|
|
||||||
## Role Variables
|
|
||||||
|
|
||||||
See `defaults/main.yml` for all available variables.
|
|
||||||
|
|
||||||
### Key Variables
|
|
||||||
|
|
||||||
- `cis_level`: CIS hardening level (1 or 2)
|
|
||||||
- `disable_root_login`: Disable root SSH login (default: true)
|
|
||||||
- `secure_ssh_config`: Apply SSH security hardening (default: true)
|
|
||||||
- `firewall_policy`: Firewall default policy (default: deny)
|
|
||||||
- `ssh_max_auth_tries`: Maximum SSH authentication attempts (default: 3)
|
|
||||||
- `ssh_client_alive_interval`: SSH client alive interval in seconds (default: 300)
|
|
||||||
- `ssh_allowed_networks`: Networks allowed SSH access from
|
|
||||||
|
|
||||||
### SSH Allowed Networks
|
|
||||||
|
|
||||||
Default trusted networks:
|
|
||||||
- 10.0.0.0/8 (Private Class A)
|
|
||||||
- 172.16.0.0/12 (Private Class B)
|
|
||||||
- 192.168.0.0/16 (Private Class C)
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
- role: hardening
|
|
||||||
vars:
|
|
||||||
cis_level: 1
|
|
||||||
disable_root_login: true
|
|
||||||
ssh_allowed_networks:
|
|
||||||
- 10.0.0.0/8
|
|
||||||
- 203.0.113.0/24
|
|
||||||
```
|
|
||||||
|
|
||||||
## SSH Configuration Changes
|
|
||||||
|
|
||||||
- Root login disabled
|
|
||||||
- Password authentication disabled
|
|
||||||
- Maximum auth tries: 3
|
|
||||||
- Empty passwords prohibited
|
|
||||||
- Client alive interval: 300 seconds
|
|
||||||
- Client alive count max: 2
|
|
||||||
|
|
||||||
## Tags
|
|
||||||
|
|
||||||
- `hardening`: All hardening tasks
|
|
||||||
- `security`: Security-related tasks
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
---
|
|
||||||
# Hardening configuration
|
|
||||||
cis_level: 1
|
|
||||||
disable_root_login: true
|
|
||||||
secure_ssh_config: true
|
|
||||||
firewall_policy: deny
|
|
||||||
auditd_enabled: true
|
|
||||||
selinux_mode: enforcing
|
|
||||||
apparmor_enabled: true
|
|
||||||
|
|
||||||
# SSH Hardening
|
|
||||||
ssh_max_auth_tries: 3
|
|
||||||
ssh_client_alive_interval: 300
|
|
||||||
ssh_client_alive_count_max: 2
|
|
||||||
|
|
||||||
# Firewall rules for SSH (trusted networks)
|
|
||||||
ssh_allowed_networks:
|
|
||||||
- 10.0.0.0/8
|
|
||||||
- 172.16.0.0/12
|
|
||||||
- 192.168.0.0/16
|
|
||||||
|
|
||||||
# Services to disable
|
|
||||||
unnecessary_services:
|
|
||||||
- cups
|
|
||||||
- avahi-daemon
|
|
||||||
- bluetooth
|
|
||||||
- nfs-server
|
|
||||||
- rpcbind
|
|
||||||
|
|
||||||
# Packages to remove
|
|
||||||
unnecessary_packages:
|
|
||||||
- telnet
|
|
||||||
- rsh-client
|
|
||||||
- talk
|
|
||||||
- ntalk
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
---
|
|
||||||
- name: restart sshd
|
|
||||||
ansible.builtin.service:
|
|
||||||
name: sshd
|
|
||||||
state: restarted
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
---
|
|
||||||
# CIS Hardening Level 1 tasks (stub for future expansion)
|
|
||||||
# https://www.cisecurity.org/cis-benchmarks/
|
|
||||||
|
|
||||||
- name: Check CIS status
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: "CIS Hardening Level {{ cis_level }} would be applied here"
|
|
||||||
@@ -1,95 +0,0 @@
|
|||||||
---
|
|
||||||
- name: Validate hardening requirements
|
|
||||||
ansible.builtin.assert:
|
|
||||||
that:
|
|
||||||
- ansible_os_family == "Debian"
|
|
||||||
- cis_level in [1, 2]
|
|
||||||
fail_msg: "Unsupported configuration for hardening"
|
|
||||||
|
|
||||||
- name: Apply CIS hardening tasks
|
|
||||||
ansible.builtin.include_tasks: cis_hardening.yml
|
|
||||||
when: cis_level >= 1
|
|
||||||
|
|
||||||
- name: Configure SSH hardening
|
|
||||||
block:
|
|
||||||
- name: Disable root SSH login
|
|
||||||
ansible.builtin.lineinfile:
|
|
||||||
path: /etc/ssh/sshd_config
|
|
||||||
regexp: '^PermitRootLogin'
|
|
||||||
line: 'PermitRootLogin no'
|
|
||||||
state: present
|
|
||||||
when: disable_root_login
|
|
||||||
notify: restart sshd
|
|
||||||
|
|
||||||
- name: Disable password authentication
|
|
||||||
ansible.builtin.lineinfile:
|
|
||||||
path: /etc/ssh/sshd_config
|
|
||||||
regexp: '^PasswordAuthentication'
|
|
||||||
line: 'PasswordAuthentication no'
|
|
||||||
state: present
|
|
||||||
when: secure_ssh_config
|
|
||||||
notify: restart sshd
|
|
||||||
|
|
||||||
- name: Set MaxAuthTries
|
|
||||||
ansible.builtin.lineinfile:
|
|
||||||
path: /etc/ssh/sshd_config
|
|
||||||
regexp: '^MaxAuthTries'
|
|
||||||
line: "MaxAuthTries {{ ssh_max_auth_tries }}"
|
|
||||||
state: present
|
|
||||||
notify: restart sshd
|
|
||||||
|
|
||||||
- name: Disable empty passwords
|
|
||||||
ansible.builtin.lineinfile:
|
|
||||||
path: /etc/ssh/sshd_config
|
|
||||||
regexp: '^PermitEmptyPasswords'
|
|
||||||
line: 'PermitEmptyPasswords no'
|
|
||||||
state: present
|
|
||||||
notify: restart sshd
|
|
||||||
|
|
||||||
- name: Set ClientAliveInterval
|
|
||||||
ansible.builtin.lineinfile:
|
|
||||||
path: /etc/ssh/sshd_config
|
|
||||||
regexp: '^ClientAliveInterval'
|
|
||||||
line: "ClientAliveInterval {{ ssh_client_alive_interval }}"
|
|
||||||
state: present
|
|
||||||
notify: restart sshd
|
|
||||||
|
|
||||||
- name: Set ClientAliveCountMax
|
|
||||||
ansible.builtin.lineinfile:
|
|
||||||
path: /etc/ssh/sshd_config
|
|
||||||
regexp: '^ClientAliveCountMax'
|
|
||||||
line: "ClientAliveCountMax {{ ssh_client_alive_count_max }}"
|
|
||||||
state: present
|
|
||||||
notify: restart sshd
|
|
||||||
|
|
||||||
- name: Configure firewall rules
|
|
||||||
block:
|
|
||||||
- name: Enable firewall
|
|
||||||
community.general.ufw:
|
|
||||||
state: enabled
|
|
||||||
policy: "{{ firewall_policy }}"
|
|
||||||
when: firewall_policy is defined
|
|
||||||
|
|
||||||
- name: Allow SSH from trusted networks
|
|
||||||
community.general.ufw:
|
|
||||||
rule: allow
|
|
||||||
port: '22'
|
|
||||||
proto: tcp
|
|
||||||
from: "{{ item }}"
|
|
||||||
loop: "{{ ssh_allowed_networks }}"
|
|
||||||
|
|
||||||
- name: Disable unnecessary services
|
|
||||||
ansible.builtin.service:
|
|
||||||
name: "{{ item }}"
|
|
||||||
state: stopped
|
|
||||||
enabled: false
|
|
||||||
loop: "{{ unnecessary_services }}"
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Remove unnecessary packages
|
|
||||||
ansible.builtin.apt:
|
|
||||||
name: "{{ item }}"
|
|
||||||
state: absent
|
|
||||||
purge: true
|
|
||||||
loop: "{{ unnecessary_packages }}"
|
|
||||||
failed_when: false
|
|
||||||
@@ -1,45 +0,0 @@
|
|||||||
# Patching Role
|
|
||||||
|
|
||||||
Apply security patches and OS updates to enterprise infrastructure nodes.
|
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
- **Idempotent**: Properly checks for changes with `changed_when`
|
|
||||||
- **Patch Window**: Optional enforcement of patch time windows
|
|
||||||
- **Pre-patch Backup**: Backs up package list before patching
|
|
||||||
- **Smart Reboot**: Automatically detects if reboot is required
|
|
||||||
- **Service Restart**: Restarts only necessary services after patching
|
|
||||||
- **Health Checks**: Verifies services and runs health endpoint checks
|
|
||||||
|
|
||||||
## Role Variables
|
|
||||||
|
|
||||||
See `defaults/main.yml` for all available variables.
|
|
||||||
|
|
||||||
### Key Variables
|
|
||||||
|
|
||||||
- `patch_window_start`: Patch window start time (default: 02:00)
|
|
||||||
- `patch_window_end`: Patch window end time (default: 04:00)
|
|
||||||
- `enforce_patch_window`: Enforce patch time window (default: true)
|
|
||||||
- `patch_security_only`: Apply security updates only (default: true)
|
|
||||||
- `backup_before_patch`: Create backup before patching (default: true)
|
|
||||||
- `reboot_if_required`: Auto-reboot if required (default: false)
|
|
||||||
- `services_to_restart`: Services to restart after patching
|
|
||||||
- `critical_services`: Critical services to verify after patching
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
- role: patching
|
|
||||||
vars:
|
|
||||||
patch_security_only: true
|
|
||||||
enforce_patch_window: false
|
|
||||||
reboot_if_required: true
|
|
||||||
```
|
|
||||||
|
|
||||||
## Report
|
|
||||||
|
|
||||||
Patch report is generated at: `/var/log/patch_report_<timestamp>.log`
|
|
||||||
|
|
||||||
## Backup Location
|
|
||||||
|
|
||||||
Pre-patch backups saved to: `/var/backups/pre-patch-<timestamp>/`
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
---
|
|
||||||
# Patching configuration
|
|
||||||
patch_window_start: "02:00"
|
|
||||||
patch_window_end: "04:00"
|
|
||||||
enforce_patch_window: true
|
|
||||||
patch_security_only: true
|
|
||||||
backup_before_patch: true
|
|
||||||
reboot_if_required: false
|
|
||||||
reboot_timeout: 300
|
|
||||||
|
|
||||||
# Services to restart after patching
|
|
||||||
services_to_restart:
|
|
||||||
- sshd
|
|
||||||
- fail2ban
|
|
||||||
|
|
||||||
# Services to verify after patching
|
|
||||||
critical_services:
|
|
||||||
- systemd-journald
|
|
||||||
- systemd-logind
|
|
||||||
- cron
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
---
|
|
||||||
- name: restart patching services
|
|
||||||
ansible.builtin.service:
|
|
||||||
name: "{{ item }}"
|
|
||||||
state: restarted
|
|
||||||
loop: "{{ services_to_restart }}"
|
|
||||||
@@ -1,105 +0,0 @@
|
|||||||
---
|
|
||||||
- name: Validate patch window
|
|
||||||
when: enforce_patch_window | bool
|
|
||||||
block:
|
|
||||||
- name: Check current time against patch window
|
|
||||||
ansible.builtin.assert:
|
|
||||||
that:
|
|
||||||
- ansible_date_time.hour | int >= patch_window_start.split(':')[0] | int
|
|
||||||
- ansible_date_time.hour | int < patch_window_end.split(':')[0] | int
|
|
||||||
fail_msg: |
|
|
||||||
Current time {{ ansible_date_time.hour }}:{{ ansible_date_time.minute }} is outside patch window {{ patch_window_start }}-{{ patch_window_end }}
|
|
||||||
|
|
||||||
- name: Create pre-patch backup
|
|
||||||
when: backup_before_patch | bool
|
|
||||||
block:
|
|
||||||
- name: Create backup directory
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "/var/backups/pre-patch-{{ ansible_date_time.iso8601 }}"
|
|
||||||
state: directory
|
|
||||||
mode: '0755'
|
|
||||||
|
|
||||||
- name: Capture current package list
|
|
||||||
ansible.builtin.shell: |
|
|
||||||
set -o pipefail
|
|
||||||
dpkg --get-selections > /var/backups/pre-patch-{{ ansible_date_time.iso8601 }}/packages.list
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Check for available updates
|
|
||||||
ansible.builtin.shell: |
|
|
||||||
set -o pipefail
|
|
||||||
apt list --upgradable 2>/dev/null | grep -v "Listing..." | wc -l
|
|
||||||
register: updates_available_count
|
|
||||||
changed_when: false
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Update package cache
|
|
||||||
ansible.builtin.apt:
|
|
||||||
update_cache: true
|
|
||||||
cache_valid_time: 300
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Check if reboot required before patching
|
|
||||||
ansible.builtin.stat:
|
|
||||||
path: /var/run/reboot-required
|
|
||||||
register: reboot_required_before
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Apply security updates
|
|
||||||
ansible.builtin.apt:
|
|
||||||
upgrade: dist
|
|
||||||
update_cache: true
|
|
||||||
when: patch_security_only | bool
|
|
||||||
register: apt_update_result
|
|
||||||
notify: restart patching services
|
|
||||||
|
|
||||||
- name: Apply all available updates
|
|
||||||
ansible.builtin.apt:
|
|
||||||
upgrade: full
|
|
||||||
update_cache: true
|
|
||||||
when: not (patch_security_only | bool)
|
|
||||||
register: apt_update_result
|
|
||||||
notify: restart patching services
|
|
||||||
|
|
||||||
- name: Check if reboot required after patching
|
|
||||||
ansible.builtin.stat:
|
|
||||||
path: /var/run/reboot-required
|
|
||||||
register: reboot_required_after
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Verify critical services are running
|
|
||||||
ansible.builtin.service:
|
|
||||||
name: "{{ item }}"
|
|
||||||
state: started
|
|
||||||
enabled: true
|
|
||||||
loop: "{{ critical_services }}"
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Run post-patch health checks
|
|
||||||
ansible.builtin.uri:
|
|
||||||
url: http://localhost/health
|
|
||||||
method: GET
|
|
||||||
status_code: 200
|
|
||||||
register: health_check
|
|
||||||
failed_when: false
|
|
||||||
ignore_errors: true
|
|
||||||
when: "'webservers' in group_names"
|
|
||||||
|
|
||||||
- name: Set reboot required flag
|
|
||||||
ansible.builtin.set_fact:
|
|
||||||
reboot_required: "{{ reboot_required_after.stat.exists | default(false) }}"
|
|
||||||
|
|
||||||
- name: Perform system reboot if required
|
|
||||||
ansible.builtin.reboot:
|
|
||||||
msg: "Rebooting after security patches"
|
|
||||||
timeout: "{{ reboot_timeout }}"
|
|
||||||
when: reboot_required and reboot_if_required | bool
|
|
||||||
|
|
||||||
- name: Generate patching report
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: patch_report.j2
|
|
||||||
dest: /var/log/patch_report_{{ ansible_date_time.iso8601 }}.log
|
|
||||||
mode: '0644'
|
|
||||||
vars:
|
|
||||||
updates_applied_count: "{{ apt_update_result.changed | ternary('Yes', 'No') }}"
|
|
||||||
reboot_required_flag: "{{ reboot_required }}"
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
Patching Report
|
|
||||||
===============
|
|
||||||
Generated: {{ ansible_date_time.iso8601 }}
|
|
||||||
Host: {{ inventory_hostname }}
|
|
||||||
|
|
||||||
Updates Applied: {{ updates_applied_count }}
|
|
||||||
Reboot Required: {{ reboot_required_flag }}
|
|
||||||
Services Restarted: {{ services_to_restart | join(', ') }}
|
|
||||||
|
|
||||||
Backup Location: /var/backups/pre-patch-{{ ansible_date_time.iso8601 }}/
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
# Scenario: Simulate Failure and Patch
|
|
||||||
|
|
||||||
## Description
|
|
||||||
|
|
||||||
Validate that a service-level failure can be detected, recovered, and followed by a controlled patch workflow. This mirrors a maintenance window where a degraded node is stabilized before package updates are applied.
|
|
||||||
|
|
||||||
## Commands
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd enterprise-infra-simulator
|
|
||||||
./scripts/simulate_failure.sh service 30 web
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/patch.yml
|
|
||||||
ansible-playbook -i inventory/hosts.ini playbooks/hardening.yml --check
|
|
||||||
```
|
|
||||||
|
|
||||||
## Expected Result
|
|
||||||
|
|
||||||
- The simulation records a temporary service failure.
|
|
||||||
- The service is restored after cleanup.
|
|
||||||
- The patch playbook completes without unreachable hosts.
|
|
||||||
- Hardening check mode reports no destructive changes.
|
|
||||||
@@ -1,116 +0,0 @@
|
|||||||
---
|
|
||||||
- name: Enterprise Scaling Event Scenario
|
|
||||||
hosts: all
|
|
||||||
become: yes
|
|
||||||
gather_facts: yes
|
|
||||||
vars:
|
|
||||||
scaling_threshold: 80
|
|
||||||
cooldown_period: 300
|
|
||||||
max_scale_up: 5
|
|
||||||
min_instances: 2
|
|
||||||
|
|
||||||
pre_tasks:
|
|
||||||
- name: Log scenario start
|
|
||||||
lineinfile:
|
|
||||||
path: "/var/log/scaling_scenario.log"
|
|
||||||
line: "{{ ansible_date_time.iso8601 }} - Starting scaling event scenario"
|
|
||||||
create: yes
|
|
||||||
|
|
||||||
- name: Check current load
|
|
||||||
command: uptime
|
|
||||||
register: system_load
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Parse load average
|
|
||||||
set_fact:
|
|
||||||
load_1min: "{{ system_load.stdout.split(',')[0].split()[-1] | float }}"
|
|
||||||
load_5min: "{{ system_load.stdout.split(',')[1] | float }}"
|
|
||||||
load_15min: "{{ system_load.stdout.split(',')[2] | float }}"
|
|
||||||
|
|
||||||
tasks:
|
|
||||||
- name: Evaluate scaling conditions
|
|
||||||
set_fact:
|
|
||||||
scale_up_needed: "{{ load_5min > scaling_threshold }}"
|
|
||||||
scale_down_needed: "{{ load_5min < (scaling_threshold * 0.3) }}"
|
|
||||||
|
|
||||||
- name: Scale up web servers
|
|
||||||
include_role:
|
|
||||||
name: scale_up
|
|
||||||
tasks_from: web_servers
|
|
||||||
vars:
|
|
||||||
scale_count: "{{ [max_scale_up, (load_5min / 10) | int] | min }}"
|
|
||||||
when: scale_up_needed and "'webservers' in group_names"
|
|
||||||
|
|
||||||
- name: Scale up database servers
|
|
||||||
include_role:
|
|
||||||
name: scale_up
|
|
||||||
tasks_from: database_servers
|
|
||||||
vars:
|
|
||||||
scale_count: "{{ [2, (load_5min / 20) | int] | min }}"
|
|
||||||
when: scale_up_needed and "'databases' in group_names"
|
|
||||||
|
|
||||||
- name: Update load balancer configuration
|
|
||||||
include_role:
|
|
||||||
name: load_balancer
|
|
||||||
tasks_from: update_backends
|
|
||||||
when: scale_up_needed
|
|
||||||
|
|
||||||
- name: Scale down web servers
|
|
||||||
include_role:
|
|
||||||
name: scale_down
|
|
||||||
tasks_from: web_servers
|
|
||||||
vars:
|
|
||||||
scale_count: "{{ [(inventory_hostname | regex_findall('[0-9]+') | first | int) - min_instances, 1] | max }}"
|
|
||||||
when: scale_down_needed and "'webservers' in group_names" and (inventory_hostname | regex_findall('[0-9]+') | first | int) > min_instances
|
|
||||||
|
|
||||||
- name: Wait for cooldown period
|
|
||||||
pause:
|
|
||||||
seconds: "{{ cooldown_period }}"
|
|
||||||
when: scale_up_needed or scale_down_needed
|
|
||||||
|
|
||||||
- name: Verify scaling results
|
|
||||||
uri:
|
|
||||||
url: http://localhost/health
|
|
||||||
method: GET
|
|
||||||
status_code: 200
|
|
||||||
register: health_check
|
|
||||||
until: health_check.status == 200
|
|
||||||
retries: 5
|
|
||||||
delay: 10
|
|
||||||
when: "'webservers' in group_names"
|
|
||||||
|
|
||||||
- name: Update monitoring thresholds
|
|
||||||
include_role:
|
|
||||||
name: monitoring
|
|
||||||
tasks_from: update_alerts
|
|
||||||
vars:
|
|
||||||
new_threshold: "{{ scaling_threshold + 10 }}"
|
|
||||||
|
|
||||||
- name: Send scaling notification
|
|
||||||
mail:
|
|
||||||
to: "{{ scaling_notification_email | default('infra-team@company.com') }}"
|
|
||||||
subject: "Infrastructure Scaling Event - {{ inventory_hostname }}"
|
|
||||||
body: |
|
|
||||||
Scaling event completed on {{ inventory_hostname }}
|
|
||||||
|
|
||||||
Load averages: {{ load_1min }}, {{ load_5min }}, {{ load_15min }}
|
|
||||||
Action taken: {{ 'Scale Up' if scale_up_needed else 'Scale Down' if scale_down_needed else 'No Action' }}
|
|
||||||
Health check: {{ 'PASSED' if health_check.status == 200 else 'FAILED' }}
|
|
||||||
|
|
||||||
See /var/log/scaling_scenario.log for details
|
|
||||||
when: scaling_notification_email is defined
|
|
||||||
ignore_errors: yes
|
|
||||||
|
|
||||||
post_tasks:
|
|
||||||
- name: Generate scaling scenario report
|
|
||||||
template:
|
|
||||||
src: templates/scaling_scenario_report.j2
|
|
||||||
dest: "/var/log/scaling_scenario_report_{{ ansible_date_time.iso8601 }}.log"
|
|
||||||
vars:
|
|
||||||
scenario_outcome: "{{ 'SUCCESS' if health_check.status == 200 else 'WARNING' }}"
|
|
||||||
load_metrics: "{{ load_1min }}, {{ load_5min }}, {{ load_15min }}"
|
|
||||||
|
|
||||||
- name: Log scenario completion
|
|
||||||
lineinfile:
|
|
||||||
path: "/var/log/scaling_scenario.log"
|
|
||||||
line: "{{ ansible_date_time.iso8601 }} - Scaling event scenario completed"
|
|
||||||
@@ -1,343 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Enterprise Infrastructure Failure Simulation Script
|
|
||||||
# Simulates various types of infrastructure failures for testing
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
DOCKER_COMPOSE_FILE="docker-compose.yml"
|
|
||||||
INVENTORY_FILE="inventory/hosts.ini"
|
|
||||||
LOG_FILE="logs/failure_simulation.log"
|
|
||||||
|
|
||||||
# Default values
|
|
||||||
FAILURE_TYPE="${1:-network}"
|
|
||||||
DURATION="${2:-60}"
|
|
||||||
TARGET_NODES="${3:-all}"
|
|
||||||
INTENSITY="${INTENSITY:-medium}"
|
|
||||||
|
|
||||||
# Logging function
|
|
||||||
log() {
|
|
||||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Error handling
|
|
||||||
error_exit() {
|
|
||||||
log "ERROR: $1"
|
|
||||||
# Cleanup any active failures
|
|
||||||
cleanup_failure
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
# Validate inputs
|
|
||||||
validate_inputs() {
|
|
||||||
case "$FAILURE_TYPE" in
|
|
||||||
network|disk|service|node|cpu|memory) ;;
|
|
||||||
*) error_exit "Invalid failure type: $FAILURE_TYPE. Must be network, disk, service, node, cpu, or memory" ;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -lt 1 ]; then
|
|
||||||
error_exit "Invalid duration: $DURATION. Must be a positive integer (seconds)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
case "$INTENSITY" in
|
|
||||||
low|medium|high|critical) ;;
|
|
||||||
*) error_exit "Invalid intensity: $INTENSITY. Must be low, medium, high, or critical" ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
# Get target containers
|
|
||||||
get_target_containers() {
|
|
||||||
case "$TARGET_NODES" in
|
|
||||||
all)
|
|
||||||
docker-compose ps --services | grep -v "^NAME$" || true
|
|
||||||
;;
|
|
||||||
web)
|
|
||||||
echo "web"
|
|
||||||
;;
|
|
||||||
db)
|
|
||||||
echo "db"
|
|
||||||
;;
|
|
||||||
lb)
|
|
||||||
echo "lb"
|
|
||||||
;;
|
|
||||||
monitor)
|
|
||||||
echo "monitor"
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "$TARGET_NODES"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
# Network failure simulation
|
|
||||||
simulate_network_failure() {
|
|
||||||
local containers=$(get_target_containers)
|
|
||||||
log "Simulating network failure on containers: $containers"
|
|
||||||
|
|
||||||
for container in $containers; do
|
|
||||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
|
||||||
|
|
||||||
for cid in $container_ids; do
|
|
||||||
if [ -n "$cid" ]; then
|
|
||||||
log "Disconnecting network for container $cid"
|
|
||||||
|
|
||||||
# Disconnect from network
|
|
||||||
docker network disconnect "$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" "$cid" 2>/dev/null || true
|
|
||||||
|
|
||||||
# Store original network for restoration
|
|
||||||
echo "$cid:$(docker inspect "$cid" --format '{{.HostConfig.NetworkMode}}')" >> /tmp/network_failure_state
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# Disk failure simulation
|
|
||||||
simulate_disk_failure() {
|
|
||||||
local containers=$(get_target_containers)
|
|
||||||
log "Simulating disk space exhaustion on containers: $containers"
|
|
||||||
|
|
||||||
for container in $containers; do
|
|
||||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
|
||||||
|
|
||||||
for cid in $container_ids; do
|
|
||||||
if [ -n "$cid" ]; then
|
|
||||||
log "Filling disk space in container $cid"
|
|
||||||
|
|
||||||
# Create a large file to consume disk space
|
|
||||||
local fill_size="100M"
|
|
||||||
case "$INTENSITY" in
|
|
||||||
low) fill_size="50M" ;;
|
|
||||||
medium) fill_size="100M" ;;
|
|
||||||
high) fill_size="500M" ;;
|
|
||||||
critical) fill_size="1G" ;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
docker exec "$cid" bash -c "dd if=/dev/zero of=/tmp/disk_fill bs=1M count=$(( ${fill_size%M} * 1024 ))" 2>/dev/null || true
|
|
||||||
echo "$cid:disk_fill" >> /tmp/disk_failure_state
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# Service failure simulation
|
|
||||||
simulate_service_failure() {
|
|
||||||
local containers=$(get_target_containers)
|
|
||||||
log "Simulating service failures on containers: $containers"
|
|
||||||
|
|
||||||
for container in $containers; do
|
|
||||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
|
||||||
|
|
||||||
for cid in $container_ids; do
|
|
||||||
if [ -n "$cid" ]; then
|
|
||||||
log "Stopping services in container $cid"
|
|
||||||
|
|
||||||
# Stop common services
|
|
||||||
docker exec "$cid" systemctl stop nginx 2>/dev/null || true
|
|
||||||
docker exec "$cid" systemctl stop postgresql 2>/dev/null || true
|
|
||||||
docker exec "$cid" systemctl stop haproxy 2>/dev/null || true
|
|
||||||
|
|
||||||
echo "$cid:services" >> /tmp/service_failure_state
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# Node failure simulation
|
|
||||||
simulate_node_failure() {
|
|
||||||
local containers=$(get_target_containers)
|
|
||||||
log "Simulating complete node failures on containers: $containers"
|
|
||||||
|
|
||||||
for container in $containers; do
|
|
||||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
|
||||||
|
|
||||||
for cid in $container_ids; do
|
|
||||||
if [ -n "$cid" ]; then
|
|
||||||
log "Stopping container $cid (node failure)"
|
|
||||||
docker pause "$cid"
|
|
||||||
echo "$cid:paused" >> /tmp/node_failure_state
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# CPU stress simulation
|
|
||||||
simulate_cpu_failure() {
|
|
||||||
local containers=$(get_target_containers)
|
|
||||||
log "Simulating CPU stress on containers: $containers"
|
|
||||||
|
|
||||||
for container in $containers; do
|
|
||||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
|
||||||
|
|
||||||
for cid in $container_ids; do
|
|
||||||
if [ -n "$cid" ]; then
|
|
||||||
log "Starting CPU stress in container $cid"
|
|
||||||
|
|
||||||
# Start CPU stress process
|
|
||||||
docker exec -d "$cid" bash -c "while true; do :; done" 2>/dev/null || true
|
|
||||||
echo "$cid:cpu_stress:$(docker exec "$cid" ps aux | grep "while true" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/cpu_failure_state
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# Memory stress simulation
|
|
||||||
simulate_memory_failure() {
|
|
||||||
local containers=$(get_target_containers)
|
|
||||||
log "Simulating memory exhaustion on containers: $containers"
|
|
||||||
|
|
||||||
for container in $containers; do
|
|
||||||
local container_ids=$(docker-compose ps -q "$container" 2>/dev/null || true)
|
|
||||||
|
|
||||||
for cid in $container_ids; do
|
|
||||||
if [ -n "$cid" ]; then
|
|
||||||
log "Starting memory stress in container $cid"
|
|
||||||
|
|
||||||
# Start memory stress process
|
|
||||||
docker exec -d "$cid" bash -c "tail /dev/zero" 2>/dev/null || true
|
|
||||||
echo "$cid:memory_stress:$(docker exec "$cid" ps aux | grep "tail /dev/zero" | grep -v grep | awk '{print $2}' | head -1)" >> /tmp/memory_failure_state
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# Inject failure
|
|
||||||
inject_failure() {
|
|
||||||
case "$FAILURE_TYPE" in
|
|
||||||
network) simulate_network_failure ;;
|
|
||||||
disk) simulate_disk_failure ;;
|
|
||||||
service) simulate_service_failure ;;
|
|
||||||
node) simulate_node_failure ;;
|
|
||||||
cpu) simulate_cpu_failure ;;
|
|
||||||
memory) simulate_memory_failure ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
# Cleanup failure
|
|
||||||
cleanup_failure() {
|
|
||||||
log "Cleaning up failure simulation"
|
|
||||||
|
|
||||||
# Restore network connections
|
|
||||||
if [ -f /tmp/network_failure_state ]; then
|
|
||||||
while IFS=: read -r cid network; do
|
|
||||||
docker network connect "$network" "$cid" 2>/dev/null || true
|
|
||||||
done < /tmp/network_failure_state
|
|
||||||
rm -f /tmp/network_failure_state
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Clean up disk fill files
|
|
||||||
if [ -f /tmp/disk_failure_state ]; then
|
|
||||||
while IFS=: read -r cid _; do
|
|
||||||
docker exec "$cid" rm -f /tmp/disk_fill 2>/dev/null || true
|
|
||||||
done < /tmp/disk_failure_state
|
|
||||||
rm -f /tmp/disk_failure_state
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Restart services
|
|
||||||
if [ -f /tmp/service_failure_state ]; then
|
|
||||||
while IFS=: read -r cid _; do
|
|
||||||
docker exec "$cid" systemctl start nginx 2>/dev/null || true
|
|
||||||
docker exec "$cid" systemctl start postgresql 2>/dev/null || true
|
|
||||||
docker exec "$cid" systemctl start haproxy 2>/dev/null || true
|
|
||||||
done < /tmp/service_failure_state
|
|
||||||
rm -f /tmp/service_failure_state
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Unpause containers
|
|
||||||
if [ -f /tmp/node_failure_state ]; then
|
|
||||||
while IFS=: read -r cid _; do
|
|
||||||
docker unpause "$cid" 2>/dev/null || true
|
|
||||||
done < /tmp/node_failure_state
|
|
||||||
rm -f /tmp/node_failure_state
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Kill stress processes
|
|
||||||
if [ -f /tmp/cpu_failure_state ]; then
|
|
||||||
while IFS=: read -r cid _ pid; do
|
|
||||||
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
|
|
||||||
done < /tmp/cpu_failure_state
|
|
||||||
rm -f /tmp/cpu_failure_state
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -f /tmp/memory_failure_state ]; then
|
|
||||||
while IFS=: read -r cid _ pid; do
|
|
||||||
docker exec "$cid" kill -9 "$pid" 2>/dev/null || true
|
|
||||||
done < /tmp/memory_failure_state
|
|
||||||
rm -f /tmp/memory_failure_state
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Monitor failure
|
|
||||||
monitor_failure() {
|
|
||||||
local end_time=$(( $(date +%s) + DURATION ))
|
|
||||||
|
|
||||||
log "Monitoring failure for $DURATION seconds"
|
|
||||||
|
|
||||||
while [ $(date +%s) -lt $end_time ]; do
|
|
||||||
# Check container status
|
|
||||||
if ! docker-compose ps | grep -q "Up\|Paused"; then
|
|
||||||
log "WARNING: All containers are down"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Log system metrics
|
|
||||||
log "System status: $(docker stats --no-stream --format 'table {{.Container}}\t{{.CPUPerc}}\t{{.MemPerc}}' | tail -n +2)"
|
|
||||||
|
|
||||||
sleep 10
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# Generate failure report
|
|
||||||
generate_report() {
|
|
||||||
local report_file="reports/failure_simulation_$(date +%Y%m%d_%H%M%S).txt"
|
|
||||||
|
|
||||||
cat > "$report_file" << EOF
|
|
||||||
Failure Simulation Report
|
|
||||||
========================
|
|
||||||
|
|
||||||
Timestamp: $(date)
|
|
||||||
Failure Type: $FAILURE_TYPE
|
|
||||||
Duration: $DURATION seconds
|
|
||||||
Target Nodes: $TARGET_NODES
|
|
||||||
Intensity: $INTENSITY
|
|
||||||
|
|
||||||
Pre-failure Status:
|
|
||||||
$(docker-compose ps)
|
|
||||||
|
|
||||||
Post-failure Status:
|
|
||||||
$(docker-compose ps)
|
|
||||||
|
|
||||||
Log File: $LOG_FILE
|
|
||||||
EOF
|
|
||||||
|
|
||||||
log "Failure simulation report generated: $report_file"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Main execution
|
|
||||||
main() {
|
|
||||||
log "Starting failure simulation: $FAILURE_TYPE for $DURATION seconds"
|
|
||||||
|
|
||||||
validate_inputs
|
|
||||||
|
|
||||||
# Inject failure
|
|
||||||
inject_failure
|
|
||||||
|
|
||||||
# Monitor during failure
|
|
||||||
monitor_failure
|
|
||||||
|
|
||||||
# Cleanup
|
|
||||||
cleanup_failure
|
|
||||||
|
|
||||||
# Generate report
|
|
||||||
generate_report
|
|
||||||
|
|
||||||
log "Failure simulation completed successfully"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Trap for cleanup on script exit
|
|
||||||
trap cleanup_failure EXIT
|
|
||||||
|
|
||||||
# Initialize logging
|
|
||||||
mkdir -p logs reports
|
|
||||||
|
|
||||||
# Run main function
|
|
||||||
main "$@"
|
|
||||||
@@ -1,208 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Enterprise Infrastructure Scaling Simulation Script
|
|
||||||
# Simulates scaling operations for infrastructure nodes
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
DOCKER_COMPOSE_FILE="docker-compose.yml"
|
|
||||||
INVENTORY_FILE="inventory/hosts.ini"
|
|
||||||
LOG_FILE="logs/scaling_simulation.log"
|
|
||||||
|
|
||||||
# Default values
|
|
||||||
DIRECTION="${1:-up}"
|
|
||||||
COUNT="${2:-1}"
|
|
||||||
NODE_TYPE="${3:-web}"
|
|
||||||
SIMULATION_MODE="${SIMULATION_MODE:-false}"
|
|
||||||
|
|
||||||
# Logging function
|
|
||||||
log() {
|
|
||||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$LOG_FILE"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Error handling
|
|
||||||
error_exit() {
|
|
||||||
log "ERROR: $1"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
# Validate inputs
|
|
||||||
validate_inputs() {
|
|
||||||
if [[ "$DIRECTION" != "up" && "$DIRECTION" != "down" ]]; then
|
|
||||||
error_exit "Invalid direction: $DIRECTION. Must be 'up' or 'down'"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! [[ "$COUNT" =~ ^[0-9]+$ ]] || [ "$COUNT" -lt 1 ]; then
|
|
||||||
error_exit "Invalid count: $COUNT. Must be a positive integer"
|
|
||||||
fi
|
|
||||||
|
|
||||||
case "$NODE_TYPE" in
|
|
||||||
web|db|lb|monitor) ;;
|
|
||||||
*) error_exit "Invalid node type: $NODE_TYPE. Must be web, db, lb, or monitor" ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
# Get current node count
|
|
||||||
get_current_count() {
|
|
||||||
local type="$1"
|
|
||||||
case "$type" in
|
|
||||||
web) docker-compose ps web | grep -c "Up" ;;
|
|
||||||
db) docker-compose ps db | grep -c "Up" ;;
|
|
||||||
lb) docker-compose ps lb | grep -c "Up" ;;
|
|
||||||
monitor) docker-compose ps monitor | grep -c "Up" ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
# Scale up infrastructure
|
|
||||||
scale_up() {
|
|
||||||
local type="$1"
|
|
||||||
local count="$2"
|
|
||||||
|
|
||||||
log "Scaling up $count $type nodes"
|
|
||||||
|
|
||||||
# Update docker-compose replica count
|
|
||||||
sed -i.bak "s/replicas: [0-9]\+/replicas: $(( $(get_current_count "$type") + count ))/" "$DOCKER_COMPOSE_FILE"
|
|
||||||
|
|
||||||
# Deploy new containers
|
|
||||||
docker-compose up -d --scale "${type}=${count}"
|
|
||||||
|
|
||||||
# Wait for containers to be ready
|
|
||||||
log "Waiting for containers to be ready..."
|
|
||||||
sleep 30
|
|
||||||
|
|
||||||
# Update inventory
|
|
||||||
update_inventory "$type" "$count" "add"
|
|
||||||
|
|
||||||
# Run provisioning playbook on new nodes
|
|
||||||
if [ "$SIMULATION_MODE" = false ]; then
|
|
||||||
ansible-playbook -i "$INVENTORY_FILE" playbooks/provision.yml --limit "${type}*"
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "Successfully scaled up $count $type nodes"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Scale down infrastructure
|
|
||||||
scale_down() {
|
|
||||||
local type="$1"
|
|
||||||
local count="$2"
|
|
||||||
|
|
||||||
local current_count=$(get_current_count "$type")
|
|
||||||
if [ "$current_count" -lt "$count" ]; then
|
|
||||||
error_exit "Cannot scale down $count nodes. Only $current_count $type nodes currently running"
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "Scaling down $count $type nodes"
|
|
||||||
|
|
||||||
# Select nodes to remove (oldest first)
|
|
||||||
local nodes_to_remove=$(docker-compose ps "$type" | grep "Up" | head -n "$count" | awk '{print $1}')
|
|
||||||
|
|
||||||
# Decommission nodes
|
|
||||||
for node in $nodes_to_remove; do
|
|
||||||
if [ "$SIMULATION_MODE" = false ]; then
|
|
||||||
ansible-playbook -i "$INVENTORY_FILE" playbooks/decommission.yml --limit "$node"
|
|
||||||
fi
|
|
||||||
docker stop "$node"
|
|
||||||
docker rm "$node"
|
|
||||||
done
|
|
||||||
|
|
||||||
# Update docker-compose replica count
|
|
||||||
sed -i.bak "s/replicas: [0-9]\+/replicas: $(( current_count - count ))/" "$DOCKER_COMPOSE_FILE"
|
|
||||||
|
|
||||||
# Update inventory
|
|
||||||
update_inventory "$type" "$count" "remove"
|
|
||||||
|
|
||||||
log "Successfully scaled down $count $type nodes"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Update Ansible inventory
|
|
||||||
update_inventory() {
|
|
||||||
local type="$1"
|
|
||||||
local count="$2"
|
|
||||||
local action="$3"
|
|
||||||
|
|
||||||
log "Updating inventory for $action $count $type nodes"
|
|
||||||
|
|
||||||
# This would be more complex in a real implementation
|
|
||||||
# For simulation, we'll just log the action
|
|
||||||
case "$action" in
|
|
||||||
add)
|
|
||||||
log "Added $count $type nodes to inventory"
|
|
||||||
;;
|
|
||||||
remove)
|
|
||||||
log "Removed $count $type nodes from inventory"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
# Health check after scaling
|
|
||||||
health_check() {
|
|
||||||
log "Running health checks after scaling"
|
|
||||||
|
|
||||||
# Check container status
|
|
||||||
if ! docker-compose ps | grep -q "Up"; then
|
|
||||||
error_exit "Some containers failed to start"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Ansible ping check
|
|
||||||
if [ "$SIMULATION_MODE" = false ]; then
|
|
||||||
if ! ansible -i "$INVENTORY_FILE" all -m ping >/dev/null 2>&1; then
|
|
||||||
log "WARNING: Some nodes failed Ansible ping check"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "Health checks completed"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Generate scaling report
|
|
||||||
generate_report() {
|
|
||||||
local report_file="reports/scaling_report_$(date +%Y%m%d_%H%M%S).txt"
|
|
||||||
|
|
||||||
cat > "$report_file" << EOF
|
|
||||||
Scaling Simulation Report
|
|
||||||
========================
|
|
||||||
|
|
||||||
Timestamp: $(date)
|
|
||||||
Direction: $DIRECTION
|
|
||||||
Node Type: $NODE_TYPE
|
|
||||||
Count: $COUNT
|
|
||||||
Simulation Mode: $SIMULATION_MODE
|
|
||||||
|
|
||||||
Current Status:
|
|
||||||
$(docker-compose ps)
|
|
||||||
|
|
||||||
Inventory Status:
|
|
||||||
$(ansible -i "$INVENTORY_FILE" --list-hosts all 2>/dev/null || echo "Ansible inventory check failed")
|
|
||||||
|
|
||||||
Log File: $LOG_FILE
|
|
||||||
EOF
|
|
||||||
|
|
||||||
log "Scaling report generated: $report_file"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Main execution
|
|
||||||
main() {
|
|
||||||
log "Starting scaling simulation: $DIRECTION $COUNT $NODE_TYPE nodes"
|
|
||||||
|
|
||||||
validate_inputs
|
|
||||||
|
|
||||||
case "$DIRECTION" in
|
|
||||||
up)
|
|
||||||
scale_up "$NODE_TYPE" "$COUNT"
|
|
||||||
;;
|
|
||||||
down)
|
|
||||||
scale_down "$NODE_TYPE" "$COUNT"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
health_check
|
|
||||||
generate_report
|
|
||||||
|
|
||||||
log "Scaling simulation completed successfully"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Initialize logging
|
|
||||||
mkdir -p logs reports
|
|
||||||
|
|
||||||
# Run main function
|
|
||||||
main "$@"
|
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
# Known Limitations
|
||||||
|
|
||||||
|
- Veritas scripts require manual review before real use. VxVM and VCS behavior varies by version, cluster design, naming convention, and operational policy.
|
||||||
|
- GPFS commands require a real cluster and must be adapted to the site layout, NSD naming standard, failure groups, storage pools, and maintenance process.
|
||||||
|
- The AIX Ansible role is a portfolio example unless tested on a real AIX LPAR with the target OpenSSH, sudo, audit, and OS levels.
|
||||||
|
- SSH hardening must be validated against the full `sshd` configuration, not only a managed drop-in file.
|
||||||
|
- The hardening examples cover selected controls only. They are not a full CIS benchmark implementation or compliance attestation.
|
||||||
|
- Scripts do not replace formal change procedures, peer review, backups, monitoring checks, or rollback planning.
|
||||||
|
- Sample outputs are fake and sanitized. They should be used for documentation review, not operational decisions.
|
||||||
@@ -0,0 +1,101 @@
|
|||||||
|
# infra-run
|
||||||
|
|
||||||
|
`infra-run` is a sanitized infrastructure operations project. It contains Bash, Ansible, Python, and documentation examples based on Linux administration, incident response, storage operations, hardening, prechecks, postchecks, and controlled change workflows.
|
||||||
|
|
||||||
|
The goal is to show operational judgment, not to ship a universal automation product.
|
||||||
|
|
||||||
|
## Current Contents
|
||||||
|
|
||||||
|
### Bash Operational Scripts
|
||||||
|
|
||||||
|
- [scripts/bash/os-healthcheck](./scripts/bash/os-healthcheck/) - general Linux health, service, disk, network, and report scripts.
|
||||||
|
- [scripts/bash/incident-checks](./scripts/bash/incident-checks/) - standalone read-only incident checks for CPU, memory/OOM, SSH failures, TLS expiry, DNS, NTP, filesystems, inodes, services, JVM diagnostics, and an L2 Markdown triage report wrapper.
|
||||||
|
- [scripts/bash/disk-full](./scripts/bash/disk-full/) - disk-full triage and cleanup review workflow.
|
||||||
|
- [scripts/bash/veritas](./scripts/bash/veritas/) - Veritas VxVM/VCS storage expansion workflow examples.
|
||||||
|
- [scripts/bash/gpfs](./scripts/bash/gpfs/) - GPFS / IBM Spectrum Scale expansion workflow examples.
|
||||||
|
|
||||||
|
### Python Log And Reporting Tools
|
||||||
|
|
||||||
|
- [scripts/python](./scripts/python/) - read-only Python operational helpers using the standard library only.
|
||||||
|
- [scripts/python/incident-log-summary](./scripts/python/incident-log-summary/) - read-only Python log summary helper for incident pattern review.
|
||||||
|
- [scripts/python/log-diff-checker](./scripts/python/log-diff-checker/) - read-only Python before/after log comparison helper for change review.
|
||||||
|
- [scripts/python/auth-log-audit](./scripts/python/auth-log-audit/) - read-only Python authentication log audit helper for SSH, sudo, su, and PAM review.
|
||||||
|
- [scripts/python/jvm-log-analyzer](./scripts/python/jvm-log-analyzer/) - read-only Python JVM and Java application log analyzer for exception, stack trace, HTTP 5xx, database, and TLS review.
|
||||||
|
- [scripts/python/journal-analyzer](./scripts/python/journal-analyzer/) - read-only Python exported journal analyzer for failed units, restart patterns, OOM events, and service warnings.
|
||||||
|
- [scripts/python/known-error-matcher](./scripts/python/known-error-matcher/) - read-only Python matcher for local logs and JSON known-error catalogs with runbook references.
|
||||||
|
|
||||||
|
### Ansible Automation
|
||||||
|
|
||||||
|
- [ansible](./ansible/) - selected baseline hardening examples for RHEL-like Linux, Debian/Ubuntu, and AIX.
|
||||||
|
|
||||||
|
### Runbooks And Documentation
|
||||||
|
|
||||||
|
- [examples](./examples/) - sanitized sample command outputs and incident notes.
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
- [docs/operations-cheatsheet.md](./docs/operations-cheatsheet.md) - production operations quick reference covering Linux/Unix triage, text processing, incident workflows, networking, storage, AIX, SSL/TLS, automation safety, Ansible execution, observability, and operational habits.
|
||||||
|
|
||||||
|
## What This Is
|
||||||
|
|
||||||
|
- A portfolio project for Linux and infrastructure operations roles.
|
||||||
|
- A set of readable examples showing precheck, dry-run, execution guardrails, postcheck, and reporting patterns.
|
||||||
|
- A place to demonstrate Bash, Ansible, storage workflow, and troubleshooting habits with sanitized inputs.
|
||||||
|
|
||||||
|
## What This Is Not
|
||||||
|
|
||||||
|
- Not intended for direct live use.
|
||||||
|
- Not a complete CIS benchmark implementation.
|
||||||
|
- Not a replacement for site-specific change procedures.
|
||||||
|
- Not tested against live Veritas, GPFS, or AIX systems in this repository.
|
||||||
|
- Not safe to run blindly on servers without review.
|
||||||
|
|
||||||
|
## Currently Usable
|
||||||
|
|
||||||
|
- Bash syntax can be checked locally.
|
||||||
|
- Shell scripts can be reviewed and partially exercised on a Linux workstation when platform commands are available or mocked.
|
||||||
|
- Disk-full read-only scripts can be run against local paths for basic behavior checks.
|
||||||
|
- Python log analysis examples can be run against sanitized sample logs under each tool directory.
|
||||||
|
- Ansible YAML and role structure can be linted locally.
|
||||||
|
|
||||||
|
## Running Safely
|
||||||
|
|
||||||
|
- Start with the relevant README or runbook before executing a script.
|
||||||
|
- Prefer read-only discovery scripts before remediation scripts.
|
||||||
|
- Use dry-run mode unless a script explicitly documents safe local behavior.
|
||||||
|
- Only use `--execute` after reviewing inputs, affected systems, rollback options, and post-checks.
|
||||||
|
- For Ansible, start with `--check --diff` against a lab inventory.
|
||||||
|
|
||||||
|
## Lab-Safe Examples
|
||||||
|
|
||||||
|
- Veritas and GPFS scripts default to dry-run behavior where they plan destructive or platform-changing operations.
|
||||||
|
- Ansible hardening roles are examples of selected controls and need adaptation before use.
|
||||||
|
- Sample outputs under [examples](./examples/) are fake and sanitized.
|
||||||
|
|
||||||
|
## Tested
|
||||||
|
|
||||||
|
See [TESTED.md](./TESTED.md) for current validation status.
|
||||||
|
|
||||||
|
Short version:
|
||||||
|
|
||||||
|
- Shell scripts were reviewed for dry-run behavior and obvious quoting issues.
|
||||||
|
- YAML and Ansible files are intended for local linting.
|
||||||
|
- Veritas, GPFS, and AIX behavior was not validated against real systems here.
|
||||||
|
|
||||||
|
## Basic Validation
|
||||||
|
|
||||||
|
From the repository root:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/validate-repo.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Focused checks are available in `scripts/check-bash.sh`, `scripts/check-ansible.sh`, `scripts/check-python.sh`, and `scripts/check-docs.sh`. If `ansible-lint` reports collection-related issues, install the collections listed in [ansible/collections/requirements.yml](./ansible/collections/requirements.yml) and rerun it. Treat lint as a starting point; platform testing still requires actual target systems.
|
||||||
|
|
||||||
|
## Supporting Notes
|
||||||
|
|
||||||
|
- [SOURCE.md](./SOURCE.md) explains why this project exists and what experience shaped it.
|
||||||
|
- [TESTED.md](./TESTED.md) lists what was checked locally and what was not.
|
||||||
|
- [KNOWN_LIMITATIONS.md](./KNOWN_LIMITATIONS.md) documents technical limits and operational cautions.
|
||||||
|
- [ROADMAP.md](./ROADMAP.md) tracks planned additions without presenting them as completed work.
|
||||||
|
- [../AGENTS.md](../AGENTS.md) and [../docs/codex](../docs/codex/) document repository working rules and review expectations.
|
||||||
@@ -0,0 +1,31 @@
|
|||||||
|
# infra-run Roadmap
|
||||||
|
|
||||||
|
This file tracks planned `infra-run` additions without presenting them as completed work.
|
||||||
|
|
||||||
|
## Candidate Additions
|
||||||
|
|
||||||
|
- More sample reports for disk pressure, service failures, and network incidents.
|
||||||
|
- A small Python parser for converting script output into a markdown change note.
|
||||||
|
- Additional Ansible molecule or container-based syntax checks where platform support is realistic.
|
||||||
|
- Standalone runbooks that reference the existing Bash workflows.
|
||||||
|
- Shared known-error pattern catalog review.
|
||||||
|
- Additional links between Python findings and existing runbooks.
|
||||||
|
- Change evidence collector for pre-check and post-check notes.
|
||||||
|
- Report examples suitable for incident and change tickets.
|
||||||
|
- Optional wrapper command only after the standalone Python tools stabilize.
|
||||||
|
|
||||||
|
## Implemented Additions
|
||||||
|
|
||||||
|
- `infra-run/scripts/bash/incident-checks/` - standalone read-only Bash checks for CPU, memory/OOM, service restart loops, failed SSH logins, TLS certificate expiry, DNS connectivity, time sync drift, read-only filesystems, inode pressure, and JVM process diagnostics.
|
||||||
|
- `infra-run/scripts/python/incident-log-summary/` - first read-only Python log analysis helper for summarizing configured incident patterns from local log files.
|
||||||
|
- `infra-run/scripts/python/log-diff-checker/` - read-only before/after log comparison helper for post-change pattern review.
|
||||||
|
- `infra-run/scripts/python/auth-log-audit/` - read-only authentication log audit helper for local SSH, sudo, su, and PAM review.
|
||||||
|
- `infra-run/scripts/python/jvm-log-analyzer/` - read-only JVM and Java application log analyzer for exceptions, stack traces, HTTP 5xx entries, database issues, TLS failures, and JVM failure symptoms.
|
||||||
|
- `infra-run/scripts/python/journal-analyzer/` - read-only exported `journalctl` text analyzer for summarizing failed units, dependency issues, restart patterns, OOM findings, disk/filesystem symptoms, and related service warnings.
|
||||||
|
- `infra-run/scripts/python/known-error-matcher/` - read-only known-error matcher for local logs and JSON pattern catalogs with severity, category, samples, and runbook references.
|
||||||
|
|
||||||
|
## Not Planned
|
||||||
|
|
||||||
|
- A full compliance benchmark implementation.
|
||||||
|
- Automated production changes without review gates.
|
||||||
|
- Vendor-specific storage actions that cannot be tested in a lab.
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
# Source And Intent
|
||||||
|
|
||||||
|
`infra-run` exists to present infrastructure operations work in a form that can be reviewed without exposing employer systems, hostnames, storage identifiers, tickets, or internal procedures.
|
||||||
|
|
||||||
|
The project is inspired by professional Linux and infrastructure operations work: prechecks before changes, postchecks after changes, disk-pressure incidents, SSH and sudo hardening, storage expansion planning, cluster awareness, and the need to leave clear notes for other engineers.
|
||||||
|
|
||||||
|
## What Is Realistic
|
||||||
|
|
||||||
|
- The workflow shape: precheck, dry-run, execute only with explicit approval, postcheck, and report.
|
||||||
|
- The operational topics: Linux health checks, disk-full triage, Veritas VxVM/VCS concepts, GPFS / IBM Spectrum Scale concepts, and selected OS hardening controls.
|
||||||
|
- The caution around storage, clustering, SSH, sudo, audit, and filesystem changes.
|
||||||
|
|
||||||
|
## What Is Simplified
|
||||||
|
|
||||||
|
- Commands are written as examples and do not cover every vendor, OS release, package layout, or site standard.
|
||||||
|
- The Veritas and GPFS scripts model common workflow steps but cannot validate a real cluster from this repository.
|
||||||
|
- The Ansible roles apply selected baseline controls; they are not full compliance implementations.
|
||||||
|
- Reporting examples use sanitized sample data.
|
||||||
|
|
||||||
|
## What Was Sanitized
|
||||||
|
|
||||||
|
- Hostnames, IP addresses, disk names, WWNs, ticket numbers, application names, company names, and environment-specific values.
|
||||||
|
- Exact production procedures and internal approval paths.
|
||||||
|
- Any data that could identify a real system or organization.
|
||||||
|
|
||||||
|
## Production Caution
|
||||||
|
|
||||||
|
Do not run these scripts blindly on production systems. Review every command, adapt variables and paths, test in a lab, confirm backups and rollback plans, and follow the local change process.
|
||||||
|
|
||||||
|
This project does not claim that the exact scripts were used in production.
|
||||||
|
|
||||||
|
## Roles This Supports
|
||||||
|
|
||||||
|
- Linux System Administrator
|
||||||
|
- Infrastructure Engineer
|
||||||
|
- SRE / DevOps Operations Engineer
|
||||||
|
- Linux Platform Engineer
|
||||||
@@ -0,0 +1,44 @@
|
|||||||
|
# Tested
|
||||||
|
|
||||||
|
This file documents the validation status for `infra-run`.
|
||||||
|
|
||||||
|
## Tested Locally
|
||||||
|
|
||||||
|
- Repository structure and documentation links were reviewed.
|
||||||
|
- Bash scripts were reviewed for dry-run defaults, quoting, and obvious unsafe cleanup behavior.
|
||||||
|
- Disk-full examples use fake data and can be read without access to production systems.
|
||||||
|
|
||||||
|
## Syntax Checked
|
||||||
|
|
||||||
|
Recommended local checks:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
find infra-run/scripts/bash -name '*.sh' -print0 | xargs -0 shellcheck -x -P infra-run/scripts/bash/disk-full -P infra-run/scripts/bash/gpfs -P infra-run/scripts/bash/veritas
|
||||||
|
yamllint .
|
||||||
|
cd infra-run/ansible && ansible-lint playbooks roles
|
||||||
|
```
|
||||||
|
|
||||||
|
The GitHub Actions workflow runs shell and YAML validation. `ansible-lint` is non-blocking because role behavior depends on platform facts, installed collections, and target OS support.
|
||||||
|
|
||||||
|
## Not Tested Against Real Systems
|
||||||
|
|
||||||
|
- Veritas VxVM/VCS commands were not tested against a live Veritas cluster here.
|
||||||
|
- GPFS / IBM Spectrum Scale commands were not tested against a live GPFS cluster here.
|
||||||
|
- AIX hardening tasks were not tested against a real AIX LPAR here.
|
||||||
|
- SSH hardening was not validated across every possible `sshd_config` layout.
|
||||||
|
|
||||||
|
## Known Limitations
|
||||||
|
|
||||||
|
- Destructive storage operations are dry-run by default where applicable, but dry-run output is not a substitute for peer review.
|
||||||
|
- Some scripts require vendor commands that are not available on a normal Linux workstation.
|
||||||
|
- Ansible examples are selected baseline controls, not full hardening benchmarks.
|
||||||
|
- Local linting does not prove production safety.
|
||||||
|
|
||||||
|
## Suggested Validation Steps
|
||||||
|
|
||||||
|
1. Run `shellcheck` against all Bash scripts.
|
||||||
|
2. Run `yamllint` against repository YAML.
|
||||||
|
3. Run `cd infra-run/ansible && ansible-lint playbooks roles` and review any non-blocking warnings.
|
||||||
|
4. Run disk-full read-only scripts on disposable local paths.
|
||||||
|
5. For Veritas or GPFS, test only in a lab with fake volumes/disks or a controlled training environment.
|
||||||
|
6. Validate SSH changes on a disposable host using the full effective `sshd` configuration.
|
||||||
@@ -0,0 +1,38 @@
|
|||||||
|
# infra-run/ansible
|
||||||
|
|
||||||
|
This directory contains Ansible automation for infrastructure operations and OS hardening. It is organized around the standard separation of inventory, roles, playbooks, collections, and tests.
|
||||||
|
|
||||||
|
## Diagram
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
A["ansible"] --> B["collections"]
|
||||||
|
A --> C["inventory"]
|
||||||
|
A --> D["playbooks"]
|
||||||
|
A --> E["roles"]
|
||||||
|
A --> F["tests"]
|
||||||
|
C --> C1["group_vars"]
|
||||||
|
C --> C2["host_vars"]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
- `collections` - collection requirements for supported automation targets.
|
||||||
|
- `inventory` - sanitized Linux and AIX inventory examples with shared defaults.
|
||||||
|
- `playbooks` - executable selected baseline hardening playbooks.
|
||||||
|
- `roles` - reusable hardening roles for supported operating systems.
|
||||||
|
- `tests` - validation and test harnesses for Ansible content.
|
||||||
|
|
||||||
|
## Hardening Coverage
|
||||||
|
|
||||||
|
- `cis-rhel9-hardening` - RHEL 9 baseline tasks for packages, services, SSH, sudo, sysctl, auditing, logging, filesystem controls, and validation.
|
||||||
|
- `cis-debian-ubuntu-hardening` - Debian 13 and Ubuntu 26.04 baseline tasks for apt packages, services, SSH, sudo, sysctl, auditing, logging, filesystem controls, and validation.
|
||||||
|
- `cis-aix7-hardening` - IBM AIX 7 baseline tasks for SSH, sudo, audit, logging, cron, users, password policy, network settings, filesystem controls, services, and validation.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Roles are selected baseline examples intended for portfolio and lab use, not a drop-in compliance certification.
|
||||||
|
- Defaults are sanitized and configurable through inventory or `--extra-vars`.
|
||||||
|
- Run platform-specific playbooks against appropriate test hosts before adapting them to managed environments.
|
||||||
|
- Prefer `--check --diff` for review runs before applying changes.
|
||||||
|
- Validate from the repository root with `./scripts/check-ansible.sh`.
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
[defaults]
|
||||||
|
inventory = inventory/hosts.yml
|
||||||
|
roles_path = roles
|
||||||
|
host_key_checking = False
|
||||||
|
retry_files_enabled = False
|
||||||
|
stdout_callback = yaml
|
||||||
|
|
||||||
|
[privilege_escalation]
|
||||||
|
become = True
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
# infra-run/ansible/collections
|
||||||
|
|
||||||
|
This folder is reserved for Ansible collections used by the `infra-run` automation area. It is intended for dependencies or custom collections that support playbooks and roles.
|
||||||
|
|
||||||
|
## Diagram
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
A["collections"] --> B["External or custom collections"]
|
||||||
|
B --> C["Modules"]
|
||||||
|
B --> D["Plugins"]
|
||||||
|
B --> E["Roles integration"]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
- Store collection dependencies close to the operational automation they support.
|
||||||
|
- Keep collection usage explicit for repeatable execution in controlled environments.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The directory currently contains only a placeholder file.
|
||||||
|
- A local README helps preserve intent before real collection content is added.
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
collections:
|
||||||
|
- name: ansible.posix
|
||||||
|
- name: community.general
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
# infra-run/ansible/inventory
|
||||||
|
|
||||||
|
This directory is intended for Ansible inventory definitions. It separates shared variables from host-specific values to support clean environment modeling and safer automation.
|
||||||
|
|
||||||
|
## Diagram
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
A["inventory"] --> B["group_vars"]
|
||||||
|
A --> C["host_vars"]
|
||||||
|
B --> D["Shared environment variables"]
|
||||||
|
C --> E["Per-host overrides"]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
- `group_vars` - variables applied at group or environment level.
|
||||||
|
- `host_vars` - variables tailored to individual nodes.
|
||||||
|
- `hosts.yml` - sanitized example groups for Linux and AIX hardening targets.
|
||||||
|
|
||||||
|
## Current Inventory Shape
|
||||||
|
|
||||||
|
- `linux` - local example host for Linux hardening playbooks.
|
||||||
|
- `aix` - empty sanitized group ready for AIX host definitions.
|
||||||
|
- `group_vars/all.yml` - shared hardening defaults such as NTP servers, SSH behavior, audit/logging toggles, sysctl hardening, and optional mount management.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Inventory values are intentionally sanitized.
|
||||||
|
- Override defaults per host, per group, or per run before applying any hardening playbook.
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
# infra-run/ansible/inventory/group_vars
|
||||||
|
|
||||||
|
This folder is reserved for shared Ansible variables applied to inventory groups. It is the right place for environment defaults, role inputs, and group-level operational settings.
|
||||||
|
|
||||||
|
## Diagram
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart LR
|
||||||
|
A["group_vars"] --> B["Environment defaults"]
|
||||||
|
A --> C["Role parameters"]
|
||||||
|
A --> D["Shared operational values"]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Use this layer when multiple hosts should inherit the same configuration.
|
||||||
|
- The directory is currently a placeholder and does not expose environment-specific data.
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
---
|
||||||
|
timezone: UTC
|
||||||
|
|
||||||
|
cis_ntp_servers:
|
||||||
|
- 0.rhel.pool.ntp.org
|
||||||
|
- 1.rhel.pool.ntp.org
|
||||||
|
- 2.rhel.pool.ntp.org
|
||||||
|
- 3.rhel.pool.ntp.org
|
||||||
|
|
||||||
|
# Operational defaults. Override per run with --extra-vars or inventory when needed.
|
||||||
|
cis_disable_root_login: true
|
||||||
|
cis_disable_password_auth: false
|
||||||
|
cis_install_auditd: true
|
||||||
|
cis_enable_chrony: true
|
||||||
|
cis_enable_rsyslog: true
|
||||||
|
cis_remove_legacy_packages: true
|
||||||
|
cis_enable_sysctl_hardening: true
|
||||||
|
cis_manage_mount_options: false
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
# infra-run/ansible/inventory/host_vars
|
||||||
|
|
||||||
|
This folder is intended for host-specific Ansible variables. It complements `group_vars` by capturing node-level differences that should not be shared across an entire inventory group.
|
||||||
|
|
||||||
|
## Diagram
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart LR
|
||||||
|
A["host_vars"] --> B["Host A overrides"]
|
||||||
|
A --> C["Host B overrides"]
|
||||||
|
A --> D["Per-node secrets or tuning"]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Use this area when operational differences are specific to one server.
|
||||||
|
- The directory is intentionally empty in the sanitized portfolio state.
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
---
|
||||||
|
linux:
|
||||||
|
hosts:
|
||||||
|
localhost:
|
||||||
|
ansible_connection: local
|
||||||
|
|
||||||
|
aix:
|
||||||
|
hosts: {}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
@@ -0,0 +1,20 @@
|
|||||||
|
# infra-run/ansible/playbooks
|
||||||
|
|
||||||
|
This directory contains executable Ansible playbooks that coordinate roles, inventories, and operational hardening tasks.
|
||||||
|
|
||||||
|
## Diagram
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
A["playbooks"] --> B["Provisioning flows"]
|
||||||
|
A --> C["Hardening flows"]
|
||||||
|
A --> D["Patch workflows"]
|
||||||
|
A --> E["Decommission workflows"]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- `cis-rhel9-hardening.yml` applies the RHEL 9 selected baseline hardening role to Linux inventory targets.
|
||||||
|
- `cis-debian-ubuntu-hardening.yml` applies the Debian 13 / Ubuntu 26.04 selected baseline hardening role to Linux inventory targets.
|
||||||
|
- `cis-aix7-hardening.yml` applies the IBM AIX 7 selected baseline hardening role to AIX inventory targets.
|
||||||
|
- Use the sanitized inventory under `../inventory/` as a starting point and override defaults per environment.
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
---
|
||||||
|
- name: Apply selected baseline IBM AIX 7 hardening controls
|
||||||
|
hosts: aix
|
||||||
|
become: true
|
||||||
|
gather_facts: true
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- role: cis-aix7-hardening
|
||||||
|
tags:
|
||||||
|
- cis
|
||||||
|
- aix7
|
||||||
|
- hardening
|
||||||
|
|
||||||
|
post_tasks:
|
||||||
|
- name: Show AIX hardening validation summary
|
||||||
|
ansible.builtin.debug:
|
||||||
|
var: cis_aix_validation_summary
|
||||||
|
when: cis_aix_validation_summary is defined
|
||||||
|
tags:
|
||||||
|
- always
|
||||||
|
- postcheck
|
||||||
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
- name: Apply selected baseline Debian and Ubuntu hardening controls
|
||||||
|
hosts: linux
|
||||||
|
become: true
|
||||||
|
gather_facts: true
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- role: cis-debian-ubuntu-hardening
|
||||||
|
tags:
|
||||||
|
- cis
|
||||||
|
- hardening
|
||||||
|
|
||||||
|
post_tasks:
|
||||||
|
- name: Show validation summary
|
||||||
|
ansible.builtin.debug:
|
||||||
|
var: cis_validation_summary
|
||||||
|
when: cis_validation_summary is defined
|
||||||
|
tags:
|
||||||
|
- always
|
||||||
|
- postcheck
|
||||||
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
- name: Apply selected baseline RHEL 9 hardening controls
|
||||||
|
hosts: linux
|
||||||
|
become: true
|
||||||
|
gather_facts: true
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- role: cis-rhel9-hardening
|
||||||
|
tags:
|
||||||
|
- cis
|
||||||
|
- hardening
|
||||||
|
|
||||||
|
post_tasks:
|
||||||
|
- name: Show validation summary
|
||||||
|
ansible.builtin.debug:
|
||||||
|
var: cis_validation_summary
|
||||||
|
when: cis_validation_summary is defined
|
||||||
|
tags:
|
||||||
|
- always
|
||||||
|
- postcheck
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
# infra-run/ansible/roles
|
||||||
|
|
||||||
|
This folder contains reusable Ansible roles. Roles organize configuration logic into predictable, testable units that can be shared across playbooks.
|
||||||
|
|
||||||
|
## Diagram
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
A["roles"] --> B["common"]
|
||||||
|
A --> C["monitoring"]
|
||||||
|
A --> D["storage"]
|
||||||
|
A --> E["security"]
|
||||||
|
E --> E1["cis-rhel9-hardening"]
|
||||||
|
E --> E2["cis-debian-ubuntu-hardening"]
|
||||||
|
E --> E3["cis-aix7-hardening"]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Current Roles
|
||||||
|
|
||||||
|
- `cis-rhel9-hardening` - RHEL 9 baseline example with package, service, SSH, sudo, sysctl, audit, logging, filesystem, and validation tasks.
|
||||||
|
- `cis-debian-ubuntu-hardening` - Debian 13 and Ubuntu 26.04 baseline example with apt, service, SSH, sudo, sysctl, audit, logging, filesystem, and validation tasks.
|
||||||
|
- `cis-aix7-hardening` - IBM AIX 7 baseline example with SSH, sudo, audit, logging, cron, user, password, network, filesystem, service, and validation tasks.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Each role includes defaults, task includes, handlers where needed, and role-specific README guidance.
|
||||||
|
- The hardening content is sanitized for portfolio use and should be reviewed against site policy before live use.
|
||||||
@@ -0,0 +1,67 @@
|
|||||||
|
# cis-aix7-hardening
|
||||||
|
|
||||||
|
Operational IBM AIX 7.x hardening role inspired by CIS Benchmark 1.2.0 and common Unix security practices.
|
||||||
|
|
||||||
|
Reference: https://www.cisecurity.org/benchmark/aix
|
||||||
|
|
||||||
|
This role is intended for infrastructure and security operations teams that manage AIX estates. It favors readable, conservative controls over broad benchmark coverage.
|
||||||
|
|
||||||
|
## Supported OS
|
||||||
|
|
||||||
|
- IBM AIX 7.x
|
||||||
|
|
||||||
|
## Implemented Areas
|
||||||
|
|
||||||
|
- Platform prechecks for AIX 7.x, SRC, SSH, audit tooling, required commands, disk safety, and baseline security state.
|
||||||
|
- SSH daemon hardening in `/etc/ssh/sshd_config` with validation through `sshd -t`.
|
||||||
|
- Account and password controls through AIX-native `lssec`, `chsec`, and `pwdadm`.
|
||||||
|
- Network tunable validation and optional hardening through `no`, with optional `nfso` support.
|
||||||
|
- SRC-aware service checks and safe inetd legacy service disablement.
|
||||||
|
- Filesystem review for JFS2, world-writable directories, and invalid owners or groups.
|
||||||
|
- Syslog and audit validation, with audit enablement disabled by default.
|
||||||
|
- Cron and at permission hardening under `/var/adm/cron`.
|
||||||
|
- Sudo defaults with validation through `visudo -cf` when sudo is present.
|
||||||
|
- Postcheck reporting for SSH, services, network values, and password policy.
|
||||||
|
|
||||||
|
## AIX Operational Notes
|
||||||
|
|
||||||
|
AIX is not Linux. This role does not assume systemd, sysctl, Linux package managers, or Linux service paths. Service operations use SRC commands such as `lssrc`, `startsrc`, `stopsrc`, and `refresh`.
|
||||||
|
|
||||||
|
AIX environments vary heavily between environments. Filesystem layout, OpenSSH source, sudo packaging, audit classes, NFS tuning, and security policy ownership should be validated before managed rollout.
|
||||||
|
|
||||||
|
## Safety Philosophy
|
||||||
|
|
||||||
|
- Defaults are conservative.
|
||||||
|
- Audit enablement is opt-in with `cis_enable_audit`.
|
||||||
|
- Filesystem mount option management is opt-in with `cis_manage_mount_options`.
|
||||||
|
- SSH password authentication is not disabled by default.
|
||||||
|
- Native AIX security files are updated with targeted `chsec` calls instead of wholesale replacement.
|
||||||
|
- Check mode is supported where practical, though AIX command modules may still need read-only probes for validation.
|
||||||
|
|
||||||
|
## Check Mode Examples
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ansible-playbook playbooks/cis-aix7-hardening.yml --check
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ansible-playbook playbooks/cis-aix7-hardening.yml --check --tags precheck,ssh,postcheck
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tag Examples
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ansible-playbook playbooks/cis-aix7-hardening.yml --tags precheck
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ansible-playbook playbooks/cis-aix7-hardening.yml --tags ssh,password_policy,network
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ansible-playbook playbooks/cis-aix7-hardening.yml --tags audit -e cis_enable_audit=true
|
||||||
|
```
|
||||||
|
|
||||||
|
## Important Warning
|
||||||
|
|
||||||
|
This is not a full compliance certification implementation and does not implement the entire CIS AIX benchmark. It is a practical baseline example that should be reviewed by infrastructure, security, and application owners before managed enforcement.
|
||||||
@@ -0,0 +1,98 @@
|
|||||||
|
---
|
||||||
|
cis_benchmark_version: "1.2.0"
|
||||||
|
|
||||||
|
cis_disable_root_login: true
|
||||||
|
cis_disable_password_auth: false
|
||||||
|
cis_enable_network_hardening: true
|
||||||
|
cis_enable_password_policy: true
|
||||||
|
cis_enable_audit: false
|
||||||
|
cis_manage_mount_options: false
|
||||||
|
|
||||||
|
cis_ssh_max_auth_tries: 4
|
||||||
|
cis_ssh_login_grace_time: 60
|
||||||
|
cis_ssh_client_alive_interval: 300
|
||||||
|
cis_ssh_client_alive_count_max: 3
|
||||||
|
cis_ssh_config_path: /etc/ssh/sshd_config
|
||||||
|
cis_sshd_test_command: sshd -t
|
||||||
|
|
||||||
|
cis_min_root_free_mb: 1024
|
||||||
|
|
||||||
|
cis_password_minlen: 14
|
||||||
|
cis_password_histsize: 10
|
||||||
|
cis_password_maxage_weeks: 12
|
||||||
|
cis_password_minalpha: 1
|
||||||
|
cis_password_minother: 1
|
||||||
|
cis_password_maxrepeats: 2
|
||||||
|
cis_password_minage_weeks: 1
|
||||||
|
cis_login_retries: 5
|
||||||
|
cis_login_lockout: 30
|
||||||
|
|
||||||
|
cis_required_commands:
|
||||||
|
- lsattr
|
||||||
|
- chdev
|
||||||
|
- lssrc
|
||||||
|
- chsec
|
||||||
|
- lssec
|
||||||
|
- pwdadm
|
||||||
|
- "no"
|
||||||
|
- audit
|
||||||
|
- cron
|
||||||
|
|
||||||
|
cis_ssh_candidate_paths:
|
||||||
|
- /usr/sbin/sshd
|
||||||
|
- /usr/bin/sshd
|
||||||
|
- /opt/freeware/sbin/sshd
|
||||||
|
- /opt/freeware/bin/sshd
|
||||||
|
|
||||||
|
cis_network_no_settings:
|
||||||
|
ipforwarding: "0"
|
||||||
|
ipsendredirects: "0"
|
||||||
|
ipignoreredirects: "1"
|
||||||
|
ipsrcrouteforward: "0"
|
||||||
|
clean_partial_conns: "1"
|
||||||
|
tcp_pmtu_discover: "0"
|
||||||
|
|
||||||
|
cis_network_nfso_settings: {}
|
||||||
|
|
||||||
|
cis_legacy_inetd_services:
|
||||||
|
- telnet
|
||||||
|
- shell
|
||||||
|
- login
|
||||||
|
- exec
|
||||||
|
- comsat
|
||||||
|
- talk
|
||||||
|
- ntalk
|
||||||
|
- tftp
|
||||||
|
- uucp
|
||||||
|
- finger
|
||||||
|
|
||||||
|
cis_src_subsystems:
|
||||||
|
- sshd
|
||||||
|
- inetd
|
||||||
|
- syslogd
|
||||||
|
- audit
|
||||||
|
|
||||||
|
cis_mount_option_targets:
|
||||||
|
- path: /tmp
|
||||||
|
options:
|
||||||
|
- nosuid
|
||||||
|
- path: /var/tmp
|
||||||
|
options:
|
||||||
|
- nosuid
|
||||||
|
|
||||||
|
cis_manage_sudo: true
|
||||||
|
cis_sudoers_path: /etc/sudoers
|
||||||
|
cis_sudo_logfile: /var/log/sudo.log
|
||||||
|
cis_sudo_use_pty: true
|
||||||
|
|
||||||
|
cis_cron_allow_path: /var/adm/cron/cron.allow
|
||||||
|
cis_cron_deny_path: /var/adm/cron/cron.deny
|
||||||
|
cis_at_allow_path: /var/adm/cron/at.allow
|
||||||
|
cis_at_deny_path: /var/adm/cron/at.deny
|
||||||
|
cis_cron_directories:
|
||||||
|
- /var/adm/cron
|
||||||
|
- /var/spool/cron
|
||||||
|
- /var/spool/cron/crontabs
|
||||||
|
|
||||||
|
cis_syslog_config_path: /etc/syslog.conf
|
||||||
|
cis_audit_config_path: /etc/security/audit/config
|
||||||
@@ -0,0 +1,44 @@
|
|||||||
|
---
|
||||||
|
- name: Validate sshd configuration
|
||||||
|
ansible.builtin.command: "{{ cis_sshd_test_command }}"
|
||||||
|
changed_when: false
|
||||||
|
listen: validate sshd
|
||||||
|
|
||||||
|
- name: Restart sshd using SRC
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
set -o pipefail
|
||||||
|
if lssrc -s sshd >/dev/null 2>&1; then
|
||||||
|
stopsrc -s sshd >/dev/null 2>&1 || true
|
||||||
|
startsrc -s sshd
|
||||||
|
fi
|
||||||
|
args:
|
||||||
|
executable: /bin/ksh
|
||||||
|
changed_when: true
|
||||||
|
listen: restart sshd
|
||||||
|
|
||||||
|
- name: Refresh inetd
|
||||||
|
ansible.builtin.command: refresh -s inetd
|
||||||
|
changed_when: true
|
||||||
|
failed_when: false
|
||||||
|
listen: refresh inetd
|
||||||
|
|
||||||
|
- name: Refresh syslog
|
||||||
|
ansible.builtin.command: refresh -s syslogd
|
||||||
|
changed_when: true
|
||||||
|
failed_when: false
|
||||||
|
listen: refresh syslog
|
||||||
|
|
||||||
|
- name: Restart audit subsystem
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
set -o pipefail
|
||||||
|
if lssrc -s audit >/dev/null 2>&1; then
|
||||||
|
stopsrc -s audit >/dev/null 2>&1 || true
|
||||||
|
startsrc -s audit
|
||||||
|
else
|
||||||
|
audit start
|
||||||
|
fi
|
||||||
|
args:
|
||||||
|
executable: /bin/ksh
|
||||||
|
changed_when: true
|
||||||
|
when: cis_enable_audit | bool
|
||||||
|
listen: restart audit
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
---
|
||||||
|
- name: Validate AIX audit configuration file
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: "{{ cis_audit_config_path }}"
|
||||||
|
register: cis_aix_audit_config
|
||||||
|
|
||||||
|
- name: Collect AIX audit query status
|
||||||
|
ansible.builtin.command: audit query
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_audit_status
|
||||||
|
|
||||||
|
- name: Enable AIX audit subsystem when explicitly configured
|
||||||
|
ansible.builtin.command: audit start
|
||||||
|
changed_when: true
|
||||||
|
when:
|
||||||
|
- cis_enable_audit | bool
|
||||||
|
- cis_aix_audit_config.stat.exists
|
||||||
|
- cis_aix_audit_status.rc != 0 or 'auditing off' in (cis_aix_audit_status.stdout | default('') | lower)
|
||||||
|
notify: restart audit
|
||||||
|
|
||||||
|
- name: Report audit status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg:
|
||||||
|
- >-
|
||||||
|
{{ 'OK: AIX audit configuration file exists.'
|
||||||
|
if cis_aix_audit_config.stat.exists else 'WARNING: AIX audit configuration file was not found.' }}
|
||||||
|
- >-
|
||||||
|
{{ 'OK: Audit enablement is explicitly allowed by cis_enable_audit.'
|
||||||
|
if cis_enable_audit | bool else 'WARNING: Audit enablement is disabled by default; validation only was performed.' }}
|
||||||
|
- "OK: audit query rc={{ cis_aix_audit_status.rc }} output={{ cis_aix_audit_status.stdout | default('') }}"
|
||||||
@@ -0,0 +1,49 @@
|
|||||||
|
---
|
||||||
|
- name: Ensure cron and at control files exist with safe ownership
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ item }}"
|
||||||
|
state: touch
|
||||||
|
owner: root
|
||||||
|
group: cron
|
||||||
|
mode: "0600"
|
||||||
|
modification_time: preserve
|
||||||
|
access_time: preserve
|
||||||
|
loop:
|
||||||
|
- "{{ cis_cron_allow_path }}"
|
||||||
|
- "{{ cis_at_allow_path }}"
|
||||||
|
|
||||||
|
- name: Ensure deny files are not world readable when present
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ item }}"
|
||||||
|
owner: root
|
||||||
|
group: cron
|
||||||
|
mode: "0600"
|
||||||
|
loop:
|
||||||
|
- "{{ cis_cron_deny_path }}"
|
||||||
|
- "{{ cis_at_deny_path }}"
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Secure cron directories when present
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ item }}"
|
||||||
|
state: directory
|
||||||
|
owner: root
|
||||||
|
group: cron
|
||||||
|
mode: "0750"
|
||||||
|
loop: "{{ cis_cron_directories }}"
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Validate cron SRC state
|
||||||
|
ansible.builtin.command: lssrc -s cron
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_cron_state
|
||||||
|
|
||||||
|
- name: Report cron and at hardening status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg:
|
||||||
|
- "OK: cron.allow and at.allow ownership and permissions are managed."
|
||||||
|
- >-
|
||||||
|
{{ 'OK: cron SRC subsystem exists.'
|
||||||
|
if cis_aix_cron_state.rc == 0 else 'WARNING: cron SRC subsystem was not found.' }}
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
---
|
||||||
|
- name: Build mounted filesystem list from gathered facts
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
cis_aix_mount_points: "{{ ansible_mounts | map(attribute='mount') | list }}"
|
||||||
|
|
||||||
|
- name: Validate JFS2 filesystems
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
set -o pipefail
|
||||||
|
lsfs -q | awk '/vfs[[:space:]]*=[[:space:]]*jfs2/{print prev} {prev=$0}'
|
||||||
|
args:
|
||||||
|
executable: /bin/ksh
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_jfs2_filesystems
|
||||||
|
|
||||||
|
- name: Review configured mount option targets
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
OK: Mount option management is disabled by default.
|
||||||
|
Review target {{ item.path }} for options {{ item.options | join(', ') }} before managed rollout.
|
||||||
|
loop: "{{ cis_mount_option_targets }}"
|
||||||
|
when: not cis_manage_mount_options | bool
|
||||||
|
|
||||||
|
- name: Apply configured mount options only when explicitly enabled
|
||||||
|
ansible.builtin.command: "chfs -a options={{ item.options | join(',') }} {{ item.path }}"
|
||||||
|
changed_when: true
|
||||||
|
loop: "{{ cis_mount_option_targets }}"
|
||||||
|
when:
|
||||||
|
- cis_manage_mount_options | bool
|
||||||
|
- item.path in cis_aix_mount_points
|
||||||
|
|
||||||
|
- name: Identify world-writable directories on local filesystems
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
set -o pipefail
|
||||||
|
find / -xdev -type d -perm -0002 -print 2>/dev/null | head -200
|
||||||
|
args:
|
||||||
|
executable: /bin/ksh
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_world_writable_dirs
|
||||||
|
|
||||||
|
- name: Identify files without valid owner or group on local filesystems
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
set -o pipefail
|
||||||
|
find / -xdev \( -nouser -o -nogroup \) -print 2>/dev/null | head -200
|
||||||
|
args:
|
||||||
|
executable: /bin/ksh
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_unowned_files
|
||||||
|
|
||||||
|
- name: Report filesystem review findings
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg:
|
||||||
|
- "OK: JFS2 filesystem review completed."
|
||||||
|
- "WARNING: World-writable directories found: {{ cis_aix_world_writable_dirs.stdout_lines | default([]) }}"
|
||||||
|
- "WARNING: Files without valid owner/group found: {{ cis_aix_unowned_files.stdout_lines | default([]) }}"
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
---
|
||||||
|
- name: Collect syslog SRC state
|
||||||
|
ansible.builtin.command: lssrc -s syslogd
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_syslog_state
|
||||||
|
|
||||||
|
- name: Ensure syslog configuration exists
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: "{{ cis_syslog_config_path }}"
|
||||||
|
register: cis_aix_syslog_config
|
||||||
|
|
||||||
|
- name: Start syslogd when installed but inactive
|
||||||
|
ansible.builtin.command: startsrc -s syslogd
|
||||||
|
changed_when: true
|
||||||
|
when:
|
||||||
|
- cis_aix_syslog_state.rc == 0
|
||||||
|
- "'active' not in cis_aix_syslog_state.stdout"
|
||||||
|
|
||||||
|
- name: Validate syslog configuration has active entries
|
||||||
|
ansible.builtin.shell: "awk 'NF && $1 !~ /^#/ {found=1} END {exit found ? 0 : 1}' {{ cis_syslog_config_path }}"
|
||||||
|
args:
|
||||||
|
executable: /bin/ksh
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_syslog_has_rules
|
||||||
|
when: cis_aix_syslog_config.stat.exists
|
||||||
|
|
||||||
|
- name: Report logging status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg:
|
||||||
|
- >-
|
||||||
|
{{ 'OK: syslogd SRC subsystem exists.'
|
||||||
|
if cis_aix_syslog_state.rc == 0 else 'WARNING: syslogd SRC subsystem was not found.' }}
|
||||||
|
- >-
|
||||||
|
{{ 'OK: syslog configuration has active rules.'
|
||||||
|
if cis_aix_syslog_has_rules.rc | default(1) == 0
|
||||||
|
else 'WARNING: syslog configuration has no active rules or could not be validated.' }}
|
||||||
@@ -0,0 +1,65 @@
|
|||||||
|
---
|
||||||
|
- name: Run AIX platform safety prechecks
|
||||||
|
ansible.builtin.import_tasks: precheck.yml
|
||||||
|
tags:
|
||||||
|
- always
|
||||||
|
- precheck
|
||||||
|
|
||||||
|
- name: Harden AIX SSH daemon configuration
|
||||||
|
ansible.builtin.import_tasks: ssh.yml
|
||||||
|
tags:
|
||||||
|
- ssh
|
||||||
|
|
||||||
|
- name: Apply AIX user account controls
|
||||||
|
ansible.builtin.import_tasks: users.yml
|
||||||
|
tags:
|
||||||
|
- users
|
||||||
|
|
||||||
|
- name: Apply AIX password policy controls
|
||||||
|
ansible.builtin.import_tasks: password_policy.yml
|
||||||
|
when: cis_enable_password_policy | bool
|
||||||
|
tags:
|
||||||
|
- password_policy
|
||||||
|
|
||||||
|
- name: Apply AIX network hardening controls
|
||||||
|
ansible.builtin.import_tasks: network.yml
|
||||||
|
when: cis_enable_network_hardening | bool
|
||||||
|
tags:
|
||||||
|
- network
|
||||||
|
|
||||||
|
- name: Manage AIX baseline services
|
||||||
|
ansible.builtin.import_tasks: services.yml
|
||||||
|
tags:
|
||||||
|
- services
|
||||||
|
|
||||||
|
- name: Review AIX filesystem controls
|
||||||
|
ansible.builtin.import_tasks: filesystem.yml
|
||||||
|
tags:
|
||||||
|
- filesystem
|
||||||
|
|
||||||
|
- name: Validate AIX logging controls
|
||||||
|
ansible.builtin.import_tasks: logging.yml
|
||||||
|
tags:
|
||||||
|
- logging
|
||||||
|
|
||||||
|
- name: Validate AIX audit controls
|
||||||
|
ansible.builtin.import_tasks: audit.yml
|
||||||
|
tags:
|
||||||
|
- audit
|
||||||
|
|
||||||
|
- name: Harden AIX cron and at controls
|
||||||
|
ansible.builtin.import_tasks: cron.yml
|
||||||
|
tags:
|
||||||
|
- cron
|
||||||
|
|
||||||
|
- name: Harden sudo configuration
|
||||||
|
ansible.builtin.import_tasks: sudo.yml
|
||||||
|
when: cis_manage_sudo | bool
|
||||||
|
tags:
|
||||||
|
- sudo
|
||||||
|
|
||||||
|
- name: Run AIX validation postchecks
|
||||||
|
ansible.builtin.import_tasks: postcheck.yml
|
||||||
|
tags:
|
||||||
|
- always
|
||||||
|
- postcheck
|
||||||
@@ -0,0 +1,65 @@
|
|||||||
|
---
|
||||||
|
- name: Collect current AIX network tunables
|
||||||
|
ansible.builtin.command: no -a
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_no_current
|
||||||
|
|
||||||
|
- name: Query configured AIX network tunables
|
||||||
|
ansible.builtin.command: "no -o {{ item.key }}"
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
loop: "{{ cis_network_no_settings | dict2items }}"
|
||||||
|
register: cis_aix_no_query
|
||||||
|
|
||||||
|
- name: Apply configured AIX network tunables
|
||||||
|
ansible.builtin.command: "no -p -o {{ item.item.key }}={{ item.item.value }}"
|
||||||
|
changed_when: true
|
||||||
|
loop: "{{ cis_aix_no_query.results }}"
|
||||||
|
when:
|
||||||
|
- item.rc == 0
|
||||||
|
- item.stdout is not search('=\\s*' ~ (item.item.value | string) ~ '\\b')
|
||||||
|
|
||||||
|
- name: Warn about unsupported AIX network tunables
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "WARNING: AIX network tunable {{ item.item.key }} is not supported on this host."
|
||||||
|
loop: "{{ cis_aix_no_query.results }}"
|
||||||
|
when: item.rc != 0
|
||||||
|
|
||||||
|
- name: Check nfso availability
|
||||||
|
ansible.builtin.shell: "command -v nfso >/dev/null 2>&1 || whence nfso >/dev/null 2>&1"
|
||||||
|
args:
|
||||||
|
executable: /bin/ksh
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_nfso_available
|
||||||
|
|
||||||
|
- name: Query configured AIX NFS tunables
|
||||||
|
ansible.builtin.command: "nfso -o {{ item.key }}"
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
loop: "{{ cis_network_nfso_settings | dict2items }}"
|
||||||
|
register: cis_aix_nfso_query
|
||||||
|
when:
|
||||||
|
- cis_aix_nfso_available.rc == 0
|
||||||
|
- cis_network_nfso_settings | length > 0
|
||||||
|
|
||||||
|
- name: Apply configured AIX NFS tunables
|
||||||
|
ansible.builtin.command: "nfso -p -o {{ item.item.key }}={{ item.item.value }}"
|
||||||
|
changed_when: true
|
||||||
|
loop: "{{ cis_aix_nfso_query.results | default([]) }}"
|
||||||
|
when:
|
||||||
|
- item.rc == 0
|
||||||
|
- item.stdout is not search('=\\s*' ~ (item.item.value | string) ~ '\\b')
|
||||||
|
|
||||||
|
- name: Report network hardening status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg:
|
||||||
|
- "OK: AIX network tunables were validated before changes."
|
||||||
|
- >-
|
||||||
|
{{ 'OK: nfso is available for optional NFS network tunables.'
|
||||||
|
if cis_aix_nfso_available.rc == 0 else 'WARNING: nfso was not found; NFS tunables were skipped.' }}
|
||||||
@@ -0,0 +1,66 @@
|
|||||||
|
---
|
||||||
|
- name: Collect current default password policy
|
||||||
|
ansible.builtin.command: lssec -f /etc/security/user -s default -a minlen histsize maxage minage minalpha minother maxrepeats loginretries
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_password_policy_current
|
||||||
|
|
||||||
|
- name: Collect current default login policy
|
||||||
|
ansible.builtin.command: lssec -f /etc/security/login.cfg -s usw -a logindisable logininterval loginreenable
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_login_policy_current
|
||||||
|
|
||||||
|
- name: Manage default password security attributes
|
||||||
|
ansible.builtin.command: "chsec -f /etc/security/user -s default -a {{ item.key }}={{ item.value }}"
|
||||||
|
changed_when: true
|
||||||
|
loop:
|
||||||
|
- key: minlen
|
||||||
|
value: "{{ cis_password_minlen }}"
|
||||||
|
- key: histsize
|
||||||
|
value: "{{ cis_password_histsize }}"
|
||||||
|
- key: maxage
|
||||||
|
value: "{{ cis_password_maxage_weeks }}"
|
||||||
|
- key: minage
|
||||||
|
value: "{{ cis_password_minage_weeks }}"
|
||||||
|
- key: minalpha
|
||||||
|
value: "{{ cis_password_minalpha }}"
|
||||||
|
- key: minother
|
||||||
|
value: "{{ cis_password_minother }}"
|
||||||
|
- key: maxrepeats
|
||||||
|
value: "{{ cis_password_maxrepeats }}"
|
||||||
|
- key: loginretries
|
||||||
|
value: "{{ cis_login_retries }}"
|
||||||
|
when: >-
|
||||||
|
(item.key ~ '=' ~ (item.value | string))
|
||||||
|
not in (cis_aix_password_policy_current.stdout | default(''))
|
||||||
|
|
||||||
|
- name: Manage login lockout interval
|
||||||
|
ansible.builtin.command: "chsec -f /etc/security/login.cfg -s usw -a loginreenable={{ cis_login_lockout }}"
|
||||||
|
changed_when: true
|
||||||
|
when: >-
|
||||||
|
('loginreenable=' ~ (cis_login_lockout | string))
|
||||||
|
not in (cis_aix_login_policy_current.stdout | default(''))
|
||||||
|
|
||||||
|
- name: Collect updated default password policy
|
||||||
|
ansible.builtin.command: lssec -f /etc/security/user -s default -a minlen histsize maxage minage minalpha minother maxrepeats loginretries
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_password_policy_updated
|
||||||
|
|
||||||
|
- name: Validate password database state
|
||||||
|
ansible.builtin.command: pwdadm -q root
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_pwdadm_root
|
||||||
|
|
||||||
|
- name: Report password policy status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg:
|
||||||
|
- "OK: Password policy managed through AIX chsec defaults, without replacing security files."
|
||||||
|
- "OK: Current default policy: {{ cis_aix_password_policy_updated.stdout | default('unavailable') }}"
|
||||||
|
- "OK: pwdadm root status: {{ cis_aix_pwdadm_root.stdout | default('unavailable') }}"
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
---
|
||||||
|
- name: Validate sshd configuration after hardening
|
||||||
|
ansible.builtin.command: "{{ cis_sshd_test_command }}"
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_post_sshd
|
||||||
|
|
||||||
|
- name: Show selected AIX network security values
|
||||||
|
ansible.builtin.command: "no -o {{ item.key }}"
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
loop: "{{ cis_network_no_settings | dict2items }}"
|
||||||
|
register: cis_aix_post_network
|
||||||
|
|
||||||
|
- name: Show key SRC service states
|
||||||
|
ansible.builtin.command: "lssrc -s {{ item }}"
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
loop:
|
||||||
|
- sshd
|
||||||
|
- syslogd
|
||||||
|
- audit
|
||||||
|
register: cis_aix_post_services
|
||||||
|
|
||||||
|
- name: Show password policy summary
|
||||||
|
ansible.builtin.command: lssec -f /etc/security/user -s default -a minlen histsize maxage minage minalpha minother loginretries
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_post_password
|
||||||
|
|
||||||
|
- name: Build AIX hardening validation summary
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
cis_aix_validation_summary:
|
||||||
|
oslevel: "{{ cis_aix_oslevel.stdout | default('unavailable') }}"
|
||||||
|
sshd_config_valid: "{{ cis_aix_post_sshd.rc == 0 }}"
|
||||||
|
sshd_validation_output: "{{ cis_aix_post_sshd.stderr | default(cis_aix_post_sshd.stdout | default('')) }}"
|
||||||
|
network_values: "{{ cis_aix_post_network.results | map(attribute='stdout') | list }}"
|
||||||
|
service_states: "{{ cis_aix_post_services.results | map(attribute='stdout') | list }}"
|
||||||
|
password_policy: "{{ cis_aix_post_password.stdout | default('unavailable') }}"
|
||||||
|
recommendations:
|
||||||
|
- "Validate SSH access from a second privileged session before enforcing passwordless-only access."
|
||||||
|
- "Review audit classes and events with security operations before setting cis_enable_audit=true."
|
||||||
|
- "Keep cis_manage_mount_options=false until filesystem owners approve remount or chfs behavior."
|
||||||
|
|
||||||
|
- name: Print AIX operational postcheck recommendations
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg:
|
||||||
|
- >-
|
||||||
|
{{ 'OK: sshd configuration validates.'
|
||||||
|
if cis_aix_post_sshd.rc == 0 else 'CRITICAL: sshd validation failed; review SSH config before restarting sessions.' }}
|
||||||
|
- "OK: Service states: {{ cis_aix_validation_summary.service_states }}"
|
||||||
|
- "OK: Password policy summary: {{ cis_aix_validation_summary.password_policy }}"
|
||||||
|
- "WARNING: This role is selected baseline and does not represent a complete compliance certification implementation."
|
||||||
|
- "{{ cis_aix_validation_summary.recommendations }}"
|
||||||
@@ -0,0 +1,147 @@
|
|||||||
|
---
|
||||||
|
- name: Determine root filesystem free space
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
cis_aix_root_mount: "{{ ansible_mounts | selectattr('mount', 'equalto', '/') | list | first | default({}) }}"
|
||||||
|
|
||||||
|
- name: Calculate root filesystem free space in MB
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
cis_aix_root_free_mb: "{{ ((cis_aix_root_mount.size_available | default(0) | int) / 1024 / 1024) | round(0, 'floor') | int }}"
|
||||||
|
|
||||||
|
- name: Collect AIX maintenance level
|
||||||
|
ansible.builtin.command: oslevel -s
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_oslevel
|
||||||
|
|
||||||
|
- name: Check required AIX commands
|
||||||
|
ansible.builtin.shell: "command -v {{ item | quote }} >/dev/null 2>&1 || whence {{ item | quote }} >/dev/null 2>&1"
|
||||||
|
args:
|
||||||
|
executable: /bin/ksh
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
loop: "{{ cis_required_commands }}"
|
||||||
|
register: cis_aix_required_command_checks
|
||||||
|
|
||||||
|
- name: Build missing required command list
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
cis_aix_missing_required_commands: >-
|
||||||
|
{{
|
||||||
|
cis_aix_required_command_checks.results
|
||||||
|
| selectattr('rc', 'ne', 0)
|
||||||
|
| map(attribute='item')
|
||||||
|
| list
|
||||||
|
}}
|
||||||
|
|
||||||
|
- name: Locate sshd binary
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: "{{ item }}"
|
||||||
|
loop: "{{ cis_ssh_candidate_paths }}"
|
||||||
|
register: cis_aix_sshd_path_checks
|
||||||
|
|
||||||
|
- name: Store detected sshd binary
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
cis_aix_sshd_path: >-
|
||||||
|
{{
|
||||||
|
(
|
||||||
|
cis_aix_sshd_path_checks.results
|
||||||
|
| selectattr('stat.exists')
|
||||||
|
| map(attribute='item')
|
||||||
|
| list
|
||||||
|
| first
|
||||||
|
)
|
||||||
|
| default('')
|
||||||
|
}}
|
||||||
|
|
||||||
|
- name: Validate SRC subsystem availability
|
||||||
|
ansible.builtin.command: lssrc -a
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_src_summary
|
||||||
|
|
||||||
|
- name: Validate audit subsystem availability
|
||||||
|
ansible.builtin.command: audit query
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_audit_query
|
||||||
|
|
||||||
|
- name: Collect LPAR summary when available
|
||||||
|
ansible.builtin.shell: "command -v lparstat >/dev/null 2>&1 && lparstat -i || true"
|
||||||
|
args:
|
||||||
|
executable: /bin/ksh
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_lparstat
|
||||||
|
|
||||||
|
- name: Collect current network tunable summary
|
||||||
|
ansible.builtin.command: no -a
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_network_summary
|
||||||
|
|
||||||
|
- name: Collect default AIX user security summary
|
||||||
|
ansible.builtin.command: lssec -f /etc/security/user -s default -a ALL
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
register: cis_aix_security_user_summary
|
||||||
|
|
||||||
|
- name: Report AIX precheck status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg:
|
||||||
|
- >-
|
||||||
|
OK: Facts gathered for {{ ansible_distribution | default(ansible_system | default('unknown')) }}
|
||||||
|
{{ ansible_distribution_version | default(ansible_kernel | default('unknown')) }}.
|
||||||
|
- "OK: oslevel -s reports {{ cis_aix_oslevel.stdout | default('unavailable') }}."
|
||||||
|
- "OK: Root filesystem free space is {{ cis_aix_root_free_mb }} MB."
|
||||||
|
- >-
|
||||||
|
{{ 'OK: sshd binary detected at ' ~ cis_aix_sshd_path
|
||||||
|
if cis_aix_sshd_path | length > 0 else 'CRITICAL: sshd binary was not found in expected AIX paths.' }}
|
||||||
|
- >-
|
||||||
|
{{ 'OK: SRC subsystem commands are functional.'
|
||||||
|
if cis_aix_src_summary.rc == 0 else 'CRITICAL: lssrc failed; SRC is unavailable or not usable.' }}
|
||||||
|
- >-
|
||||||
|
{{ 'OK: AIX audit subsystem responded to audit query.'
|
||||||
|
if cis_aix_audit_query.rc == 0 else 'WARNING: audit query did not complete; audit may be disabled or unconfigured.' }}
|
||||||
|
- >-
|
||||||
|
{{ 'OK: Required commands are present.'
|
||||||
|
if cis_aix_missing_required_commands | length == 0
|
||||||
|
else 'CRITICAL: Missing required commands: ' ~ (cis_aix_missing_required_commands | join(', ')) }}
|
||||||
|
|
||||||
|
- name: Fail when operating system is unsupported
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- ansible_system | default(ansible_distribution | default('')) == 'AIX'
|
||||||
|
- ansible_distribution_version | default('') is match('^7\\.')
|
||||||
|
fail_msg: >-
|
||||||
|
CRITICAL: This role supports IBM AIX 7.x only.
|
||||||
|
Detected {{ ansible_distribution | default(ansible_system | default('unknown')) }}
|
||||||
|
{{ ansible_distribution_version | default('unknown') }}.
|
||||||
|
success_msg: "OK: Supported IBM AIX 7.x platform detected."
|
||||||
|
|
||||||
|
- name: Fail when root filesystem free space is below safety threshold
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- cis_aix_root_free_mb | int >= cis_min_root_free_mb | int
|
||||||
|
fail_msg: >-
|
||||||
|
CRITICAL: Root filesystem has {{ cis_aix_root_free_mb }} MB free.
|
||||||
|
Minimum required free space is {{ cis_min_root_free_mb }} MB.
|
||||||
|
success_msg: "OK: Root filesystem free space meets the safety threshold."
|
||||||
|
|
||||||
|
- name: Fail when critical AIX commands are missing
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- cis_aix_missing_required_commands | length == 0
|
||||||
|
- cis_aix_src_summary.rc == 0
|
||||||
|
- cis_aix_sshd_path | length > 0
|
||||||
|
fail_msg: >-
|
||||||
|
CRITICAL: Required AIX hardening prerequisites are missing.
|
||||||
|
Missing commands={{ cis_aix_missing_required_commands | join(', ') | default('none', true) }},
|
||||||
|
SRC rc={{ cis_aix_src_summary.rc }},
|
||||||
|
sshd={{ cis_aix_sshd_path | default('not found', true) }}.
|
||||||
|
success_msg: "OK: Critical AIX hardening prerequisites are available."
|
||||||
@@ -0,0 +1,51 @@
|
|||||||
|
---
|
||||||
|
- name: Collect SRC subsystem states
|
||||||
|
ansible.builtin.command: "lssrc -s {{ item }}"
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
check_mode: false
|
||||||
|
loop: "{{ cis_src_subsystems }}"
|
||||||
|
register: cis_aix_src_service_states
|
||||||
|
|
||||||
|
- name: Validate inetd configuration exists
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: /etc/inetd.conf
|
||||||
|
register: cis_aix_inetd_config
|
||||||
|
|
||||||
|
- name: Read inetd configuration
|
||||||
|
ansible.builtin.slurp:
|
||||||
|
src: /etc/inetd.conf
|
||||||
|
register: cis_aix_inetd_conf_content
|
||||||
|
when: cis_aix_inetd_config.stat.exists
|
||||||
|
|
||||||
|
- name: Disable insecure inetd services when present
|
||||||
|
ansible.builtin.lineinfile:
|
||||||
|
path: /etc/inetd.conf
|
||||||
|
regexp: '^(?!#)({{ item }})\s+'
|
||||||
|
line: '# \1 disabled by cis-aix7-hardening'
|
||||||
|
backrefs: true
|
||||||
|
backup: true
|
||||||
|
loop: "{{ cis_legacy_inetd_services }}"
|
||||||
|
when: cis_aix_inetd_config.stat.exists
|
||||||
|
notify: refresh inetd
|
||||||
|
|
||||||
|
- name: Report inetd configuration status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg:
|
||||||
|
- >-
|
||||||
|
{{ 'OK: /etc/inetd.conf exists and legacy entries were reviewed.'
|
||||||
|
if cis_aix_inetd_config.stat.exists else 'WARNING: /etc/inetd.conf was not found; inetd review skipped.' }}
|
||||||
|
- "OK: SRC states collected for {{ cis_src_subsystems | join(', ') }}."
|
||||||
|
|
||||||
|
- name: Stop inactive legacy SRC subsystems when present
|
||||||
|
ansible.builtin.command: "stopsrc -s {{ item }}"
|
||||||
|
changed_when: true
|
||||||
|
failed_when: false
|
||||||
|
loop:
|
||||||
|
- routed
|
||||||
|
- gated
|
||||||
|
- named
|
||||||
|
when: >-
|
||||||
|
cis_aix_src_summary.stdout is defined
|
||||||
|
and item in cis_aix_src_summary.stdout
|
||||||
|
and 'active' in cis_aix_src_summary.stdout
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
---
|
||||||
|
- name: Ensure sshd configuration exists
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: "{{ cis_ssh_config_path }}"
|
||||||
|
register: cis_aix_sshd_config
|
||||||
|
|
||||||
|
- name: Fail when sshd configuration is missing
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- cis_aix_sshd_config.stat.exists
|
||||||
|
fail_msg: "CRITICAL: {{ cis_ssh_config_path }} was not found; refusing to manage SSH hardening."
|
||||||
|
success_msg: "OK: {{ cis_ssh_config_path }} exists."
|
||||||
|
|
||||||
|
- name: Set sshd validation command from detected binary
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
cis_sshd_test_command: "{{ cis_aix_sshd_path }} -t"
|
||||||
|
when: cis_aix_sshd_path is defined and cis_aix_sshd_path | length > 0
|
||||||
|
|
||||||
|
- name: Apply managed AIX sshd hardening block
|
||||||
|
ansible.builtin.blockinfile:
|
||||||
|
path: "{{ cis_ssh_config_path }}"
|
||||||
|
marker: "# {mark} ANSIBLE MANAGED BLOCK cis-aix7-hardening"
|
||||||
|
owner: root
|
||||||
|
group: system
|
||||||
|
mode: "0600"
|
||||||
|
backup: true
|
||||||
|
validate: "{{ cis_sshd_test_command }} -f %s"
|
||||||
|
block: |
|
||||||
|
PermitRootLogin {{ 'no' if cis_disable_root_login | bool else 'prohibit-password' }}
|
||||||
|
PermitEmptyPasswords no
|
||||||
|
PasswordAuthentication {{ 'no' if cis_disable_password_auth | bool else 'yes' }}
|
||||||
|
MaxAuthTries {{ cis_ssh_max_auth_tries }}
|
||||||
|
LoginGraceTime {{ cis_ssh_login_grace_time }}
|
||||||
|
ClientAliveInterval {{ cis_ssh_client_alive_interval }}
|
||||||
|
ClientAliveCountMax {{ cis_ssh_client_alive_count_max }}
|
||||||
|
notify:
|
||||||
|
- validate sshd
|
||||||
|
- restart sshd
|
||||||
|
|
||||||
|
- name: Validate effective sshd configuration
|
||||||
|
ansible.builtin.command: "{{ cis_sshd_test_command }}"
|
||||||
|
changed_when: false
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user