Rework portfolio around Linux operations, Zabbix monitoring, migration validation, and ELK/Grafana log observability. Add AAP-style LVM resize workflow, Zabbix server/proxy/agent automation assets, Linux/AIX monitoring templates, and updated validation CI.
This commit is contained in:
@@ -0,0 +1,20 @@
|
||||
.PHONY: run test demo down
|
||||
|
||||
run:
|
||||
docker compose up -d
|
||||
|
||||
test:
|
||||
docker compose config --quiet
|
||||
test -f elasticsearch/config/elasticsearch.yml
|
||||
test -f logstash/config/logstash.yml
|
||||
test -f logstash/pipeline/logstash.conf
|
||||
test -f kibana/config/kibana.yml
|
||||
test -f filebeat/config/filebeat.yml
|
||||
test -d grafana/provisioning
|
||||
test -d grafana/dashboards
|
||||
|
||||
demo:
|
||||
bash ./scenarios/incident_simulation.sh app-errors 3
|
||||
|
||||
down:
|
||||
docker compose down
|
||||
@@ -0,0 +1,98 @@
|
||||
# Log Observability ELK/Grafana
|
||||
|
||||
## Problem
|
||||
|
||||
Operations teams need searchable logs and reviewable incident evidence in addition to simple OS checks. Zabbix is useful for host and service health signals; ELK/Grafana is better suited for log ingestion, error analysis, dashboards, and environment-level observability.
|
||||
|
||||
## CV Relevance
|
||||
|
||||
This project supports the monitoring and troubleshooting part of the CV by showing how incident logs can be collected, parsed, searched, and reviewed. It is separate from the Zabbix project: Zabbix handles simple checks, while this project focuses on logs and observability evidence.
|
||||
|
||||
## What This Project Demonstrates
|
||||
|
||||
- A local Docker Compose scaffold for Elasticsearch, Logstash, Kibana, Grafana, and Filebeat.
|
||||
- Minimal configs required for the stack to validate independently.
|
||||
- Sample logs and alert intent that can be reviewed without starting the full stack.
|
||||
- An incident simulation script for generating operational log evidence.
|
||||
|
||||
This is a local demo stack. The default credentials are for non-production use only.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Application/System Logs -> Filebeat -> Logstash -> Elasticsearch -> Kibana
|
||||
|
|
||||
v
|
||||
Grafana
|
||||
|
||||
Incident Scenario -> Sample Logs -> Alert Rules -> Operator Review
|
||||
```
|
||||
|
||||
Core components:
|
||||
|
||||
- `docker-compose.yml` defines the observability services.
|
||||
- `alerting/alert_rules.yml` records alert intent and severity.
|
||||
- `examples/` contains representative operational logs and alert output.
|
||||
- `scenarios/incident_simulation.sh` emits incident activity.
|
||||
- `grafana/`, `kibana/`, `logstash/`, `filebeat/`, and `elasticsearch/` contain minimal local configs.
|
||||
|
||||
## Quickstart
|
||||
|
||||
```bash
|
||||
cd professional-infra/log-observability-elk-grafana
|
||||
make test
|
||||
make demo
|
||||
```
|
||||
|
||||
Start the full local stack with Docker:
|
||||
|
||||
```bash
|
||||
make test
|
||||
make run
|
||||
make down
|
||||
```
|
||||
|
||||
When running locally:
|
||||
|
||||
- Kibana: `http://localhost:5601`
|
||||
- Grafana: `http://localhost:3000`
|
||||
- Elasticsearch: `http://localhost:9200`
|
||||
|
||||
Default demo credentials:
|
||||
|
||||
- Elasticsearch/Kibana: `elastic` / `elastic`
|
||||
- Grafana: `admin` / `admin`
|
||||
|
||||
## Validation
|
||||
|
||||
```bash
|
||||
make test
|
||||
docker compose config --quiet
|
||||
```
|
||||
|
||||
`make test` also checks that all bind-mounted config files and directories exist.
|
||||
|
||||
## Example Output
|
||||
|
||||
```text
|
||||
[2026-04-29 04:18:23] WARN Database connection pool nearing capacity
|
||||
[2026-04-29 04:18:28] ERROR Database connection pool exhausted
|
||||
[2026-04-29 04:18:33] ERROR Database query timeout occurred
|
||||
[2026-04-29 04:18:44] INFO Database connections restored
|
||||
```
|
||||
|
||||
Additional examples are available in [examples/alert-output.txt](examples/alert-output.txt) and [examples/sample-log.txt](examples/sample-log.txt).
|
||||
|
||||
## Interview Talking Points
|
||||
|
||||
- When to use Zabbix checks versus ELK log analysis.
|
||||
- How Filebeat, Logstash, and Elasticsearch fit into a basic log pipeline.
|
||||
- How incident simulations create evidence for troubleshooting discussions.
|
||||
- Why local demo credentials and single-node Elasticsearch are not production architecture.
|
||||
|
||||
## Roadmap
|
||||
|
||||
- Add curated Grafana and Kibana dashboards.
|
||||
- Add Prometheus metrics collection.
|
||||
- Add distributed tracing with Jaeger or OpenTelemetry.
|
||||
- Add synthetic monitoring checks.
|
||||
@@ -0,0 +1,326 @@
|
||||
# Enterprise Observability Alert Rules
|
||||
# Alert definitions for automated incident detection and notification
|
||||
|
||||
alert_rules:
|
||||
# System Resource Alerts
|
||||
- name: "High CPU Usage"
|
||||
description: "CPU utilization exceeds threshold"
|
||||
condition: "cpu_usage_percent > 90"
|
||||
duration: "5m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- system
|
||||
- performance
|
||||
channels:
|
||||
- email
|
||||
- slack
|
||||
labels:
|
||||
team: "platform"
|
||||
component: "system"
|
||||
|
||||
- name: "High Memory Usage"
|
||||
description: "Memory utilization exceeds threshold"
|
||||
condition: "memory_usage_percent > 85"
|
||||
duration: "3m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- system
|
||||
- memory
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "platform"
|
||||
component: "system"
|
||||
|
||||
- name: "Disk Space Critical"
|
||||
description: "Disk usage exceeds critical threshold"
|
||||
condition: "disk_usage_percent > 95"
|
||||
duration: "2m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- storage
|
||||
- disk
|
||||
channels:
|
||||
- email
|
||||
- pagerduty
|
||||
labels:
|
||||
team: "platform"
|
||||
component: "storage"
|
||||
|
||||
- name: "Disk Space Warning"
|
||||
description: "Disk usage exceeds warning threshold"
|
||||
condition: "disk_usage_percent > 85"
|
||||
duration: "10m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- storage
|
||||
- disk
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "platform"
|
||||
component: "storage"
|
||||
|
||||
# Service Availability Alerts
|
||||
- name: "Service Down"
|
||||
description: "Critical service is not responding"
|
||||
condition: "service_status == 'down' OR http_status_code >= 500"
|
||||
duration: "2m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- service
|
||||
- availability
|
||||
channels:
|
||||
- email
|
||||
- slack
|
||||
- pagerduty
|
||||
labels:
|
||||
team: "application"
|
||||
component: "service"
|
||||
|
||||
- name: "Database Connection Failed"
|
||||
description: "Database connection pool exhausted or unresponsive"
|
||||
condition: "db_connections_active == 0 OR db_response_time > 5000"
|
||||
duration: "1m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- database
|
||||
- connectivity
|
||||
channels:
|
||||
- email
|
||||
- pagerduty
|
||||
labels:
|
||||
team: "database"
|
||||
component: "postgresql"
|
||||
|
||||
- name: "Cache Unavailable"
|
||||
description: "Cache service is down or unresponsive"
|
||||
condition: "cache_hit_ratio < 0.1 OR cache_response_time > 1000"
|
||||
duration: "3m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- cache
|
||||
- performance
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "infrastructure"
|
||||
component: "redis"
|
||||
|
||||
# Application Performance Alerts
|
||||
- name: "High Error Rate"
|
||||
description: "Application error rate exceeds threshold"
|
||||
condition: "error_rate_percent > 5"
|
||||
duration: "5m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- application
|
||||
- errors
|
||||
channels:
|
||||
- email
|
||||
- slack
|
||||
labels:
|
||||
team: "application"
|
||||
component: "api"
|
||||
|
||||
- name: "Slow Response Time"
|
||||
description: "API response time exceeds SLA"
|
||||
condition: "response_time_p95 > 2000"
|
||||
duration: "5m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- application
|
||||
- performance
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "application"
|
||||
component: "api"
|
||||
|
||||
- name: "High Request Queue"
|
||||
description: "Request queue depth is too high"
|
||||
condition: "queue_depth > 100"
|
||||
duration: "3m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- application
|
||||
- queue
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "application"
|
||||
component: "queue"
|
||||
|
||||
# Infrastructure Alerts
|
||||
- name: "Network Latency High"
|
||||
description: "Network round-trip time exceeds threshold"
|
||||
condition: "network_rtt > 100"
|
||||
duration: "5m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- network
|
||||
- latency
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "network"
|
||||
component: "infrastructure"
|
||||
|
||||
- name: "Load Balancer Unhealthy"
|
||||
description: "Load balancer backend servers are unhealthy"
|
||||
condition: "lb_unhealthy_backends > 0"
|
||||
duration: "2m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- loadbalancer
|
||||
- availability
|
||||
channels:
|
||||
- email
|
||||
- pagerduty
|
||||
labels:
|
||||
team: "infrastructure"
|
||||
component: "loadbalancer"
|
||||
|
||||
# Security Alerts
|
||||
- name: "Failed Login Attempts"
|
||||
description: "Multiple failed authentication attempts detected"
|
||||
condition: "failed_login_attempts > 5"
|
||||
duration: "5m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- security
|
||||
- authentication
|
||||
channels:
|
||||
- email
|
||||
- slack
|
||||
labels:
|
||||
team: "security"
|
||||
component: "authentication"
|
||||
|
||||
- name: "Suspicious Network Traffic"
|
||||
description: "Unusual network traffic patterns detected"
|
||||
condition: "network_bytes_unusual > 1000000"
|
||||
duration: "10m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- security
|
||||
- network
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "security"
|
||||
component: "network"
|
||||
|
||||
# Log-based Alerts
|
||||
- name: "Application Errors"
|
||||
description: "High volume of application error logs"
|
||||
condition: "log_errors_per_minute > 10"
|
||||
duration: "2m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- logs
|
||||
- errors
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "application"
|
||||
component: "logs"
|
||||
|
||||
- name: "Out of Memory Errors"
|
||||
description: "Out of memory errors detected in logs"
|
||||
condition: "log_oom_errors > 0"
|
||||
duration: "1m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- memory
|
||||
- errors
|
||||
channels:
|
||||
- email
|
||||
- pagerduty
|
||||
labels:
|
||||
team: "application"
|
||||
component: "memory"
|
||||
|
||||
# Business Logic Alerts
|
||||
- name: "Low Business Transactions"
|
||||
description: "Business transaction volume below expected threshold"
|
||||
condition: "business_transactions_per_hour < 100"
|
||||
duration: "15m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- business
|
||||
- transactions
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "business"
|
||||
component: "transactions"
|
||||
|
||||
- name: "Payment Failures"
|
||||
description: "Payment processing failure rate is high"
|
||||
condition: "payment_failure_rate > 0.05"
|
||||
duration: "5m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- payments
|
||||
- business
|
||||
channels:
|
||||
- email
|
||||
- pagerduty
|
||||
labels:
|
||||
team: "payments"
|
||||
component: "processing"
|
||||
|
||||
# Alert Channels Configuration
|
||||
alert_channels:
|
||||
email:
|
||||
type: "email"
|
||||
recipients:
|
||||
- "platform-team@company.com"
|
||||
- "oncall@company.com"
|
||||
subject_template: "[{{severity}}] {{name}} - {{description}}"
|
||||
|
||||
slack:
|
||||
type: "slack"
|
||||
webhook_url: "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK"
|
||||
channel: "#alerts"
|
||||
username: "Observability Bot"
|
||||
icon_emoji: ":warning:"
|
||||
|
||||
pagerduty:
|
||||
type: "pagerduty"
|
||||
integration_key: "your-pagerduty-integration-key"
|
||||
severity_mapping:
|
||||
critical: "critical"
|
||||
warning: "warning"
|
||||
info: "info"
|
||||
|
||||
# Alert Silencing Rules
|
||||
silence_rules:
|
||||
- name: "Maintenance Window"
|
||||
condition: "maintenance_window == true"
|
||||
duration: "4h"
|
||||
comment: "Silenced during scheduled maintenance"
|
||||
|
||||
- name: "Known Issue"
|
||||
condition: "known_issue_id == 'TICKET-123'"
|
||||
duration: "24h"
|
||||
comment: "Silenced for known issue resolution"
|
||||
|
||||
# Escalation Policies
|
||||
escalation_policies:
|
||||
- name: "Default Escalation"
|
||||
steps:
|
||||
- delay: "5m"
|
||||
channels: ["email"]
|
||||
- delay: "15m"
|
||||
channels: ["slack"]
|
||||
- delay: "30m"
|
||||
channels: ["pagerduty"]
|
||||
|
||||
- name: "Critical Escalation"
|
||||
steps:
|
||||
- delay: "0m"
|
||||
channels: ["email", "slack", "pagerduty"]
|
||||
- delay: "10m"
|
||||
channels: ["pagerduty"] # Escalation
|
||||
@@ -0,0 +1,120 @@
|
||||
services:
|
||||
elasticsearch:
|
||||
image: docker.elastic.co/elasticsearch/elasticsearch:8.11.0
|
||||
container_name: observability-elasticsearch
|
||||
environment:
|
||||
- discovery.type=single-node
|
||||
- xpack.security.enabled=true
|
||||
- ELASTIC_PASSWORD=elastic
|
||||
- "ES_JAVA_OPTS=-Xms1g -Xmx1g"
|
||||
volumes:
|
||||
- elasticsearch_data:/usr/share/elasticsearch/data
|
||||
- ./elasticsearch/config/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml
|
||||
ports:
|
||||
- "9200:9200"
|
||||
- "9300:9300"
|
||||
networks:
|
||||
- observability
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -u elastic:elastic -f http://localhost:9200/_cluster/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
|
||||
logstash:
|
||||
image: docker.elastic.co/logstash/logstash:8.11.0
|
||||
container_name: observability-logstash
|
||||
environment:
|
||||
- "LS_JAVA_OPTS=-Xms512m -Xmx512m"
|
||||
volumes:
|
||||
- ./logstash/config/logstash.yml:/usr/share/logstash/config/logstash.yml
|
||||
- ./logstash/pipeline:/usr/share/logstash/pipeline
|
||||
- ./logs:/usr/share/logstash/logs
|
||||
ports:
|
||||
- "5044:5044"
|
||||
- "8080:8080"
|
||||
networks:
|
||||
- observability
|
||||
depends_on:
|
||||
elasticsearch:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:9600/_node/pipelines || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
kibana:
|
||||
image: docker.elastic.co/kibana/kibana:8.11.0
|
||||
container_name: observability-kibana
|
||||
environment:
|
||||
- ELASTICSEARCH_HOSTS=http://elasticsearch:9200
|
||||
- ELASTICSEARCH_USERNAME=elastic
|
||||
- ELASTICSEARCH_PASSWORD=elastic
|
||||
volumes:
|
||||
- ./kibana/config/kibana.yml:/usr/share/kibana/config/kibana.yml
|
||||
ports:
|
||||
- "5601:5601"
|
||||
networks:
|
||||
- observability
|
||||
depends_on:
|
||||
elasticsearch:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:5601/api/status || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:10.2.0
|
||||
container_name: observability-grafana
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- ./grafana/provisioning:/etc/grafana/provisioning
|
||||
- ./grafana/dashboards:/var/lib/grafana/dashboards
|
||||
ports:
|
||||
- "3000:3000"
|
||||
networks:
|
||||
- observability
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
filebeat:
|
||||
image: docker.elastic.co/beats/filebeat:8.11.0
|
||||
container_name: observability-filebeat
|
||||
user: root
|
||||
volumes:
|
||||
- ./filebeat/config/filebeat.yml:/usr/share/filebeat/filebeat.yml
|
||||
- ./logs:/var/log/sample
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
networks:
|
||||
- observability
|
||||
depends_on:
|
||||
- logstash
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
elasticsearch_data:
|
||||
driver: local
|
||||
grafana_data:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
observability:
|
||||
driver: bridge
|
||||
ipam:
|
||||
config:
|
||||
- subnet: 172.25.0.0/16
|
||||
@@ -0,0 +1,30 @@
|
||||
# Log Observability ELK/Grafana Architecture
|
||||
|
||||
## Components
|
||||
|
||||
- Filebeat: tails sample and container logs.
|
||||
- Logstash: receives and processes log events.
|
||||
- Elasticsearch: stores searchable observability data.
|
||||
- Kibana: supports log exploration and dashboards.
|
||||
- Grafana: provides operational dashboards.
|
||||
- Alert rules: document symptoms, thresholds, and severity.
|
||||
- Incident simulation: generates controlled failure signals.
|
||||
|
||||
## Data Flow
|
||||
|
||||
```
|
||||
Log source -> Filebeat -> Logstash -> Elasticsearch -> Kibana
|
||||
|
|
||||
v
|
||||
Grafana
|
||||
```
|
||||
|
||||
Incident exercises follow this flow:
|
||||
|
||||
```
|
||||
Operator -> incident_simulation.sh -> logs/incident_simulation.log -> Filebeat -> Logstash -> alerts/dashboards
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
This is a local demonstration stack, not a production Elasticsearch deployment. A production version would add dedicated nodes, TLS, secret management, retention policies, index lifecycle management, and external alert delivery.
|
||||
+4
@@ -0,0 +1,4 @@
|
||||
cluster.name: portfolio-observability
|
||||
node.name: elasticsearch-demo
|
||||
network.host: 0.0.0.0
|
||||
xpack.security.enabled: true
|
||||
@@ -0,0 +1,4 @@
|
||||
2026-04-29T04:19:00Z alert=database_connection_pool_exhausted severity=critical service=checkout-api host=app-web-02 value=100 threshold=95 status=firing
|
||||
2026-04-29T04:19:30Z alert=api_error_rate_high severity=warning service=checkout-api host=app-web-02 value=7.8 threshold=5.0 status=firing
|
||||
2026-04-29T04:22:00Z alert=database_connection_pool_exhausted severity=critical service=checkout-api host=app-web-02 value=71 threshold=95 status=resolved
|
||||
2026-04-29T04:23:15Z alert=api_error_rate_high severity=warning service=checkout-api host=app-web-02 value=1.2 threshold=5.0 status=resolved
|
||||
@@ -0,0 +1,5 @@
|
||||
2026-04-29T04:18:21Z INFO service=checkout-api host=app-web-02 request_id=8f4b2 path=/checkout status=200 latency_ms=142
|
||||
2026-04-29T04:18:28Z WARN service=checkout-api host=app-web-02 event=db_pool_pressure active=92 max=100
|
||||
2026-04-29T04:18:33Z ERROR service=checkout-api host=app-web-02 event=db_timeout query=CreateOrder timeout_ms=5000 customer_tier=enterprise
|
||||
2026-04-29T04:18:39Z ERROR service=checkout-api host=app-web-02 event=payment_retry_exhausted order_id=ord-104288 provider=stripe
|
||||
2026-04-29T04:18:44Z INFO service=checkout-api host=app-web-02 event=recovery db_pool_active=48
|
||||
@@ -0,0 +1,11 @@
|
||||
filebeat.inputs:
|
||||
- type: filestream
|
||||
id: portfolio-sample-logs
|
||||
enabled: true
|
||||
paths:
|
||||
- /var/log/sample/*.log
|
||||
|
||||
output.logstash:
|
||||
hosts: ["logstash:5044"]
|
||||
|
||||
logging.level: info
|
||||
@@ -0,0 +1,3 @@
|
||||
# Dashboards
|
||||
|
||||
This directory is reserved for local demo dashboards. The current portfolio scope validates the observability stack scaffold, sample logs, alert intent, and incident simulation without claiming production-ready dashboards.
|
||||
+14
@@ -0,0 +1,14 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Elasticsearch
|
||||
type: elasticsearch
|
||||
access: proxy
|
||||
url: http://elasticsearch:9200
|
||||
basicAuth: true
|
||||
basicAuthUser: elastic
|
||||
jsonData:
|
||||
index: portfolio-logs-*
|
||||
timeField: "@timestamp"
|
||||
secureJsonData:
|
||||
basicAuthPassword: elastic
|
||||
@@ -0,0 +1,4 @@
|
||||
server.host: 0.0.0.0
|
||||
elasticsearch.hosts: ["http://elasticsearch:9200"]
|
||||
elasticsearch.username: elastic
|
||||
elasticsearch.password: elastic
|
||||
@@ -0,0 +1,2 @@
|
||||
http.host: 0.0.0.0
|
||||
pipeline.ecs_compatibility: disabled
|
||||
@@ -0,0 +1,24 @@
|
||||
input {
|
||||
beats {
|
||||
port => 5044
|
||||
}
|
||||
}
|
||||
|
||||
filter {
|
||||
grok {
|
||||
match => { "message" => "\[%{TIMESTAMP_ISO8601:observed_at}\] %{LOGLEVEL:level} %{GREEDYDATA:event_message}" }
|
||||
tag_on_failure => ["portfolio_parse_failure"]
|
||||
}
|
||||
}
|
||||
|
||||
output {
|
||||
elasticsearch {
|
||||
hosts => ["http://elasticsearch:9200"]
|
||||
user => "elastic"
|
||||
password => "elastic"
|
||||
index => "portfolio-logs-%{+YYYY.MM.dd}"
|
||||
}
|
||||
stdout {
|
||||
codec => rubydebug
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
# Scenario: Incident Simulation
|
||||
|
||||
## Description
|
||||
|
||||
Generate a controlled application and infrastructure incident so the logging pipeline, alert rules, and dashboards can be reviewed with realistic event timing.
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
cd professional-infra/log-observability-elk-grafana
|
||||
docker compose config
|
||||
./scenarios/incident_simulation.sh comprehensive
|
||||
tail -n 40 logs/incident_simulation.log
|
||||
```
|
||||
|
||||
## Expected Result
|
||||
|
||||
- The compose file validates successfully.
|
||||
- The simulation writes a sequence of CPU, memory, service, database, and application error events.
|
||||
- Alert examples indicate firing and resolved states.
|
||||
- Operators can trace incident progression through logs and dashboard queries.
|
||||
+318
@@ -0,0 +1,318 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Enterprise Incident Simulation Script
|
||||
# Simulates various failure scenarios for testing observability stack
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||
LOG_FILE="$PROJECT_ROOT/logs/incident_simulation.log"
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Logging function
|
||||
log() {
|
||||
local level=$1
|
||||
local message=$2
|
||||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
echo "[$timestamp] $level $message" >> "$LOG_FILE"
|
||||
echo -e "${BLUE}[$timestamp]${NC} $level $message"
|
||||
}
|
||||
|
||||
# Function to simulate CPU spike
|
||||
simulate_cpu_spike() {
|
||||
local duration=${1:-60}
|
||||
log "INFO" "Starting CPU spike simulation for ${duration} seconds"
|
||||
|
||||
# Launch CPU-intensive processes
|
||||
for i in {1..4}; do
|
||||
(
|
||||
end_time=$((SECONDS + duration))
|
||||
while [ $SECONDS -lt $end_time ]; do
|
||||
# CPU-intensive calculation
|
||||
result=0
|
||||
for j in {1..100000}; do
|
||||
result=$((result + j))
|
||||
done
|
||||
done
|
||||
) &
|
||||
PIDS[$i]=$!
|
||||
done
|
||||
|
||||
# Wait for simulation to complete
|
||||
for pid in "${PIDS[@]}"; do
|
||||
wait $pid 2>/dev/null || true
|
||||
done
|
||||
|
||||
log "INFO" "CPU spike simulation completed"
|
||||
}
|
||||
|
||||
# Function to simulate memory leak
|
||||
simulate_memory_leak() {
|
||||
local duration=${1:-30}
|
||||
log "INFO" "Starting memory leak simulation for ${duration} seconds"
|
||||
|
||||
# Create a process that gradually consumes memory
|
||||
(
|
||||
data=""
|
||||
end_time=$((SECONDS + duration))
|
||||
while [ $SECONDS -lt $end_time ]; do
|
||||
# Gradually consume memory
|
||||
data="${data}X"
|
||||
sleep 0.1
|
||||
done
|
||||
) &
|
||||
MEM_PID=$!
|
||||
|
||||
wait $MEM_PID 2>/dev/null || true
|
||||
log "INFO" "Memory leak simulation completed"
|
||||
}
|
||||
|
||||
# Function to simulate disk space exhaustion
|
||||
simulate_disk_full() {
|
||||
local target_dir=${1:-"/tmp"}
|
||||
local duration=${2:-30}
|
||||
log "INFO" "Starting disk space exhaustion simulation in ${target_dir} for ${duration} seconds"
|
||||
|
||||
# Create large files to fill disk space
|
||||
(
|
||||
end_time=$((SECONDS + duration))
|
||||
while [ $SECONDS -lt $end_time ]; do
|
||||
# Create 100MB file
|
||||
dd if=/dev/zero of="${target_dir}/incident_test_file_$(date +%s).tmp" bs=1M count=100 2>/dev/null || true
|
||||
sleep 2
|
||||
done
|
||||
) &
|
||||
DISK_PID=$!
|
||||
|
||||
wait $DISK_PID 2>/dev/null || true
|
||||
|
||||
# Cleanup test files
|
||||
rm -f "${target_dir}"/incident_test_file_*.tmp 2>/dev/null || true
|
||||
log "INFO" "Disk space exhaustion simulation completed and cleaned up"
|
||||
}
|
||||
|
||||
# Function to simulate network issues
|
||||
simulate_network_issues() {
|
||||
local interface=${1:-"lo"}
|
||||
local duration=${2:-20}
|
||||
log "INFO" "Starting network issues simulation on ${interface} for ${duration} seconds"
|
||||
|
||||
# Add network delay and packet loss
|
||||
sudo tc qdisc add dev $interface root netem delay 100ms 50ms loss 10% 2>/dev/null || true
|
||||
|
||||
sleep $duration
|
||||
|
||||
# Remove network simulation
|
||||
sudo tc qdisc del dev $interface root 2>/dev/null || true
|
||||
log "INFO" "Network issues simulation completed"
|
||||
}
|
||||
|
||||
# Function to simulate service crashes
|
||||
simulate_service_crash() {
|
||||
local service_name=${1:-"test-service"}
|
||||
log "INFO" "Starting service crash simulation for ${service_name}"
|
||||
|
||||
# Simulate service going down
|
||||
log "ERROR" "Service ${service_name} crashed unexpectedly"
|
||||
sleep 5
|
||||
log "INFO" "Service ${service_name} restarted automatically"
|
||||
|
||||
# Simulate multiple crashes
|
||||
for i in {1..3}; do
|
||||
sleep 2
|
||||
log "ERROR" "Service ${service_name} crashed again (attempt $i)"
|
||||
sleep 1
|
||||
log "INFO" "Service ${service_name} recovered after crash $i"
|
||||
done
|
||||
}
|
||||
|
||||
# Function to simulate database issues
|
||||
simulate_database_issues() {
|
||||
local duration=${1:-25}
|
||||
log "INFO" "Starting database issues simulation for ${duration} seconds"
|
||||
|
||||
# Simulate connection pool exhaustion
|
||||
log "WARN" "Database connection pool nearing capacity"
|
||||
sleep 5
|
||||
log "ERROR" "Database connection pool exhausted"
|
||||
sleep 5
|
||||
log "ERROR" "Database query timeout occurred"
|
||||
sleep 5
|
||||
log "WARN" "Database connections recovering"
|
||||
sleep 5
|
||||
log "INFO" "Database connections restored"
|
||||
|
||||
log "INFO" "Database issues simulation completed"
|
||||
}
|
||||
|
||||
# Function to simulate application errors
|
||||
simulate_application_errors() {
|
||||
local error_count=${1:-10}
|
||||
log "INFO" "Starting application error simulation (${error_count} errors)"
|
||||
|
||||
for i in $(seq 1 "$error_count"); do
|
||||
case $((RANDOM % 4)) in
|
||||
0)
|
||||
log "ERROR" "NullPointerException in UserService.getUser($i)"
|
||||
;;
|
||||
1)
|
||||
log "ERROR" "TimeoutException: Database query timed out for user ID: $i"
|
||||
;;
|
||||
2)
|
||||
log "ERROR" "ValidationException: Invalid input data for request $i"
|
||||
;;
|
||||
3)
|
||||
log "ERROR" "IOException: Failed to write to log file"
|
||||
;;
|
||||
esac
|
||||
sleep $((RANDOM % 3 + 1))
|
||||
done
|
||||
|
||||
log "INFO" "Application error simulation completed"
|
||||
}
|
||||
|
||||
# Function to run comprehensive incident scenario
|
||||
run_comprehensive_scenario() {
|
||||
log "INFO" "Starting comprehensive incident scenario simulation"
|
||||
|
||||
# Phase 1: Initial system stress
|
||||
log "INFO" "Phase 1: System stress simulation"
|
||||
simulate_cpu_spike 30 &
|
||||
CPU_PID=$!
|
||||
simulate_memory_leak 20 &
|
||||
MEM_PID=$!
|
||||
|
||||
sleep 10
|
||||
|
||||
# Phase 2: Service degradation
|
||||
log "INFO" "Phase 2: Service degradation simulation"
|
||||
simulate_service_crash "web-service" &
|
||||
SERVICE_PID=$!
|
||||
|
||||
sleep 5
|
||||
|
||||
# Phase 3: Database issues
|
||||
log "INFO" "Phase 3: Database issues simulation"
|
||||
simulate_database_issues 15 &
|
||||
DB_PID=$!
|
||||
|
||||
# Phase 4: Application errors
|
||||
log "INFO" "Phase 4: Application error burst"
|
||||
simulate_application_errors 15 &
|
||||
APP_PID=$!
|
||||
|
||||
# Phase 5: Infrastructure issues
|
||||
log "INFO" "Phase 5: Infrastructure issues simulation"
|
||||
simulate_disk_full "/tmp" 10 &
|
||||
DISK_PID=$!
|
||||
|
||||
# Wait for all simulations to complete
|
||||
wait $CPU_PID 2>/dev/null || true
|
||||
wait $MEM_PID 2>/dev/null || true
|
||||
wait $SERVICE_PID 2>/dev/null || true
|
||||
wait $DB_PID 2>/dev/null || true
|
||||
wait $APP_PID 2>/dev/null || true
|
||||
wait $DISK_PID 2>/dev/null || true
|
||||
|
||||
log "INFO" "Comprehensive incident scenario completed"
|
||||
}
|
||||
|
||||
# Function to show usage
|
||||
show_usage() {
|
||||
echo "Enterprise Incident Simulation Script"
|
||||
echo "Usage: $0 [SCENARIO] [OPTIONS]"
|
||||
echo ""
|
||||
echo "SCENARIOS:"
|
||||
echo " cpu [DURATION] - Simulate CPU spike (default: 60s)"
|
||||
echo " memory [DURATION] - Simulate memory leak (default: 30s)"
|
||||
echo " disk [DIR] [DURATION] - Simulate disk space exhaustion (default: /tmp, 30s)"
|
||||
echo " network [INTERFACE] [DURATION] - Simulate network issues (default: lo, 20s)"
|
||||
echo " service [NAME] - Simulate service crashes (default: test-service)"
|
||||
echo " database [DURATION] - Simulate database issues (default: 25s)"
|
||||
echo " app-errors [COUNT] - Simulate application errors (default: 10)"
|
||||
echo " comprehensive - Run full incident scenario"
|
||||
echo " all - Run all individual scenarios sequentially"
|
||||
echo ""
|
||||
echo "EXAMPLES:"
|
||||
echo " $0 cpu 120 - CPU spike for 2 minutes"
|
||||
echo " $0 disk /var/log 45 - Disk full simulation in /var/log for 45 seconds"
|
||||
echo " $0 comprehensive - Full incident simulation"
|
||||
echo ""
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
local scenario=${1:-"comprehensive"}
|
||||
|
||||
# Create log directory if it doesn't exist
|
||||
mkdir -p "$(dirname "$LOG_FILE")"
|
||||
|
||||
log "INFO" "Incident simulation script started"
|
||||
log "INFO" "Scenario: $scenario"
|
||||
|
||||
case $scenario in
|
||||
"cpu")
|
||||
simulate_cpu_spike "${2:-60}"
|
||||
;;
|
||||
"memory")
|
||||
simulate_memory_leak "${2:-30}"
|
||||
;;
|
||||
"disk")
|
||||
simulate_disk_full "${2:-/tmp}" "${3:-30}"
|
||||
;;
|
||||
"network")
|
||||
simulate_network_issues "${2:-lo}" "${3:-20}"
|
||||
;;
|
||||
"service")
|
||||
simulate_service_crash "${2:-test-service}"
|
||||
;;
|
||||
"database")
|
||||
simulate_database_issues "${2:-25}"
|
||||
;;
|
||||
"app-errors")
|
||||
simulate_application_errors "${2:-10}"
|
||||
;;
|
||||
"comprehensive")
|
||||
run_comprehensive_scenario
|
||||
;;
|
||||
"all")
|
||||
log "INFO" "Running all scenarios sequentially"
|
||||
simulate_cpu_spike 30
|
||||
sleep 5
|
||||
simulate_memory_leak 20
|
||||
sleep 5
|
||||
simulate_disk_full "/tmp" 15
|
||||
sleep 5
|
||||
simulate_service_crash "test-service"
|
||||
sleep 5
|
||||
simulate_database_issues 15
|
||||
sleep 5
|
||||
simulate_application_errors 8
|
||||
sleep 5
|
||||
simulate_network_issues "lo" 10
|
||||
;;
|
||||
"help"|"-h"|"--help")
|
||||
show_usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo -e "${RED}Error: Unknown scenario '$scenario'${NC}"
|
||||
echo ""
|
||||
show_usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
log "INFO" "Incident simulation script completed successfully"
|
||||
echo -e "${GREEN}Simulation completed. Check logs at: $LOG_FILE${NC}"
|
||||
}
|
||||
|
||||
# Run main function with all arguments
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user