feat: Add comprehensive enterprise Linux infrastructure portfolio with Ansible, Python, and ELK stack
CI Pipeline / lint-ansible (push) Waiting to run
CI Pipeline / test-python (push) Waiting to run
CI Pipeline / validate-docker (push) Waiting to run
CI Pipeline / security-scan (push) Waiting to run
CI Pipeline / documentation (push) Waiting to run
CI Pipeline / integration-test (push) Blocked by required conditions
CI Pipeline / lint-ansible (push) Waiting to run
CI Pipeline / test-python (push) Waiting to run
CI Pipeline / validate-docker (push) Waiting to run
CI Pipeline / security-scan (push) Waiting to run
CI Pipeline / documentation (push) Waiting to run
CI Pipeline / integration-test (push) Blocked by required conditions
This commit is contained in:
@@ -0,0 +1,326 @@
|
||||
# Enterprise Observability Alert Rules
|
||||
# Alert definitions for automated incident detection and notification
|
||||
|
||||
alert_rules:
|
||||
# System Resource Alerts
|
||||
- name: "High CPU Usage"
|
||||
description: "CPU utilization exceeds threshold"
|
||||
condition: "cpu_usage_percent > 90"
|
||||
duration: "5m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- system
|
||||
- performance
|
||||
channels:
|
||||
- email
|
||||
- slack
|
||||
labels:
|
||||
team: "platform"
|
||||
component: "system"
|
||||
|
||||
- name: "High Memory Usage"
|
||||
description: "Memory utilization exceeds threshold"
|
||||
condition: "memory_usage_percent > 85"
|
||||
duration: "3m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- system
|
||||
- memory
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "platform"
|
||||
component: "system"
|
||||
|
||||
- name: "Disk Space Critical"
|
||||
description: "Disk usage exceeds critical threshold"
|
||||
condition: "disk_usage_percent > 95"
|
||||
duration: "2m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- storage
|
||||
- disk
|
||||
channels:
|
||||
- email
|
||||
- pagerduty
|
||||
labels:
|
||||
team: "platform"
|
||||
component: "storage"
|
||||
|
||||
- name: "Disk Space Warning"
|
||||
description: "Disk usage exceeds warning threshold"
|
||||
condition: "disk_usage_percent > 85"
|
||||
duration: "10m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- storage
|
||||
- disk
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "platform"
|
||||
component: "storage"
|
||||
|
||||
# Service Availability Alerts
|
||||
- name: "Service Down"
|
||||
description: "Critical service is not responding"
|
||||
condition: "service_status == 'down' OR http_status_code >= 500"
|
||||
duration: "2m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- service
|
||||
- availability
|
||||
channels:
|
||||
- email
|
||||
- slack
|
||||
- pagerduty
|
||||
labels:
|
||||
team: "application"
|
||||
component: "service"
|
||||
|
||||
- name: "Database Connection Failed"
|
||||
description: "Database connection pool exhausted or unresponsive"
|
||||
condition: "db_connections_active == 0 OR db_response_time > 5000"
|
||||
duration: "1m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- database
|
||||
- connectivity
|
||||
channels:
|
||||
- email
|
||||
- pagerduty
|
||||
labels:
|
||||
team: "database"
|
||||
component: "postgresql"
|
||||
|
||||
- name: "Cache Unavailable"
|
||||
description: "Cache service is down or unresponsive"
|
||||
condition: "cache_hit_ratio < 0.1 OR cache_response_time > 1000"
|
||||
duration: "3m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- cache
|
||||
- performance
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "infrastructure"
|
||||
component: "redis"
|
||||
|
||||
# Application Performance Alerts
|
||||
- name: "High Error Rate"
|
||||
description: "Application error rate exceeds threshold"
|
||||
condition: "error_rate_percent > 5"
|
||||
duration: "5m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- application
|
||||
- errors
|
||||
channels:
|
||||
- email
|
||||
- slack
|
||||
labels:
|
||||
team: "application"
|
||||
component: "api"
|
||||
|
||||
- name: "Slow Response Time"
|
||||
description: "API response time exceeds SLA"
|
||||
condition: "response_time_p95 > 2000"
|
||||
duration: "5m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- application
|
||||
- performance
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "application"
|
||||
component: "api"
|
||||
|
||||
- name: "High Request Queue"
|
||||
description: "Request queue depth is too high"
|
||||
condition: "queue_depth > 100"
|
||||
duration: "3m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- application
|
||||
- queue
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "application"
|
||||
component: "queue"
|
||||
|
||||
# Infrastructure Alerts
|
||||
- name: "Network Latency High"
|
||||
description: "Network round-trip time exceeds threshold"
|
||||
condition: "network_rtt > 100"
|
||||
duration: "5m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- network
|
||||
- latency
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "network"
|
||||
component: "infrastructure"
|
||||
|
||||
- name: "Load Balancer Unhealthy"
|
||||
description: "Load balancer backend servers are unhealthy"
|
||||
condition: "lb_unhealthy_backends > 0"
|
||||
duration: "2m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- loadbalancer
|
||||
- availability
|
||||
channels:
|
||||
- email
|
||||
- pagerduty
|
||||
labels:
|
||||
team: "infrastructure"
|
||||
component: "loadbalancer"
|
||||
|
||||
# Security Alerts
|
||||
- name: "Failed Login Attempts"
|
||||
description: "Multiple failed authentication attempts detected"
|
||||
condition: "failed_login_attempts > 5"
|
||||
duration: "5m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- security
|
||||
- authentication
|
||||
channels:
|
||||
- email
|
||||
- slack
|
||||
labels:
|
||||
team: "security"
|
||||
component: "authentication"
|
||||
|
||||
- name: "Suspicious Network Traffic"
|
||||
description: "Unusual network traffic patterns detected"
|
||||
condition: "network_bytes_unusual > 1000000"
|
||||
duration: "10m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- security
|
||||
- network
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "security"
|
||||
component: "network"
|
||||
|
||||
# Log-based Alerts
|
||||
- name: "Application Errors"
|
||||
description: "High volume of application error logs"
|
||||
condition: "log_errors_per_minute > 10"
|
||||
duration: "2m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- logs
|
||||
- errors
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "application"
|
||||
component: "logs"
|
||||
|
||||
- name: "Out of Memory Errors"
|
||||
description: "Out of memory errors detected in logs"
|
||||
condition: "log_oom_errors > 0"
|
||||
duration: "1m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- memory
|
||||
- errors
|
||||
channels:
|
||||
- email
|
||||
- pagerduty
|
||||
labels:
|
||||
team: "application"
|
||||
component: "memory"
|
||||
|
||||
# Business Logic Alerts
|
||||
- name: "Low Business Transactions"
|
||||
description: "Business transaction volume below expected threshold"
|
||||
condition: "business_transactions_per_hour < 100"
|
||||
duration: "15m"
|
||||
severity: "warning"
|
||||
tags:
|
||||
- business
|
||||
- transactions
|
||||
channels:
|
||||
- email
|
||||
labels:
|
||||
team: "business"
|
||||
component: "transactions"
|
||||
|
||||
- name: "Payment Failures"
|
||||
description: "Payment processing failure rate is high"
|
||||
condition: "payment_failure_rate > 0.05"
|
||||
duration: "5m"
|
||||
severity: "critical"
|
||||
tags:
|
||||
- payments
|
||||
- business
|
||||
channels:
|
||||
- email
|
||||
- pagerduty
|
||||
labels:
|
||||
team: "payments"
|
||||
component: "processing"
|
||||
|
||||
# Alert Channels Configuration
|
||||
alert_channels:
|
||||
email:
|
||||
type: "email"
|
||||
recipients:
|
||||
- "platform-team@company.com"
|
||||
- "oncall@company.com"
|
||||
subject_template: "[{{severity}}] {{name}} - {{description}}"
|
||||
|
||||
slack:
|
||||
type: "slack"
|
||||
webhook_url: "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK"
|
||||
channel: "#alerts"
|
||||
username: "Observability Bot"
|
||||
icon_emoji: ":warning:"
|
||||
|
||||
pagerduty:
|
||||
type: "pagerduty"
|
||||
integration_key: "your-pagerduty-integration-key"
|
||||
severity_mapping:
|
||||
critical: "critical"
|
||||
warning: "warning"
|
||||
info: "info"
|
||||
|
||||
# Alert Silencing Rules
|
||||
silence_rules:
|
||||
- name: "Maintenance Window"
|
||||
condition: "maintenance_window == true"
|
||||
duration: "4h"
|
||||
comment: "Silenced during scheduled maintenance"
|
||||
|
||||
- name: "Known Issue"
|
||||
condition: "known_issue_id == 'TICKET-123'"
|
||||
duration: "24h"
|
||||
comment: "Silenced for known issue resolution"
|
||||
|
||||
# Escalation Policies
|
||||
escalation_policies:
|
||||
- name: "Default Escalation"
|
||||
steps:
|
||||
- delay: "5m"
|
||||
channels: ["email"]
|
||||
- delay: "15m"
|
||||
channels: ["slack"]
|
||||
- delay: "30m"
|
||||
channels: ["pagerduty"]
|
||||
|
||||
- name: "Critical Escalation"
|
||||
steps:
|
||||
- delay: "0m"
|
||||
channels: ["email", "slack", "pagerduty"]
|
||||
- delay: "10m"
|
||||
channels: ["pagerduty"] # Escalation
|
||||
Reference in New Issue
Block a user