# Enterprise Observability Alert Rules # Alert definitions for automated incident detection and notification alert_rules: # System Resource Alerts - name: "High CPU Usage" description: "CPU utilization exceeds threshold" condition: "cpu_usage_percent > 90" duration: "5m" severity: "critical" tags: - system - performance channels: - email - slack labels: team: "platform" component: "system" - name: "High Memory Usage" description: "Memory utilization exceeds threshold" condition: "memory_usage_percent > 85" duration: "3m" severity: "warning" tags: - system - memory channels: - email labels: team: "platform" component: "system" - name: "Disk Space Critical" description: "Disk usage exceeds critical threshold" condition: "disk_usage_percent > 95" duration: "2m" severity: "critical" tags: - storage - disk channels: - email - pagerduty labels: team: "platform" component: "storage" - name: "Disk Space Warning" description: "Disk usage exceeds warning threshold" condition: "disk_usage_percent > 85" duration: "10m" severity: "warning" tags: - storage - disk channels: - email labels: team: "platform" component: "storage" # Service Availability Alerts - name: "Service Down" description: "Critical service is not responding" condition: "service_status == 'down' OR http_status_code >= 500" duration: "2m" severity: "critical" tags: - service - availability channels: - email - slack - pagerduty labels: team: "application" component: "service" - name: "Database Connection Failed" description: "Database connection pool exhausted or unresponsive" condition: "db_connections_active == 0 OR db_response_time > 5000" duration: "1m" severity: "critical" tags: - database - connectivity channels: - email - pagerduty labels: team: "database" component: "postgresql" - name: "Cache Unavailable" description: "Cache service is down or unresponsive" condition: "cache_hit_ratio < 0.1 OR cache_response_time > 1000" duration: "3m" severity: "warning" tags: - cache - performance channels: - email labels: team: "infrastructure" component: "redis" # Application Performance Alerts - name: "High Error Rate" description: "Application error rate exceeds threshold" condition: "error_rate_percent > 5" duration: "5m" severity: "critical" tags: - application - errors channels: - email - slack labels: team: "application" component: "api" - name: "Slow Response Time" description: "API response time exceeds SLA" condition: "response_time_p95 > 2000" duration: "5m" severity: "warning" tags: - application - performance channels: - email labels: team: "application" component: "api" - name: "High Request Queue" description: "Request queue depth is too high" condition: "queue_depth > 100" duration: "3m" severity: "warning" tags: - application - queue channels: - email labels: team: "application" component: "queue" # Infrastructure Alerts - name: "Network Latency High" description: "Network round-trip time exceeds threshold" condition: "network_rtt > 100" duration: "5m" severity: "warning" tags: - network - latency channels: - email labels: team: "network" component: "infrastructure" - name: "Load Balancer Unhealthy" description: "Load balancer backend servers are unhealthy" condition: "lb_unhealthy_backends > 0" duration: "2m" severity: "critical" tags: - loadbalancer - availability channels: - email - pagerduty labels: team: "infrastructure" component: "loadbalancer" # Security Alerts - name: "Failed Login Attempts" description: "Multiple failed authentication attempts detected" condition: "failed_login_attempts > 5" duration: "5m" severity: "warning" tags: - security - authentication channels: - email - slack labels: team: "security" component: "authentication" - name: "Suspicious Network Traffic" description: "Unusual network traffic patterns detected" condition: "network_bytes_unusual > 1000000" duration: "10m" severity: "warning" tags: - security - network channels: - email labels: team: "security" component: "network" # Log-based Alerts - name: "Application Errors" description: "High volume of application error logs" condition: "log_errors_per_minute > 10" duration: "2m" severity: "warning" tags: - logs - errors channels: - email labels: team: "application" component: "logs" - name: "Out of Memory Errors" description: "Out of memory errors detected in logs" condition: "log_oom_errors > 0" duration: "1m" severity: "critical" tags: - memory - errors channels: - email - pagerduty labels: team: "application" component: "memory" # Business Logic Alerts - name: "Low Business Transactions" description: "Business transaction volume below expected threshold" condition: "business_transactions_per_hour < 100" duration: "15m" severity: "warning" tags: - business - transactions channels: - email labels: team: "business" component: "transactions" - name: "Payment Failures" description: "Payment processing failure rate is high" condition: "payment_failure_rate > 0.05" duration: "5m" severity: "critical" tags: - payments - business channels: - email - pagerduty labels: team: "payments" component: "processing" # Alert Channels Configuration alert_channels: email: type: "email" recipients: - "platform-team@company.com" - "oncall@company.com" subject_template: "[{{severity}}] {{name}} - {{description}}" slack: type: "slack" webhook_url: "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK" channel: "#alerts" username: "Observability Bot" icon_emoji: ":warning:" pagerduty: type: "pagerduty" integration_key: "your-pagerduty-integration-key" severity_mapping: critical: "critical" warning: "warning" info: "info" # Alert Silencing Rules silence_rules: - name: "Maintenance Window" condition: "maintenance_window == true" duration: "4h" comment: "Silenced during scheduled maintenance" - name: "Known Issue" condition: "known_issue_id == 'TICKET-123'" duration: "24h" comment: "Silenced for known issue resolution" # Escalation Policies escalation_policies: - name: "Default Escalation" steps: - delay: "5m" channels: ["email"] - delay: "15m" channels: ["slack"] - delay: "30m" channels: ["pagerduty"] - name: "Critical Escalation" steps: - delay: "0m" channels: ["email", "slack", "pagerduty"] - delay: "10m" channels: ["pagerduty"] # Escalation