portfolio/observability-stack/alerting/alert_rules.yml

# Enterprise Observability Alert Rules
# Alert definitions for automated incident detection and notification

alert_rules:
  # System Resource Alerts
  - name: "High CPU Usage"
    description: "CPU utilization exceeds threshold"
    condition: "cpu_usage_percent > 90"
    duration: "5m"
    severity: "critical"
    tags:
      - system
      - performance
    channels:
      - email
      - slack
    labels:
      team: "platform"
      component: "system"

  - name: "High Memory Usage"
    description: "Memory utilization exceeds threshold"
    condition: "memory_usage_percent > 85"
    duration: "3m"
    severity: "warning"
    tags:
      - system
      - memory
    channels:
      - email
    labels:
      team: "platform"
      component: "system"

  - name: "Disk Space Critical"
    description: "Disk usage exceeds critical threshold"
    condition: "disk_usage_percent > 95"
    duration: "2m"
    severity: "critical"
    tags:
      - storage
      - disk
    channels:
      - email
      - pagerduty
    labels:
      team: "platform"
      component: "storage"

  - name: "Disk Space Warning"
    description: "Disk usage exceeds warning threshold"
    condition: "disk_usage_percent > 85"
    duration: "10m"
    severity: "warning"
    tags:
      - storage
      - disk
    channels:
      - email
    labels:
      team: "platform"
      component: "storage"

  # Service Availability Alerts
  - name: "Service Down"
    description: "Critical service is not responding"
    condition: "service_status == 'down' OR http_status_code >= 500"
    duration: "2m"
    severity: "critical"
    tags:
      - service
      - availability
    channels:
      - email
      - slack
      - pagerduty
    labels:
      team: "application"
      component: "service"

  - name: "Database Connection Failed"
    description: "Database connection pool exhausted or unresponsive"
    condition: "db_connections_active == 0 OR db_response_time > 5000"
    duration: "1m"
    severity: "critical"
    tags:
      - database
      - connectivity
    channels:
      - email
      - pagerduty
    labels:
      team: "database"
      component: "postgresql"

  - name: "Cache Unavailable"
    description: "Cache service is down or unresponsive"
    condition: "cache_hit_ratio < 0.1 OR cache_response_time > 1000"
    duration: "3m"
    severity: "warning"
    tags:
      - cache
      - performance
    channels:
      - email
    labels:
      team: "infrastructure"
      component: "redis"

  # Application Performance Alerts
  - name: "High Error Rate"
    description: "Application error rate exceeds threshold"
    condition: "error_rate_percent > 5"
    duration: "5m"
    severity: "critical"
    tags:
      - application
      - errors
    channels:
      - email
      - slack
    labels:
      team: "application"
      component: "api"

  - name: "Slow Response Time"
    description: "API response time exceeds SLA"
    condition: "response_time_p95 > 2000"
    duration: "5m"
    severity: "warning"
    tags:
      - application
      - performance
    channels:
      - email
    labels:
      team: "application"
      component: "api"

  - name: "High Request Queue"
    description: "Request queue depth is too high"
    condition: "queue_depth > 100"
    duration: "3m"
    severity: "warning"
    tags:
      - application
      - queue
    channels:
      - email
    labels:
      team: "application"
      component: "queue"

  # Infrastructure Alerts
  - name: "Network Latency High"
    description: "Network round-trip time exceeds threshold"
    condition: "network_rtt > 100"
    duration: "5m"
    severity: "warning"
    tags:
      - network
      - latency
    channels:
      - email
    labels:
      team: "network"
      component: "infrastructure"

  - name: "Load Balancer Unhealthy"
    description: "Load balancer backend servers are unhealthy"
    condition: "lb_unhealthy_backends > 0"
    duration: "2m"
    severity: "critical"
    tags:
      - loadbalancer
      - availability
    channels:
      - email
      - pagerduty
    labels:
      team: "infrastructure"
      component: "loadbalancer"

  # Security Alerts
  - name: "Failed Login Attempts"
    description: "Multiple failed authentication attempts detected"
    condition: "failed_login_attempts > 5"
    duration: "5m"
    severity: "warning"
    tags:
      - security
      - authentication
    channels:
      - email
      - slack
    labels:
      team: "security"
      component: "authentication"

  - name: "Suspicious Network Traffic"
    description: "Unusual network traffic patterns detected"
    condition: "network_bytes_unusual > 1000000"
    duration: "10m"
    severity: "warning"
    tags:
      - security
      - network
    channels:
      - email
    labels:
      team: "security"
      component: "network"

  # Log-based Alerts
  - name: "Application Errors"
    description: "High volume of application error logs"
    condition: "log_errors_per_minute > 10"
    duration: "2m"
    severity: "warning"
    tags:
      - logs
      - errors
    channels:
      - email
    labels:
      team: "application"
      component: "logs"

  - name: "Out of Memory Errors"
    description: "Out of memory errors detected in logs"
    condition: "log_oom_errors > 0"
    duration: "1m"
    severity: "critical"
    tags:
      - memory
      - errors
    channels:
      - email
      - pagerduty
    labels:
      team: "application"
      component: "memory"

  # Business Logic Alerts
  - name: "Low Business Transactions"
    description: "Business transaction volume below expected threshold"
    condition: "business_transactions_per_hour < 100"
    duration: "15m"
    severity: "warning"
    tags:
      - business
      - transactions
    channels:
      - email
    labels:
      team: "business"
      component: "transactions"

  - name: "Payment Failures"
    description: "Payment processing failure rate is high"
    condition: "payment_failure_rate > 0.05"
    duration: "5m"
    severity: "critical"
    tags:
      - payments
      - business
    channels:
      - email
      - pagerduty
    labels:
      team: "payments"
      component: "processing"

# Alert Channels Configuration
alert_channels:
  email:
    type: "email"
    recipients:
      - "platform-team@company.com"
      - "oncall@company.com"
    subject_template: "[{{severity}}] {{name}} - {{description}}"

  slack:
    type: "slack"
    webhook_url: "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK"
    channel: "#alerts"
    username: "Observability Bot"
    icon_emoji: ":warning:"

  pagerduty:
    type: "pagerduty"
    integration_key: "your-pagerduty-integration-key"
    severity_mapping:
      critical: "critical"
      warning: "warning"
      info: "info"

# Alert Silencing Rules
silence_rules:
  - name: "Maintenance Window"
    condition: "maintenance_window == true"
    duration: "4h"
    comment: "Silenced during scheduled maintenance"

  - name: "Known Issue"
    condition: "known_issue_id == 'TICKET-123'"
    duration: "24h"
    comment: "Silenced for known issue resolution"

# Escalation Policies
escalation_policies:
  - name: "Default Escalation"
    steps:
      - delay: "5m"
        channels: ["email"]
      - delay: "15m"
        channels: ["slack"]
      - delay: "30m"
        channels: ["pagerduty"]

  - name: "Critical Escalation"
    steps:
      - delay: "0m"
        channels: ["email", "slack", "pagerduty"]
      - delay: "10m"
        channels: ["pagerduty"]  # Escalation