Files
portfolio/observability-stack/alerting/alert_rules.yml
T
Mateusz Suski 7757020014
CI Pipeline / lint-ansible (push) Waiting to run
CI Pipeline / test-python (push) Waiting to run
CI Pipeline / validate-docker (push) Waiting to run
CI Pipeline / security-scan (push) Waiting to run
CI Pipeline / documentation (push) Waiting to run
CI Pipeline / integration-test (push) Blocked by required conditions
feat: Add comprehensive enterprise Linux infrastructure portfolio with Ansible, Python, and ELK stack
2026-04-29 23:14:14 +00:00

326 lines
7.4 KiB
YAML

# Enterprise Observability Alert Rules
# Alert definitions for automated incident detection and notification
alert_rules:
# System Resource Alerts
- name: "High CPU Usage"
description: "CPU utilization exceeds threshold"
condition: "cpu_usage_percent > 90"
duration: "5m"
severity: "critical"
tags:
- system
- performance
channels:
- email
- slack
labels:
team: "platform"
component: "system"
- name: "High Memory Usage"
description: "Memory utilization exceeds threshold"
condition: "memory_usage_percent > 85"
duration: "3m"
severity: "warning"
tags:
- system
- memory
channels:
- email
labels:
team: "platform"
component: "system"
- name: "Disk Space Critical"
description: "Disk usage exceeds critical threshold"
condition: "disk_usage_percent > 95"
duration: "2m"
severity: "critical"
tags:
- storage
- disk
channels:
- email
- pagerduty
labels:
team: "platform"
component: "storage"
- name: "Disk Space Warning"
description: "Disk usage exceeds warning threshold"
condition: "disk_usage_percent > 85"
duration: "10m"
severity: "warning"
tags:
- storage
- disk
channels:
- email
labels:
team: "platform"
component: "storage"
# Service Availability Alerts
- name: "Service Down"
description: "Critical service is not responding"
condition: "service_status == 'down' OR http_status_code >= 500"
duration: "2m"
severity: "critical"
tags:
- service
- availability
channels:
- email
- slack
- pagerduty
labels:
team: "application"
component: "service"
- name: "Database Connection Failed"
description: "Database connection pool exhausted or unresponsive"
condition: "db_connections_active == 0 OR db_response_time > 5000"
duration: "1m"
severity: "critical"
tags:
- database
- connectivity
channels:
- email
- pagerduty
labels:
team: "database"
component: "postgresql"
- name: "Cache Unavailable"
description: "Cache service is down or unresponsive"
condition: "cache_hit_ratio < 0.1 OR cache_response_time > 1000"
duration: "3m"
severity: "warning"
tags:
- cache
- performance
channels:
- email
labels:
team: "infrastructure"
component: "redis"
# Application Performance Alerts
- name: "High Error Rate"
description: "Application error rate exceeds threshold"
condition: "error_rate_percent > 5"
duration: "5m"
severity: "critical"
tags:
- application
- errors
channels:
- email
- slack
labels:
team: "application"
component: "api"
- name: "Slow Response Time"
description: "API response time exceeds SLA"
condition: "response_time_p95 > 2000"
duration: "5m"
severity: "warning"
tags:
- application
- performance
channels:
- email
labels:
team: "application"
component: "api"
- name: "High Request Queue"
description: "Request queue depth is too high"
condition: "queue_depth > 100"
duration: "3m"
severity: "warning"
tags:
- application
- queue
channels:
- email
labels:
team: "application"
component: "queue"
# Infrastructure Alerts
- name: "Network Latency High"
description: "Network round-trip time exceeds threshold"
condition: "network_rtt > 100"
duration: "5m"
severity: "warning"
tags:
- network
- latency
channels:
- email
labels:
team: "network"
component: "infrastructure"
- name: "Load Balancer Unhealthy"
description: "Load balancer backend servers are unhealthy"
condition: "lb_unhealthy_backends > 0"
duration: "2m"
severity: "critical"
tags:
- loadbalancer
- availability
channels:
- email
- pagerduty
labels:
team: "infrastructure"
component: "loadbalancer"
# Security Alerts
- name: "Failed Login Attempts"
description: "Multiple failed authentication attempts detected"
condition: "failed_login_attempts > 5"
duration: "5m"
severity: "warning"
tags:
- security
- authentication
channels:
- email
- slack
labels:
team: "security"
component: "authentication"
- name: "Suspicious Network Traffic"
description: "Unusual network traffic patterns detected"
condition: "network_bytes_unusual > 1000000"
duration: "10m"
severity: "warning"
tags:
- security
- network
channels:
- email
labels:
team: "security"
component: "network"
# Log-based Alerts
- name: "Application Errors"
description: "High volume of application error logs"
condition: "log_errors_per_minute > 10"
duration: "2m"
severity: "warning"
tags:
- logs
- errors
channels:
- email
labels:
team: "application"
component: "logs"
- name: "Out of Memory Errors"
description: "Out of memory errors detected in logs"
condition: "log_oom_errors > 0"
duration: "1m"
severity: "critical"
tags:
- memory
- errors
channels:
- email
- pagerduty
labels:
team: "application"
component: "memory"
# Business Logic Alerts
- name: "Low Business Transactions"
description: "Business transaction volume below expected threshold"
condition: "business_transactions_per_hour < 100"
duration: "15m"
severity: "warning"
tags:
- business
- transactions
channels:
- email
labels:
team: "business"
component: "transactions"
- name: "Payment Failures"
description: "Payment processing failure rate is high"
condition: "payment_failure_rate > 0.05"
duration: "5m"
severity: "critical"
tags:
- payments
- business
channels:
- email
- pagerduty
labels:
team: "payments"
component: "processing"
# Alert Channels Configuration
alert_channels:
email:
type: "email"
recipients:
- "platform-team@company.com"
- "oncall@company.com"
subject_template: "[{{severity}}] {{name}} - {{description}}"
slack:
type: "slack"
webhook_url: "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK"
channel: "#alerts"
username: "Observability Bot"
icon_emoji: ":warning:"
pagerduty:
type: "pagerduty"
integration_key: "your-pagerduty-integration-key"
severity_mapping:
critical: "critical"
warning: "warning"
info: "info"
# Alert Silencing Rules
silence_rules:
- name: "Maintenance Window"
condition: "maintenance_window == true"
duration: "4h"
comment: "Silenced during scheduled maintenance"
- name: "Known Issue"
condition: "known_issue_id == 'TICKET-123'"
duration: "24h"
comment: "Silenced for known issue resolution"
# Escalation Policies
escalation_policies:
- name: "Default Escalation"
steps:
- delay: "5m"
channels: ["email"]
- delay: "15m"
channels: ["slack"]
- delay: "30m"
channels: ["pagerduty"]
- name: "Critical Escalation"
steps:
- delay: "0m"
channels: ["email", "slack", "pagerduty"]
- delay: "10m"
channels: ["pagerduty"] # Escalation