{ "patterns": [ { "id": "disk_full", "name": "Disk full", "severity": "CRITICAL", "regex": "No space left on device|disk full|filesystem full", "category": "storage", "runbook": "infra-run/scripts/bash/disk-full/README.md", "description": "Filesystem or application failed because free space was exhausted." }, { "id": "inode_exhaustion", "name": "Inode exhaustion", "severity": "CRITICAL", "regex": "No space left on device.*inode|inode.*exhaust|free inodes.*0", "category": "storage", "runbook": "infra-run/scripts/bash/disk-full/README.md", "description": "Filesystem may have free blocks but too few available inodes." }, { "id": "read_only_filesystem", "name": "Read-only filesystem", "severity": "CRITICAL", "regex": "read-only file system|read-only filesystem|Remounting filesystem read-only", "category": "storage", "runbook": "infra-run/runbooks/incidents/read-only-filesystem.md", "description": "Filesystem writes failed because the mount was read-only or remounted read-only." }, { "id": "io_error", "name": "I/O error", "severity": "CRITICAL", "regex": "\\bI/O error\\b|Buffer I/O error|blk_update_request.*I/O error", "category": "storage", "runbook": "infra-run/runbooks/incidents/storage-io-error.md", "description": "Kernel or application reported storage I/O errors that require device and filesystem review." }, { "id": "out_of_memory", "name": "Out of memory", "severity": "CRITICAL", "regex": "\\bout of memory\\b|Cannot allocate memory", "category": "memory", "runbook": "infra-run/runbooks/incidents/memory-pressure.md", "description": "Process or host reported memory exhaustion symptoms." }, { "id": "oom_killer", "name": "OOM killer invoked", "severity": "CRITICAL", "regex": "oom-killer|Killed process \\d+|Out of memory: Killed process", "category": "memory", "runbook": "infra-run/runbooks/incidents/oom-killer.md", "description": "Kernel OOM killer activity was logged and affected processes should be reviewed." }, { "id": "segmentation_fault", "name": "Segmentation fault", "severity": "CRITICAL", "regex": "segmentation fault|segfault", "category": "process", "runbook": "infra-run/runbooks/incidents/process-crash.md", "description": "A process crash pattern was logged." }, { "id": "connection_refused", "name": "Connection refused", "severity": "WARNING", "regex": "connection refused|ConnectException: Connection refused", "category": "network", "runbook": "infra-run/scripts/bash/os-healthcheck/README.md", "description": "Client connection attempts were refused by the destination service or host." }, { "id": "connection_reset", "name": "Connection reset", "severity": "WARNING", "regex": "connection reset|Connection reset by peer", "category": "network", "runbook": "infra-run/scripts/bash/os-healthcheck/README.md", "description": "Established network connections were reset and require endpoint review." }, { "id": "timeout", "name": "Timeout", "severity": "WARNING", "regex": "\\btimeout\\b|timed out|TimeoutException|SocketTimeoutException", "category": "network", "runbook": "infra-run/scripts/bash/os-healthcheck/README.md", "description": "Operation timed out and may require network, service, or dependency review." }, { "id": "dns_resolution_failure", "name": "DNS resolution failure", "severity": "WARNING", "regex": "Temporary failure in name resolution|Name or service not known|NXDOMAIN|UnknownHostException|could not resolve host", "category": "network", "runbook": "infra-run/runbooks/incidents/dns-resolution.md", "description": "Name resolution failed for a host or service dependency." }, { "id": "certificate_expired", "name": "Certificate expired", "severity": "CRITICAL", "regex": "certificate expired|CertificateExpiredException|certificate has expired|notAfter", "category": "tls", "runbook": "infra-run/runbooks/incidents/certificate-expired.md", "description": "TLS certificate expiry was logged and certificate state should be reviewed." }, { "id": "tls_handshake_failed", "name": "TLS handshake failed", "severity": "WARNING", "regex": "TLS handshake failed|SSL handshake failed|handshake_failure", "category": "tls", "runbook": "infra-run/runbooks/incidents/tls-handshake.md", "description": "TLS handshake failed and may require certificate, protocol, or trust-store review." }, { "id": "authentication_failure", "name": "Authentication failure", "severity": "WARNING", "regex": "authentication failure|Failed password|authentication failed", "category": "security", "runbook": "infra-run/scripts/python/auth-log-audit/README.md", "description": "Authentication failures were logged and may require access review." }, { "id": "permission_denied", "name": "Permission denied", "severity": "WARNING", "regex": "permission denied|access denied|denied by policy", "category": "security", "runbook": "infra-run/runbooks/incidents/permission-denied.md", "description": "Access or permission denial was logged." }, { "id": "invalid_user", "name": "Invalid user", "severity": "WARNING", "regex": "Invalid user|invalid user|user unknown|User not known", "category": "security", "runbook": "infra-run/scripts/python/auth-log-audit/README.md", "description": "Log contains attempts involving invalid or unknown users." }, { "id": "java_out_of_memory", "name": "Java OutOfMemoryError", "severity": "CRITICAL", "regex": "OutOfMemoryError|Java heap space|GC overhead limit exceeded", "category": "application_jvm", "runbook": "infra-run/scripts/python/jvm-log-analyzer/README.md", "description": "Java process logged memory exhaustion symptoms." }, { "id": "ssl_handshake_exception", "name": "SSLHandshakeException", "severity": "CRITICAL", "regex": "SSLHandshakeException|javax\\.net\\.ssl\\.SSLHandshakeException", "category": "application_jvm", "runbook": "infra-run/scripts/python/jvm-log-analyzer/README.md", "description": "Java TLS handshake exception was logged." }, { "id": "database_unavailable", "name": "Database unavailable", "severity": "CRITICAL", "regex": "database unavailable|database is unavailable|SQLRecoverableException|CommunicationsException|connection pool exhausted", "category": "application", "runbook": "infra-run/scripts/python/jvm-log-analyzer/README.md", "description": "Application logged unavailable database or database connectivity symptoms." }, { "id": "http_500", "name": "HTTP 500", "severity": "CRITICAL", "regex": "\\bHTTP\\s+500\\b|\\bstatus=500\\b|\\s500\\s", "category": "application", "runbook": "infra-run/runbooks/incidents/http-5xx.md", "description": "Application or proxy logged HTTP 500 responses." }, { "id": "http_503", "name": "HTTP 503", "severity": "CRITICAL", "regex": "\\bHTTP\\s+503\\b|\\bstatus=503\\b|\\s503\\s|Service Unavailable", "category": "application", "runbook": "infra-run/runbooks/incidents/http-5xx.md", "description": "Application or proxy logged HTTP 503 service unavailable responses." }, { "id": "service_failed", "name": "Systemd service failed", "severity": "CRITICAL", "regex": "Failed to start .*\\.service|entered failed state|Unit .*\\.service failed|Main process exited.*status=", "category": "systemd", "runbook": "infra-run/scripts/python/journal-analyzer/README.md", "description": "Systemd logged a failed service or failed service start." }, { "id": "dependency_failed", "name": "Systemd dependency failed", "severity": "CRITICAL", "regex": "Dependency failed for|dependency failed", "category": "systemd", "runbook": "infra-run/scripts/python/journal-analyzer/README.md", "description": "Systemd logged a unit dependency failure." }, { "id": "start_request_repeated", "name": "Start request repeated too quickly", "severity": "WARNING", "regex": "Start request repeated too quickly|start request repeated too quickly", "category": "systemd", "runbook": "infra-run/scripts/python/journal-analyzer/README.md", "description": "Systemd throttled service restarts after repeated start failures." } ] }