221 lines
8.8 KiB
JSON
221 lines
8.8 KiB
JSON
|
|
{
|
||
|
|
"patterns": [
|
||
|
|
{
|
||
|
|
"id": "disk_full",
|
||
|
|
"name": "Disk full",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "No space left on device|disk full|filesystem full",
|
||
|
|
"category": "storage",
|
||
|
|
"runbook": "infra-run/scripts/bash/disk-full/README.md",
|
||
|
|
"description": "Filesystem or application failed because free space was exhausted."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "inode_exhaustion",
|
||
|
|
"name": "Inode exhaustion",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "No space left on device.*inode|inode.*exhaust|free inodes.*0",
|
||
|
|
"category": "storage",
|
||
|
|
"runbook": "infra-run/scripts/bash/disk-full/README.md",
|
||
|
|
"description": "Filesystem may have free blocks but too few available inodes."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "read_only_filesystem",
|
||
|
|
"name": "Read-only filesystem",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "read-only file system|read-only filesystem|Remounting filesystem read-only",
|
||
|
|
"category": "storage",
|
||
|
|
"runbook": "infra-run/runbooks/incidents/read-only-filesystem.md",
|
||
|
|
"description": "Filesystem writes failed because the mount was read-only or remounted read-only."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "io_error",
|
||
|
|
"name": "I/O error",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "\\bI/O error\\b|Buffer I/O error|blk_update_request.*I/O error",
|
||
|
|
"category": "storage",
|
||
|
|
"runbook": "infra-run/runbooks/incidents/storage-io-error.md",
|
||
|
|
"description": "Kernel or application reported storage I/O errors that require device and filesystem review."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "out_of_memory",
|
||
|
|
"name": "Out of memory",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "\\bout of memory\\b|Cannot allocate memory",
|
||
|
|
"category": "memory",
|
||
|
|
"runbook": "infra-run/runbooks/incidents/memory-pressure.md",
|
||
|
|
"description": "Process or host reported memory exhaustion symptoms."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "oom_killer",
|
||
|
|
"name": "OOM killer invoked",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "oom-killer|Killed process \\d+|Out of memory: Killed process",
|
||
|
|
"category": "memory",
|
||
|
|
"runbook": "infra-run/runbooks/incidents/oom-killer.md",
|
||
|
|
"description": "Kernel OOM killer activity was logged and affected processes should be reviewed."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "segmentation_fault",
|
||
|
|
"name": "Segmentation fault",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "segmentation fault|segfault",
|
||
|
|
"category": "process",
|
||
|
|
"runbook": "infra-run/runbooks/incidents/process-crash.md",
|
||
|
|
"description": "A process crash pattern was logged."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "connection_refused",
|
||
|
|
"name": "Connection refused",
|
||
|
|
"severity": "WARNING",
|
||
|
|
"regex": "connection refused|ConnectException: Connection refused",
|
||
|
|
"category": "network",
|
||
|
|
"runbook": "infra-run/scripts/bash/os-healthcheck/README.md",
|
||
|
|
"description": "Client connection attempts were refused by the destination service or host."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "connection_reset",
|
||
|
|
"name": "Connection reset",
|
||
|
|
"severity": "WARNING",
|
||
|
|
"regex": "connection reset|Connection reset by peer",
|
||
|
|
"category": "network",
|
||
|
|
"runbook": "infra-run/scripts/bash/os-healthcheck/README.md",
|
||
|
|
"description": "Established network connections were reset and require endpoint review."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "timeout",
|
||
|
|
"name": "Timeout",
|
||
|
|
"severity": "WARNING",
|
||
|
|
"regex": "\\btimeout\\b|timed out|TimeoutException|SocketTimeoutException",
|
||
|
|
"category": "network",
|
||
|
|
"runbook": "infra-run/scripts/bash/os-healthcheck/README.md",
|
||
|
|
"description": "Operation timed out and may require network, service, or dependency review."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "dns_resolution_failure",
|
||
|
|
"name": "DNS resolution failure",
|
||
|
|
"severity": "WARNING",
|
||
|
|
"regex": "Temporary failure in name resolution|Name or service not known|NXDOMAIN|UnknownHostException|could not resolve host",
|
||
|
|
"category": "network",
|
||
|
|
"runbook": "infra-run/runbooks/incidents/dns-resolution.md",
|
||
|
|
"description": "Name resolution failed for a host or service dependency."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "certificate_expired",
|
||
|
|
"name": "Certificate expired",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "certificate expired|CertificateExpiredException|certificate has expired|notAfter",
|
||
|
|
"category": "tls",
|
||
|
|
"runbook": "infra-run/runbooks/incidents/certificate-expired.md",
|
||
|
|
"description": "TLS certificate expiry was logged and certificate state should be reviewed."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "tls_handshake_failed",
|
||
|
|
"name": "TLS handshake failed",
|
||
|
|
"severity": "WARNING",
|
||
|
|
"regex": "TLS handshake failed|SSL handshake failed|handshake_failure",
|
||
|
|
"category": "tls",
|
||
|
|
"runbook": "infra-run/runbooks/incidents/tls-handshake.md",
|
||
|
|
"description": "TLS handshake failed and may require certificate, protocol, or trust-store review."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "authentication_failure",
|
||
|
|
"name": "Authentication failure",
|
||
|
|
"severity": "WARNING",
|
||
|
|
"regex": "authentication failure|Failed password|authentication failed",
|
||
|
|
"category": "security",
|
||
|
|
"runbook": "infra-run/scripts/python/auth-log-audit/README.md",
|
||
|
|
"description": "Authentication failures were logged and may require access review."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "permission_denied",
|
||
|
|
"name": "Permission denied",
|
||
|
|
"severity": "WARNING",
|
||
|
|
"regex": "permission denied|access denied|denied by policy",
|
||
|
|
"category": "security",
|
||
|
|
"runbook": "infra-run/runbooks/incidents/permission-denied.md",
|
||
|
|
"description": "Access or permission denial was logged."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "invalid_user",
|
||
|
|
"name": "Invalid user",
|
||
|
|
"severity": "WARNING",
|
||
|
|
"regex": "Invalid user|invalid user|user unknown|User not known",
|
||
|
|
"category": "security",
|
||
|
|
"runbook": "infra-run/scripts/python/auth-log-audit/README.md",
|
||
|
|
"description": "Log contains attempts involving invalid or unknown users."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "java_out_of_memory",
|
||
|
|
"name": "Java OutOfMemoryError",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "OutOfMemoryError|Java heap space|GC overhead limit exceeded",
|
||
|
|
"category": "application_jvm",
|
||
|
|
"runbook": "infra-run/scripts/python/jvm-log-analyzer/README.md",
|
||
|
|
"description": "Java process logged memory exhaustion symptoms."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "ssl_handshake_exception",
|
||
|
|
"name": "SSLHandshakeException",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "SSLHandshakeException|javax\\.net\\.ssl\\.SSLHandshakeException",
|
||
|
|
"category": "application_jvm",
|
||
|
|
"runbook": "infra-run/scripts/python/jvm-log-analyzer/README.md",
|
||
|
|
"description": "Java TLS handshake exception was logged."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "database_unavailable",
|
||
|
|
"name": "Database unavailable",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "database unavailable|database is unavailable|SQLRecoverableException|CommunicationsException|connection pool exhausted",
|
||
|
|
"category": "application",
|
||
|
|
"runbook": "infra-run/scripts/python/jvm-log-analyzer/README.md",
|
||
|
|
"description": "Application logged unavailable database or database connectivity symptoms."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "http_500",
|
||
|
|
"name": "HTTP 500",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "\\bHTTP\\s+500\\b|\\bstatus=500\\b|\\s500\\s",
|
||
|
|
"category": "application",
|
||
|
|
"runbook": "infra-run/runbooks/incidents/http-5xx.md",
|
||
|
|
"description": "Application or proxy logged HTTP 500 responses."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "http_503",
|
||
|
|
"name": "HTTP 503",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "\\bHTTP\\s+503\\b|\\bstatus=503\\b|\\s503\\s|Service Unavailable",
|
||
|
|
"category": "application",
|
||
|
|
"runbook": "infra-run/runbooks/incidents/http-5xx.md",
|
||
|
|
"description": "Application or proxy logged HTTP 503 service unavailable responses."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "service_failed",
|
||
|
|
"name": "Systemd service failed",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "Failed to start .*\\.service|entered failed state|Unit .*\\.service failed|Main process exited.*status=",
|
||
|
|
"category": "systemd",
|
||
|
|
"runbook": "infra-run/scripts/python/journal-analyzer/README.md",
|
||
|
|
"description": "Systemd logged a failed service or failed service start."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "dependency_failed",
|
||
|
|
"name": "Systemd dependency failed",
|
||
|
|
"severity": "CRITICAL",
|
||
|
|
"regex": "Dependency failed for|dependency failed",
|
||
|
|
"category": "systemd",
|
||
|
|
"runbook": "infra-run/scripts/python/journal-analyzer/README.md",
|
||
|
|
"description": "Systemd logged a unit dependency failure."
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"id": "start_request_repeated",
|
||
|
|
"name": "Start request repeated too quickly",
|
||
|
|
"severity": "WARNING",
|
||
|
|
"regex": "Start request repeated too quickly|start request repeated too quickly",
|
||
|
|
"category": "systemd",
|
||
|
|
"runbook": "infra-run/scripts/python/journal-analyzer/README.md",
|
||
|
|
"description": "Systemd throttled service restarts after repeated start failures."
|
||
|
|
}
|
||
|
|
]
|
||
|
|
}
|