Add known error matcher tool

This commit is contained in:
Mateusz Suski
2026-05-11 17:06:46 +00:00
parent 5fc96348c5
commit 1636f46f81
6 changed files with 1096 additions and 0 deletions
@@ -0,0 +1,220 @@
{
"patterns": [
{
"id": "disk_full",
"name": "Disk full",
"severity": "CRITICAL",
"regex": "No space left on device|disk full|filesystem full",
"category": "storage",
"runbook": "infra-run/scripts/bash/disk-full/README.md",
"description": "Filesystem or application failed because free space was exhausted."
},
{
"id": "inode_exhaustion",
"name": "Inode exhaustion",
"severity": "CRITICAL",
"regex": "No space left on device.*inode|inode.*exhaust|free inodes.*0",
"category": "storage",
"runbook": "infra-run/scripts/bash/disk-full/README.md",
"description": "Filesystem may have free blocks but too few available inodes."
},
{
"id": "read_only_filesystem",
"name": "Read-only filesystem",
"severity": "CRITICAL",
"regex": "read-only file system|read-only filesystem|Remounting filesystem read-only",
"category": "storage",
"runbook": "infra-run/runbooks/incidents/read-only-filesystem.md",
"description": "Filesystem writes failed because the mount was read-only or remounted read-only."
},
{
"id": "io_error",
"name": "I/O error",
"severity": "CRITICAL",
"regex": "\\bI/O error\\b|Buffer I/O error|blk_update_request.*I/O error",
"category": "storage",
"runbook": "infra-run/runbooks/incidents/storage-io-error.md",
"description": "Kernel or application reported storage I/O errors that require device and filesystem review."
},
{
"id": "out_of_memory",
"name": "Out of memory",
"severity": "CRITICAL",
"regex": "\\bout of memory\\b|Cannot allocate memory",
"category": "memory",
"runbook": "infra-run/runbooks/incidents/memory-pressure.md",
"description": "Process or host reported memory exhaustion symptoms."
},
{
"id": "oom_killer",
"name": "OOM killer invoked",
"severity": "CRITICAL",
"regex": "oom-killer|Killed process \\d+|Out of memory: Killed process",
"category": "memory",
"runbook": "infra-run/runbooks/incidents/oom-killer.md",
"description": "Kernel OOM killer activity was logged and affected processes should be reviewed."
},
{
"id": "segmentation_fault",
"name": "Segmentation fault",
"severity": "CRITICAL",
"regex": "segmentation fault|segfault",
"category": "process",
"runbook": "infra-run/runbooks/incidents/process-crash.md",
"description": "A process crash pattern was logged."
},
{
"id": "connection_refused",
"name": "Connection refused",
"severity": "WARNING",
"regex": "connection refused|ConnectException: Connection refused",
"category": "network",
"runbook": "infra-run/scripts/bash/os-healthcheck/README.md",
"description": "Client connection attempts were refused by the destination service or host."
},
{
"id": "connection_reset",
"name": "Connection reset",
"severity": "WARNING",
"regex": "connection reset|Connection reset by peer",
"category": "network",
"runbook": "infra-run/scripts/bash/os-healthcheck/README.md",
"description": "Established network connections were reset and require endpoint review."
},
{
"id": "timeout",
"name": "Timeout",
"severity": "WARNING",
"regex": "\\btimeout\\b|timed out|TimeoutException|SocketTimeoutException",
"category": "network",
"runbook": "infra-run/scripts/bash/os-healthcheck/README.md",
"description": "Operation timed out and may require network, service, or dependency review."
},
{
"id": "dns_resolution_failure",
"name": "DNS resolution failure",
"severity": "WARNING",
"regex": "Temporary failure in name resolution|Name or service not known|NXDOMAIN|UnknownHostException|could not resolve host",
"category": "network",
"runbook": "infra-run/runbooks/incidents/dns-resolution.md",
"description": "Name resolution failed for a host or service dependency."
},
{
"id": "certificate_expired",
"name": "Certificate expired",
"severity": "CRITICAL",
"regex": "certificate expired|CertificateExpiredException|certificate has expired|notAfter",
"category": "tls",
"runbook": "infra-run/runbooks/incidents/certificate-expired.md",
"description": "TLS certificate expiry was logged and certificate state should be reviewed."
},
{
"id": "tls_handshake_failed",
"name": "TLS handshake failed",
"severity": "WARNING",
"regex": "TLS handshake failed|SSL handshake failed|handshake_failure",
"category": "tls",
"runbook": "infra-run/runbooks/incidents/tls-handshake.md",
"description": "TLS handshake failed and may require certificate, protocol, or trust-store review."
},
{
"id": "authentication_failure",
"name": "Authentication failure",
"severity": "WARNING",
"regex": "authentication failure|Failed password|authentication failed",
"category": "security",
"runbook": "infra-run/scripts/python/auth-log-audit/README.md",
"description": "Authentication failures were logged and may require access review."
},
{
"id": "permission_denied",
"name": "Permission denied",
"severity": "WARNING",
"regex": "permission denied|access denied|denied by policy",
"category": "security",
"runbook": "infra-run/runbooks/incidents/permission-denied.md",
"description": "Access or permission denial was logged."
},
{
"id": "invalid_user",
"name": "Invalid user",
"severity": "WARNING",
"regex": "Invalid user|invalid user|user unknown|User not known",
"category": "security",
"runbook": "infra-run/scripts/python/auth-log-audit/README.md",
"description": "Log contains attempts involving invalid or unknown users."
},
{
"id": "java_out_of_memory",
"name": "Java OutOfMemoryError",
"severity": "CRITICAL",
"regex": "OutOfMemoryError|Java heap space|GC overhead limit exceeded",
"category": "application_jvm",
"runbook": "infra-run/scripts/python/jvm-log-analyzer/README.md",
"description": "Java process logged memory exhaustion symptoms."
},
{
"id": "ssl_handshake_exception",
"name": "SSLHandshakeException",
"severity": "CRITICAL",
"regex": "SSLHandshakeException|javax\\.net\\.ssl\\.SSLHandshakeException",
"category": "application_jvm",
"runbook": "infra-run/scripts/python/jvm-log-analyzer/README.md",
"description": "Java TLS handshake exception was logged."
},
{
"id": "database_unavailable",
"name": "Database unavailable",
"severity": "CRITICAL",
"regex": "database unavailable|database is unavailable|SQLRecoverableException|CommunicationsException|connection pool exhausted",
"category": "application",
"runbook": "infra-run/scripts/python/jvm-log-analyzer/README.md",
"description": "Application logged unavailable database or database connectivity symptoms."
},
{
"id": "http_500",
"name": "HTTP 500",
"severity": "CRITICAL",
"regex": "\\bHTTP\\s+500\\b|\\bstatus=500\\b|\\s500\\s",
"category": "application",
"runbook": "infra-run/runbooks/incidents/http-5xx.md",
"description": "Application or proxy logged HTTP 500 responses."
},
{
"id": "http_503",
"name": "HTTP 503",
"severity": "CRITICAL",
"regex": "\\bHTTP\\s+503\\b|\\bstatus=503\\b|\\s503\\s|Service Unavailable",
"category": "application",
"runbook": "infra-run/runbooks/incidents/http-5xx.md",
"description": "Application or proxy logged HTTP 503 service unavailable responses."
},
{
"id": "service_failed",
"name": "Systemd service failed",
"severity": "CRITICAL",
"regex": "Failed to start .*\\.service|entered failed state|Unit .*\\.service failed|Main process exited.*status=",
"category": "systemd",
"runbook": "infra-run/scripts/python/journal-analyzer/README.md",
"description": "Systemd logged a failed service or failed service start."
},
{
"id": "dependency_failed",
"name": "Systemd dependency failed",
"severity": "CRITICAL",
"regex": "Dependency failed for|dependency failed",
"category": "systemd",
"runbook": "infra-run/scripts/python/journal-analyzer/README.md",
"description": "Systemd logged a unit dependency failure."
},
{
"id": "start_request_repeated",
"name": "Start request repeated too quickly",
"severity": "WARNING",
"regex": "Start request repeated too quickly|start request repeated too quickly",
"category": "systemd",
"runbook": "infra-run/scripts/python/journal-analyzer/README.md",
"description": "Systemd throttled service restarts after repeated start failures."
}
]
}