Files
Mateusz Suski e851568c8c
lint / shell-yaml-ansible (push) Failing after 16s
Add standalone Bash incident check scripts
2026-05-11 18:49:00 +00:00

112 lines
3.9 KiB
Bash
Executable File

#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
service_name=""
since_value="1 hour ago"
warning_count=3
critical_count=10
usage() {
cat <<'USAGE'
Usage: check_service_restart_loop.sh --service SERVICE_NAME [--since TEXT] [--warning COUNT] [--critical COUNT] [--help]
Detect restart-loop evidence for a systemd service. Read-only.
USAGE
}
is_number() {
[[ "$1" =~ ^[0-9]+$ ]]
}
require_cmd() {
if ! command -v "$1" >/dev/null 2>&1; then
printf 'CRITICAL: required command not found: %s\n' "$1"
exit 2
fi
}
while (($# > 0)); do
case "$1" in
--service) [[ $# -ge 2 ]] || { printf 'CRITICAL: --service requires a value\n'; exit 2; }; service_name="$2"; shift 2 ;;
--since) [[ $# -ge 2 ]] || { printf 'CRITICAL: --since requires a value\n'; exit 2; }; since_value="$2"; shift 2 ;;
--warning) [[ $# -ge 2 ]] || { printf 'CRITICAL: --warning requires a value\n'; exit 2; }; warning_count="$2"; shift 2 ;;
--critical) [[ $# -ge 2 ]] || { printf 'CRITICAL: --critical requires a value\n'; exit 2; }; critical_count="$2"; shift 2 ;;
--help|-h) usage; exit 0 ;;
*) printf 'CRITICAL: unknown option: %s\n' "$1"; usage; exit 2 ;;
esac
done
if [[ -z "$service_name" ]]; then
printf 'CRITICAL: --service is required\n'
usage
exit 2
fi
for value in "$warning_count" "$critical_count"; do
if ! is_number "$value"; then
printf 'CRITICAL: numeric option expected, got: %s\n' "$value"
exit 2
fi
done
if ((warning_count >= critical_count)); then
printf 'CRITICAL: --warning must be lower than --critical\n'
exit 2
fi
require_cmd systemctl
active_state="$(systemctl show "$service_name" --property=ActiveState --value 2>/dev/null || printf 'unknown')"
sub_state="$(systemctl show "$service_name" --property=SubState --value 2>/dev/null || printf 'unknown')"
n_restarts="$(systemctl show "$service_name" --property=NRestarts --value 2>/dev/null || printf '')"
restart_count="${n_restarts:-0}"
if ! is_number "$restart_count"; then
restart_count=0
fi
status="OK"
exit_code=0
if [[ "$active_state" == "failed" ]] || ((restart_count >= critical_count)); then
status="CRITICAL"
exit_code=3
elif ((restart_count >= warning_count)) || [[ "$active_state" != "active" ]]; then
status="WARNING"
exit_code=1
fi
printf '%s: Service %s state=%s substate=%s restarts=%s\n\n' "$status" "$service_name" "$active_state" "$sub_state" "$restart_count"
printf 'Service state:\n'
systemctl status "$service_name" --no-pager --lines=8 2>/dev/null || printf 'WARNING: unable to read service status for %s\n' "$service_name"
printf '\n'
printf 'Systemd properties:\n'
systemctl show "$service_name" --property=Id,Names,LoadState,ActiveState,SubState,Result,ExecMainStatus,NRestarts,Restart,RestartUSec --no-pager 2>/dev/null || true
printf '\n'
printf 'Recent start/stop/failure log lines since %s:\n' "$since_value"
if command -v journalctl >/dev/null 2>&1; then
journalctl -u "$service_name" --since "$since_value" --no-pager 2>/dev/null \
| grep -Ei 'start|stop|fail|restart|exit|status|main process' \
| tail -n 40 || printf 'OK: no matching journal lines found\n'
else
printf 'WARNING: journalctl not available; service logs unavailable from this script\n'
fi
printf '\n'
printf 'Evidence:\n'
printf 'Thresholds: warning=%s restarts critical=%s restarts since="%s"\n' "$warning_count" "$critical_count" "$since_value"
if [[ "${EUID:-$(id -u 2>/dev/null || printf '1')}" != "0" ]]; then
printf 'WARNING: running without root; journal visibility may be limited\n'
fi
printf '\n'
printf 'Recommended next steps:\n'
printf -- '- Inspect the unit file and drop-in overrides\n'
printf -- '- Review application logs around the restart timestamps\n'
printf -- '- Check dependencies such as network, storage, database, or secrets\n'
printf -- '- Verify recent configuration or package changes\n'
printf -- '- Do not restart blindly; attach this output to the incident ticket\n'
exit "$exit_code"