#!/usr/bin/env bash set -o errexit set -o nounset set -o pipefail service_name="" since_value="1 hour ago" warning_count=3 critical_count=10 usage() { cat <<'USAGE' Usage: check_service_restart_loop.sh --service SERVICE_NAME [--since TEXT] [--warning COUNT] [--critical COUNT] [--help] Detect restart-loop evidence for a systemd service. Read-only. USAGE } is_number() { [[ "$1" =~ ^[0-9]+$ ]] } require_cmd() { if ! command -v "$1" >/dev/null 2>&1; then printf 'CRITICAL: required command not found: %s\n' "$1" exit 2 fi } while (($# > 0)); do case "$1" in --service) [[ $# -ge 2 ]] || { printf 'CRITICAL: --service requires a value\n'; exit 2; }; service_name="$2"; shift 2 ;; --since) [[ $# -ge 2 ]] || { printf 'CRITICAL: --since requires a value\n'; exit 2; }; since_value="$2"; shift 2 ;; --warning) [[ $# -ge 2 ]] || { printf 'CRITICAL: --warning requires a value\n'; exit 2; }; warning_count="$2"; shift 2 ;; --critical) [[ $# -ge 2 ]] || { printf 'CRITICAL: --critical requires a value\n'; exit 2; }; critical_count="$2"; shift 2 ;; --help|-h) usage; exit 0 ;; *) printf 'CRITICAL: unknown option: %s\n' "$1"; usage; exit 2 ;; esac done if [[ -z "$service_name" ]]; then printf 'CRITICAL: --service is required\n' usage exit 2 fi for value in "$warning_count" "$critical_count"; do if ! is_number "$value"; then printf 'CRITICAL: numeric option expected, got: %s\n' "$value" exit 2 fi done if ((warning_count >= critical_count)); then printf 'CRITICAL: --warning must be lower than --critical\n' exit 2 fi require_cmd systemctl active_state="$(systemctl show "$service_name" --property=ActiveState --value 2>/dev/null || printf 'unknown')" sub_state="$(systemctl show "$service_name" --property=SubState --value 2>/dev/null || printf 'unknown')" n_restarts="$(systemctl show "$service_name" --property=NRestarts --value 2>/dev/null || printf '')" restart_count="${n_restarts:-0}" if ! is_number "$restart_count"; then restart_count=0 fi status="OK" exit_code=0 if [[ "$active_state" == "failed" ]] || ((restart_count >= critical_count)); then status="CRITICAL" exit_code=3 elif ((restart_count >= warning_count)) || [[ "$active_state" != "active" ]]; then status="WARNING" exit_code=1 fi printf '%s: Service %s state=%s substate=%s restarts=%s\n\n' "$status" "$service_name" "$active_state" "$sub_state" "$restart_count" printf 'Service state:\n' systemctl status "$service_name" --no-pager --lines=8 2>/dev/null || printf 'WARNING: unable to read service status for %s\n' "$service_name" printf '\n' printf 'Systemd properties:\n' systemctl show "$service_name" --property=Id,Names,LoadState,ActiveState,SubState,Result,ExecMainStatus,NRestarts,Restart,RestartUSec --no-pager 2>/dev/null || true printf '\n' printf 'Recent start/stop/failure log lines since %s:\n' "$since_value" if command -v journalctl >/dev/null 2>&1; then journalctl -u "$service_name" --since "$since_value" --no-pager 2>/dev/null \ | grep -Ei 'start|stop|fail|restart|exit|status|main process' \ | tail -n 40 || printf 'OK: no matching journal lines found\n' else printf 'WARNING: journalctl not available; service logs unavailable from this script\n' fi printf '\n' printf 'Evidence:\n' printf 'Thresholds: warning=%s restarts critical=%s restarts since="%s"\n' "$warning_count" "$critical_count" "$since_value" if [[ "${EUID:-$(id -u 2>/dev/null || printf '1')}" != "0" ]]; then printf 'WARNING: running without root; journal visibility may be limited\n' fi printf '\n' printf 'Recommended next steps:\n' printf -- '- Inspect the unit file and drop-in overrides\n' printf -- '- Review application logs around the restart timestamps\n' printf -- '- Check dependencies such as network, storage, database, or secrets\n' printf -- '- Verify recent configuration or package changes\n' printf -- '- Do not restart blindly; attach this output to the incident ticket\n' exit "$exit_code"