This commit is contained in:
@@ -0,0 +1,111 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
service_name=""
|
||||
since_value="1 hour ago"
|
||||
warning_count=3
|
||||
critical_count=10
|
||||
|
||||
usage() {
|
||||
cat <<'USAGE'
|
||||
Usage: check_service_restart_loop.sh --service SERVICE_NAME [--since TEXT] [--warning COUNT] [--critical COUNT] [--help]
|
||||
|
||||
Detect restart-loop evidence for a systemd service. Read-only.
|
||||
USAGE
|
||||
}
|
||||
|
||||
is_number() {
|
||||
[[ "$1" =~ ^[0-9]+$ ]]
|
||||
}
|
||||
|
||||
require_cmd() {
|
||||
if ! command -v "$1" >/dev/null 2>&1; then
|
||||
printf 'CRITICAL: required command not found: %s\n' "$1"
|
||||
exit 2
|
||||
fi
|
||||
}
|
||||
|
||||
while (($# > 0)); do
|
||||
case "$1" in
|
||||
--service) [[ $# -ge 2 ]] || { printf 'CRITICAL: --service requires a value\n'; exit 2; }; service_name="$2"; shift 2 ;;
|
||||
--since) [[ $# -ge 2 ]] || { printf 'CRITICAL: --since requires a value\n'; exit 2; }; since_value="$2"; shift 2 ;;
|
||||
--warning) [[ $# -ge 2 ]] || { printf 'CRITICAL: --warning requires a value\n'; exit 2; }; warning_count="$2"; shift 2 ;;
|
||||
--critical) [[ $# -ge 2 ]] || { printf 'CRITICAL: --critical requires a value\n'; exit 2; }; critical_count="$2"; shift 2 ;;
|
||||
--help|-h) usage; exit 0 ;;
|
||||
*) printf 'CRITICAL: unknown option: %s\n' "$1"; usage; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "$service_name" ]]; then
|
||||
printf 'CRITICAL: --service is required\n'
|
||||
usage
|
||||
exit 2
|
||||
fi
|
||||
for value in "$warning_count" "$critical_count"; do
|
||||
if ! is_number "$value"; then
|
||||
printf 'CRITICAL: numeric option expected, got: %s\n' "$value"
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
if ((warning_count >= critical_count)); then
|
||||
printf 'CRITICAL: --warning must be lower than --critical\n'
|
||||
exit 2
|
||||
fi
|
||||
|
||||
require_cmd systemctl
|
||||
|
||||
active_state="$(systemctl show "$service_name" --property=ActiveState --value 2>/dev/null || printf 'unknown')"
|
||||
sub_state="$(systemctl show "$service_name" --property=SubState --value 2>/dev/null || printf 'unknown')"
|
||||
n_restarts="$(systemctl show "$service_name" --property=NRestarts --value 2>/dev/null || printf '')"
|
||||
restart_count="${n_restarts:-0}"
|
||||
if ! is_number "$restart_count"; then
|
||||
restart_count=0
|
||||
fi
|
||||
|
||||
status="OK"
|
||||
exit_code=0
|
||||
if [[ "$active_state" == "failed" ]] || ((restart_count >= critical_count)); then
|
||||
status="CRITICAL"
|
||||
exit_code=3
|
||||
elif ((restart_count >= warning_count)) || [[ "$active_state" != "active" ]]; then
|
||||
status="WARNING"
|
||||
exit_code=1
|
||||
fi
|
||||
|
||||
printf '%s: Service %s state=%s substate=%s restarts=%s\n\n' "$status" "$service_name" "$active_state" "$sub_state" "$restart_count"
|
||||
|
||||
printf 'Service state:\n'
|
||||
systemctl status "$service_name" --no-pager --lines=8 2>/dev/null || printf 'WARNING: unable to read service status for %s\n' "$service_name"
|
||||
printf '\n'
|
||||
|
||||
printf 'Systemd properties:\n'
|
||||
systemctl show "$service_name" --property=Id,Names,LoadState,ActiveState,SubState,Result,ExecMainStatus,NRestarts,Restart,RestartUSec --no-pager 2>/dev/null || true
|
||||
printf '\n'
|
||||
|
||||
printf 'Recent start/stop/failure log lines since %s:\n' "$since_value"
|
||||
if command -v journalctl >/dev/null 2>&1; then
|
||||
journalctl -u "$service_name" --since "$since_value" --no-pager 2>/dev/null \
|
||||
| grep -Ei 'start|stop|fail|restart|exit|status|main process' \
|
||||
| tail -n 40 || printf 'OK: no matching journal lines found\n'
|
||||
else
|
||||
printf 'WARNING: journalctl not available; service logs unavailable from this script\n'
|
||||
fi
|
||||
printf '\n'
|
||||
|
||||
printf 'Evidence:\n'
|
||||
printf 'Thresholds: warning=%s restarts critical=%s restarts since="%s"\n' "$warning_count" "$critical_count" "$since_value"
|
||||
if [[ "${EUID:-$(id -u 2>/dev/null || printf '1')}" != "0" ]]; then
|
||||
printf 'WARNING: running without root; journal visibility may be limited\n'
|
||||
fi
|
||||
printf '\n'
|
||||
|
||||
printf 'Recommended next steps:\n'
|
||||
printf -- '- Inspect the unit file and drop-in overrides\n'
|
||||
printf -- '- Review application logs around the restart timestamps\n'
|
||||
printf -- '- Check dependencies such as network, storage, database, or secrets\n'
|
||||
printf -- '- Verify recent configuration or package changes\n'
|
||||
printf -- '- Do not restart blindly; attach this output to the incident ticket\n'
|
||||
|
||||
exit "$exit_code"
|
||||
Reference in New Issue
Block a user