This commit is contained in:
+146
@@ -0,0 +1,146 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
warning_threshold=75
|
||||
critical_threshold=90
|
||||
top_count=10
|
||||
|
||||
usage() {
|
||||
cat <<'USAGE'
|
||||
Usage: check_high_cpu.sh [--warning PERCENT] [--critical PERCENT] [--top N] [--help]
|
||||
|
||||
Detect high CPU load and show top CPU-consuming processes.
|
||||
|
||||
Exit codes:
|
||||
0 OK
|
||||
1 WARNING / operational issue detected
|
||||
2 invalid input / missing required dependency
|
||||
3 CRITICAL issue detected
|
||||
USAGE
|
||||
}
|
||||
|
||||
is_number() {
|
||||
[[ "$1" =~ ^[0-9]+$ ]]
|
||||
}
|
||||
|
||||
require_cmd() {
|
||||
if ! command -v "$1" >/dev/null 2>&1; then
|
||||
printf 'CRITICAL: required command not found: %s\n' "$1"
|
||||
exit 2
|
||||
fi
|
||||
}
|
||||
|
||||
while (($# > 0)); do
|
||||
case "$1" in
|
||||
--warning)
|
||||
[[ $# -ge 2 ]] || { printf 'CRITICAL: --warning requires a value\n'; exit 2; }
|
||||
warning_threshold="$2"
|
||||
shift 2
|
||||
;;
|
||||
--critical)
|
||||
[[ $# -ge 2 ]] || { printf 'CRITICAL: --critical requires a value\n'; exit 2; }
|
||||
critical_threshold="$2"
|
||||
shift 2
|
||||
;;
|
||||
--top)
|
||||
[[ $# -ge 2 ]] || { printf 'CRITICAL: --top requires a value\n'; exit 2; }
|
||||
top_count="$2"
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
printf 'CRITICAL: unknown option: %s\n' "$1"
|
||||
usage
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
for value in "$warning_threshold" "$critical_threshold" "$top_count"; do
|
||||
if ! is_number "$value"; then
|
||||
printf 'CRITICAL: numeric option expected, got: %s\n' "$value"
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
|
||||
if ((warning_threshold >= critical_threshold)); then
|
||||
printf 'CRITICAL: --warning must be lower than --critical\n'
|
||||
exit 2
|
||||
fi
|
||||
|
||||
require_cmd ps
|
||||
require_cmd awk
|
||||
require_cmd head
|
||||
|
||||
cpu_count=1
|
||||
if command -v getconf >/dev/null 2>&1; then
|
||||
cpu_count="$(getconf _NPROCESSORS_ONLN 2>/dev/null || printf '1')"
|
||||
elif [[ -r /proc/cpuinfo ]]; then
|
||||
cpu_count="$(grep -c '^processor' /proc/cpuinfo 2>/dev/null || printf '1')"
|
||||
fi
|
||||
[[ "$cpu_count" =~ ^[0-9]+$ ]] || cpu_count=1
|
||||
((cpu_count > 0)) || cpu_count=1
|
||||
|
||||
load_1m="unavailable"
|
||||
load_5m="unavailable"
|
||||
load_15m="unavailable"
|
||||
load_per_cpu_pct=0
|
||||
if [[ -r /proc/loadavg ]]; then
|
||||
read -r load_1m load_5m load_15m _ < /proc/loadavg
|
||||
load_per_cpu_pct="$(awk -v load="$load_1m" -v cpus="$cpu_count" 'BEGIN { printf "%d", (load / cpus) * 100 }')"
|
||||
elif command -v uptime >/dev/null 2>&1; then
|
||||
load_line="$(uptime 2>/dev/null || true)"
|
||||
load_1m="$(printf '%s\n' "$load_line" | sed -n 's/.*load average[s]*: *\([^,]*\).*/\1/p')"
|
||||
fi
|
||||
|
||||
status="OK"
|
||||
exit_code=0
|
||||
if ((load_per_cpu_pct >= critical_threshold)); then
|
||||
status="CRITICAL"
|
||||
exit_code=3
|
||||
elif ((load_per_cpu_pct >= warning_threshold)); then
|
||||
status="WARNING"
|
||||
exit_code=1
|
||||
fi
|
||||
|
||||
printf '%s: 1-minute load is %s across %s CPU(s) (%s%% of CPU count)\n\n' "$status" "$load_1m" "$cpu_count" "$load_per_cpu_pct"
|
||||
|
||||
printf 'Load average:\n'
|
||||
printf '1m=%s 5m=%s 15m=%s\n\n' "$load_1m" "$load_5m" "$load_15m"
|
||||
|
||||
printf 'CPU count:\n'
|
||||
printf '%s\n\n' "$cpu_count"
|
||||
|
||||
printf 'Top CPU processes:\n'
|
||||
ps -eo pid,ppid,user,pcpu,pmem,comm,args --sort=-pcpu | head -n "$((top_count + 1))"
|
||||
printf '\n'
|
||||
|
||||
printf 'Evidence:\n'
|
||||
if command -v uptime >/dev/null 2>&1; then
|
||||
uptime || true
|
||||
else
|
||||
printf 'WARNING: uptime command not available; used /proc/loadavg where possible\n'
|
||||
fi
|
||||
if ((load_per_cpu_pct >= 100)); then
|
||||
printf 'WARNING: load is higher than online CPU count; runnable task saturation is possible\n'
|
||||
else
|
||||
printf 'OK: load is not above online CPU count at collection time\n'
|
||||
fi
|
||||
if [[ "${EUID:-$(id -u 2>/dev/null || printf '1')}" != "0" ]]; then
|
||||
printf 'WARNING: running without root; process ownership details are usually available, but some command lines may be limited\n'
|
||||
fi
|
||||
printf '\n'
|
||||
|
||||
printf 'Recommended next steps:\n'
|
||||
printf -- '- Check process ownership and whether the top process is expected\n'
|
||||
printf -- '- Check recent deployments, cron jobs, batch jobs, or maintenance activity\n'
|
||||
printf -- '- Review logs for the top CPU-consuming process\n'
|
||||
printf -- '- Compare with longer trend data from monitoring before taking action\n'
|
||||
printf -- '- Attach this output to the incident ticket\n'
|
||||
|
||||
exit "$exit_code"
|
||||
Reference in New Issue
Block a user