147 lines
4.0 KiB
Bash
Executable File
147 lines
4.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -o errexit
|
|
set -o nounset
|
|
set -o pipefail
|
|
|
|
warning_threshold=75
|
|
critical_threshold=90
|
|
top_count=10
|
|
|
|
usage() {
|
|
cat <<'USAGE'
|
|
Usage: check_high_cpu.sh [--warning PERCENT] [--critical PERCENT] [--top N] [--help]
|
|
|
|
Detect high CPU load and show top CPU-consuming processes.
|
|
|
|
Exit codes:
|
|
0 OK
|
|
1 WARNING / operational issue detected
|
|
2 invalid input / missing required dependency
|
|
3 CRITICAL issue detected
|
|
USAGE
|
|
}
|
|
|
|
is_number() {
|
|
[[ "$1" =~ ^[0-9]+$ ]]
|
|
}
|
|
|
|
require_cmd() {
|
|
if ! command -v "$1" >/dev/null 2>&1; then
|
|
printf 'CRITICAL: required command not found: %s\n' "$1"
|
|
exit 2
|
|
fi
|
|
}
|
|
|
|
while (($# > 0)); do
|
|
case "$1" in
|
|
--warning)
|
|
[[ $# -ge 2 ]] || { printf 'CRITICAL: --warning requires a value\n'; exit 2; }
|
|
warning_threshold="$2"
|
|
shift 2
|
|
;;
|
|
--critical)
|
|
[[ $# -ge 2 ]] || { printf 'CRITICAL: --critical requires a value\n'; exit 2; }
|
|
critical_threshold="$2"
|
|
shift 2
|
|
;;
|
|
--top)
|
|
[[ $# -ge 2 ]] || { printf 'CRITICAL: --top requires a value\n'; exit 2; }
|
|
top_count="$2"
|
|
shift 2
|
|
;;
|
|
--help|-h)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
printf 'CRITICAL: unknown option: %s\n' "$1"
|
|
usage
|
|
exit 2
|
|
;;
|
|
esac
|
|
done
|
|
|
|
for value in "$warning_threshold" "$critical_threshold" "$top_count"; do
|
|
if ! is_number "$value"; then
|
|
printf 'CRITICAL: numeric option expected, got: %s\n' "$value"
|
|
exit 2
|
|
fi
|
|
done
|
|
|
|
if ((warning_threshold >= critical_threshold)); then
|
|
printf 'CRITICAL: --warning must be lower than --critical\n'
|
|
exit 2
|
|
fi
|
|
|
|
require_cmd ps
|
|
require_cmd awk
|
|
require_cmd head
|
|
|
|
cpu_count=1
|
|
if command -v getconf >/dev/null 2>&1; then
|
|
cpu_count="$(getconf _NPROCESSORS_ONLN 2>/dev/null || printf '1')"
|
|
elif [[ -r /proc/cpuinfo ]]; then
|
|
cpu_count="$(grep -c '^processor' /proc/cpuinfo 2>/dev/null || printf '1')"
|
|
fi
|
|
[[ "$cpu_count" =~ ^[0-9]+$ ]] || cpu_count=1
|
|
((cpu_count > 0)) || cpu_count=1
|
|
|
|
load_1m="unavailable"
|
|
load_5m="unavailable"
|
|
load_15m="unavailable"
|
|
load_per_cpu_pct=0
|
|
if [[ -r /proc/loadavg ]]; then
|
|
read -r load_1m load_5m load_15m _ < /proc/loadavg
|
|
load_per_cpu_pct="$(awk -v load_avg="$load_1m" -v cpus="$cpu_count" 'BEGIN { printf "%d", (load_avg / cpus) * 100 }')"
|
|
elif command -v uptime >/dev/null 2>&1; then
|
|
load_line="$(uptime 2>/dev/null || true)"
|
|
load_1m="$(printf '%s\n' "$load_line" | sed -n 's/.*load average[s]*: *\([^,]*\).*/\1/p')"
|
|
fi
|
|
|
|
status="OK"
|
|
exit_code=0
|
|
if ((load_per_cpu_pct >= critical_threshold)); then
|
|
status="CRITICAL"
|
|
exit_code=3
|
|
elif ((load_per_cpu_pct >= warning_threshold)); then
|
|
status="WARNING"
|
|
exit_code=1
|
|
fi
|
|
|
|
printf '%s: 1-minute load is %s across %s CPU(s) (%s%% of CPU count)\n\n' "$status" "$load_1m" "$cpu_count" "$load_per_cpu_pct"
|
|
|
|
printf 'Load average:\n'
|
|
printf '1m=%s 5m=%s 15m=%s\n\n' "$load_1m" "$load_5m" "$load_15m"
|
|
|
|
printf 'CPU count:\n'
|
|
printf '%s\n\n' "$cpu_count"
|
|
|
|
printf 'Top CPU processes:\n'
|
|
ps -eo pid,ppid,user,pcpu,pmem,comm,args --sort=-pcpu | head -n "$((top_count + 1))"
|
|
printf '\n'
|
|
|
|
printf 'Evidence:\n'
|
|
if command -v uptime >/dev/null 2>&1; then
|
|
uptime || true
|
|
else
|
|
printf 'WARNING: uptime command not available; used /proc/loadavg where possible\n'
|
|
fi
|
|
if ((load_per_cpu_pct >= 100)); then
|
|
printf 'WARNING: load is higher than online CPU count; runnable task saturation is possible\n'
|
|
else
|
|
printf 'OK: load is not above online CPU count at collection time\n'
|
|
fi
|
|
if [[ "${EUID:-$(id -u 2>/dev/null || printf '1')}" != "0" ]]; then
|
|
printf 'WARNING: running without root; process ownership details are usually available, but some command lines may be limited\n'
|
|
fi
|
|
printf '\n'
|
|
|
|
printf 'Recommended next steps:\n'
|
|
printf -- '- Check process ownership and whether the top process is expected\n'
|
|
printf -- '- Check recent deployments, cron jobs, batch jobs, or maintenance activity\n'
|
|
printf -- '- Review logs for the top CPU-consuming process\n'
|
|
printf -- '- Compare with longer trend data from monitoring before taking action\n'
|
|
printf -- '- Attach this output to the incident ticket\n'
|
|
|
|
exit "$exit_code"
|