#!/usr/bin/env bash set -o errexit set -o nounset set -o pipefail warning_threshold=75 critical_threshold=90 top_count=10 usage() { cat <<'USAGE' Usage: check_high_cpu.sh [--warning PERCENT] [--critical PERCENT] [--top N] [--help] Detect high CPU load and show top CPU-consuming processes. Exit codes: 0 OK 1 WARNING / operational issue detected 2 invalid input / missing required dependency 3 CRITICAL issue detected USAGE } is_number() { [[ "$1" =~ ^[0-9]+$ ]] } require_cmd() { if ! command -v "$1" >/dev/null 2>&1; then printf 'CRITICAL: required command not found: %s\n' "$1" exit 2 fi } while (($# > 0)); do case "$1" in --warning) [[ $# -ge 2 ]] || { printf 'CRITICAL: --warning requires a value\n'; exit 2; } warning_threshold="$2" shift 2 ;; --critical) [[ $# -ge 2 ]] || { printf 'CRITICAL: --critical requires a value\n'; exit 2; } critical_threshold="$2" shift 2 ;; --top) [[ $# -ge 2 ]] || { printf 'CRITICAL: --top requires a value\n'; exit 2; } top_count="$2" shift 2 ;; --help|-h) usage exit 0 ;; *) printf 'CRITICAL: unknown option: %s\n' "$1" usage exit 2 ;; esac done for value in "$warning_threshold" "$critical_threshold" "$top_count"; do if ! is_number "$value"; then printf 'CRITICAL: numeric option expected, got: %s\n' "$value" exit 2 fi done if ((warning_threshold >= critical_threshold)); then printf 'CRITICAL: --warning must be lower than --critical\n' exit 2 fi require_cmd ps require_cmd awk require_cmd head cpu_count=1 if command -v getconf >/dev/null 2>&1; then cpu_count="$(getconf _NPROCESSORS_ONLN 2>/dev/null || printf '1')" elif [[ -r /proc/cpuinfo ]]; then cpu_count="$(grep -c '^processor' /proc/cpuinfo 2>/dev/null || printf '1')" fi [[ "$cpu_count" =~ ^[0-9]+$ ]] || cpu_count=1 ((cpu_count > 0)) || cpu_count=1 load_1m="unavailable" load_5m="unavailable" load_15m="unavailable" load_per_cpu_pct=0 if [[ -r /proc/loadavg ]]; then read -r load_1m load_5m load_15m _ < /proc/loadavg load_per_cpu_pct="$(awk -v load="$load_1m" -v cpus="$cpu_count" 'BEGIN { printf "%d", (load / cpus) * 100 }')" elif command -v uptime >/dev/null 2>&1; then load_line="$(uptime 2>/dev/null || true)" load_1m="$(printf '%s\n' "$load_line" | sed -n 's/.*load average[s]*: *\([^,]*\).*/\1/p')" fi status="OK" exit_code=0 if ((load_per_cpu_pct >= critical_threshold)); then status="CRITICAL" exit_code=3 elif ((load_per_cpu_pct >= warning_threshold)); then status="WARNING" exit_code=1 fi printf '%s: 1-minute load is %s across %s CPU(s) (%s%% of CPU count)\n\n' "$status" "$load_1m" "$cpu_count" "$load_per_cpu_pct" printf 'Load average:\n' printf '1m=%s 5m=%s 15m=%s\n\n' "$load_1m" "$load_5m" "$load_15m" printf 'CPU count:\n' printf '%s\n\n' "$cpu_count" printf 'Top CPU processes:\n' ps -eo pid,ppid,user,pcpu,pmem,comm,args --sort=-pcpu | head -n "$((top_count + 1))" printf '\n' printf 'Evidence:\n' if command -v uptime >/dev/null 2>&1; then uptime || true else printf 'WARNING: uptime command not available; used /proc/loadavg where possible\n' fi if ((load_per_cpu_pct >= 100)); then printf 'WARNING: load is higher than online CPU count; runnable task saturation is possible\n' else printf 'OK: load is not above online CPU count at collection time\n' fi if [[ "${EUID:-$(id -u 2>/dev/null || printf '1')}" != "0" ]]; then printf 'WARNING: running without root; process ownership details are usually available, but some command lines may be limited\n' fi printf '\n' printf 'Recommended next steps:\n' printf -- '- Check process ownership and whether the top process is expected\n' printf -- '- Check recent deployments, cron jobs, batch jobs, or maintenance activity\n' printf -- '- Review logs for the top CPU-consuming process\n' printf -- '- Compare with longer trend data from monitoring before taking action\n' printf -- '- Attach this output to the incident ticket\n' exit "$exit_code"