Files
portfolio/infra-run/scripts/bash/incident-checks/check_inode_usage.sh
T

104 lines
2.9 KiB
Bash
Raw Normal View History

2026-05-11 18:49:00 +00:00
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
warning_threshold=80
critical_threshold=90
top_count=10
usage() {
cat <<'USAGE'
Usage: check_inode_usage.sh [--warning PERCENT] [--critical PERCENT] [--top N] [--help]
Detect inode exhaustion using df -i.
USAGE
}
is_number() {
[[ "$1" =~ ^[0-9]+$ ]]
}
while (($# > 0)); do
case "$1" in
--warning) [[ $# -ge 2 ]] || { printf 'CRITICAL: --warning requires a value\n'; exit 2; }; warning_threshold="$2"; shift 2 ;;
--critical) [[ $# -ge 2 ]] || { printf 'CRITICAL: --critical requires a value\n'; exit 2; }; critical_threshold="$2"; shift 2 ;;
--top) [[ $# -ge 2 ]] || { printf 'CRITICAL: --top requires a value\n'; exit 2; }; top_count="$2"; shift 2 ;;
--help|-h) usage; exit 0 ;;
*) printf 'CRITICAL: unknown option: %s\n' "$1"; usage; exit 2 ;;
esac
done
for value in "$warning_threshold" "$critical_threshold" "$top_count"; do
if ! is_number "$value"; then
printf 'CRITICAL: numeric option expected, got: %s\n' "$value"
exit 2
fi
done
if ((warning_threshold >= critical_threshold)); then
printf 'CRITICAL: --warning must be lower than --critical\n'
exit 2
fi
if ! command -v df >/dev/null 2>&1; then
printf 'CRITICAL: required command not found: df\n'
exit 2
fi
tmp_df="$(mktemp)"
tmp_alerts="$(mktemp)"
trap 'rm -f "$tmp_df" "$tmp_alerts"' EXIT
df -Pi > "$tmp_df"
awk -v warn="$warning_threshold" '
NR > 1 {
pct=$5
gsub(/%/, "", pct)
if (pct >= warn) {
print $0
}
}
' "$tmp_df" > "$tmp_alerts"
max_pct="$(awk 'NR > 1 { pct=$5; gsub(/%/, "", pct); if (pct > max) max=pct } END { printf "%d", max }' "$tmp_df")"
status="OK"
exit_code=0
if ((max_pct >= critical_threshold)); then
status="CRITICAL"
exit_code=3
elif ((max_pct >= warning_threshold)); then
status="WARNING"
exit_code=1
fi
printf '%s: Highest inode usage is %s%%\n\n' "$status" "$max_pct"
printf 'Filesystems above threshold:\n'
if [[ -s "$tmp_alerts" ]]; then
cat "$tmp_alerts"
else
printf 'OK: no filesystems above warning threshold\n'
fi
printf '\n'
printf 'Inode usage table:\n'
cat "$tmp_df"
printf '\n'
printf 'Top affected mount points:\n'
awk 'NR > 1 { pct=$5; gsub(/%/, "", pct); print pct, $6, $1, $2, $3, $4 }' "$tmp_df" \
| sort -rn | head -n "$top_count" \
| awk '{ printf "%s%% %s %s inodes=%s used=%s free=%s\n", $1, $2, $3, $4, $5, $6 }'
printf '\n'
printf 'Evidence:\n'
printf 'Thresholds: warning=%s%% critical=%s%%\n\n' "$warning_threshold" "$critical_threshold"
printf 'Recommended next steps:\n'
printf -- '- Find directories with many small files under affected mount points\n'
printf -- '- Check logs, cache, spool, session, and temporary directories\n'
printf -- '- Avoid deleting blindly; confirm ownership and application impact first\n'
printf -- '- Confirm whether inode exhaustion is causing write or deploy failures\n'
printf -- '- Attach this output to incident ticket\n'
exit "$exit_code"