Add disk full incident response toolkit
This commit is contained in:
+90
@@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=00_env.sh
|
||||
. "$SCRIPT_DIR/00_env.sh"
|
||||
|
||||
BEFORE_FILE=""
|
||||
exit_code=0
|
||||
|
||||
usage() {
|
||||
printf 'Usage: %s [--before-file <df_output_file>]\n' "$(basename "$0")"
|
||||
}
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--before-file) BEFORE_FILE="${2:-}"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) critical "Unknown argument: $1"; usage; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
require_cmd df || exit 1
|
||||
|
||||
section "Post-Cleanup Disk Space"
|
||||
df -h 2>&1 | tee -a "$LOG_FILE"
|
||||
|
||||
section "Post-Cleanup Inodes"
|
||||
df -i 2>&1 | tee -a "$LOG_FILE" || warning "Unable to collect inode usage"
|
||||
|
||||
section "Critical Filesystem Check"
|
||||
while read -r fs size used avail pct mount; do
|
||||
percent="$(usage_percent_number "$pct")"
|
||||
if (( percent >= EMERGENCY_THRESHOLD )); then
|
||||
critical "$mount is still ${pct} full on $fs (size=$size used=$used avail=$avail)"
|
||||
exit_code=1
|
||||
elif (( percent >= CRIT_THRESHOLD )); then
|
||||
warning "$mount remains high at ${pct} on $fs"
|
||||
else
|
||||
ok "$mount is ${pct} full on $fs"
|
||||
fi
|
||||
done < <(df -P -h | awk 'NR > 1 { print $1, $2, $3, $4, $5, $6 }')
|
||||
|
||||
if [[ -n "$BEFORE_FILE" ]]; then
|
||||
section "Before And After Comparison"
|
||||
if [[ ! -f "$BEFORE_FILE" ]]; then
|
||||
warning "Before file not found: $BEFORE_FILE"
|
||||
else
|
||||
awk '
|
||||
NR == FNR && FNR > 1 {
|
||||
before[$6] = $5
|
||||
next
|
||||
}
|
||||
FNR > 1 {
|
||||
mount = $6
|
||||
if (mount in before) {
|
||||
before_pct = before[mount]
|
||||
after_pct = $5
|
||||
gsub(/%/, "", before_pct)
|
||||
gsub(/%/, "", after_pct)
|
||||
|
||||
if (after_pct < before_pct) {
|
||||
status = "OK"
|
||||
result = "improved"
|
||||
} else if (after_pct == before_pct) {
|
||||
status = "WARNING"
|
||||
result = "unchanged"
|
||||
} else {
|
||||
status = "WARNING"
|
||||
result = "increased"
|
||||
}
|
||||
|
||||
printf "%s: %s before=%s after=%s (%s)\n", status, mount, before[mount], $5, result
|
||||
}
|
||||
}
|
||||
' "$BEFORE_FILE" <(df -P -h) | tee -a "$LOG_FILE"
|
||||
fi
|
||||
else
|
||||
warning "No --before-file supplied. Improvement comparison skipped."
|
||||
fi
|
||||
|
||||
if [[ "$exit_code" -eq 0 ]]; then
|
||||
ok "Post-check completed without emergency-threshold filesystems."
|
||||
else
|
||||
critical "One or more filesystems remain at or above ${EMERGENCY_THRESHOLD}%."
|
||||
fi
|
||||
|
||||
exit "$exit_code"
|
||||
Reference in New Issue
Block a user