2026-05-05 21:40:46 +00:00
|
|
|
#!/usr/bin/env bash
|
2026-05-08 21:18:22 +00:00
|
|
|
set -euo pipefail
|
2026-05-05 21:40:46 +00:00
|
|
|
|
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
|
# shellcheck source=00_env.sh
|
|
|
|
|
. "$SCRIPT_DIR/00_env.sh"
|
|
|
|
|
|
|
|
|
|
usage() {
|
|
|
|
|
printf 'Usage: %s --fs <filesystem>\n' "$(basename "$0")"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
while [[ "$#" -gt 0 ]]; do
|
|
|
|
|
case "$1" in
|
|
|
|
|
--fs) FILESYSTEM="${2:-}"; shift 2 ;;
|
|
|
|
|
-h|--help) usage; exit 0 ;;
|
|
|
|
|
*) critical "Unknown argument: $1"; usage; exit 2 ;;
|
|
|
|
|
esac
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
if [[ -z "$FILESYSTEM" ]]; then
|
|
|
|
|
critical "Missing required --fs <filesystem>"
|
|
|
|
|
usage
|
|
|
|
|
exit 2
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
issues=0
|
|
|
|
|
|
|
|
|
|
run_check() {
|
|
|
|
|
local description="$1"
|
|
|
|
|
shift
|
|
|
|
|
|
|
|
|
|
section "$description"
|
|
|
|
|
if command -v "$1" >/dev/null 2>&1; then
|
|
|
|
|
"$@" 2>&1 | tee -a "$LOG_FILE" || {
|
|
|
|
|
critical "$description failed"
|
|
|
|
|
issues=1
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
warning "$1 command not available, skipping"
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
run_check "GPFS daemon state" mmgetstate -a
|
|
|
|
|
run_check "Target filesystem mount state" mmlsmount "$FILESYSTEM"
|
|
|
|
|
run_check "Target filesystem disks" mmlsdisk "$FILESYSTEM"
|
|
|
|
|
run_check "NSD inventory" mmlsnsd
|
|
|
|
|
|
|
|
|
|
section "Filesystem capacity"
|
|
|
|
|
if command -v df >/dev/null 2>&1; then
|
|
|
|
|
df -h 2>&1 | awk -v fs="$FILESYSTEM" 'NR == 1 || $0 ~ fs || $0 ~ /gpfs|mmfs/' | tee -a "$LOG_FILE"
|
|
|
|
|
else
|
|
|
|
|
warning "df command not available, skipping"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
section "Cluster health"
|
|
|
|
|
if command -v mmhealth >/dev/null 2>&1; then
|
|
|
|
|
health_output="$(mmhealth cluster show 2>&1 || true)"
|
|
|
|
|
printf '%s\n' "$health_output" | tee -a "$LOG_FILE"
|
|
|
|
|
if printf '%s\n' "$health_output" | grep -Eiq 'degraded|failed|down|error|unhealthy'; then
|
|
|
|
|
critical "Cluster health output indicates an issue"
|
|
|
|
|
issues=1
|
|
|
|
|
fi
|
|
|
|
|
else
|
|
|
|
|
warning "mmhealth command not available, skipping"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
section "Recent GPFS journal entries"
|
|
|
|
|
if command -v journalctl >/dev/null 2>&1; then
|
|
|
|
|
journalctl -u 'gpfs*' -n 50 --no-pager 2>&1 | tee -a "$LOG_FILE" || warning "journalctl GPFS query failed"
|
|
|
|
|
else
|
|
|
|
|
warning "journalctl command not available, skipping"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
section "Recent kernel messages"
|
|
|
|
|
if command -v dmesg >/dev/null 2>&1; then
|
|
|
|
|
dmesg -T 2>/dev/null | tail -50 | tee -a "$LOG_FILE" || warning "dmesg query failed"
|
|
|
|
|
else
|
|
|
|
|
warning "dmesg command not available, skipping"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
if [[ "$issues" -eq 0 ]]; then
|
|
|
|
|
ok "Post-check completed without detected operational failures"
|
|
|
|
|
exit 0
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
critical "Post-check detected one or more issues"
|
|
|
|
|
exit 1
|