#!/usr/bin/env bash set -o errexit set -o nounset set -o pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck source=00_env.sh . "$SCRIPT_DIR/00_env.sh" usage() { printf 'Usage: %s --fs \n' "$(basename "$0")" } while [[ "$#" -gt 0 ]]; do case "$1" in --fs) FILESYSTEM="${2:-}"; shift 2 ;; -h|--help) usage; exit 0 ;; *) critical "Unknown argument: $1"; usage; exit 2 ;; esac done if [[ -z "$FILESYSTEM" ]]; then critical "Missing required --fs " usage exit 2 fi issues=0 run_check() { local description="$1" shift section "$description" if command -v "$1" >/dev/null 2>&1; then "$@" 2>&1 | tee -a "$LOG_FILE" || { critical "$description failed" issues=1 } else warning "$1 command not available, skipping" fi } run_check "GPFS daemon state" mmgetstate -a run_check "Target filesystem mount state" mmlsmount "$FILESYSTEM" run_check "Target filesystem disks" mmlsdisk "$FILESYSTEM" run_check "NSD inventory" mmlsnsd section "Filesystem capacity" if command -v df >/dev/null 2>&1; then df -h 2>&1 | awk -v fs="$FILESYSTEM" 'NR == 1 || $0 ~ fs || $0 ~ /gpfs|mmfs/' | tee -a "$LOG_FILE" else warning "df command not available, skipping" fi section "Cluster health" if command -v mmhealth >/dev/null 2>&1; then health_output="$(mmhealth cluster show 2>&1 || true)" printf '%s\n' "$health_output" | tee -a "$LOG_FILE" if printf '%s\n' "$health_output" | grep -Eiq 'degraded|failed|down|error|unhealthy'; then critical "Cluster health output indicates an issue" issues=1 fi else warning "mmhealth command not available, skipping" fi section "Recent GPFS journal entries" if command -v journalctl >/dev/null 2>&1; then journalctl -u 'gpfs*' -n 50 --no-pager 2>&1 | tee -a "$LOG_FILE" || warning "journalctl GPFS query failed" else warning "journalctl command not available, skipping" fi section "Recent kernel messages" if command -v dmesg >/dev/null 2>&1; then dmesg -T 2>/dev/null | tail -50 | tee -a "$LOG_FILE" || warning "dmesg query failed" else warning "dmesg command not available, skipping" fi if [[ "$issues" -eq 0 ]]; then ok "Post-check completed without detected operational failures" exit 0 fi critical "Post-check detected one or more issues" exit 1