From 5dd8c34952585018cf34db9f986d79bea2902b74 Mon Sep 17 00:00:00 2001 From: Mateusz Suski Date: Tue, 5 May 2026 21:40:46 +0000 Subject: [PATCH] Add GPFS storage expansion toolkit --- infra-run/scripts/bash/gpfs/00_env.sh | 114 +++++++++++++++ .../scripts/bash/gpfs/01_cluster_overview.sh | 37 +++++ .../scripts/bash/gpfs/02_precheck_gpfs.sh | 103 +++++++++++++ .../scripts/bash/gpfs/03_detect_new_disks.sh | 83 +++++++++++ .../scripts/bash/gpfs/04_create_nsd_stanza.sh | 76 ++++++++++ .../bash/gpfs/05_add_nsd_to_filesystem.sh | 59 ++++++++ .../bash/gpfs/06_rebalance_filesystem.sh | 56 ++++++++ .../scripts/bash/gpfs/07_postcheck_gpfs.sh | 89 ++++++++++++ .../scripts/bash/gpfs/08_generate_report.sh | 78 ++++++++++ infra-run/scripts/bash/gpfs/README.md | 136 ++++++++++++++++++ .../scripts/bash/gpfs/gpfs_extend_runbook.sh | 94 ++++++++++++ 11 files changed, 925 insertions(+) create mode 100755 infra-run/scripts/bash/gpfs/00_env.sh create mode 100755 infra-run/scripts/bash/gpfs/01_cluster_overview.sh create mode 100755 infra-run/scripts/bash/gpfs/02_precheck_gpfs.sh create mode 100755 infra-run/scripts/bash/gpfs/03_detect_new_disks.sh create mode 100755 infra-run/scripts/bash/gpfs/04_create_nsd_stanza.sh create mode 100755 infra-run/scripts/bash/gpfs/05_add_nsd_to_filesystem.sh create mode 100755 infra-run/scripts/bash/gpfs/06_rebalance_filesystem.sh create mode 100755 infra-run/scripts/bash/gpfs/07_postcheck_gpfs.sh create mode 100755 infra-run/scripts/bash/gpfs/08_generate_report.sh create mode 100644 infra-run/scripts/bash/gpfs/README.md create mode 100755 infra-run/scripts/bash/gpfs/gpfs_extend_runbook.sh diff --git a/infra-run/scripts/bash/gpfs/00_env.sh b/infra-run/scripts/bash/gpfs/00_env.sh new file mode 100755 index 0000000..d2d1bcc --- /dev/null +++ b/infra-run/scripts/bash/gpfs/00_env.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d_%H%M%S)}" +DRY_RUN="${DRY_RUN:-true}" +LOG_FILE="${LOG_FILE:-/tmp/gpfs_extend_${TIMESTAMP}.log}" + +FILESYSTEM="${FILESYSTEM:-}" +NSD_STANZA="${NSD_STANZA:-}" +FAILURE_GROUP="${FAILURE_GROUP:-}" +STORAGE_POOL="${STORAGE_POOL:-system}" +USAGE="${USAGE:-dataAndMetadata}" + +log() { + local level="$1" + shift + local message="$*" + + printf '%s: %s\n' "$level" "$message" | tee -a "$LOG_FILE" +} + +ok() { + log "OK" "$@" +} + +warning() { + log "WARNING" "$@" +} + +critical() { + log "CRITICAL" "$@" +} + +require_cmd() { + local cmd="$1" + + if command -v "$cmd" >/dev/null 2>&1; then + ok "Command available: $cmd" + return 0 + fi + + critical "Required command not found: $cmd" + return 1 +} + +validate_gpfs_command() { + local cmd="$1" + + if command -v "$cmd" >/dev/null 2>&1; then + return 0 + fi + + warning "GPFS command not available, skipping: $cmd" + return 1 +} + +run_cmd() { + if [[ "$#" -eq 0 ]]; then + critical "run_cmd called without a command" + return 2 + fi + + if [[ "$DRY_RUN" == "true" ]]; then + log "OK" "DRY-RUN: $*" + return 0 + fi + + log "OK" "RUN: $*" + "$@" 2>&1 | tee -a "$LOG_FILE" +} + +run_readonly() { + if [[ "$#" -eq 0 ]]; then + critical "run_readonly called without a command" + return 2 + fi + + log "OK" "READ-ONLY: $*" + "$@" 2>&1 | tee -a "$LOG_FILE" +} + +confirm_execute() { + local target="${1:-GPFS change}" + + if [[ "$DRY_RUN" == "true" ]]; then + ok "Dry-run mode enabled. No changes will be made." + return 0 + fi + + warning "Execution mode requested for: $target" + warning "Coordinate this change with storage, GPFS, application, and change-management teams." + printf 'Type EXECUTE to continue: ' + read -r confirmation + + if [[ "$confirmation" != "EXECUTE" ]]; then + critical "Confirmation failed. Aborting." + exit 1 + fi + + ok "Execution confirmed by operator." +} + +usage_value_valid() { + case "$1" in + dataOnly|metadataOnly|dataAndMetadata) return 0 ;; + *) return 1 ;; + esac +} + +section() { + printf '\n== %s ==\n' "$1" | tee -a "$LOG_FILE" +} diff --git a/infra-run/scripts/bash/gpfs/01_cluster_overview.sh b/infra-run/scripts/bash/gpfs/01_cluster_overview.sh new file mode 100755 index 0000000..ad9da82 --- /dev/null +++ b/infra-run/scripts/bash/gpfs/01_cluster_overview.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=00_env.sh +. "$SCRIPT_DIR/00_env.sh" + +run_optional() { + local description="$1" + shift + + section "$description" + if validate_gpfs_command "$1"; then + run_readonly "$@" || warning "$description command failed" + fi +} + +section "GPFS / Spectrum Scale Cluster Overview" +ok "Log file: $LOG_FILE" + +run_optional "GPFS daemon state on all nodes" mmgetstate -a +run_optional "Cluster definition" mmlscluster +run_optional "Cluster configuration" mmlsconfig +run_optional "Managers and quorum information" mmlsmgr +run_optional "NSD inventory" mmlsnsd +run_optional "Disk inventory for all filesystems" mmlsdisk all +run_optional "Filesystem definitions" mmlsfs all +run_optional "Mount state for all filesystems" mmlsmount all + +section "Mounted GPFS filesystems from df" +if command -v df >/dev/null 2>&1; then + df -h -t gpfs 2>/dev/null | tee -a "$LOG_FILE" || df -h | awk 'NR == 1 || /gpfs|mmfs/' | tee -a "$LOG_FILE" +else + warning "df command not available" +fi diff --git a/infra-run/scripts/bash/gpfs/02_precheck_gpfs.sh b/infra-run/scripts/bash/gpfs/02_precheck_gpfs.sh new file mode 100755 index 0000000..333d7b4 --- /dev/null +++ b/infra-run/scripts/bash/gpfs/02_precheck_gpfs.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=00_env.sh +. "$SCRIPT_DIR/00_env.sh" + +usage() { + printf 'Usage: %s --fs \n' "$(basename "$0")" +} + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --fs) + FILESYSTEM="${2:-}" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + critical "Unknown argument: $1" + usage + exit 2 + ;; + esac +done + +if [[ -z "$FILESYSTEM" ]]; then + critical "Missing required --fs " + usage + exit 2 +fi + +missing=0 +for cmd in mmgetstate mmlscluster mmlsfs mmlsdisk mmlsmount mmlsmgr df; do + require_cmd "$cmd" || missing=1 +done + +if [[ "$missing" -ne 0 ]]; then + exit 2 +fi + +issues=0 + +section "GPFS daemon state" +state_output="$(mmgetstate -a 2>&1 || true)" +printf '%s\n' "$state_output" | tee -a "$LOG_FILE" +if printf '%s\n' "$state_output" | awk 'NR > 1 && $0 !~ / active / { found=1 } END { exit found ? 0 : 1 }'; then + warning "Not all GPFS nodes appear active" +fi + +section "Target filesystem definition" +if mmlsfs "$FILESYSTEM" 2>&1 | tee -a "$LOG_FILE"; then + ok "Filesystem exists: $FILESYSTEM" +else + critical "Filesystem does not exist or cannot be queried: $FILESYSTEM" + exit 1 +fi + +section "Target filesystem mount state" +mount_output="$(mmlsmount "$FILESYSTEM" 2>&1 || true)" +printf '%s\n' "$mount_output" | tee -a "$LOG_FILE" +if printf '%s\n' "$mount_output" | grep -Eiq 'not mounted|no file systems were found|not found'; then + warning "Filesystem may not be mounted anywhere: $FILESYSTEM" +fi + +section "Existing disks" +if ! mmlsdisk "$FILESYSTEM" 2>&1 | tee -a "$LOG_FILE"; then + critical "Unable to list disks for filesystem: $FILESYSTEM" + issues=1 +fi + +section "Filesystem capacity" +df -h 2>&1 | awk -v fs="$FILESYSTEM" 'NR == 1 || $0 ~ fs || $0 ~ /gpfs|mmfs/' | tee -a "$LOG_FILE" + +section "Cluster health" +if command -v mmhealth >/dev/null 2>&1; then + health_output="$(mmhealth cluster show 2>&1 || true)" + printf '%s\n' "$health_output" | tee -a "$LOG_FILE" + if printf '%s\n' "$health_output" | grep -Eiq 'degraded|failed|down|error|unhealthy'; then + warning "Cluster health output indicates a degraded condition" + fi +else + warning "mmhealth command not available, skipping health check" +fi + +section "Managers and quorum" +mmlsmgr 2>&1 | tee -a "$LOG_FILE" || { + critical "Unable to query GPFS manager/quorum information" + issues=1 +} + +if [[ "$issues" -eq 0 ]]; then + ok "Precheck completed for filesystem: $FILESYSTEM" + exit 0 +fi + +critical "Precheck found operational validation failures" +exit 1 diff --git a/infra-run/scripts/bash/gpfs/03_detect_new_disks.sh b/infra-run/scripts/bash/gpfs/03_detect_new_disks.sh new file mode 100755 index 0000000..7641235 --- /dev/null +++ b/infra-run/scripts/bash/gpfs/03_detect_new_disks.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=00_env.sh +. "$SCRIPT_DIR/00_env.sh" + +EXCLUDE_MOUNTED=false +EXCLUDE_EXISTING_NSD=false + +usage() { + printf 'Usage: %s [--exclude-mounted] [--exclude-existing-nsd]\n' "$(basename "$0")" +} + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --exclude-mounted) + EXCLUDE_MOUNTED=true + shift + ;; + --exclude-existing-nsd) + EXCLUDE_EXISTING_NSD=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + critical "Unknown argument: $1" + usage + exit 2 + ;; + esac +done + +for cmd in lsblk findmnt; do + require_cmd "$cmd" || exit 2 +done + +warning "Candidate devices are not automatically safe. Confirm every device with the storage and cluster teams before use." + +existing_gpfs_devices="" +if [[ "$EXCLUDE_EXISTING_NSD" == "true" ]]; then + if command -v mmlsnsd >/dev/null 2>&1; then + existing_gpfs_devices="$(mmlsnsd 2>/dev/null || true)" + elif command -v mmlsdisk >/dev/null 2>&1; then + existing_gpfs_devices="$(mmlsdisk all 2>/dev/null || true)" + else + warning "mmlsnsd and mmlsdisk are unavailable; cannot exclude existing GPFS devices" + fi +fi + +section "Block device inventory" +lsblk -dpno NAME,TYPE,SIZE,MODEL,SERIAL,MOUNTPOINT 2>&1 | tee -a "$LOG_FILE" + +section "Candidate devices" +found=0 +while read -r name type size model serial mountpoint; do + [[ "$type" == "disk" ]] || continue + + if [[ "$EXCLUDE_MOUNTED" == "true" ]]; then + if [[ -n "${mountpoint:-}" ]] || findmnt -rn --source "$name" >/dev/null 2>&1; then + continue + fi + fi + + if [[ "$EXCLUDE_EXISTING_NSD" == "true" ]] && [[ -n "$existing_gpfs_devices" ]]; then + if printf '%s\n' "$existing_gpfs_devices" | grep -Fq "$name"; then + continue + fi + fi + + printf 'OK: candidate=%s size=%s model=%s serial=%s mountpoint=%s\n' \ + "$name" "${size:-unknown}" "${model:-unknown}" "${serial:-unknown}" "${mountpoint:-none}" | tee -a "$LOG_FILE" + found=1 +done < <(lsblk -dpno NAME,TYPE,SIZE,MODEL,SERIAL,MOUNTPOINT) + +if [[ "$found" -eq 0 ]]; then + warning "No candidate devices found with the selected filters" +fi diff --git a/infra-run/scripts/bash/gpfs/04_create_nsd_stanza.sh b/infra-run/scripts/bash/gpfs/04_create_nsd_stanza.sh new file mode 100755 index 0000000..7ec352c --- /dev/null +++ b/infra-run/scripts/bash/gpfs/04_create_nsd_stanza.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=00_env.sh +. "$SCRIPT_DIR/00_env.sh" + +DEVICES="" +SERVERS="" +OUTPUT="" + +usage() { + printf 'Usage: %s --fs --devices "/dev/sdb /dev/sdc" --servers "node1,node2" --failure-group --pool --usage [--output ]\n' "$(basename "$0")" +} + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --fs) FILESYSTEM="${2:-}"; shift 2 ;; + --devices) DEVICES="${2:-}"; shift 2 ;; + --servers) SERVERS="${2:-}"; shift 2 ;; + --failure-group) FAILURE_GROUP="${2:-}"; shift 2 ;; + --pool) STORAGE_POOL="${2:-}"; shift 2 ;; + --usage) USAGE="${2:-}"; shift 2 ;; + --output) OUTPUT="${2:-}"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) critical "Unknown argument: $1"; usage; exit 2 ;; + esac +done + +if [[ -z "$FILESYSTEM" || -z "$DEVICES" || -z "$SERVERS" || -z "$FAILURE_GROUP" || -z "$STORAGE_POOL" || -z "$USAGE" ]]; then + critical "Missing required input" + usage + exit 2 +fi + +if ! [[ "$FAILURE_GROUP" =~ ^-?[0-9]+$ ]]; then + critical "--failure-group must be an integer" + exit 2 +fi + +if ! usage_value_valid "$USAGE"; then + critical "--usage must be one of: dataOnly, metadataOnly, dataAndMetadata" + exit 2 +fi + +if [[ -z "$OUTPUT" ]]; then + OUTPUT="/tmp/gpfs_nsd_${FILESYSTEM}_${TIMESTAMP}.stanza" +fi + +safe_fs="$(printf '%s' "$FILESYSTEM" | tr -c '[:alnum:]_' '_')" + +{ + printf '# Generated GPFS NSD stanza for filesystem %s\n' "$FILESYSTEM" + printf '# Review with storage and cluster teams before use.\n\n' + for device in $DEVICES; do + if [[ "$device" != /dev/* ]]; then + critical "Device must be an absolute /dev path: $device" + exit 2 + fi + + device_base="$(basename "$device" | tr -c '[:alnum:]_' '_')" + nsd_name="nsd_${safe_fs}_${device_base}" + printf '%%nsd:\n' + printf ' device=%s\n' "$device" + printf ' nsd=%s\n' "$nsd_name" + printf ' servers=%s\n' "$SERVERS" + printf ' usage=%s\n' "$USAGE" + printf ' failureGroup=%s\n' "$FAILURE_GROUP" + printf ' pool=%s\n\n' "$STORAGE_POOL" + done +} > "$OUTPUT" + +ok "Generated NSD stanza: $OUTPUT" +warning "This script only writes a stanza file. It does not create NSDs or modify GPFS." diff --git a/infra-run/scripts/bash/gpfs/05_add_nsd_to_filesystem.sh b/infra-run/scripts/bash/gpfs/05_add_nsd_to_filesystem.sh new file mode 100755 index 0000000..082901f --- /dev/null +++ b/infra-run/scripts/bash/gpfs/05_add_nsd_to_filesystem.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=00_env.sh +. "$SCRIPT_DIR/00_env.sh" + +usage() { + printf 'Usage: %s --fs --stanza [--execute]\n' "$(basename "$0")" +} + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --fs) FILESYSTEM="${2:-}"; shift 2 ;; + --stanza) NSD_STANZA="${2:-}"; shift 2 ;; + --execute) DRY_RUN=false; shift ;; + -h|--help) usage; exit 0 ;; + *) critical "Unknown argument: $1"; usage; exit 2 ;; + esac +done + +if [[ -z "$FILESYSTEM" || -z "$NSD_STANZA" ]]; then + critical "Missing required --fs or --stanza" + usage + exit 2 +fi + +if [[ ! -r "$NSD_STANZA" ]]; then + critical "Stanza file does not exist or is not readable: $NSD_STANZA" + exit 2 +fi + +for cmd in mmlsfs mmcrnsd mmadddisk; do + require_cmd "$cmd" || exit 2 +done + +if ! mmlsfs "$FILESYSTEM" >/dev/null 2>&1; then + critical "Filesystem does not exist or cannot be queried: $FILESYSTEM" + exit 1 +fi + +warning "Adding NSDs must be coordinated with storage, GPFS, application, and change-management teams." +section "Planned GPFS changes" +ok "DRY-RUN: mmcrnsd -F $NSD_STANZA" +ok "DRY-RUN: mmadddisk $FILESYSTEM -F $NSD_STANZA" + +confirm_execute "create NSDs and add disks to $FILESYSTEM" + +if [[ "$DRY_RUN" == "false" ]]; then + run_cmd mmcrnsd -F "$NSD_STANZA" + run_cmd mmadddisk "$FILESYSTEM" -F "$NSD_STANZA" + + section "Post-add NSD inventory" + mmlsnsd 2>&1 | tee -a "$LOG_FILE" || warning "mmlsnsd command failed after execution" + section "Post-add filesystem disks" + mmlsdisk "$FILESYSTEM" 2>&1 | tee -a "$LOG_FILE" || warning "mmlsdisk command failed after execution" +fi diff --git a/infra-run/scripts/bash/gpfs/06_rebalance_filesystem.sh b/infra-run/scripts/bash/gpfs/06_rebalance_filesystem.sh new file mode 100755 index 0000000..a918220 --- /dev/null +++ b/infra-run/scripts/bash/gpfs/06_rebalance_filesystem.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=00_env.sh +. "$SCRIPT_DIR/00_env.sh" + +BACKGROUND=false + +usage() { + printf 'Usage: %s --fs [--execute] [--background]\n' "$(basename "$0")" +} + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --fs) FILESYSTEM="${2:-}"; shift 2 ;; + --execute) DRY_RUN=false; shift ;; + --background) BACKGROUND=true; shift ;; + -h|--help) usage; exit 0 ;; + *) critical "Unknown argument: $1"; usage; exit 2 ;; + esac +done + +if [[ -z "$FILESYSTEM" ]]; then + critical "Missing required --fs " + usage + exit 2 +fi + +for cmd in mmlsdisk mmrestripefs; do + require_cmd "$cmd" || exit 2 +done + +warning "Restripe/rebalance can be I/O intensive. Run only in an approved change window." + +section "Current disk balance" +mmlsdisk "$FILESYSTEM" 2>&1 | tee -a "$LOG_FILE" || warning "Unable to show current disk state" + +section "Planned rebalance" +if [[ "$BACKGROUND" == "true" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + ok "DRY-RUN: mmrestripefs $FILESYSTEM -b &" + else + confirm_execute "background restripe for $FILESYSTEM" + ok "RUN: mmrestripefs $FILESYSTEM -b &" + mmrestripefs "$FILESYSTEM" -b 2>&1 | tee -a "$LOG_FILE" & + fi +else + ok "DRY-RUN: mmrestripefs $FILESYSTEM -b" + confirm_execute "restripe for $FILESYSTEM" + if [[ "$DRY_RUN" == "false" ]]; then + run_cmd mmrestripefs "$FILESYSTEM" -b + fi +fi diff --git a/infra-run/scripts/bash/gpfs/07_postcheck_gpfs.sh b/infra-run/scripts/bash/gpfs/07_postcheck_gpfs.sh new file mode 100755 index 0000000..080a52f --- /dev/null +++ b/infra-run/scripts/bash/gpfs/07_postcheck_gpfs.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=00_env.sh +. "$SCRIPT_DIR/00_env.sh" + +usage() { + printf 'Usage: %s --fs \n' "$(basename "$0")" +} + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --fs) FILESYSTEM="${2:-}"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) critical "Unknown argument: $1"; usage; exit 2 ;; + esac +done + +if [[ -z "$FILESYSTEM" ]]; then + critical "Missing required --fs " + usage + exit 2 +fi + +issues=0 + +run_check() { + local description="$1" + shift + + section "$description" + if command -v "$1" >/dev/null 2>&1; then + "$@" 2>&1 | tee -a "$LOG_FILE" || { + critical "$description failed" + issues=1 + } + else + warning "$1 command not available, skipping" + fi +} + +run_check "GPFS daemon state" mmgetstate -a +run_check "Target filesystem mount state" mmlsmount "$FILESYSTEM" +run_check "Target filesystem disks" mmlsdisk "$FILESYSTEM" +run_check "NSD inventory" mmlsnsd + +section "Filesystem capacity" +if command -v df >/dev/null 2>&1; then + df -h 2>&1 | awk -v fs="$FILESYSTEM" 'NR == 1 || $0 ~ fs || $0 ~ /gpfs|mmfs/' | tee -a "$LOG_FILE" +else + warning "df command not available, skipping" +fi + +section "Cluster health" +if command -v mmhealth >/dev/null 2>&1; then + health_output="$(mmhealth cluster show 2>&1 || true)" + printf '%s\n' "$health_output" | tee -a "$LOG_FILE" + if printf '%s\n' "$health_output" | grep -Eiq 'degraded|failed|down|error|unhealthy'; then + critical "Cluster health output indicates an issue" + issues=1 + fi +else + warning "mmhealth command not available, skipping" +fi + +section "Recent GPFS journal entries" +if command -v journalctl >/dev/null 2>&1; then + journalctl -u 'gpfs*' -n 50 --no-pager 2>&1 | tee -a "$LOG_FILE" || warning "journalctl GPFS query failed" +else + warning "journalctl command not available, skipping" +fi + +section "Recent kernel messages" +if command -v dmesg >/dev/null 2>&1; then + dmesg -T 2>/dev/null | tail -50 | tee -a "$LOG_FILE" || warning "dmesg query failed" +else + warning "dmesg command not available, skipping" +fi + +if [[ "$issues" -eq 0 ]]; then + ok "Post-check completed without detected operational failures" + exit 0 +fi + +critical "Post-check detected one or more issues" +exit 1 diff --git a/infra-run/scripts/bash/gpfs/08_generate_report.sh b/infra-run/scripts/bash/gpfs/08_generate_report.sh new file mode 100755 index 0000000..be71a83 --- /dev/null +++ b/infra-run/scripts/bash/gpfs/08_generate_report.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=00_env.sh +. "$SCRIPT_DIR/00_env.sh" + +REPORT_FILE="" + +usage() { + printf 'Usage: %s --fs \n' "$(basename "$0")" +} + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --fs) FILESYSTEM="${2:-}"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) critical "Unknown argument: $1"; usage; exit 2 ;; + esac +done + +if [[ -z "$FILESYSTEM" ]]; then + critical "Missing required --fs " + usage + exit 2 +fi + +REPORT_FILE="/tmp/gpfs_extend_report_${FILESYSTEM}_${TIMESTAMP}.txt" + +append_section() { + local title="$1" + shift + + { + printf '\n== %s ==\n' "$title" + if command -v "$1" >/dev/null 2>&1; then + "$@" 2>&1 || printf 'WARNING: command failed: %s\n' "$*" + else + printf 'WARNING: command not available: %s\n' "$1" + fi + } >> "$REPORT_FILE" +} + +{ + printf 'GPFS / Spectrum Scale Filesystem Expansion Report\n' + printf 'Hostname: %s\n' "$(hostname 2>/dev/null || printf 'unknown')" + printf 'Date: %s\n' "$(date)" + printf 'Target filesystem: %s\n' "$FILESYSTEM" +} > "$REPORT_FILE" + +append_section "GPFS daemon state" mmgetstate -a +append_section "Cluster definition" mmlscluster +append_section "Managers and quorum" mmlsmgr +append_section "Target filesystem mount state" mmlsmount "$FILESYSTEM" +append_section "Target filesystem disks" mmlsdisk "$FILESYSTEM" +append_section "NSD inventory" mmlsnsd +append_section "Filesystem capacity" df -h + +if command -v mmhealth >/dev/null 2>&1; then + append_section "Cluster health" mmhealth cluster show +else + printf '\n== Cluster health ==\nWARNING: mmhealth command not available\n' >> "$REPORT_FILE" +fi + +if command -v journalctl >/dev/null 2>&1; then + append_section "Recent GPFS journal entries" journalctl -u 'gpfs*' -n 50 --no-pager +fi + +if command -v dmesg >/dev/null 2>&1; then + { + printf '\n== Recent kernel messages ==\n' + dmesg -T 2>/dev/null | tail -50 || printf 'WARNING: dmesg query failed\n' + } >> "$REPORT_FILE" +fi + +ok "Generated report: $REPORT_FILE" diff --git a/infra-run/scripts/bash/gpfs/README.md b/infra-run/scripts/bash/gpfs/README.md new file mode 100644 index 0000000..383e093 --- /dev/null +++ b/infra-run/scripts/bash/gpfs/README.md @@ -0,0 +1,136 @@ +# GPFS / IBM Spectrum Scale Filesystem Expansion Toolkit + +Safe, sanitized Bash examples for planning and executing a GPFS / IBM Spectrum Scale filesystem expansion. The scripts are written as portfolio-grade operational tooling for a Linux Infrastructure Engineer: conservative defaults, clear validation, dry-run behavior, and explicit operator confirmation before changes. + +These scripts are examples. Exact GPFS commands, flags, quorum practices, failure-group design, and storage naming standards vary by Spectrum Scale version and site policy. + +## Concepts + +- **Cluster** - the Spectrum Scale administrative domain containing the nodes, daemon configuration, quorum policy, filesystems, and NSDs. +- **Node** - a server participating in the GPFS cluster. Nodes may be clients, NSD servers, quorum nodes, manager-capable nodes, or a mix of roles. +- **Quorum** - the voting mechanism that protects the cluster from split-brain conditions. Expansion work should not proceed during quorum instability. +- **Filesystem** - the GPFS namespace and data layout presented to clients, backed by one or more NSDs. +- **NSD** - Network Shared Disk, the GPFS abstraction for a disk or LUN that is served to the cluster. +- **Failure group** - a placement hint that tells GPFS which disks share a failure domain, such as an enclosure, rack, site, controller pair, or storage array. +- **Storage pool** - a named pool of NSDs used for placement and lifecycle policy, commonly `system` plus optional data pools. +- **Restripe/rebalance** - the operation that redistributes data after disks are added. It can be I/O intensive and should run only in an approved change window. + +## Required Tools + +Common GPFS / Spectrum Scale tools expected in production include: + +- `mmgetstate` +- `mmlscluster` +- `mmlsfs` +- `mmlsdisk` +- `mmlsnsd` +- `mmcrnsd` +- `mmadddisk` +- `mmrestripefs` + +The toolkit also uses common Linux tools such as `df`, `lsblk`, `findmnt`, `journalctl`, and `dmesg` where available. Missing optional commands are reported as `WARNING` and skipped. + +## Safety Model + +- Default mode is dry-run. +- Real GPFS modifications require `--execute`. +- Destructive or high-impact steps also prompt for `EXECUTE`. +- Disk detection is read-only and never partitions, formats, wipes, or modifies devices. +- Device selection must always be confirmed with the storage team and cluster owners. +- The scripts do not assume production disk names. + +Output uses a consistent status format: + +- `OK` +- `WARNING` +- `CRITICAL` + +Exit codes: + +- `0` - OK +- `1` - operational validation failure +- `2` - invalid input or missing requirement + +## Scripts + +- `00_env.sh` - shared configuration and helper functions. +- `01_cluster_overview.sh` - read-only cluster overview. +- `02_precheck_gpfs.sh` - pre-expansion validation for a target filesystem. +- `03_detect_new_disks.sh` - read-only candidate block-device discovery. +- `04_create_nsd_stanza.sh` - generate an NSD stanza file. +- `05_add_nsd_to_filesystem.sh` - create NSDs and add disks to a filesystem, dry-run by default. +- `06_rebalance_filesystem.sh` - optional restripe/rebalance, dry-run by default. +- `07_postcheck_gpfs.sh` - post-change validation. +- `08_generate_report.sh` - text report for the change record. +- `gpfs_extend_runbook.sh` - guided order of operations plus safe read-only checks. + +## Example Workflow + +```bash +cd infra-run/scripts/bash/gpfs + +./01_cluster_overview.sh +./02_precheck_gpfs.sh --fs gpfs01 +./03_detect_new_disks.sh --exclude-mounted --exclude-existing-nsd + +./04_create_nsd_stanza.sh \ + --fs gpfs01 \ + --devices "/dev/sdb /dev/sdc" \ + --servers "gpfsnsd01,gpfsnsd02" \ + --failure-group 10 \ + --pool system \ + --usage dataAndMetadata +``` + +Review the generated stanza with the storage and cluster teams. Confirm device identity, LUN masking, multipath naming, failure group placement, and site standards before continuing. + +Dry-run the add step: + +```bash +./05_add_nsd_to_filesystem.sh \ + --fs gpfs01 \ + --stanza /tmp/gpfs_nsd_gpfs01_YYYYmmdd_HHMMSS.stanza +``` + +Execute only in an approved change window: + +```bash +./05_add_nsd_to_filesystem.sh \ + --fs gpfs01 \ + --stanza /tmp/gpfs_nsd_gpfs01_YYYYmmdd_HHMMSS.stanza \ + --execute +``` + +Optional rebalance: + +```bash +./06_rebalance_filesystem.sh --fs gpfs01 +./06_rebalance_filesystem.sh --fs gpfs01 --execute --background +``` + +Post-check and report: + +```bash +./07_postcheck_gpfs.sh --fs gpfs01 +./08_generate_report.sh --fs gpfs01 +``` + +Runbook helper: + +```bash +./gpfs_extend_runbook.sh \ + --fs gpfs01 \ + --devices "/dev/sdb /dev/sdc" \ + --servers "gpfsnsd01,gpfsnsd02" \ + --failure-group 10 \ + --pool system \ + --usage dataAndMetadata +``` + +## Operational Notes + +- Do not run these scripts blindly on production clusters. +- Confirm disk and multipath identity with the storage team before creating NSDs. +- Validate quorum and manager health before expansion. +- Confirm application I/O risk and rollback procedures before `mmadddisk` or `mmrestripefs`. +- Confirm the Spectrum Scale version and local standards for stanza fields before executing changes. diff --git a/infra-run/scripts/bash/gpfs/gpfs_extend_runbook.sh b/infra-run/scripts/bash/gpfs/gpfs_extend_runbook.sh new file mode 100755 index 0000000..6465d9c --- /dev/null +++ b/infra-run/scripts/bash/gpfs/gpfs_extend_runbook.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=00_env.sh +. "$SCRIPT_DIR/00_env.sh" + +DEVICES="" +SERVERS="" +EXECUTE=false + +usage() { + printf 'Usage: %s --fs --devices "/dev/sdb /dev/sdc" --servers "node1,node2" --failure-group --pool --usage [--execute]\n' "$(basename "$0")" +} + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --fs) FILESYSTEM="${2:-}"; shift 2 ;; + --devices) DEVICES="${2:-}"; shift 2 ;; + --servers) SERVERS="${2:-}"; shift 2 ;; + --failure-group) FAILURE_GROUP="${2:-}"; shift 2 ;; + --pool) STORAGE_POOL="${2:-}"; shift 2 ;; + --usage) USAGE="${2:-}"; shift 2 ;; + --execute) EXECUTE=true; DRY_RUN=false; shift ;; + -h|--help) usage; exit 0 ;; + *) critical "Unknown argument: $1"; usage; exit 2 ;; + esac +done + +section "Recommended GPFS Expansion Flow" +cat < + +Step 3: Detect candidate disks + $SCRIPT_DIR/03_detect_new_disks.sh --exclude-mounted --exclude-existing-nsd + +Step 4: Generate NSD stanza + $SCRIPT_DIR/04_create_nsd_stanza.sh --fs --devices "/dev/sdb /dev/sdc" --servers "node1,node2" --failure-group --pool --usage + +Step 5: Create NSDs and add disks to filesystem + $SCRIPT_DIR/05_add_nsd_to_filesystem.sh --fs --stanza [--execute] + +Step 6: Optional restripe/rebalance + $SCRIPT_DIR/06_rebalance_filesystem.sh --fs [--execute] [--background] + +Step 7: Post-check + $SCRIPT_DIR/07_postcheck_gpfs.sh --fs + +Step 8: Generate report + $SCRIPT_DIR/08_generate_report.sh --fs +FLOW + +if [[ -z "$FILESYSTEM" ]]; then + warning "No --fs supplied. Printed runbook only." + exit 0 +fi + +if [[ "$EXECUTE" == "true" ]]; then + warning "--execute was supplied. Destructive steps still require the individual script confirmation prompt." +else + DRY_RUN=true +fi + +section "Running Safe Read-Only Steps" +"$SCRIPT_DIR/01_cluster_overview.sh" || warning "Cluster overview reported warnings or failures" +"$SCRIPT_DIR/02_precheck_gpfs.sh" --fs "$FILESYSTEM" || warning "Precheck reported warnings or failures" +"$SCRIPT_DIR/03_detect_new_disks.sh" --exclude-mounted --exclude-existing-nsd || warning "Disk detection reported warnings or failures" + +if [[ -n "$DEVICES" || -n "$SERVERS" || -n "$FAILURE_GROUP" ]]; then + if [[ -z "$DEVICES" || -z "$SERVERS" || -z "$FAILURE_GROUP" ]]; then + warning "NSD stanza generation requires --devices, --servers, --failure-group, --pool, and --usage" + else + "$SCRIPT_DIR/04_create_nsd_stanza.sh" \ + --fs "$FILESYSTEM" \ + --devices "$DEVICES" \ + --servers "$SERVERS" \ + --failure-group "$FAILURE_GROUP" \ + --pool "$STORAGE_POOL" \ + --usage "$USAGE" + fi +fi + +section "Next Manual Step" +if [[ "$EXECUTE" == "true" ]]; then + warning "Run 05_add_nsd_to_filesystem.sh manually with --execute after reviewing the generated stanza." +else + ok "Review outputs and generated stanza. Add disks only through 05_add_nsd_to_filesystem.sh with --execute." +fi