Add GPFS storage expansion toolkit

This commit is contained in:
Mateusz Suski
2026-05-05 21:40:46 +00:00
parent c42d8bfb8f
commit 5dd8c34952
11 changed files with 925 additions and 0 deletions
+114
View File
@@ -0,0 +1,114 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d_%H%M%S)}"
DRY_RUN="${DRY_RUN:-true}"
LOG_FILE="${LOG_FILE:-/tmp/gpfs_extend_${TIMESTAMP}.log}"
FILESYSTEM="${FILESYSTEM:-}"
NSD_STANZA="${NSD_STANZA:-}"
FAILURE_GROUP="${FAILURE_GROUP:-}"
STORAGE_POOL="${STORAGE_POOL:-system}"
USAGE="${USAGE:-dataAndMetadata}"
log() {
local level="$1"
shift
local message="$*"
printf '%s: %s\n' "$level" "$message" | tee -a "$LOG_FILE"
}
ok() {
log "OK" "$@"
}
warning() {
log "WARNING" "$@"
}
critical() {
log "CRITICAL" "$@"
}
require_cmd() {
local cmd="$1"
if command -v "$cmd" >/dev/null 2>&1; then
ok "Command available: $cmd"
return 0
fi
critical "Required command not found: $cmd"
return 1
}
validate_gpfs_command() {
local cmd="$1"
if command -v "$cmd" >/dev/null 2>&1; then
return 0
fi
warning "GPFS command not available, skipping: $cmd"
return 1
}
run_cmd() {
if [[ "$#" -eq 0 ]]; then
critical "run_cmd called without a command"
return 2
fi
if [[ "$DRY_RUN" == "true" ]]; then
log "OK" "DRY-RUN: $*"
return 0
fi
log "OK" "RUN: $*"
"$@" 2>&1 | tee -a "$LOG_FILE"
}
run_readonly() {
if [[ "$#" -eq 0 ]]; then
critical "run_readonly called without a command"
return 2
fi
log "OK" "READ-ONLY: $*"
"$@" 2>&1 | tee -a "$LOG_FILE"
}
confirm_execute() {
local target="${1:-GPFS change}"
if [[ "$DRY_RUN" == "true" ]]; then
ok "Dry-run mode enabled. No changes will be made."
return 0
fi
warning "Execution mode requested for: $target"
warning "Coordinate this change with storage, GPFS, application, and change-management teams."
printf 'Type EXECUTE to continue: '
read -r confirmation
if [[ "$confirmation" != "EXECUTE" ]]; then
critical "Confirmation failed. Aborting."
exit 1
fi
ok "Execution confirmed by operator."
}
usage_value_valid() {
case "$1" in
dataOnly|metadataOnly|dataAndMetadata) return 0 ;;
*) return 1 ;;
esac
}
section() {
printf '\n== %s ==\n' "$1" | tee -a "$LOG_FILE"
}
+37
View File
@@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
run_optional() {
local description="$1"
shift
section "$description"
if validate_gpfs_command "$1"; then
run_readonly "$@" || warning "$description command failed"
fi
}
section "GPFS / Spectrum Scale Cluster Overview"
ok "Log file: $LOG_FILE"
run_optional "GPFS daemon state on all nodes" mmgetstate -a
run_optional "Cluster definition" mmlscluster
run_optional "Cluster configuration" mmlsconfig
run_optional "Managers and quorum information" mmlsmgr
run_optional "NSD inventory" mmlsnsd
run_optional "Disk inventory for all filesystems" mmlsdisk all
run_optional "Filesystem definitions" mmlsfs all
run_optional "Mount state for all filesystems" mmlsmount all
section "Mounted GPFS filesystems from df"
if command -v df >/dev/null 2>&1; then
df -h -t gpfs 2>/dev/null | tee -a "$LOG_FILE" || df -h | awk 'NR == 1 || /gpfs|mmfs/' | tee -a "$LOG_FILE"
else
warning "df command not available"
fi
+103
View File
@@ -0,0 +1,103 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
usage() {
printf 'Usage: %s --fs <filesystem>\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--fs)
FILESYSTEM="${2:-}"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
critical "Unknown argument: $1"
usage
exit 2
;;
esac
done
if [[ -z "$FILESYSTEM" ]]; then
critical "Missing required --fs <filesystem>"
usage
exit 2
fi
missing=0
for cmd in mmgetstate mmlscluster mmlsfs mmlsdisk mmlsmount mmlsmgr df; do
require_cmd "$cmd" || missing=1
done
if [[ "$missing" -ne 0 ]]; then
exit 2
fi
issues=0
section "GPFS daemon state"
state_output="$(mmgetstate -a 2>&1 || true)"
printf '%s\n' "$state_output" | tee -a "$LOG_FILE"
if printf '%s\n' "$state_output" | awk 'NR > 1 && $0 !~ / active / { found=1 } END { exit found ? 0 : 1 }'; then
warning "Not all GPFS nodes appear active"
fi
section "Target filesystem definition"
if mmlsfs "$FILESYSTEM" 2>&1 | tee -a "$LOG_FILE"; then
ok "Filesystem exists: $FILESYSTEM"
else
critical "Filesystem does not exist or cannot be queried: $FILESYSTEM"
exit 1
fi
section "Target filesystem mount state"
mount_output="$(mmlsmount "$FILESYSTEM" 2>&1 || true)"
printf '%s\n' "$mount_output" | tee -a "$LOG_FILE"
if printf '%s\n' "$mount_output" | grep -Eiq 'not mounted|no file systems were found|not found'; then
warning "Filesystem may not be mounted anywhere: $FILESYSTEM"
fi
section "Existing disks"
if ! mmlsdisk "$FILESYSTEM" 2>&1 | tee -a "$LOG_FILE"; then
critical "Unable to list disks for filesystem: $FILESYSTEM"
issues=1
fi
section "Filesystem capacity"
df -h 2>&1 | awk -v fs="$FILESYSTEM" 'NR == 1 || $0 ~ fs || $0 ~ /gpfs|mmfs/' | tee -a "$LOG_FILE"
section "Cluster health"
if command -v mmhealth >/dev/null 2>&1; then
health_output="$(mmhealth cluster show 2>&1 || true)"
printf '%s\n' "$health_output" | tee -a "$LOG_FILE"
if printf '%s\n' "$health_output" | grep -Eiq 'degraded|failed|down|error|unhealthy'; then
warning "Cluster health output indicates a degraded condition"
fi
else
warning "mmhealth command not available, skipping health check"
fi
section "Managers and quorum"
mmlsmgr 2>&1 | tee -a "$LOG_FILE" || {
critical "Unable to query GPFS manager/quorum information"
issues=1
}
if [[ "$issues" -eq 0 ]]; then
ok "Precheck completed for filesystem: $FILESYSTEM"
exit 0
fi
critical "Precheck found operational validation failures"
exit 1
+83
View File
@@ -0,0 +1,83 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
EXCLUDE_MOUNTED=false
EXCLUDE_EXISTING_NSD=false
usage() {
printf 'Usage: %s [--exclude-mounted] [--exclude-existing-nsd]\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--exclude-mounted)
EXCLUDE_MOUNTED=true
shift
;;
--exclude-existing-nsd)
EXCLUDE_EXISTING_NSD=true
shift
;;
-h|--help)
usage
exit 0
;;
*)
critical "Unknown argument: $1"
usage
exit 2
;;
esac
done
for cmd in lsblk findmnt; do
require_cmd "$cmd" || exit 2
done
warning "Candidate devices are not automatically safe. Confirm every device with the storage and cluster teams before use."
existing_gpfs_devices=""
if [[ "$EXCLUDE_EXISTING_NSD" == "true" ]]; then
if command -v mmlsnsd >/dev/null 2>&1; then
existing_gpfs_devices="$(mmlsnsd 2>/dev/null || true)"
elif command -v mmlsdisk >/dev/null 2>&1; then
existing_gpfs_devices="$(mmlsdisk all 2>/dev/null || true)"
else
warning "mmlsnsd and mmlsdisk are unavailable; cannot exclude existing GPFS devices"
fi
fi
section "Block device inventory"
lsblk -dpno NAME,TYPE,SIZE,MODEL,SERIAL,MOUNTPOINT 2>&1 | tee -a "$LOG_FILE"
section "Candidate devices"
found=0
while read -r name type size model serial mountpoint; do
[[ "$type" == "disk" ]] || continue
if [[ "$EXCLUDE_MOUNTED" == "true" ]]; then
if [[ -n "${mountpoint:-}" ]] || findmnt -rn --source "$name" >/dev/null 2>&1; then
continue
fi
fi
if [[ "$EXCLUDE_EXISTING_NSD" == "true" ]] && [[ -n "$existing_gpfs_devices" ]]; then
if printf '%s\n' "$existing_gpfs_devices" | grep -Fq "$name"; then
continue
fi
fi
printf 'OK: candidate=%s size=%s model=%s serial=%s mountpoint=%s\n' \
"$name" "${size:-unknown}" "${model:-unknown}" "${serial:-unknown}" "${mountpoint:-none}" | tee -a "$LOG_FILE"
found=1
done < <(lsblk -dpno NAME,TYPE,SIZE,MODEL,SERIAL,MOUNTPOINT)
if [[ "$found" -eq 0 ]]; then
warning "No candidate devices found with the selected filters"
fi
+76
View File
@@ -0,0 +1,76 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
DEVICES=""
SERVERS=""
OUTPUT=""
usage() {
printf 'Usage: %s --fs <filesystem> --devices "/dev/sdb /dev/sdc" --servers "node1,node2" --failure-group <number> --pool <storage_pool> --usage <dataOnly|metadataOnly|dataAndMetadata> [--output <path>]\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--fs) FILESYSTEM="${2:-}"; shift 2 ;;
--devices) DEVICES="${2:-}"; shift 2 ;;
--servers) SERVERS="${2:-}"; shift 2 ;;
--failure-group) FAILURE_GROUP="${2:-}"; shift 2 ;;
--pool) STORAGE_POOL="${2:-}"; shift 2 ;;
--usage) USAGE="${2:-}"; shift 2 ;;
--output) OUTPUT="${2:-}"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) critical "Unknown argument: $1"; usage; exit 2 ;;
esac
done
if [[ -z "$FILESYSTEM" || -z "$DEVICES" || -z "$SERVERS" || -z "$FAILURE_GROUP" || -z "$STORAGE_POOL" || -z "$USAGE" ]]; then
critical "Missing required input"
usage
exit 2
fi
if ! [[ "$FAILURE_GROUP" =~ ^-?[0-9]+$ ]]; then
critical "--failure-group must be an integer"
exit 2
fi
if ! usage_value_valid "$USAGE"; then
critical "--usage must be one of: dataOnly, metadataOnly, dataAndMetadata"
exit 2
fi
if [[ -z "$OUTPUT" ]]; then
OUTPUT="/tmp/gpfs_nsd_${FILESYSTEM}_${TIMESTAMP}.stanza"
fi
safe_fs="$(printf '%s' "$FILESYSTEM" | tr -c '[:alnum:]_' '_')"
{
printf '# Generated GPFS NSD stanza for filesystem %s\n' "$FILESYSTEM"
printf '# Review with storage and cluster teams before use.\n\n'
for device in $DEVICES; do
if [[ "$device" != /dev/* ]]; then
critical "Device must be an absolute /dev path: $device"
exit 2
fi
device_base="$(basename "$device" | tr -c '[:alnum:]_' '_')"
nsd_name="nsd_${safe_fs}_${device_base}"
printf '%%nsd:\n'
printf ' device=%s\n' "$device"
printf ' nsd=%s\n' "$nsd_name"
printf ' servers=%s\n' "$SERVERS"
printf ' usage=%s\n' "$USAGE"
printf ' failureGroup=%s\n' "$FAILURE_GROUP"
printf ' pool=%s\n\n' "$STORAGE_POOL"
done
} > "$OUTPUT"
ok "Generated NSD stanza: $OUTPUT"
warning "This script only writes a stanza file. It does not create NSDs or modify GPFS."
+59
View File
@@ -0,0 +1,59 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
usage() {
printf 'Usage: %s --fs <filesystem> --stanza <stanza_file> [--execute]\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--fs) FILESYSTEM="${2:-}"; shift 2 ;;
--stanza) NSD_STANZA="${2:-}"; shift 2 ;;
--execute) DRY_RUN=false; shift ;;
-h|--help) usage; exit 0 ;;
*) critical "Unknown argument: $1"; usage; exit 2 ;;
esac
done
if [[ -z "$FILESYSTEM" || -z "$NSD_STANZA" ]]; then
critical "Missing required --fs or --stanza"
usage
exit 2
fi
if [[ ! -r "$NSD_STANZA" ]]; then
critical "Stanza file does not exist or is not readable: $NSD_STANZA"
exit 2
fi
for cmd in mmlsfs mmcrnsd mmadddisk; do
require_cmd "$cmd" || exit 2
done
if ! mmlsfs "$FILESYSTEM" >/dev/null 2>&1; then
critical "Filesystem does not exist or cannot be queried: $FILESYSTEM"
exit 1
fi
warning "Adding NSDs must be coordinated with storage, GPFS, application, and change-management teams."
section "Planned GPFS changes"
ok "DRY-RUN: mmcrnsd -F $NSD_STANZA"
ok "DRY-RUN: mmadddisk $FILESYSTEM -F $NSD_STANZA"
confirm_execute "create NSDs and add disks to $FILESYSTEM"
if [[ "$DRY_RUN" == "false" ]]; then
run_cmd mmcrnsd -F "$NSD_STANZA"
run_cmd mmadddisk "$FILESYSTEM" -F "$NSD_STANZA"
section "Post-add NSD inventory"
mmlsnsd 2>&1 | tee -a "$LOG_FILE" || warning "mmlsnsd command failed after execution"
section "Post-add filesystem disks"
mmlsdisk "$FILESYSTEM" 2>&1 | tee -a "$LOG_FILE" || warning "mmlsdisk command failed after execution"
fi
+56
View File
@@ -0,0 +1,56 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
BACKGROUND=false
usage() {
printf 'Usage: %s --fs <filesystem> [--execute] [--background]\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--fs) FILESYSTEM="${2:-}"; shift 2 ;;
--execute) DRY_RUN=false; shift ;;
--background) BACKGROUND=true; shift ;;
-h|--help) usage; exit 0 ;;
*) critical "Unknown argument: $1"; usage; exit 2 ;;
esac
done
if [[ -z "$FILESYSTEM" ]]; then
critical "Missing required --fs <filesystem>"
usage
exit 2
fi
for cmd in mmlsdisk mmrestripefs; do
require_cmd "$cmd" || exit 2
done
warning "Restripe/rebalance can be I/O intensive. Run only in an approved change window."
section "Current disk balance"
mmlsdisk "$FILESYSTEM" 2>&1 | tee -a "$LOG_FILE" || warning "Unable to show current disk state"
section "Planned rebalance"
if [[ "$BACKGROUND" == "true" ]]; then
if [[ "$DRY_RUN" == "true" ]]; then
ok "DRY-RUN: mmrestripefs $FILESYSTEM -b &"
else
confirm_execute "background restripe for $FILESYSTEM"
ok "RUN: mmrestripefs $FILESYSTEM -b &"
mmrestripefs "$FILESYSTEM" -b 2>&1 | tee -a "$LOG_FILE" &
fi
else
ok "DRY-RUN: mmrestripefs $FILESYSTEM -b"
confirm_execute "restripe for $FILESYSTEM"
if [[ "$DRY_RUN" == "false" ]]; then
run_cmd mmrestripefs "$FILESYSTEM" -b
fi
fi
+89
View File
@@ -0,0 +1,89 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
usage() {
printf 'Usage: %s --fs <filesystem>\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--fs) FILESYSTEM="${2:-}"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) critical "Unknown argument: $1"; usage; exit 2 ;;
esac
done
if [[ -z "$FILESYSTEM" ]]; then
critical "Missing required --fs <filesystem>"
usage
exit 2
fi
issues=0
run_check() {
local description="$1"
shift
section "$description"
if command -v "$1" >/dev/null 2>&1; then
"$@" 2>&1 | tee -a "$LOG_FILE" || {
critical "$description failed"
issues=1
}
else
warning "$1 command not available, skipping"
fi
}
run_check "GPFS daemon state" mmgetstate -a
run_check "Target filesystem mount state" mmlsmount "$FILESYSTEM"
run_check "Target filesystem disks" mmlsdisk "$FILESYSTEM"
run_check "NSD inventory" mmlsnsd
section "Filesystem capacity"
if command -v df >/dev/null 2>&1; then
df -h 2>&1 | awk -v fs="$FILESYSTEM" 'NR == 1 || $0 ~ fs || $0 ~ /gpfs|mmfs/' | tee -a "$LOG_FILE"
else
warning "df command not available, skipping"
fi
section "Cluster health"
if command -v mmhealth >/dev/null 2>&1; then
health_output="$(mmhealth cluster show 2>&1 || true)"
printf '%s\n' "$health_output" | tee -a "$LOG_FILE"
if printf '%s\n' "$health_output" | grep -Eiq 'degraded|failed|down|error|unhealthy'; then
critical "Cluster health output indicates an issue"
issues=1
fi
else
warning "mmhealth command not available, skipping"
fi
section "Recent GPFS journal entries"
if command -v journalctl >/dev/null 2>&1; then
journalctl -u 'gpfs*' -n 50 --no-pager 2>&1 | tee -a "$LOG_FILE" || warning "journalctl GPFS query failed"
else
warning "journalctl command not available, skipping"
fi
section "Recent kernel messages"
if command -v dmesg >/dev/null 2>&1; then
dmesg -T 2>/dev/null | tail -50 | tee -a "$LOG_FILE" || warning "dmesg query failed"
else
warning "dmesg command not available, skipping"
fi
if [[ "$issues" -eq 0 ]]; then
ok "Post-check completed without detected operational failures"
exit 0
fi
critical "Post-check detected one or more issues"
exit 1
+78
View File
@@ -0,0 +1,78 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
REPORT_FILE=""
usage() {
printf 'Usage: %s --fs <filesystem>\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--fs) FILESYSTEM="${2:-}"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) critical "Unknown argument: $1"; usage; exit 2 ;;
esac
done
if [[ -z "$FILESYSTEM" ]]; then
critical "Missing required --fs <filesystem>"
usage
exit 2
fi
REPORT_FILE="/tmp/gpfs_extend_report_${FILESYSTEM}_${TIMESTAMP}.txt"
append_section() {
local title="$1"
shift
{
printf '\n== %s ==\n' "$title"
if command -v "$1" >/dev/null 2>&1; then
"$@" 2>&1 || printf 'WARNING: command failed: %s\n' "$*"
else
printf 'WARNING: command not available: %s\n' "$1"
fi
} >> "$REPORT_FILE"
}
{
printf 'GPFS / Spectrum Scale Filesystem Expansion Report\n'
printf 'Hostname: %s\n' "$(hostname 2>/dev/null || printf 'unknown')"
printf 'Date: %s\n' "$(date)"
printf 'Target filesystem: %s\n' "$FILESYSTEM"
} > "$REPORT_FILE"
append_section "GPFS daemon state" mmgetstate -a
append_section "Cluster definition" mmlscluster
append_section "Managers and quorum" mmlsmgr
append_section "Target filesystem mount state" mmlsmount "$FILESYSTEM"
append_section "Target filesystem disks" mmlsdisk "$FILESYSTEM"
append_section "NSD inventory" mmlsnsd
append_section "Filesystem capacity" df -h
if command -v mmhealth >/dev/null 2>&1; then
append_section "Cluster health" mmhealth cluster show
else
printf '\n== Cluster health ==\nWARNING: mmhealth command not available\n' >> "$REPORT_FILE"
fi
if command -v journalctl >/dev/null 2>&1; then
append_section "Recent GPFS journal entries" journalctl -u 'gpfs*' -n 50 --no-pager
fi
if command -v dmesg >/dev/null 2>&1; then
{
printf '\n== Recent kernel messages ==\n'
dmesg -T 2>/dev/null | tail -50 || printf 'WARNING: dmesg query failed\n'
} >> "$REPORT_FILE"
fi
ok "Generated report: $REPORT_FILE"
+136
View File
@@ -0,0 +1,136 @@
# GPFS / IBM Spectrum Scale Filesystem Expansion Toolkit
Safe, sanitized Bash examples for planning and executing a GPFS / IBM Spectrum Scale filesystem expansion. The scripts are written as portfolio-grade operational tooling for a Linux Infrastructure Engineer: conservative defaults, clear validation, dry-run behavior, and explicit operator confirmation before changes.
These scripts are examples. Exact GPFS commands, flags, quorum practices, failure-group design, and storage naming standards vary by Spectrum Scale version and site policy.
## Concepts
- **Cluster** - the Spectrum Scale administrative domain containing the nodes, daemon configuration, quorum policy, filesystems, and NSDs.
- **Node** - a server participating in the GPFS cluster. Nodes may be clients, NSD servers, quorum nodes, manager-capable nodes, or a mix of roles.
- **Quorum** - the voting mechanism that protects the cluster from split-brain conditions. Expansion work should not proceed during quorum instability.
- **Filesystem** - the GPFS namespace and data layout presented to clients, backed by one or more NSDs.
- **NSD** - Network Shared Disk, the GPFS abstraction for a disk or LUN that is served to the cluster.
- **Failure group** - a placement hint that tells GPFS which disks share a failure domain, such as an enclosure, rack, site, controller pair, or storage array.
- **Storage pool** - a named pool of NSDs used for placement and lifecycle policy, commonly `system` plus optional data pools.
- **Restripe/rebalance** - the operation that redistributes data after disks are added. It can be I/O intensive and should run only in an approved change window.
## Required Tools
Common GPFS / Spectrum Scale tools expected in production include:
- `mmgetstate`
- `mmlscluster`
- `mmlsfs`
- `mmlsdisk`
- `mmlsnsd`
- `mmcrnsd`
- `mmadddisk`
- `mmrestripefs`
The toolkit also uses common Linux tools such as `df`, `lsblk`, `findmnt`, `journalctl`, and `dmesg` where available. Missing optional commands are reported as `WARNING` and skipped.
## Safety Model
- Default mode is dry-run.
- Real GPFS modifications require `--execute`.
- Destructive or high-impact steps also prompt for `EXECUTE`.
- Disk detection is read-only and never partitions, formats, wipes, or modifies devices.
- Device selection must always be confirmed with the storage team and cluster owners.
- The scripts do not assume production disk names.
Output uses a consistent status format:
- `OK`
- `WARNING`
- `CRITICAL`
Exit codes:
- `0` - OK
- `1` - operational validation failure
- `2` - invalid input or missing requirement
## Scripts
- `00_env.sh` - shared configuration and helper functions.
- `01_cluster_overview.sh` - read-only cluster overview.
- `02_precheck_gpfs.sh` - pre-expansion validation for a target filesystem.
- `03_detect_new_disks.sh` - read-only candidate block-device discovery.
- `04_create_nsd_stanza.sh` - generate an NSD stanza file.
- `05_add_nsd_to_filesystem.sh` - create NSDs and add disks to a filesystem, dry-run by default.
- `06_rebalance_filesystem.sh` - optional restripe/rebalance, dry-run by default.
- `07_postcheck_gpfs.sh` - post-change validation.
- `08_generate_report.sh` - text report for the change record.
- `gpfs_extend_runbook.sh` - guided order of operations plus safe read-only checks.
## Example Workflow
```bash
cd infra-run/scripts/bash/gpfs
./01_cluster_overview.sh
./02_precheck_gpfs.sh --fs gpfs01
./03_detect_new_disks.sh --exclude-mounted --exclude-existing-nsd
./04_create_nsd_stanza.sh \
--fs gpfs01 \
--devices "/dev/sdb /dev/sdc" \
--servers "gpfsnsd01,gpfsnsd02" \
--failure-group 10 \
--pool system \
--usage dataAndMetadata
```
Review the generated stanza with the storage and cluster teams. Confirm device identity, LUN masking, multipath naming, failure group placement, and site standards before continuing.
Dry-run the add step:
```bash
./05_add_nsd_to_filesystem.sh \
--fs gpfs01 \
--stanza /tmp/gpfs_nsd_gpfs01_YYYYmmdd_HHMMSS.stanza
```
Execute only in an approved change window:
```bash
./05_add_nsd_to_filesystem.sh \
--fs gpfs01 \
--stanza /tmp/gpfs_nsd_gpfs01_YYYYmmdd_HHMMSS.stanza \
--execute
```
Optional rebalance:
```bash
./06_rebalance_filesystem.sh --fs gpfs01
./06_rebalance_filesystem.sh --fs gpfs01 --execute --background
```
Post-check and report:
```bash
./07_postcheck_gpfs.sh --fs gpfs01
./08_generate_report.sh --fs gpfs01
```
Runbook helper:
```bash
./gpfs_extend_runbook.sh \
--fs gpfs01 \
--devices "/dev/sdb /dev/sdc" \
--servers "gpfsnsd01,gpfsnsd02" \
--failure-group 10 \
--pool system \
--usage dataAndMetadata
```
## Operational Notes
- Do not run these scripts blindly on production clusters.
- Confirm disk and multipath identity with the storage team before creating NSDs.
- Validate quorum and manager health before expansion.
- Confirm application I/O risk and rollback procedures before `mmadddisk` or `mmrestripefs`.
- Confirm the Spectrum Scale version and local standards for stanza fields before executing changes.
+94
View File
@@ -0,0 +1,94 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
DEVICES=""
SERVERS=""
EXECUTE=false
usage() {
printf 'Usage: %s --fs <filesystem> --devices "/dev/sdb /dev/sdc" --servers "node1,node2" --failure-group <number> --pool <storage_pool> --usage <dataOnly|metadataOnly|dataAndMetadata> [--execute]\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--fs) FILESYSTEM="${2:-}"; shift 2 ;;
--devices) DEVICES="${2:-}"; shift 2 ;;
--servers) SERVERS="${2:-}"; shift 2 ;;
--failure-group) FAILURE_GROUP="${2:-}"; shift 2 ;;
--pool) STORAGE_POOL="${2:-}"; shift 2 ;;
--usage) USAGE="${2:-}"; shift 2 ;;
--execute) EXECUTE=true; DRY_RUN=false; shift ;;
-h|--help) usage; exit 0 ;;
*) critical "Unknown argument: $1"; usage; exit 2 ;;
esac
done
section "Recommended GPFS Expansion Flow"
cat <<FLOW
Step 1: Cluster overview
$SCRIPT_DIR/01_cluster_overview.sh
Step 2: GPFS precheck
$SCRIPT_DIR/02_precheck_gpfs.sh --fs <filesystem>
Step 3: Detect candidate disks
$SCRIPT_DIR/03_detect_new_disks.sh --exclude-mounted --exclude-existing-nsd
Step 4: Generate NSD stanza
$SCRIPT_DIR/04_create_nsd_stanza.sh --fs <filesystem> --devices "/dev/sdb /dev/sdc" --servers "node1,node2" --failure-group <number> --pool <storage_pool> --usage <usage>
Step 5: Create NSDs and add disks to filesystem
$SCRIPT_DIR/05_add_nsd_to_filesystem.sh --fs <filesystem> --stanza <stanza_file> [--execute]
Step 6: Optional restripe/rebalance
$SCRIPT_DIR/06_rebalance_filesystem.sh --fs <filesystem> [--execute] [--background]
Step 7: Post-check
$SCRIPT_DIR/07_postcheck_gpfs.sh --fs <filesystem>
Step 8: Generate report
$SCRIPT_DIR/08_generate_report.sh --fs <filesystem>
FLOW
if [[ -z "$FILESYSTEM" ]]; then
warning "No --fs supplied. Printed runbook only."
exit 0
fi
if [[ "$EXECUTE" == "true" ]]; then
warning "--execute was supplied. Destructive steps still require the individual script confirmation prompt."
else
DRY_RUN=true
fi
section "Running Safe Read-Only Steps"
"$SCRIPT_DIR/01_cluster_overview.sh" || warning "Cluster overview reported warnings or failures"
"$SCRIPT_DIR/02_precheck_gpfs.sh" --fs "$FILESYSTEM" || warning "Precheck reported warnings or failures"
"$SCRIPT_DIR/03_detect_new_disks.sh" --exclude-mounted --exclude-existing-nsd || warning "Disk detection reported warnings or failures"
if [[ -n "$DEVICES" || -n "$SERVERS" || -n "$FAILURE_GROUP" ]]; then
if [[ -z "$DEVICES" || -z "$SERVERS" || -z "$FAILURE_GROUP" ]]; then
warning "NSD stanza generation requires --devices, --servers, --failure-group, --pool, and --usage"
else
"$SCRIPT_DIR/04_create_nsd_stanza.sh" \
--fs "$FILESYSTEM" \
--devices "$DEVICES" \
--servers "$SERVERS" \
--failure-group "$FAILURE_GROUP" \
--pool "$STORAGE_POOL" \
--usage "$USAGE"
fi
fi
section "Next Manual Step"
if [[ "$EXECUTE" == "true" ]]; then
warning "Run 05_add_nsd_to_filesystem.sh manually with --execute after reviewing the generated stanza."
else
ok "Review outputs and generated stanza. Add disks only through 05_add_nsd_to_filesystem.sh with --execute."
fi