208 lines
5.4 KiB
YAML
208 lines
5.4 KiB
YAML
---
|
|
- name: Validate cluster after OS rolling upgrade
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Validate Slurm controller and cluster state
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "### slurmctld ping"
|
|
scontrol ping
|
|
|
|
echo
|
|
echo "### nodes"
|
|
sinfo -N
|
|
|
|
echo
|
|
echo "### partitions"
|
|
sinfo
|
|
|
|
echo
|
|
echo "### queue"
|
|
squeue
|
|
|
|
echo
|
|
echo "### important config"
|
|
scontrol show config | grep -E "AccountingStorage|JobAcctGather|TaskPlugin|ProctrackType|SelectType|ClusterName"
|
|
|
|
echo
|
|
echo "### accounting recent jobs"
|
|
sacct -S today --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30
|
|
args:
|
|
executable: /bin/bash
|
|
register: cluster_state
|
|
changed_when: false
|
|
|
|
- name: Print cluster state
|
|
ansible.builtin.debug:
|
|
var: cluster_state.stdout_lines
|
|
|
|
|
|
- name: Validate worker services after OS rolling upgrade
|
|
hosts: slurm_compute:slurm_gpu
|
|
become: true
|
|
gather_facts: true
|
|
|
|
tasks:
|
|
- name: Validate local worker services and Slurm connectivity
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "HOST=$(hostname)"
|
|
echo "FQDN=$(hostname -f 2>/dev/null || hostname)"
|
|
echo "KERNEL=$(uname -r)"
|
|
echo "UPTIME=$(uptime -p)"
|
|
|
|
echo
|
|
echo "### services"
|
|
systemctl is-active munge
|
|
systemctl is-active slurmd
|
|
|
|
echo
|
|
echo "### munge local test"
|
|
munge -n | unmunge >/dev/null
|
|
echo "munge OK"
|
|
|
|
echo
|
|
echo "### controller ping"
|
|
scontrol ping
|
|
|
|
echo
|
|
echo "### local slurm.conf checksum"
|
|
sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true
|
|
|
|
echo
|
|
echo "### gpu check if present"
|
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
|
nvidia-smi --query-gpu=index,name,driver_version,memory.total --format=csv,noheader || true
|
|
else
|
|
echo "NO_NVIDIA_SMI"
|
|
fi
|
|
args:
|
|
executable: /bin/bash
|
|
register: worker_state
|
|
changed_when: false
|
|
|
|
- name: Print worker state
|
|
ansible.builtin.debug:
|
|
var: worker_state.stdout_lines
|
|
|
|
|
|
- name: Submit post-upgrade CPU validation job
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Submit CPU validation job to debug partition
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
job_id="$(
|
|
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
|
#!/bin/bash
|
|
#SBATCH --job-name=os-upgrade-cpu-test
|
|
#SBATCH --partition=debug
|
|
#SBATCH --cpus-per-task=1
|
|
#SBATCH --mem=256M
|
|
#SBATCH --time=00:02:00
|
|
#SBATCH --output=/shared/os-upgrade-cpu-test-%j.out
|
|
|
|
echo "HOST=$(hostname)"
|
|
echo "USER=$(whoami)"
|
|
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
|
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
|
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
|
echo "KERNEL=$(uname -r)"
|
|
date
|
|
SBATCH
|
|
)"
|
|
|
|
echo "JOB_ID=$job_id"
|
|
|
|
for i in $(seq 1 90); do
|
|
if squeue -h -j "$job_id" | grep -q .; then
|
|
squeue -j "$job_id"
|
|
sleep 1
|
|
else
|
|
break
|
|
fi
|
|
done
|
|
|
|
echo "### sacct"
|
|
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
|
|
|
echo "### output"
|
|
cat "/shared/os-upgrade-cpu-test-${job_id}.out"
|
|
args:
|
|
executable: /bin/bash
|
|
register: cpu_validation_job
|
|
changed_when: true
|
|
|
|
- name: Print CPU validation job
|
|
ansible.builtin.debug:
|
|
var: cpu_validation_job.stdout_lines
|
|
|
|
|
|
- name: Submit post-upgrade GPU validation job
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Submit GPU validation job to gpu partition
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
job_id="$(
|
|
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
|
#!/bin/bash
|
|
#SBATCH --job-name=os-upgrade-gpu-test
|
|
#SBATCH --partition=gpu
|
|
#SBATCH --gres=gpu:1
|
|
#SBATCH --cpus-per-task=2
|
|
#SBATCH --mem=1G
|
|
#SBATCH --time=00:03:00
|
|
#SBATCH --output=/shared/os-upgrade-gpu-test-%j.out
|
|
|
|
echo "HOST=$(hostname)"
|
|
echo "USER=$(whoami)"
|
|
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
|
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
|
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
|
|
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
|
|
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
|
echo "KERNEL=$(uname -r)"
|
|
echo
|
|
nvidia-smi
|
|
SBATCH
|
|
)"
|
|
|
|
echo "JOB_ID=$job_id"
|
|
|
|
for i in $(seq 1 120); do
|
|
if squeue -h -j "$job_id" | grep -q .; then
|
|
squeue -j "$job_id"
|
|
sleep 1
|
|
else
|
|
break
|
|
fi
|
|
done
|
|
|
|
echo "### sacct"
|
|
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
|
|
|
echo "### output"
|
|
cat "/shared/os-upgrade-gpu-test-${job_id}.out"
|
|
args:
|
|
executable: /bin/bash
|
|
register: gpu_validation_job
|
|
changed_when: true
|
|
|
|
- name: Print GPU validation job
|
|
ansible.builtin.debug:
|
|
var: gpu_validation_job.stdout_lines
|