--- - name: Validate cluster after OS rolling upgrade hosts: slurm_controller become: true gather_facts: false tasks: - name: Validate Slurm controller and cluster state ansible.builtin.shell: | set -euo pipefail echo "### slurmctld ping" scontrol ping echo echo "### nodes" sinfo -N echo echo "### partitions" sinfo echo echo "### queue" squeue echo echo "### important config" scontrol show config | grep -E "AccountingStorage|JobAcctGather|TaskPlugin|ProctrackType|SelectType|ClusterName" echo echo "### accounting recent jobs" sacct -S today --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30 args: executable: /bin/bash register: cluster_state changed_when: false - name: Print cluster state ansible.builtin.debug: var: cluster_state.stdout_lines - name: Validate worker services after OS rolling upgrade hosts: slurm_compute:slurm_gpu become: true gather_facts: true tasks: - name: Validate local worker services and Slurm connectivity ansible.builtin.shell: | set -euo pipefail echo "HOST=$(hostname)" echo "FQDN=$(hostname -f 2>/dev/null || hostname)" echo "KERNEL=$(uname -r)" echo "UPTIME=$(uptime -p)" echo echo "### services" systemctl is-active munge systemctl is-active slurmd echo echo "### munge local test" munge -n | unmunge >/dev/null echo "munge OK" echo echo "### controller ping" scontrol ping echo echo "### local slurm.conf checksum" sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true echo echo "### gpu check if present" if command -v nvidia-smi >/dev/null 2>&1; then nvidia-smi --query-gpu=index,name,driver_version,memory.total --format=csv,noheader || true else echo "NO_NVIDIA_SMI" fi args: executable: /bin/bash register: worker_state changed_when: false - name: Print worker state ansible.builtin.debug: var: worker_state.stdout_lines - name: Submit post-upgrade CPU validation job hosts: slurm_controller become: true gather_facts: false tasks: - name: Submit CPU validation job to debug partition ansible.builtin.shell: | set -euo pipefail job_id="$( sudo -iu slurmuser sbatch --parsable <<'SBATCH' #!/bin/bash #SBATCH --job-name=os-upgrade-cpu-test #SBATCH --partition=debug #SBATCH --cpus-per-task=1 #SBATCH --mem=256M #SBATCH --time=00:02:00 #SBATCH --output=/shared/os-upgrade-cpu-test-%j.out echo "HOST=$(hostname)" echo "USER=$(whoami)" echo "SLURM_JOB_ID=$SLURM_JOB_ID" echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" echo "KERNEL=$(uname -r)" date SBATCH )" echo "JOB_ID=$job_id" for i in $(seq 1 90); do if squeue -h -j "$job_id" | grep -q .; then squeue -j "$job_id" sleep 1 else break fi done echo "### sacct" sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList echo "### output" cat "/shared/os-upgrade-cpu-test-${job_id}.out" args: executable: /bin/bash register: cpu_validation_job changed_when: true - name: Print CPU validation job ansible.builtin.debug: var: cpu_validation_job.stdout_lines - name: Submit post-upgrade GPU validation job hosts: slurm_controller become: true gather_facts: false tasks: - name: Submit GPU validation job to gpu partition ansible.builtin.shell: | set -euo pipefail job_id="$( sudo -iu slurmuser sbatch --parsable <<'SBATCH' #!/bin/bash #SBATCH --job-name=os-upgrade-gpu-test #SBATCH --partition=gpu #SBATCH --gres=gpu:1 #SBATCH --cpus-per-task=2 #SBATCH --mem=1G #SBATCH --time=00:03:00 #SBATCH --output=/shared/os-upgrade-gpu-test-%j.out echo "HOST=$(hostname)" echo "USER=$(whoami)" echo "SLURM_JOB_ID=$SLURM_JOB_ID" echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}" echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}" echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" echo "KERNEL=$(uname -r)" echo nvidia-smi SBATCH )" echo "JOB_ID=$job_id" for i in $(seq 1 120); do if squeue -h -j "$job_id" | grep -q .; then squeue -j "$job_id" sleep 1 else break fi done echo "### sacct" sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList echo "### output" cat "/shared/os-upgrade-gpu-test-${job_id}.out" args: executable: /bin/bash register: gpu_validation_job changed_when: true - name: Print GPU validation job ansible.builtin.debug: var: gpu_validation_job.stdout_lines