236 lines
6.6 KiB
YAML
236 lines
6.6 KiB
YAML
---
|
|
- name: Validate Slurm QOS, fairshare and priority
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Validate priority runtime config
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "### priority config"
|
|
scontrol show config | grep -E "PriorityType|PriorityWeight|PriorityDecay|PriorityCalc|PriorityMaxAge|PriorityFavor"
|
|
|
|
echo
|
|
echo "### accounting enforcement"
|
|
scontrol show config | grep -E "AccountingStorageType|AccountingStorageEnforce|AccountingStorageTRES"
|
|
|
|
echo
|
|
echo "### QOS"
|
|
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%50,MaxJobsPU
|
|
|
|
echo
|
|
echo "### associations"
|
|
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%80,DefaultQOS,Fairshare
|
|
|
|
echo
|
|
echo "### fairshare"
|
|
sshare -A {{ slurm_account_name }} || true
|
|
args:
|
|
executable: /bin/bash
|
|
register: priority_state
|
|
changed_when: false
|
|
|
|
- name: Submit debug-short QOS job
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
job_id="$(
|
|
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
|
#!/bin/bash
|
|
#SBATCH --job-name=qos-debug-test
|
|
#SBATCH --partition=debug
|
|
#SBATCH --qos=debug-short
|
|
#SBATCH --account=lab
|
|
#SBATCH --cpus-per-task=1
|
|
#SBATCH --mem=256M
|
|
#SBATCH --time=00:02:00
|
|
#SBATCH --output=/shared/qos-debug-test-%j.out
|
|
|
|
echo "HOST=$(hostname)"
|
|
echo "USER=$(whoami)"
|
|
echo "QOS=${SLURM_JOB_QOS:-}"
|
|
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
|
|
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
|
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
|
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
|
date
|
|
SBATCH
|
|
)"
|
|
|
|
echo "JOB_ID=$job_id"
|
|
|
|
for i in $(seq 1 90); do
|
|
if squeue -h -j "$job_id" | grep -q .; then
|
|
squeue -j "$job_id"
|
|
sleep 1
|
|
else
|
|
break
|
|
fi
|
|
done
|
|
|
|
echo "### sacct"
|
|
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
|
|
|
echo "### output"
|
|
cat "/shared/qos-debug-test-${job_id}.out"
|
|
args:
|
|
executable: /bin/bash
|
|
register: debug_qos_job
|
|
changed_when: true
|
|
|
|
- name: Submit gpu-short QOS job
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
job_id="$(
|
|
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
|
#!/bin/bash
|
|
#SBATCH --job-name=qos-gpu-test
|
|
#SBATCH --partition=gpu
|
|
#SBATCH --qos=gpu-short
|
|
#SBATCH --account=lab
|
|
#SBATCH --gres=gpu:1
|
|
#SBATCH --cpus-per-task=2
|
|
#SBATCH --mem=1G
|
|
#SBATCH --time=00:03:00
|
|
#SBATCH --output=/shared/qos-gpu-test-%j.out
|
|
|
|
echo "HOST=$(hostname)"
|
|
echo "USER=$(whoami)"
|
|
echo "QOS=${SLURM_JOB_QOS:-}"
|
|
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
|
|
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
|
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
|
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
|
|
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
|
|
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
|
echo
|
|
nvidia-smi
|
|
SBATCH
|
|
)"
|
|
|
|
echo "JOB_ID=$job_id"
|
|
|
|
for i in $(seq 1 120); do
|
|
if squeue -h -j "$job_id" | grep -q .; then
|
|
squeue -j "$job_id"
|
|
sleep 1
|
|
else
|
|
break
|
|
fi
|
|
done
|
|
|
|
echo "### sacct"
|
|
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
|
|
|
echo "### output"
|
|
cat "/shared/qos-gpu-test-${job_id}.out"
|
|
args:
|
|
executable: /bin/bash
|
|
register: gpu_qos_job
|
|
changed_when: true
|
|
|
|
- name: Validate debug-short walltime limit behavior
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
set +e
|
|
output="$(
|
|
sudo -iu slurmuser sbatch --parsable <<'SBATCH' 2>&1
|
|
#!/bin/bash
|
|
#SBATCH --job-name=qos-limit-fail
|
|
#SBATCH --partition=debug
|
|
#SBATCH --qos=debug-short
|
|
#SBATCH --account=lab
|
|
#SBATCH --cpus-per-task=1
|
|
#SBATCH --mem=256M
|
|
#SBATCH --time=00:30:00
|
|
#SBATCH --output=/shared/qos-limit-fail-%j.out
|
|
|
|
sleep 10
|
|
SBATCH
|
|
)"
|
|
rc=$?
|
|
set -e
|
|
|
|
echo "RC=$rc"
|
|
echo "$output"
|
|
|
|
if [ "$rc" -ne 0 ]; then
|
|
echo "Limit rejection test passed at submit time"
|
|
exit 0
|
|
fi
|
|
|
|
job_id="$output"
|
|
echo "Submitted job despite expected limit check: $job_id"
|
|
|
|
sleep 3
|
|
|
|
echo "### squeue"
|
|
squeue -j "$job_id" -o "%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" || true
|
|
|
|
echo
|
|
echo "### job detail"
|
|
scontrol show job "$job_id" || true
|
|
|
|
state="$(squeue -h -j "$job_id" -o "%T" || true)"
|
|
reason="$(squeue -h -j "$job_id" -o "%R" || true)"
|
|
|
|
echo "STATE=$state"
|
|
echo "REASON=$reason"
|
|
|
|
if echo "$state" | grep -qE "PENDING|CONFIGURING"; then
|
|
if echo "$reason" | grep -qiE "qos|limit|time|max|assoc"; then
|
|
echo "Limit enforcement test passed via pending reason"
|
|
scancel "$job_id" || true
|
|
exit 0
|
|
fi
|
|
fi
|
|
|
|
echo "Job was accepted without an obvious QOS/limit pending reason"
|
|
scancel "$job_id" || true
|
|
exit 1
|
|
args:
|
|
executable: /bin/bash
|
|
register: limit_rejection
|
|
changed_when: false
|
|
|
|
- name: Show priority and fairshare snapshot
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "### queue"
|
|
squeue || true
|
|
|
|
echo
|
|
echo "### sprio"
|
|
sprio || true
|
|
|
|
echo
|
|
echo "### sshare"
|
|
sshare -A {{ slurm_account_name }} || true
|
|
|
|
echo
|
|
echo "### recent sacct"
|
|
sacct -S today --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -40
|
|
args:
|
|
executable: /bin/bash
|
|
register: priority_snapshot
|
|
changed_when: false
|
|
|
|
- name: Print validation result
|
|
ansible.builtin.debug:
|
|
msg:
|
|
- "### priority state"
|
|
- "{{ priority_state.stdout_lines }}"
|
|
- "### debug QOS job"
|
|
- "{{ debug_qos_job.stdout_lines }}"
|
|
- "### GPU QOS job"
|
|
- "{{ gpu_qos_job.stdout_lines }}"
|
|
- "### limit rejection"
|
|
- "{{ limit_rejection.stdout_lines }}"
|
|
- "### priority snapshot"
|
|
- "{{ priority_snapshot.stdout_lines }}"
|