--- - name: Validate Slurm QOS, fairshare and priority hosts: slurm_controller become: true gather_facts: false tasks: - name: Validate priority runtime config ansible.builtin.shell: | set -euo pipefail echo "### priority config" scontrol show config | grep -E "PriorityType|PriorityWeight|PriorityDecay|PriorityCalc|PriorityMaxAge|PriorityFavor" echo echo "### accounting enforcement" scontrol show config | grep -E "AccountingStorageType|AccountingStorageEnforce|AccountingStorageTRES" echo echo "### QOS" sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%50,MaxJobsPU echo echo "### associations" sacctmgr show assoc format=Cluster,Account,User,Share,QOS%80,DefaultQOS,Fairshare echo echo "### fairshare" sshare -A {{ slurm_account_name }} || true args: executable: /bin/bash register: priority_state changed_when: false - name: Submit debug-short QOS job ansible.builtin.shell: | set -euo pipefail job_id="$( sudo -iu slurmuser sbatch --parsable <<'SBATCH' #!/bin/bash #SBATCH --job-name=qos-debug-test #SBATCH --partition=debug #SBATCH --qos=debug-short #SBATCH --account=lab #SBATCH --cpus-per-task=1 #SBATCH --mem=256M #SBATCH --time=00:02:00 #SBATCH --output=/shared/qos-debug-test-%j.out echo "HOST=$(hostname)" echo "USER=$(whoami)" echo "QOS=${SLURM_JOB_QOS:-}" echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}" echo "SLURM_JOB_ID=$SLURM_JOB_ID" echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" date SBATCH )" echo "JOB_ID=$job_id" for i in $(seq 1 90); do if squeue -h -j "$job_id" | grep -q .; then squeue -j "$job_id" sleep 1 else break fi done echo "### sacct" sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList echo "### output" cat "/shared/qos-debug-test-${job_id}.out" args: executable: /bin/bash register: debug_qos_job changed_when: true - name: Submit gpu-short QOS job ansible.builtin.shell: | set -euo pipefail job_id="$( sudo -iu slurmuser sbatch --parsable <<'SBATCH' #!/bin/bash #SBATCH --job-name=qos-gpu-test #SBATCH --partition=gpu #SBATCH --qos=gpu-short #SBATCH --account=lab #SBATCH --gres=gpu:1 #SBATCH --cpus-per-task=2 #SBATCH --mem=1G #SBATCH --time=00:03:00 #SBATCH --output=/shared/qos-gpu-test-%j.out echo "HOST=$(hostname)" echo "USER=$(whoami)" echo "QOS=${SLURM_JOB_QOS:-}" echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}" echo "SLURM_JOB_ID=$SLURM_JOB_ID" echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}" echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}" echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" echo nvidia-smi SBATCH )" echo "JOB_ID=$job_id" for i in $(seq 1 120); do if squeue -h -j "$job_id" | grep -q .; then squeue -j "$job_id" sleep 1 else break fi done echo "### sacct" sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList echo "### output" cat "/shared/qos-gpu-test-${job_id}.out" args: executable: /bin/bash register: gpu_qos_job changed_when: true - name: Validate debug-short walltime limit behavior ansible.builtin.shell: | set -euo pipefail set +e output="$( sudo -iu slurmuser sbatch --parsable <<'SBATCH' 2>&1 #!/bin/bash #SBATCH --job-name=qos-limit-fail #SBATCH --partition=debug #SBATCH --qos=debug-short #SBATCH --account=lab #SBATCH --cpus-per-task=1 #SBATCH --mem=256M #SBATCH --time=00:30:00 #SBATCH --output=/shared/qos-limit-fail-%j.out sleep 10 SBATCH )" rc=$? set -e echo "RC=$rc" echo "$output" if [ "$rc" -ne 0 ]; then echo "Limit rejection test passed at submit time" exit 0 fi job_id="$output" echo "Submitted job despite expected limit check: $job_id" sleep 3 echo "### squeue" squeue -j "$job_id" -o "%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" || true echo echo "### job detail" scontrol show job "$job_id" || true state="$(squeue -h -j "$job_id" -o "%T" || true)" reason="$(squeue -h -j "$job_id" -o "%R" || true)" echo "STATE=$state" echo "REASON=$reason" if echo "$state" | grep -qE "PENDING|CONFIGURING"; then if echo "$reason" | grep -qiE "qos|limit|time|max|assoc"; then echo "Limit enforcement test passed via pending reason" scancel "$job_id" || true exit 0 fi fi echo "Job was accepted without an obvious QOS/limit pending reason" scancel "$job_id" || true exit 1 args: executable: /bin/bash register: limit_rejection changed_when: false - name: Show priority and fairshare snapshot ansible.builtin.shell: | set -euo pipefail echo "### queue" squeue || true echo echo "### sprio" sprio || true echo echo "### sshare" sshare -A {{ slurm_account_name }} || true echo echo "### recent sacct" sacct -S today --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -40 args: executable: /bin/bash register: priority_snapshot changed_when: false - name: Print validation result ansible.builtin.debug: msg: - "### priority state" - "{{ priority_state.stdout_lines }}" - "### debug QOS job" - "{{ debug_qos_job.stdout_lines }}" - "### GPU QOS job" - "{{ gpu_qos_job.stdout_lines }}" - "### limit rejection" - "{{ limit_rejection.stdout_lines }}" - "### priority snapshot" - "{{ priority_snapshot.stdout_lines }}"