Add Slurm AI/HPC cluster platform project
lint / shell-yaml-ansible (push) Failing after 47s

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit d300d490f5
49 changed files with 4777 additions and 0 deletions
@@ -0,0 +1,235 @@
---
- name: Validate Slurm QOS, fairshare and priority
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Validate priority runtime config
ansible.builtin.shell: |
set -euo pipefail
echo "### priority config"
scontrol show config | grep -E "PriorityType|PriorityWeight|PriorityDecay|PriorityCalc|PriorityMaxAge|PriorityFavor"
echo
echo "### accounting enforcement"
scontrol show config | grep -E "AccountingStorageType|AccountingStorageEnforce|AccountingStorageTRES"
echo
echo "### QOS"
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%50,MaxJobsPU
echo
echo "### associations"
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%80,DefaultQOS,Fairshare
echo
echo "### fairshare"
sshare -A {{ slurm_account_name }} || true
args:
executable: /bin/bash
register: priority_state
changed_when: false
- name: Submit debug-short QOS job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=qos-debug-test
#SBATCH --partition=debug
#SBATCH --qos=debug-short
#SBATCH --account=lab
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --output=/shared/qos-debug-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "QOS=${SLURM_JOB_QOS:-}"
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/qos-debug-test-${job_id}.out"
args:
executable: /bin/bash
register: debug_qos_job
changed_when: true
- name: Submit gpu-short QOS job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=qos-gpu-test
#SBATCH --partition=gpu
#SBATCH --qos=gpu-short
#SBATCH --account=lab
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=2
#SBATCH --mem=1G
#SBATCH --time=00:03:00
#SBATCH --output=/shared/qos-gpu-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "QOS=${SLURM_JOB_QOS:-}"
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo
nvidia-smi
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 120); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/qos-gpu-test-${job_id}.out"
args:
executable: /bin/bash
register: gpu_qos_job
changed_when: true
- name: Validate debug-short walltime limit behavior
ansible.builtin.shell: |
set -euo pipefail
set +e
output="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH' 2>&1
#!/bin/bash
#SBATCH --job-name=qos-limit-fail
#SBATCH --partition=debug
#SBATCH --qos=debug-short
#SBATCH --account=lab
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:30:00
#SBATCH --output=/shared/qos-limit-fail-%j.out
sleep 10
SBATCH
)"
rc=$?
set -e
echo "RC=$rc"
echo "$output"
if [ "$rc" -ne 0 ]; then
echo "Limit rejection test passed at submit time"
exit 0
fi
job_id="$output"
echo "Submitted job despite expected limit check: $job_id"
sleep 3
echo "### squeue"
squeue -j "$job_id" -o "%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" || true
echo
echo "### job detail"
scontrol show job "$job_id" || true
state="$(squeue -h -j "$job_id" -o "%T" || true)"
reason="$(squeue -h -j "$job_id" -o "%R" || true)"
echo "STATE=$state"
echo "REASON=$reason"
if echo "$state" | grep -qE "PENDING|CONFIGURING"; then
if echo "$reason" | grep -qiE "qos|limit|time|max|assoc"; then
echo "Limit enforcement test passed via pending reason"
scancel "$job_id" || true
exit 0
fi
fi
echo "Job was accepted without an obvious QOS/limit pending reason"
scancel "$job_id" || true
exit 1
args:
executable: /bin/bash
register: limit_rejection
changed_when: false
- name: Show priority and fairshare snapshot
ansible.builtin.shell: |
set -euo pipefail
echo "### queue"
squeue || true
echo
echo "### sprio"
sprio || true
echo
echo "### sshare"
sshare -A {{ slurm_account_name }} || true
echo
echo "### recent sacct"
sacct -S today --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -40
args:
executable: /bin/bash
register: priority_snapshot
changed_when: false
- name: Print validation result
ansible.builtin.debug:
msg:
- "### priority state"
- "{{ priority_state.stdout_lines }}"
- "### debug QOS job"
- "{{ debug_qos_job.stdout_lines }}"
- "### GPU QOS job"
- "{{ gpu_qos_job.stdout_lines }}"
- "### limit rejection"
- "{{ limit_rejection.stdout_lines }}"
- "### priority snapshot"
- "{{ priority_snapshot.stdout_lines }}"