Add Slurm AI/HPC cluster platform project

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit cd6830334b
47 changed files with 4727 additions and 0 deletions
@@ -0,0 +1,169 @@
---
- name: Configure Slurm QOS, limits and fairshare
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Ensure sacctmgr is avgpu01le
ansible.builtin.command:
cmd: sacctmgr -n list cluster
changed_when: false
- name: Validate accounting GPU TRES exists
ansible.builtin.shell: |
set -euo pipefail
echo "### configured AccountingStorageTRES"
scontrol show config | grep -E "AccountingStorageTRES|AccountingStorageType|AccountingStorageEnforce"
echo
echo "### known TRES"
sacctmgr show tres
echo
echo "### checking gres/gpu"
sacctmgr -n show tres format=Type,Name | awk '$1=="gres" && $2=="gpu" {found=1} END {exit !found}'
args:
executable: /bin/bash
register: gpu_tres_check
changed_when: false
- name: Ensure normal QOS exists
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i add qos normal Priority=100
args:
executable: /bin/bash
register: add_qos_normal
changed_when: "'Adding QOS' in (add_qos_normal.stdout + add_qos_normal.stderr)"
failed_when: >
add_qos_normal.rc != 0 and
'Nothing new added' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
'already exists' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
'Already existing' not in (add_qos_normal.stdout + add_qos_normal.stderr)
- name: Ensure debug-short QOS exists
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i add qos debug-short Priority=500
args:
executable: /bin/bash
register: add_qos_debug
changed_when: "'Adding QOS' in (add_qos_debug.stdout + add_qos_debug.stderr)"
failed_when: >
add_qos_debug.rc != 0 and
'Nothing new added' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
'already exists' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
'Already existing' not in (add_qos_debug.stdout + add_qos_debug.stderr)
- name: Ensure gpu-short QOS exists
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i add qos gpu-short Priority=1000
args:
executable: /bin/bash
register: add_qos_gpu
changed_when: "'Adding QOS' in (add_qos_gpu.stdout + add_qos_gpu.stderr)"
failed_when: >
add_qos_gpu.rc != 0 and
'Nothing new added' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
'already exists' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
'Already existing' not in (add_qos_gpu.stdout + add_qos_gpu.stderr)
- name: Ensure maintenance QOS exists
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i add qos maintenance Priority=5000
args:
executable: /bin/bash
register: add_qos_maintenance
changed_when: "'Adding QOS' in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)"
failed_when: >
add_qos_maintenance.rc != 0 and
'Nothing new added' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
'already exists' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
'Already existing' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)
- name: Normalize normal QOS settings
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify qos normal set Priority=100
args:
executable: /bin/bash
changed_when: true
- name: Normalize debug-short QOS settings
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify qos debug-short set Priority=500 MaxWall=00:10:00 MaxTRESPU=cpu=2 MaxJobsPU=4
args:
executable: /bin/bash
changed_when: true
- name: Normalize gpu-short QOS settings
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify qos gpu-short set Priority=1000 MaxWall=01:00:00 MaxTRESPU=gres/gpu=1,cpu=12 MaxJobsPU=2
args:
executable: /bin/bash
changed_when: true
- name: Normalize maintenance QOS settings
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify qos maintenance set Priority=5000 MaxWall=02:00:00
args:
executable: /bin/bash
changed_when: true
- name: Assign QOS set to lab account
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify account {{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
args:
executable: /bin/bash
changed_when: true
- name: Assign default account to slurmuser
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }}
args:
executable: /bin/bash
changed_when: true
- name: Assign QOS set to slurmuser association
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify user where name=slurmuser account={{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
args:
executable: /bin/bash
changed_when: true
- name: Show configured QOS and associations
ansible.builtin.shell: |
set -euo pipefail
echo "### TRES"
sacctmgr show tres
echo
echo "### QOS"
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%40,MaxJobsPU
echo
echo "### Associations"
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%60,DefaultQOS,Fairshare
echo
echo "### Fairshare"
sshare -A {{ slurm_account_name }} || true
args:
executable: /bin/bash
register: qos_state
changed_when: false
- name: Print QOS state
ansible.builtin.debug:
var: qos_state.stdout_lines
@@ -0,0 +1,235 @@
---
- name: Validate Slurm QOS, fairshare and priority
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Validate priority runtime config
ansible.builtin.shell: |
set -euo pipefail
echo "### priority config"
scontrol show config | grep -E "PriorityType|PriorityWeight|PriorityDecay|PriorityCalc|PriorityMaxAge|PriorityFavor"
echo
echo "### accounting enforcement"
scontrol show config | grep -E "AccountingStorageType|AccountingStorageEnforce|AccountingStorageTRES"
echo
echo "### QOS"
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%50,MaxJobsPU
echo
echo "### associations"
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%80,DefaultQOS,Fairshare
echo
echo "### fairshare"
sshare -A {{ slurm_account_name }} || true
args:
executable: /bin/bash
register: priority_state
changed_when: false
- name: Submit debug-short QOS job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=qos-debug-test
#SBATCH --partition=debug
#SBATCH --qos=debug-short
#SBATCH --account=lab
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --output=/shared/qos-debug-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "QOS=${SLURM_JOB_QOS:-}"
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/qos-debug-test-${job_id}.out"
args:
executable: /bin/bash
register: debug_qos_job
changed_when: true
- name: Submit gpu-short QOS job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=qos-gpu-test
#SBATCH --partition=gpu
#SBATCH --qos=gpu-short
#SBATCH --account=lab
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=2
#SBATCH --mem=1G
#SBATCH --time=00:03:00
#SBATCH --output=/shared/qos-gpu-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "QOS=${SLURM_JOB_QOS:-}"
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo
nvidia-smi
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 120); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/qos-gpu-test-${job_id}.out"
args:
executable: /bin/bash
register: gpu_qos_job
changed_when: true
- name: Validate debug-short walltime limit behavior
ansible.builtin.shell: |
set -euo pipefail
set +e
output="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH' 2>&1
#!/bin/bash
#SBATCH --job-name=qos-limit-fail
#SBATCH --partition=debug
#SBATCH --qos=debug-short
#SBATCH --account=lab
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:30:00
#SBATCH --output=/shared/qos-limit-fail-%j.out
sleep 10
SBATCH
)"
rc=$?
set -e
echo "RC=$rc"
echo "$output"
if [ "$rc" -ne 0 ]; then
echo "Limit rejection test passed at submit time"
exit 0
fi
job_id="$output"
echo "Submitted job despite expected limit check: $job_id"
sleep 3
echo "### squeue"
squeue -j "$job_id" -o "%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" || true
echo
echo "### job detail"
scontrol show job "$job_id" || true
state="$(squeue -h -j "$job_id" -o "%T" || true)"
reason="$(squeue -h -j "$job_id" -o "%R" || true)"
echo "STATE=$state"
echo "REASON=$reason"
if echo "$state" | grep -qE "PENDING|CONFIGURING"; then
if echo "$reason" | grep -qiE "qos|limit|time|max|assoc"; then
echo "Limit enforcement test passed via pending reason"
scancel "$job_id" || true
exit 0
fi
fi
echo "Job was accepted without an obvious QOS/limit pending reason"
scancel "$job_id" || true
exit 1
args:
executable: /bin/bash
register: limit_rejection
changed_when: false
- name: Show priority and fairshare snapshot
ansible.builtin.shell: |
set -euo pipefail
echo "### queue"
squeue || true
echo
echo "### sprio"
sprio || true
echo
echo "### sshare"
sshare -A {{ slurm_account_name }} || true
echo
echo "### recent sacct"
sacct -S today --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -40
args:
executable: /bin/bash
register: priority_snapshot
changed_when: false
- name: Print validation result
ansible.builtin.debug:
msg:
- "### priority state"
- "{{ priority_state.stdout_lines }}"
- "### debug QOS job"
- "{{ debug_qos_job.stdout_lines }}"
- "### GPU QOS job"
- "{{ gpu_qos_job.stdout_lines }}"
- "### limit rejection"
- "{{ limit_rejection.stdout_lines }}"
- "### priority snapshot"
- "{{ priority_snapshot.stdout_lines }}"