Add Slurm AI/HPC cluster platform project
This commit is contained in:
@@ -0,0 +1,169 @@
|
||||
---
|
||||
- name: Configure Slurm QOS, limits and fairshare
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Ensure sacctmgr is avgpu01le
|
||||
ansible.builtin.command:
|
||||
cmd: sacctmgr -n list cluster
|
||||
changed_when: false
|
||||
|
||||
- name: Validate accounting GPU TRES exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### configured AccountingStorageTRES"
|
||||
scontrol show config | grep -E "AccountingStorageTRES|AccountingStorageType|AccountingStorageEnforce"
|
||||
|
||||
echo
|
||||
echo "### known TRES"
|
||||
sacctmgr show tres
|
||||
|
||||
echo
|
||||
echo "### checking gres/gpu"
|
||||
sacctmgr -n show tres format=Type,Name | awk '$1=="gres" && $2=="gpu" {found=1} END {exit !found}'
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: gpu_tres_check
|
||||
changed_when: false
|
||||
|
||||
- name: Ensure normal QOS exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i add qos normal Priority=100
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: add_qos_normal
|
||||
changed_when: "'Adding QOS' in (add_qos_normal.stdout + add_qos_normal.stderr)"
|
||||
failed_when: >
|
||||
add_qos_normal.rc != 0 and
|
||||
'Nothing new added' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
|
||||
'already exists' not in (add_qos_normal.stdout + add_qos_normal.stderr) and
|
||||
'Already existing' not in (add_qos_normal.stdout + add_qos_normal.stderr)
|
||||
|
||||
- name: Ensure debug-short QOS exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i add qos debug-short Priority=500
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: add_qos_debug
|
||||
changed_when: "'Adding QOS' in (add_qos_debug.stdout + add_qos_debug.stderr)"
|
||||
failed_when: >
|
||||
add_qos_debug.rc != 0 and
|
||||
'Nothing new added' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
|
||||
'already exists' not in (add_qos_debug.stdout + add_qos_debug.stderr) and
|
||||
'Already existing' not in (add_qos_debug.stdout + add_qos_debug.stderr)
|
||||
|
||||
- name: Ensure gpu-short QOS exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i add qos gpu-short Priority=1000
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: add_qos_gpu
|
||||
changed_when: "'Adding QOS' in (add_qos_gpu.stdout + add_qos_gpu.stderr)"
|
||||
failed_when: >
|
||||
add_qos_gpu.rc != 0 and
|
||||
'Nothing new added' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
|
||||
'already exists' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and
|
||||
'Already existing' not in (add_qos_gpu.stdout + add_qos_gpu.stderr)
|
||||
|
||||
- name: Ensure maintenance QOS exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i add qos maintenance Priority=5000
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: add_qos_maintenance
|
||||
changed_when: "'Adding QOS' in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)"
|
||||
failed_when: >
|
||||
add_qos_maintenance.rc != 0 and
|
||||
'Nothing new added' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
|
||||
'already exists' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and
|
||||
'Already existing' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)
|
||||
|
||||
- name: Normalize normal QOS settings
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify qos normal set Priority=100
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Normalize debug-short QOS settings
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify qos debug-short set Priority=500 MaxWall=00:10:00 MaxTRESPU=cpu=2 MaxJobsPU=4
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Normalize gpu-short QOS settings
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify qos gpu-short set Priority=1000 MaxWall=01:00:00 MaxTRESPU=gres/gpu=1,cpu=12 MaxJobsPU=2
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Normalize maintenance QOS settings
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify qos maintenance set Priority=5000 MaxWall=02:00:00
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Assign QOS set to lab account
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify account {{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Assign default account to slurmuser
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Assign QOS set to slurmuser association
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify user where name=slurmuser account={{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Show configured QOS and associations
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### TRES"
|
||||
sacctmgr show tres
|
||||
|
||||
echo
|
||||
echo "### QOS"
|
||||
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%40,MaxJobsPU
|
||||
|
||||
echo
|
||||
echo "### Associations"
|
||||
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%60,DefaultQOS,Fairshare
|
||||
|
||||
echo
|
||||
echo "### Fairshare"
|
||||
sshare -A {{ slurm_account_name }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: qos_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print QOS state
|
||||
ansible.builtin.debug:
|
||||
var: qos_state.stdout_lines
|
||||
@@ -0,0 +1,235 @@
|
||||
---
|
||||
- name: Validate Slurm QOS, fairshare and priority
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Validate priority runtime config
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### priority config"
|
||||
scontrol show config | grep -E "PriorityType|PriorityWeight|PriorityDecay|PriorityCalc|PriorityMaxAge|PriorityFavor"
|
||||
|
||||
echo
|
||||
echo "### accounting enforcement"
|
||||
scontrol show config | grep -E "AccountingStorageType|AccountingStorageEnforce|AccountingStorageTRES"
|
||||
|
||||
echo
|
||||
echo "### QOS"
|
||||
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%50,MaxJobsPU
|
||||
|
||||
echo
|
||||
echo "### associations"
|
||||
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%80,DefaultQOS,Fairshare
|
||||
|
||||
echo
|
||||
echo "### fairshare"
|
||||
sshare -A {{ slurm_account_name }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: priority_state
|
||||
changed_when: false
|
||||
|
||||
- name: Submit debug-short QOS job
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qos-debug-test
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --qos=debug-short
|
||||
#SBATCH --account=lab
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/qos-debug-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "QOS=${SLURM_JOB_QOS:-}"
|
||||
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/qos-debug-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: debug_qos_job
|
||||
changed_when: true
|
||||
|
||||
- name: Submit gpu-short QOS job
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qos-gpu-test
|
||||
#SBATCH --partition=gpu
|
||||
#SBATCH --qos=gpu-short
|
||||
#SBATCH --account=lab
|
||||
#SBATCH --gres=gpu:1
|
||||
#SBATCH --cpus-per-task=2
|
||||
#SBATCH --mem=1G
|
||||
#SBATCH --time=00:03:00
|
||||
#SBATCH --output=/shared/qos-gpu-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "QOS=${SLURM_JOB_QOS:-}"
|
||||
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
|
||||
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo
|
||||
nvidia-smi
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 120); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/qos-gpu-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: gpu_qos_job
|
||||
changed_when: true
|
||||
|
||||
- name: Validate debug-short walltime limit behavior
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
set +e
|
||||
output="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH' 2>&1
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qos-limit-fail
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --qos=debug-short
|
||||
#SBATCH --account=lab
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --output=/shared/qos-limit-fail-%j.out
|
||||
|
||||
sleep 10
|
||||
SBATCH
|
||||
)"
|
||||
rc=$?
|
||||
set -e
|
||||
|
||||
echo "RC=$rc"
|
||||
echo "$output"
|
||||
|
||||
if [ "$rc" -ne 0 ]; then
|
||||
echo "Limit rejection test passed at submit time"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
job_id="$output"
|
||||
echo "Submitted job despite expected limit check: $job_id"
|
||||
|
||||
sleep 3
|
||||
|
||||
echo "### squeue"
|
||||
squeue -j "$job_id" -o "%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" || true
|
||||
|
||||
echo
|
||||
echo "### job detail"
|
||||
scontrol show job "$job_id" || true
|
||||
|
||||
state="$(squeue -h -j "$job_id" -o "%T" || true)"
|
||||
reason="$(squeue -h -j "$job_id" -o "%R" || true)"
|
||||
|
||||
echo "STATE=$state"
|
||||
echo "REASON=$reason"
|
||||
|
||||
if echo "$state" | grep -qE "PENDING|CONFIGURING"; then
|
||||
if echo "$reason" | grep -qiE "qos|limit|time|max|assoc"; then
|
||||
echo "Limit enforcement test passed via pending reason"
|
||||
scancel "$job_id" || true
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Job was accepted without an obvious QOS/limit pending reason"
|
||||
scancel "$job_id" || true
|
||||
exit 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: limit_rejection
|
||||
changed_when: false
|
||||
|
||||
- name: Show priority and fairshare snapshot
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### queue"
|
||||
squeue || true
|
||||
|
||||
echo
|
||||
echo "### sprio"
|
||||
sprio || true
|
||||
|
||||
echo
|
||||
echo "### sshare"
|
||||
sshare -A {{ slurm_account_name }} || true
|
||||
|
||||
echo
|
||||
echo "### recent sacct"
|
||||
sacct -S today --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -40
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: priority_snapshot
|
||||
changed_when: false
|
||||
|
||||
- name: Print validation result
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "### priority state"
|
||||
- "{{ priority_state.stdout_lines }}"
|
||||
- "### debug QOS job"
|
||||
- "{{ debug_qos_job.stdout_lines }}"
|
||||
- "### GPU QOS job"
|
||||
- "{{ gpu_qos_job.stdout_lines }}"
|
||||
- "### limit rejection"
|
||||
- "{{ limit_rejection.stdout_lines }}"
|
||||
- "### priority snapshot"
|
||||
- "{{ priority_snapshot.stdout_lines }}"
|
||||
Reference in New Issue
Block a user