Add Slurm AI/HPC cluster platform project
This commit is contained in:
@@ -0,0 +1,235 @@
|
||||
---
|
||||
- name: Validate Slurm QOS, fairshare and priority
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Validate priority runtime config
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### priority config"
|
||||
scontrol show config | grep -E "PriorityType|PriorityWeight|PriorityDecay|PriorityCalc|PriorityMaxAge|PriorityFavor"
|
||||
|
||||
echo
|
||||
echo "### accounting enforcement"
|
||||
scontrol show config | grep -E "AccountingStorageType|AccountingStorageEnforce|AccountingStorageTRES"
|
||||
|
||||
echo
|
||||
echo "### QOS"
|
||||
sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%50,MaxJobsPU
|
||||
|
||||
echo
|
||||
echo "### associations"
|
||||
sacctmgr show assoc format=Cluster,Account,User,Share,QOS%80,DefaultQOS,Fairshare
|
||||
|
||||
echo
|
||||
echo "### fairshare"
|
||||
sshare -A {{ slurm_account_name }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: priority_state
|
||||
changed_when: false
|
||||
|
||||
- name: Submit debug-short QOS job
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qos-debug-test
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --qos=debug-short
|
||||
#SBATCH --account=lab
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/qos-debug-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "QOS=${SLURM_JOB_QOS:-}"
|
||||
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/qos-debug-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: debug_qos_job
|
||||
changed_when: true
|
||||
|
||||
- name: Submit gpu-short QOS job
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qos-gpu-test
|
||||
#SBATCH --partition=gpu
|
||||
#SBATCH --qos=gpu-short
|
||||
#SBATCH --account=lab
|
||||
#SBATCH --gres=gpu:1
|
||||
#SBATCH --cpus-per-task=2
|
||||
#SBATCH --mem=1G
|
||||
#SBATCH --time=00:03:00
|
||||
#SBATCH --output=/shared/qos-gpu-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "QOS=${SLURM_JOB_QOS:-}"
|
||||
echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
|
||||
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
echo
|
||||
nvidia-smi
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 120); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/qos-gpu-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: gpu_qos_job
|
||||
changed_when: true
|
||||
|
||||
- name: Validate debug-short walltime limit behavior
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
set +e
|
||||
output="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH' 2>&1
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qos-limit-fail
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --qos=debug-short
|
||||
#SBATCH --account=lab
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --output=/shared/qos-limit-fail-%j.out
|
||||
|
||||
sleep 10
|
||||
SBATCH
|
||||
)"
|
||||
rc=$?
|
||||
set -e
|
||||
|
||||
echo "RC=$rc"
|
||||
echo "$output"
|
||||
|
||||
if [ "$rc" -ne 0 ]; then
|
||||
echo "Limit rejection test passed at submit time"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
job_id="$output"
|
||||
echo "Submitted job despite expected limit check: $job_id"
|
||||
|
||||
sleep 3
|
||||
|
||||
echo "### squeue"
|
||||
squeue -j "$job_id" -o "%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" || true
|
||||
|
||||
echo
|
||||
echo "### job detail"
|
||||
scontrol show job "$job_id" || true
|
||||
|
||||
state="$(squeue -h -j "$job_id" -o "%T" || true)"
|
||||
reason="$(squeue -h -j "$job_id" -o "%R" || true)"
|
||||
|
||||
echo "STATE=$state"
|
||||
echo "REASON=$reason"
|
||||
|
||||
if echo "$state" | grep -qE "PENDING|CONFIGURING"; then
|
||||
if echo "$reason" | grep -qiE "qos|limit|time|max|assoc"; then
|
||||
echo "Limit enforcement test passed via pending reason"
|
||||
scancel "$job_id" || true
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Job was accepted without an obvious QOS/limit pending reason"
|
||||
scancel "$job_id" || true
|
||||
exit 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: limit_rejection
|
||||
changed_when: false
|
||||
|
||||
- name: Show priority and fairshare snapshot
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### queue"
|
||||
squeue || true
|
||||
|
||||
echo
|
||||
echo "### sprio"
|
||||
sprio || true
|
||||
|
||||
echo
|
||||
echo "### sshare"
|
||||
sshare -A {{ slurm_account_name }} || true
|
||||
|
||||
echo
|
||||
echo "### recent sacct"
|
||||
sacct -S today --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -40
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: priority_snapshot
|
||||
changed_when: false
|
||||
|
||||
- name: Print validation result
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "### priority state"
|
||||
- "{{ priority_state.stdout_lines }}"
|
||||
- "### debug QOS job"
|
||||
- "{{ debug_qos_job.stdout_lines }}"
|
||||
- "### GPU QOS job"
|
||||
- "{{ gpu_qos_job.stdout_lines }}"
|
||||
- "### limit rejection"
|
||||
- "{{ limit_rejection.stdout_lines }}"
|
||||
- "### priority snapshot"
|
||||
- "{{ priority_snapshot.stdout_lines }}"
|
||||
Reference in New Issue
Block a user