Add Slurm AI/HPC cluster platform project

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit cd6830334b
47 changed files with 4727 additions and 0 deletions
@@ -0,0 +1,59 @@
---
- name: Test CPU cgroup enforcement on gpu01
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit cgroup CPU test to gpu01
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=cgroup-cpu-test
#SBATCH --partition=all
#SBATCH --nodelist=gpu01
#SBATCH --cpus-per-task=2
#SBATCH --mem=1G
#SBATCH --time=00:02:00
#SBATCH --output=/shared/cgroup-cpu-test-%j.out
echo "HOST=$(hostname)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "SLURM_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo "MEM_ALLOWED=$(grep Mems_allowed_list /proc/self/status || true)"
echo
echo "### cgroup"
cat /proc/self/cgroup
echo
echo "### mounted cgroups"
mount | grep cgroup || true
sleep 5
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 60); do
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
sudo -iu slurmuser squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### output"
cat "/shared/cgroup-cpu-test-${job_id}.out"
args:
executable: /bin/bash
register: cgroup_cpu_result
changed_when: true
- name: Show cgroup CPU result
ansible.builtin.debug:
var: cgroup_cpu_result.stdout_lines
@@ -0,0 +1,60 @@
---
- name: Submit CPU test job
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit test job to debug partition
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=cpu-test
#SBATCH --partition=debug
#SBATCH --cpus-per-task=1
#SBATCH --mem=512M
#SBATCH --time=00:02:00
#SBATCH --output=/shared/cpu-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 60); do
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
sudo -iu slurmuser squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sudo -iu slurmuser sacct -j "$job_id" --format=JobID,JobName,Partition,State,ExitCode 2>/dev/null || true
echo "### output"
if [ -f "/shared/cpu-test-${job_id}.out" ]; then
cat "/shared/cpu-test-${job_id}.out"
else
echo "Output file not found: /shared/cpu-test-${job_id}.out"
find /shared -maxdepth 1 -name "cpu-test-*.out" -ls | tail -5 || true
exit 1
fi
args:
executable: /bin/bash
register: cpu_job_result
changed_when: true
- name: Show CPU job result
ansible.builtin.debug:
var: cpu_job_result.stdout_lines
@@ -0,0 +1,58 @@
---
- name: Test GPU access without GRES allocation
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit job to gpu01 without requesting GPU
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=gpu-deny-test
#SBATCH --partition=all
#SBATCH --nodelist=gpu01
#SBATCH --cpus-per-task=1
#SBATCH --mem=1G
#SBATCH --time=00:02:00
#SBATCH --output=/shared/gpu-deny-test-%j.out
echo "HOST=$(hostname)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo
echo "### ls nvidia devices"
ls -l /dev/nvidia* 2>&1 || true
echo
echo "### nvidia-smi without GRES"
nvidia-smi 2>&1 || true
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 60); do
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
sudo -iu slurmuser squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### output"
cat "/shared/gpu-deny-test-${job_id}.out"
args:
executable: /bin/bash
register: gpu_deny_result
changed_when: true
- name: Show GPU deny test result
ansible.builtin.debug:
var: gpu_deny_result.stdout_lines
@@ -0,0 +1,70 @@
---
- name: Submit GPU test job
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit test job to gpu partition
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=gpu-test
#SBATCH --partition=gpu
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=2
#SBATCH --mem=2G
#SBATCH --time=00:03:00
#SBATCH --output=/shared/gpu-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo
echo "### nvidia-smi"
nvidia-smi
echo
echo "### GPU process table"
nvidia-smi pmon -c 1 || true
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
sudo -iu slurmuser squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sudo -iu slurmuser sacct -j "$job_id" --format=JobID,JobName,Partition,State,ExitCode 2>/dev/null || true
echo "### output"
if [ -f "/shared/gpu-test-${job_id}.out" ]; then
cat "/shared/gpu-test-${job_id}.out"
else
echo "Output file not found: /shared/gpu-test-${job_id}.out"
find /shared -maxdepth 1 -name "gpu-test-*.out" -ls | tail -5 || true
exit 1
fi
args:
executable: /bin/bash
register: gpu_job_result
changed_when: true
- name: Show GPU job result
ansible.builtin.debug:
var: gpu_job_result.stdout_lines
@@ -0,0 +1,95 @@
---
- name: Submit job to specific Slurm node
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Require target_node
ansible.builtin.fail:
msg: "Use: ansible-playbook test-specific-node.yml -e target_node=<hostname>"
when: target_node is not defined
- name: Submit test job to target node
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<SBATCH
#!/bin/bash
#SBATCH --job-name=node-test
#SBATCH --partition=debug
#SBATCH --nodelist={{ target_node }}
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --account=lab
#SBATCH --qos=normal
#SBATCH --output=/shared/node-test-%j.out
echo "HOST=\$(hostname)"
echo "USER=\$(whoami)"
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
echo "### waiting for job to leave queue"
for i in $(seq 1 120); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### waiting for output file"
for i in $(seq 1 30); do
if [ -s "/shared/node-test-${job_id}.out" ]; then
break
fi
sleep 1
done
echo "### waiting for sacct final state"
final_state=""
for i in $(seq 1 30); do
final_state="$(
sacct -n -P -j "$job_id" --format=State 2>/dev/null \
| head -n 1 \
| cut -d'|' -f1 \
| awk '{print $1}'
)"
if echo "$final_state" | grep -qE "COMPLETED|FAILED|CANCELLED|TIMEOUT|NODE_FAIL|OUT_OF_MEMORY"; then
break
fi
sleep 1
done
echo "FINAL_STATE=${final_state:-UNKNOWN}"
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/node-test-${job_id}.out"
if [ "${final_state:-UNKNOWN}" != "COMPLETED" ]; then
echo "Job did not reach COMPLETED state according to sacct"
exit 1
fi
args:
executable: /bin/bash
register: node_test
changed_when: true
- name: Show node test result
ansible.builtin.debug:
var: node_test.stdout_lines
@@ -0,0 +1,60 @@
---
- name: Generate measurable Slurm usage for sreport
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit CPU usage job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=sreport-usage
#SBATCH --partition=debug
#SBATCH --cpus-per-task=2
#SBATCH --mem=512M
#SBATCH --time=00:03:00
#SBATCH --output=/shared/sreport-usage-%j.out
echo "HOST=$(hostname)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo "Burning CPU for 90 seconds"
timeout 90 bash -c 'while true; do :; done' &
timeout 90 bash -c 'while true; do :; done' &
wait
echo "Done"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 150); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 2
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/sreport-usage-${job_id}.out"
args:
executable: /bin/bash
register: sreport_usage_job
changed_when: true
- name: Show usage job result
ansible.builtin.debug:
var: sreport_usage_job.stdout_lines
@@ -0,0 +1,140 @@
---
- name: Validate Slurm operator user and SSH mesh
hosts: slurm_cluster
become: true
gather_facts: false
vars:
slurm_operator_user: "{{ slurm_operator_user | default('slurmuser') }}"
slurm_hosts: "{{ groups['slurm_cluster'] }}"
tasks:
- name: Validate slurmuser exists
ansible.builtin.command:
cmd: id {{ slurm_operator_user }}
changed_when: false
- name: Validate sinfo as slurmuser
ansible.builtin.command:
cmd: sudo -iu {{ slurm_operator_user }} sinfo
changed_when: false
- name: Validate squeue as slurmuser
ansible.builtin.command:
cmd: sudo -iu {{ slurm_operator_user }} squeue
changed_when: false
- name: Validate SSH mesh as slurmuser
ansible.builtin.shell: |
set -euo pipefail
for h in {{ slurm_hosts | join(' ') }}; do
echo "=== $h ==="
ssh -o BatchMode=yes -o ConnectTimeout=5 "$h" hostname
done
args:
executable: /bin/bash
become_user: "{{ slurm_operator_user }}"
changed_when: false
- name: Validate Slurm controller commands
hosts: slurm_controller
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Validate slurmctld status through sudo
ansible.builtin.command:
cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmctld --no-pager
changed_when: false
- name: Validate controller Slurm commands
ansible.builtin.shell: |
set -euo pipefail
sudo -iu {{ slurm_operator_user }} sinfo
sudo -iu {{ slurm_operator_user }} squeue
sudo -iu {{ slurm_operator_user }} scontrol show nodes
args:
executable: /bin/bash
changed_when: false
- name: Validate Slurm worker commands
hosts: slurm_compute:slurm_gpu
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Validate slurmd status through sudo
ansible.builtin.command:
cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmd --no-pager
changed_when: false
- name: Validate worker Slurm commands
ansible.builtin.shell: |
set -euo pipefail
sudo -iu {{ slurm_operator_user }} sinfo
sudo -iu {{ slurm_operator_user }} squeue
sudo -iu {{ slurm_operator_user }} scontrol show nodes
args:
executable: /bin/bash
changed_when: false
- name: Validate basic job submission
hosts: slurm_controller
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Submit simple Slurm test job as slurmuser
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu {{ slurm_operator_user }} sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=ansible-validate
#SBATCH --partition=debug
#SBATCH --time=00:01:00
#SBATCH --output=/tmp/ansible-validate-%j.out
hostname
whoami
date
SBATCH
)"
echo "$job_id"
for i in $(seq 1 20); do
state="$(sudo -iu {{ slurm_operator_user }} squeue -h -j "$job_id" -o "%T" || true)"
if [ -z "$state" ]; then
break
fi
echo "job_state=$state"
sleep 1
done
sudo -iu {{ slurm_operator_user }} sacct -j "$job_id" --format=JobID,JobName,State,ExitCode 2>/dev/null || true
if ls /tmp/ansible-validate-"$job_id".out >/dev/null 2>&1; then
cat /tmp/ansible-validate-"$job_id".out
fi
args:
executable: /bin/bash
register: slurm_job_test
changed_when: true
- name: Show basic job submission result
ansible.builtin.debug:
var: slurm_job_test.stdout_lines