Files
portfolio/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-cgroup-cpu-gpu-node.yml
T

60 lines
1.6 KiB
YAML
Raw Normal View History

2026-06-04 19:41:05 +00:00
---
- name: Test CPU cgroup enforcement on gpu01
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit cgroup CPU test to gpu01
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=cgroup-cpu-test
#SBATCH --partition=all
#SBATCH --nodelist=gpu01
#SBATCH --cpus-per-task=2
#SBATCH --mem=1G
#SBATCH --time=00:02:00
#SBATCH --output=/shared/cgroup-cpu-test-%j.out
echo "HOST=$(hostname)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "SLURM_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo "MEM_ALLOWED=$(grep Mems_allowed_list /proc/self/status || true)"
echo
echo "### cgroup"
cat /proc/self/cgroup
echo
echo "### mounted cgroups"
mount | grep cgroup || true
sleep 5
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 60); do
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
sudo -iu slurmuser squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### output"
cat "/shared/cgroup-cpu-test-${job_id}.out"
args:
executable: /bin/bash
register: cgroup_cpu_result
changed_when: true
- name: Show cgroup CPU result
ansible.builtin.debug:
var: cgroup_cpu_result.stdout_lines