Files

71 lines
1.9 KiB
YAML
Raw Permalink Normal View History

2026-06-04 19:41:05 +00:00
---
- name: Submit GPU test job
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Submit test job to gpu partition
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=gpu-test
#SBATCH --partition=gpu
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=2
#SBATCH --mem=2G
#SBATCH --time=00:03:00
#SBATCH --output=/shared/gpu-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}"
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
echo
echo "### nvidia-smi"
nvidia-smi
echo
echo "### GPU process table"
nvidia-smi pmon -c 1 || true
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then
sudo -iu slurmuser squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sudo -iu slurmuser sacct -j "$job_id" --format=JobID,JobName,Partition,State,ExitCode 2>/dev/null || true
echo "### output"
if [ -f "/shared/gpu-test-${job_id}.out" ]; then
cat "/shared/gpu-test-${job_id}.out"
else
echo "Output file not found: /shared/gpu-test-${job_id}.out"
find /shared -maxdepth 1 -name "gpu-test-*.out" -ls | tail -5 || true
exit 1
fi
args:
executable: /bin/bash
register: gpu_job_result
changed_when: true
- name: Show GPU job result
ansible.builtin.debug:
var: gpu_job_result.stdout_lines