--- - name: Submit GPU test job hosts: slurm_controller become: true gather_facts: false tasks: - name: Submit test job to gpu partition ansible.builtin.shell: | set -euo pipefail job_id="$( sudo -iu slurmuser sbatch --parsable <<'SBATCH' #!/bin/bash #SBATCH --job-name=gpu-test #SBATCH --partition=gpu #SBATCH --gres=gpu:1 #SBATCH --cpus-per-task=2 #SBATCH --mem=2G #SBATCH --time=00:03:00 #SBATCH --output=/shared/gpu-test-%j.out echo "HOST=$(hostname)" echo "USER=$(whoami)" echo "SLURM_JOB_ID=$SLURM_JOB_ID" echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}" echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}" echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" echo echo "### nvidia-smi" nvidia-smi echo echo "### GPU process table" nvidia-smi pmon -c 1 || true SBATCH )" echo "JOB_ID=$job_id" for i in $(seq 1 90); do if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then sudo -iu slurmuser squeue -j "$job_id" sleep 1 else break fi done echo "### sacct" sudo -iu slurmuser sacct -j "$job_id" --format=JobID,JobName,Partition,State,ExitCode 2>/dev/null || true echo "### output" if [ -f "/shared/gpu-test-${job_id}.out" ]; then cat "/shared/gpu-test-${job_id}.out" else echo "Output file not found: /shared/gpu-test-${job_id}.out" find /shared -maxdepth 1 -name "gpu-test-*.out" -ls | tail -5 || true exit 1 fi args: executable: /bin/bash register: gpu_job_result changed_when: true - name: Show GPU job result ansible.builtin.debug: var: gpu_job_result.stdout_lines