96 lines
2.6 KiB
YAML
96 lines
2.6 KiB
YAML
|
|
---
|
||
|
|
- name: Submit job to specific Slurm node
|
||
|
|
hosts: slurm_controller
|
||
|
|
become: true
|
||
|
|
gather_facts: false
|
||
|
|
|
||
|
|
tasks:
|
||
|
|
- name: Require target_node
|
||
|
|
ansible.builtin.fail:
|
||
|
|
msg: "Use: ansible-playbook test-specific-node.yml -e target_node=<hostname>"
|
||
|
|
when: target_node is not defined
|
||
|
|
|
||
|
|
- name: Submit test job to target node
|
||
|
|
ansible.builtin.shell: |
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
job_id="$(
|
||
|
|
sudo -iu slurmuser sbatch --parsable <<SBATCH
|
||
|
|
#!/bin/bash
|
||
|
|
#SBATCH --job-name=node-test
|
||
|
|
#SBATCH --partition=debug
|
||
|
|
#SBATCH --nodelist={{ target_node }}
|
||
|
|
#SBATCH --cpus-per-task=1
|
||
|
|
#SBATCH --mem=256M
|
||
|
|
#SBATCH --time=00:02:00
|
||
|
|
#SBATCH --account=lab
|
||
|
|
#SBATCH --qos=normal
|
||
|
|
#SBATCH --output=/shared/node-test-%j.out
|
||
|
|
|
||
|
|
echo "HOST=\$(hostname)"
|
||
|
|
echo "USER=\$(whoami)"
|
||
|
|
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
|
||
|
|
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
|
||
|
|
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
|
||
|
|
date
|
||
|
|
SBATCH
|
||
|
|
)"
|
||
|
|
|
||
|
|
echo "JOB_ID=$job_id"
|
||
|
|
|
||
|
|
echo "### waiting for job to leave queue"
|
||
|
|
for i in $(seq 1 120); do
|
||
|
|
if squeue -h -j "$job_id" | grep -q .; then
|
||
|
|
squeue -j "$job_id"
|
||
|
|
sleep 1
|
||
|
|
else
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
|
||
|
|
echo "### waiting for output file"
|
||
|
|
for i in $(seq 1 30); do
|
||
|
|
if [ -s "/shared/node-test-${job_id}.out" ]; then
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
sleep 1
|
||
|
|
done
|
||
|
|
|
||
|
|
echo "### waiting for sacct final state"
|
||
|
|
final_state=""
|
||
|
|
for i in $(seq 1 30); do
|
||
|
|
final_state="$(
|
||
|
|
sacct -n -P -j "$job_id" --format=State 2>/dev/null \
|
||
|
|
| head -n 1 \
|
||
|
|
| cut -d'|' -f1 \
|
||
|
|
| awk '{print $1}'
|
||
|
|
)"
|
||
|
|
|
||
|
|
if echo "$final_state" | grep -qE "COMPLETED|FAILED|CANCELLED|TIMEOUT|NODE_FAIL|OUT_OF_MEMORY"; then
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
|
||
|
|
sleep 1
|
||
|
|
done
|
||
|
|
|
||
|
|
echo "FINAL_STATE=${final_state:-UNKNOWN}"
|
||
|
|
|
||
|
|
echo "### sacct"
|
||
|
|
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||
|
|
|
||
|
|
echo "### output"
|
||
|
|
cat "/shared/node-test-${job_id}.out"
|
||
|
|
|
||
|
|
if [ "${final_state:-UNKNOWN}" != "COMPLETED" ]; then
|
||
|
|
echo "Job did not reach COMPLETED state according to sacct"
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
args:
|
||
|
|
executable: /bin/bash
|
||
|
|
register: node_test
|
||
|
|
changed_when: true
|
||
|
|
|
||
|
|
- name: Show node test result
|
||
|
|
ansible.builtin.debug:
|
||
|
|
var: node_test.stdout_lines
|