Files
2026-06-05 15:38:56 +00:00

141 lines
3.7 KiB
YAML

---
- name: Validate Slurm operator user and SSH mesh
hosts: slurm_cluster
become: true
gather_facts: false
vars:
slurm_operator_user: "{{ slurm_operator_user | default('slurmuser') }}"
slurm_hosts: "{{ groups['slurm_cluster'] }}"
tasks:
- name: Validate slurmuser exists
ansible.builtin.command:
cmd: id {{ slurm_operator_user }}
changed_when: false
- name: Validate sinfo as slurmuser
ansible.builtin.command:
cmd: sudo -iu {{ slurm_operator_user }} sinfo
changed_when: false
- name: Validate squeue as slurmuser
ansible.builtin.command:
cmd: sudo -iu {{ slurm_operator_user }} squeue
changed_when: false
- name: Validate SSH mesh as slurmuser
ansible.builtin.shell: |
set -euo pipefail
for h in {{ slurm_hosts | join(' ') }}; do
echo "=== $h ==="
ssh -o BatchMode=yes -o ConnectTimeout=5 "$h" hostname
done
args:
executable: /bin/bash
become_user: "{{ slurm_operator_user }}"
changed_when: false
- name: Validate Slurm controller commands
hosts: slurm_controller
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Validate slurmctld status through sudo
ansible.builtin.command:
cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmctld --no-pager
changed_when: false
- name: Validate controller Slurm commands
ansible.builtin.shell: |
set -euo pipefail
sudo -iu {{ slurm_operator_user }} sinfo
sudo -iu {{ slurm_operator_user }} squeue
sudo -iu {{ slurm_operator_user }} scontrol show nodes
args:
executable: /bin/bash
changed_when: false
- name: Validate Slurm worker commands
hosts: slurm_compute:slurm_gpu
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Validate slurmd status through sudo
ansible.builtin.command:
cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmd --no-pager
changed_when: false
- name: Validate worker Slurm commands
ansible.builtin.shell: |
set -euo pipefail
sudo -iu {{ slurm_operator_user }} sinfo
sudo -iu {{ slurm_operator_user }} squeue
sudo -iu {{ slurm_operator_user }} scontrol show nodes
args:
executable: /bin/bash
changed_when: false
- name: Validate basic job submission
hosts: slurm_controller
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Submit simple Slurm test job as slurmuser
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu {{ slurm_operator_user }} sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=ansible-validate
#SBATCH --partition=debug
#SBATCH --time=00:01:00
#SBATCH --output=/tmp/ansible-validate-%j.out
hostname
whoami
date
SBATCH
)"
echo "$job_id"
for i in $(seq 1 20); do
state="$(sudo -iu {{ slurm_operator_user }} squeue -h -j "$job_id" -o "%T" || true)"
if [ -z "$state" ]; then
break
fi
echo "job_state=$state"
sleep 1
done
sudo -iu {{ slurm_operator_user }} sacct -j "$job_id" --format=JobID,JobName,State,ExitCode 2>/dev/null || true
if ls /tmp/ansible-validate-"$job_id".out >/dev/null 2>&1; then
cat /tmp/ansible-validate-"$job_id".out
fi
args:
executable: /bin/bash
register: slurm_job_test
changed_when: true
- name: Show basic job submission result
ansible.builtin.debug:
var: slurm_job_test.stdout_lines