--- - name: Validate Slurm operator user and SSH mesh hosts: slurm_cluster become: true gather_facts: false vars: slurm_operator_user: "{{ slurm_operator_user | default('slurmuser') }}" slurm_hosts: "{{ groups['slurm_cluster'] }}" tasks: - name: Validate slurmuser exists ansible.builtin.command: cmd: id {{ slurm_operator_user }} changed_when: false - name: Validate sinfo as slurmuser ansible.builtin.command: cmd: sudo -iu {{ slurm_operator_user }} sinfo changed_when: false - name: Validate squeue as slurmuser ansible.builtin.command: cmd: sudo -iu {{ slurm_operator_user }} squeue changed_when: false - name: Validate SSH mesh as slurmuser ansible.builtin.shell: | set -euo pipefail for h in {{ slurm_hosts | join(' ') }}; do echo "=== $h ===" ssh -o BatchMode=yes -o ConnectTimeout=5 "$h" hostname done args: executable: /bin/bash become_user: "{{ slurm_operator_user }}" changed_when: false - name: Validate Slurm controller commands hosts: slurm_controller become: true gather_facts: false vars: slurm_operator_user: slurmuser tasks: - name: Validate slurmctld status through sudo ansible.builtin.command: cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmctld --no-pager changed_when: false - name: Validate controller Slurm commands ansible.builtin.shell: | set -euo pipefail sudo -iu {{ slurm_operator_user }} sinfo sudo -iu {{ slurm_operator_user }} squeue sudo -iu {{ slurm_operator_user }} scontrol show nodes args: executable: /bin/bash changed_when: false - name: Validate Slurm worker commands hosts: slurm_compute:slurm_gpu become: true gather_facts: false vars: slurm_operator_user: slurmuser tasks: - name: Validate slurmd status through sudo ansible.builtin.command: cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmd --no-pager changed_when: false - name: Validate worker Slurm commands ansible.builtin.shell: | set -euo pipefail sudo -iu {{ slurm_operator_user }} sinfo sudo -iu {{ slurm_operator_user }} squeue sudo -iu {{ slurm_operator_user }} scontrol show nodes args: executable: /bin/bash changed_when: false - name: Validate basic job submission hosts: slurm_controller become: true gather_facts: false vars: slurm_operator_user: slurmuser tasks: - name: Submit simple Slurm test job as slurmuser ansible.builtin.shell: | set -euo pipefail job_id="$( sudo -iu {{ slurm_operator_user }} sbatch --parsable <<'SBATCH' #!/bin/bash #SBATCH --job-name=ansible-validate #SBATCH --partition=debug #SBATCH --time=00:01:00 #SBATCH --output=/tmp/ansible-validate-%j.out hostname whoami date SBATCH )" echo "$job_id" for i in $(seq 1 20); do state="$(sudo -iu {{ slurm_operator_user }} squeue -h -j "$job_id" -o "%T" || true)" if [ -z "$state" ]; then break fi echo "job_state=$state" sleep 1 done sudo -iu {{ slurm_operator_user }} sacct -j "$job_id" --format=JobID,JobName,State,ExitCode 2>/dev/null || true if ls /tmp/ansible-validate-"$job_id".out >/dev/null 2>&1; then cat /tmp/ansible-validate-"$job_id".out fi args: executable: /bin/bash register: slurm_job_test changed_when: true - name: Show basic job submission result ansible.builtin.debug: var: slurm_job_test.stdout_lines