141 lines
3.7 KiB
YAML
141 lines
3.7 KiB
YAML
---
|
|
- name: Validate Slurm operator user and SSH mesh
|
|
hosts: slurm_cluster
|
|
become: true
|
|
gather_facts: false
|
|
|
|
vars:
|
|
slurm_operator_user: "{{ slurm_operator_user | default('slurmuser') }}"
|
|
slurm_hosts: "{{ groups['slurm_cluster'] }}"
|
|
|
|
tasks:
|
|
- name: Validate slurmuser exists
|
|
ansible.builtin.command:
|
|
cmd: id {{ slurm_operator_user }}
|
|
changed_when: false
|
|
|
|
- name: Validate sinfo as slurmuser
|
|
ansible.builtin.command:
|
|
cmd: sudo -iu {{ slurm_operator_user }} sinfo
|
|
changed_when: false
|
|
|
|
- name: Validate squeue as slurmuser
|
|
ansible.builtin.command:
|
|
cmd: sudo -iu {{ slurm_operator_user }} squeue
|
|
changed_when: false
|
|
|
|
- name: Validate SSH mesh as slurmuser
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
for h in {{ slurm_hosts | join(' ') }}; do
|
|
echo "=== $h ==="
|
|
ssh -o BatchMode=yes -o ConnectTimeout=5 "$h" hostname
|
|
done
|
|
args:
|
|
executable: /bin/bash
|
|
become_user: "{{ slurm_operator_user }}"
|
|
changed_when: false
|
|
|
|
|
|
- name: Validate Slurm controller commands
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
vars:
|
|
slurm_operator_user: slurmuser
|
|
|
|
tasks:
|
|
- name: Validate slurmctld status through sudo
|
|
ansible.builtin.command:
|
|
cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmctld --no-pager
|
|
changed_when: false
|
|
|
|
- name: Validate controller Slurm commands
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
sudo -iu {{ slurm_operator_user }} sinfo
|
|
sudo -iu {{ slurm_operator_user }} squeue
|
|
sudo -iu {{ slurm_operator_user }} scontrol show nodes
|
|
args:
|
|
executable: /bin/bash
|
|
changed_when: false
|
|
|
|
|
|
- name: Validate Slurm worker commands
|
|
hosts: slurm_compute:slurm_gpu
|
|
become: true
|
|
gather_facts: false
|
|
|
|
vars:
|
|
slurm_operator_user: slurmuser
|
|
|
|
tasks:
|
|
- name: Validate slurmd status through sudo
|
|
ansible.builtin.command:
|
|
cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmd --no-pager
|
|
changed_when: false
|
|
|
|
- name: Validate worker Slurm commands
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
sudo -iu {{ slurm_operator_user }} sinfo
|
|
sudo -iu {{ slurm_operator_user }} squeue
|
|
sudo -iu {{ slurm_operator_user }} scontrol show nodes
|
|
args:
|
|
executable: /bin/bash
|
|
changed_when: false
|
|
|
|
|
|
- name: Validate basic job submission
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
vars:
|
|
slurm_operator_user: slurmuser
|
|
|
|
tasks:
|
|
- name: Submit simple Slurm test job as slurmuser
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
job_id="$(
|
|
sudo -iu {{ slurm_operator_user }} sbatch --parsable <<'SBATCH'
|
|
#!/bin/bash
|
|
#SBATCH --job-name=ansible-validate
|
|
#SBATCH --partition=debug
|
|
#SBATCH --time=00:01:00
|
|
#SBATCH --output=/tmp/ansible-validate-%j.out
|
|
|
|
hostname
|
|
whoami
|
|
date
|
|
SBATCH
|
|
)"
|
|
|
|
echo "$job_id"
|
|
|
|
for i in $(seq 1 20); do
|
|
state="$(sudo -iu {{ slurm_operator_user }} squeue -h -j "$job_id" -o "%T" || true)"
|
|
if [ -z "$state" ]; then
|
|
break
|
|
fi
|
|
echo "job_state=$state"
|
|
sleep 1
|
|
done
|
|
|
|
sudo -iu {{ slurm_operator_user }} sacct -j "$job_id" --format=JobID,JobName,State,ExitCode 2>/dev/null || true
|
|
|
|
if ls /tmp/ansible-validate-"$job_id".out >/dev/null 2>&1; then
|
|
cat /tmp/ansible-validate-"$job_id".out
|
|
fi
|
|
args:
|
|
executable: /bin/bash
|
|
register: slurm_job_test
|
|
changed_when: true
|
|
|
|
- name: Show basic job submission result
|
|
ansible.builtin.debug:
|
|
var: slurm_job_test.stdout_lines
|