Add Slurm AI/HPC cluster platform project
This commit is contained in:
@@ -0,0 +1,140 @@
|
||||
---
|
||||
- name: Validate Slurm operator user and SSH mesh
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: "{{ slurm_operator_user | default('slurmuser') }}"
|
||||
slurm_hosts: "{{ groups['slurm_cluster'] }}"
|
||||
|
||||
tasks:
|
||||
- name: Validate slurmuser exists
|
||||
ansible.builtin.command:
|
||||
cmd: id {{ slurm_operator_user }}
|
||||
changed_when: false
|
||||
|
||||
- name: Validate sinfo as slurmuser
|
||||
ansible.builtin.command:
|
||||
cmd: sudo -iu {{ slurm_operator_user }} sinfo
|
||||
changed_when: false
|
||||
|
||||
- name: Validate squeue as slurmuser
|
||||
ansible.builtin.command:
|
||||
cmd: sudo -iu {{ slurm_operator_user }} squeue
|
||||
changed_when: false
|
||||
|
||||
- name: Validate SSH mesh as slurmuser
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
for h in {{ slurm_hosts | join(' ') }}; do
|
||||
echo "=== $h ==="
|
||||
ssh -o BatchMode=yes -o ConnectTimeout=5 "$h" hostname
|
||||
done
|
||||
args:
|
||||
executable: /bin/bash
|
||||
become_user: "{{ slurm_operator_user }}"
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Validate Slurm controller commands
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Validate slurmctld status through sudo
|
||||
ansible.builtin.command:
|
||||
cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmctld --no-pager
|
||||
changed_when: false
|
||||
|
||||
- name: Validate controller Slurm commands
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sudo -iu {{ slurm_operator_user }} sinfo
|
||||
sudo -iu {{ slurm_operator_user }} squeue
|
||||
sudo -iu {{ slurm_operator_user }} scontrol show nodes
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Validate Slurm worker commands
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Validate slurmd status through sudo
|
||||
ansible.builtin.command:
|
||||
cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmd --no-pager
|
||||
changed_when: false
|
||||
|
||||
- name: Validate worker Slurm commands
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sudo -iu {{ slurm_operator_user }} sinfo
|
||||
sudo -iu {{ slurm_operator_user }} squeue
|
||||
sudo -iu {{ slurm_operator_user }} scontrol show nodes
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Validate basic job submission
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
slurm_operator_user: slurmuser
|
||||
|
||||
tasks:
|
||||
- name: Submit simple Slurm test job as slurmuser
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu {{ slurm_operator_user }} sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=ansible-validate
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --time=00:01:00
|
||||
#SBATCH --output=/tmp/ansible-validate-%j.out
|
||||
|
||||
hostname
|
||||
whoami
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "$job_id"
|
||||
|
||||
for i in $(seq 1 20); do
|
||||
state="$(sudo -iu {{ slurm_operator_user }} squeue -h -j "$job_id" -o "%T" || true)"
|
||||
if [ -z "$state" ]; then
|
||||
break
|
||||
fi
|
||||
echo "job_state=$state"
|
||||
sleep 1
|
||||
done
|
||||
|
||||
sudo -iu {{ slurm_operator_user }} sacct -j "$job_id" --format=JobID,JobName,State,ExitCode 2>/dev/null || true
|
||||
|
||||
if ls /tmp/ansible-validate-"$job_id".out >/dev/null 2>&1; then
|
||||
cat /tmp/ansible-validate-"$job_id".out
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurm_job_test
|
||||
changed_when: true
|
||||
|
||||
- name: Show basic job submission result
|
||||
ansible.builtin.debug:
|
||||
var: slurm_job_test.stdout_lines
|
||||
Reference in New Issue
Block a user