Add Slurm AI/HPC cluster platform project

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit cd6830334b
47 changed files with 4727 additions and 0 deletions
@@ -0,0 +1,216 @@
---
- name: Detect problematic Slurm nodes
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Detect nodes needing remediation
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -h -o "%N %T" | awk '
tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1}
' | sort -u
args:
executable: /bin/bash
register: bad_nodes_raw
changed_when: false
- name: Store bad node list
ansible.builtin.set_fact:
bad_nodes: "{{ bad_nodes_raw.stdout_lines }}"
- name: Show detected problematic nodes
ansible.builtin.debug:
var: bad_nodes
- name: Attempt auto-remediation on problematic nodes
hosts: slurm_compute:slurm_gpu
become: true
gather_facts: false
serial: 1
vars:
bad_nodes_from_controller: "{{ hostvars[groups['slurm_controller'][0]].bad_nodes | default([]) }}"
tasks:
- name: Skip healthy nodes
ansible.builtin.meta: end_host
when: inventory_hostname not in bad_nodes_from_controller
- name: Restart Munge
ansible.builtin.systemd:
name: munge
state: restarted
enabled: true
- name: Restart slurmd
ansible.builtin.systemd:
name: slurmd
state: restarted
enabled: true
- name: Validate local services after remediation attempt
ansible.builtin.shell: |
set -euo pipefail
echo "HOST=$(hostname)"
echo
echo "### services"
systemctl is-active munge
systemctl is-active slurmd
echo
echo "### munge"
munge -n | unmunge >/dev/null
echo "munge OK"
echo
echo "### controller ping"
scontrol ping
echo
echo "### slurmd listener"
ss -lntp | grep ':6818 ' || true
echo
echo "### recent slurmd logs"
journalctl -u slurmd -n 30 --no-pager || true
args:
executable: /bin/bash
register: local_repair_check
changed_when: false
- name: Print local remediation result
ansible.builtin.debug:
var: local_repair_check.stdout_lines
- name: Refresh controller and validate remediated nodes
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Restart slurmctld to refresh node states
ansible.builtin.systemd:
name: slurmctld
state: restarted
- name: Wait for controller
ansible.builtin.command:
cmd: scontrol ping
register: slurmctld_ping
retries: 15
delay: 2
until: slurmctld_ping.rc == 0
changed_when: false
- name: Clear maintenance state on previously bad nodes
ansible.builtin.shell: |
set -euo pipefail
bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}"
if [ -z "$bad_nodes" ]; then
echo "No bad nodes detected. Nothing to clear."
sinfo -N
exit 0
fi
for node in $bad_nodes; do
echo "### clearing state on $node"
scontrol update NodeName="$node" State=RESUME 2>/dev/null || true
scontrol update NodeName="$node" State=UNDRAIN 2>/dev/null || true
scontrol update NodeName="$node" State=IDLE 2>/dev/null || true
done
sleep 5
sinfo -N
args:
executable: /bin/bash
register: clear_result
changed_when: true
- name: Print clear-state result
ansible.builtin.debug:
var: clear_result.stdout_lines
- name: Detect nodes still unhealthy after remediation
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -h -o "%N %T" | awk '
tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1}
' | sort -u
args:
executable: /bin/bash
register: still_bad_nodes_raw
changed_when: false
- name: Store still bad nodes
ansible.builtin.set_fact:
still_bad_nodes: "{{ still_bad_nodes_raw.stdout_lines }}"
- name: Drain nodes that remain unhealthy
ansible.builtin.shell: |
set -euo pipefail
unresolved_nodes="{{ still_bad_nodes | join(' ') }}"
if [ -z "$unresolved_nodes" ]; then
echo "No unresolved unhealthy nodes."
sinfo -N
exit 0
fi
for node in $unresolved_nodes; do
echo "### draining unresolved node $node"
scontrol update NodeName="$node" State=DRAIN Reason="auto-remediation failed"
done
sinfo -N
args:
executable: /bin/bash
register: drain_unresolved
changed_when: still_bad_nodes | length > 0
- name: Show remediation summary
ansible.builtin.shell: |
set -euo pipefail
echo "### initial bad nodes"
bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}"
if [ -z "$bad_nodes" ]; then
echo "none"
else
printf '%s\n' $bad_nodes
fi
echo
echo "### still bad nodes"
still_bad_nodes="{{ (still_bad_nodes | default([])) | join(' ') }}"
if [ -z "$still_bad_nodes" ]; then
echo "none"
else
printf '%s\n' $still_bad_nodes
fi
echo
echo "### final sinfo"
sinfo -N
echo
echo "### queue"
squeue
args:
executable: /bin/bash
register: remediation_summary
changed_when: false
- name: Print remediation summary
ansible.builtin.debug:
var: remediation_summary.stdout_lines