217 lines
5.5 KiB
YAML
217 lines
5.5 KiB
YAML
---
|
|
- name: Detect problematic Slurm nodes
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Detect nodes needing remediation
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
sinfo -N -h -o "%N %T" | awk '
|
|
tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1}
|
|
' | sort -u
|
|
args:
|
|
executable: /bin/bash
|
|
register: bad_nodes_raw
|
|
changed_when: false
|
|
|
|
- name: Store bad node list
|
|
ansible.builtin.set_fact:
|
|
bad_nodes: "{{ bad_nodes_raw.stdout_lines }}"
|
|
|
|
- name: Show detected problematic nodes
|
|
ansible.builtin.debug:
|
|
var: bad_nodes
|
|
|
|
|
|
- name: Attempt auto-remediation on problematic nodes
|
|
hosts: slurm_compute:slurm_gpu
|
|
become: true
|
|
gather_facts: false
|
|
serial: 1
|
|
|
|
vars:
|
|
bad_nodes_from_controller: "{{ hostvars[groups['slurm_controller'][0]].bad_nodes | default([]) }}"
|
|
|
|
tasks:
|
|
- name: Skip healthy nodes
|
|
ansible.builtin.meta: end_host
|
|
when: inventory_hostname not in bad_nodes_from_controller
|
|
|
|
- name: Restart Munge
|
|
ansible.builtin.systemd:
|
|
name: munge
|
|
state: restarted
|
|
enabled: true
|
|
|
|
- name: Restart slurmd
|
|
ansible.builtin.systemd:
|
|
name: slurmd
|
|
state: restarted
|
|
enabled: true
|
|
|
|
- name: Validate local services after remediation attempt
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "HOST=$(hostname)"
|
|
|
|
echo
|
|
echo "### services"
|
|
systemctl is-active munge
|
|
systemctl is-active slurmd
|
|
|
|
echo
|
|
echo "### munge"
|
|
munge -n | unmunge >/dev/null
|
|
echo "munge OK"
|
|
|
|
echo
|
|
echo "### controller ping"
|
|
scontrol ping
|
|
|
|
echo
|
|
echo "### slurmd listener"
|
|
ss -lntp | grep ':6818 ' || true
|
|
|
|
echo
|
|
echo "### recent slurmd logs"
|
|
journalctl -u slurmd -n 30 --no-pager || true
|
|
args:
|
|
executable: /bin/bash
|
|
register: local_repair_check
|
|
changed_when: false
|
|
|
|
- name: Print local remediation result
|
|
ansible.builtin.debug:
|
|
var: local_repair_check.stdout_lines
|
|
|
|
|
|
- name: Refresh controller and validate remediated nodes
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Restart slurmctld to refresh node states
|
|
ansible.builtin.systemd:
|
|
name: slurmctld
|
|
state: restarted
|
|
|
|
- name: Wait for controller
|
|
ansible.builtin.command:
|
|
cmd: scontrol ping
|
|
register: slurmctld_ping
|
|
retries: 15
|
|
delay: 2
|
|
until: slurmctld_ping.rc == 0
|
|
changed_when: false
|
|
|
|
- name: Clear maintenance state on previously bad nodes
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}"
|
|
|
|
if [ -z "$bad_nodes" ]; then
|
|
echo "No bad nodes detected. Nothing to clear."
|
|
sinfo -N
|
|
exit 0
|
|
fi
|
|
|
|
for node in $bad_nodes; do
|
|
echo "### clearing state on $node"
|
|
scontrol update NodeName="$node" State=RESUME 2>/dev/null || true
|
|
scontrol update NodeName="$node" State=UNDRAIN 2>/dev/null || true
|
|
scontrol update NodeName="$node" State=IDLE 2>/dev/null || true
|
|
done
|
|
|
|
sleep 5
|
|
sinfo -N
|
|
args:
|
|
executable: /bin/bash
|
|
register: clear_result
|
|
changed_when: true
|
|
|
|
- name: Print clear-state result
|
|
ansible.builtin.debug:
|
|
var: clear_result.stdout_lines
|
|
|
|
- name: Detect nodes still unhealthy after remediation
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
sinfo -N -h -o "%N %T" | awk '
|
|
tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1}
|
|
' | sort -u
|
|
args:
|
|
executable: /bin/bash
|
|
register: still_bad_nodes_raw
|
|
changed_when: false
|
|
|
|
- name: Store still bad nodes
|
|
ansible.builtin.set_fact:
|
|
still_bad_nodes: "{{ still_bad_nodes_raw.stdout_lines }}"
|
|
|
|
- name: Drain nodes that remain unhealthy
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
unresolved_nodes="{{ still_bad_nodes | join(' ') }}"
|
|
|
|
if [ -z "$unresolved_nodes" ]; then
|
|
echo "No unresolved unhealthy nodes."
|
|
sinfo -N
|
|
exit 0
|
|
fi
|
|
|
|
for node in $unresolved_nodes; do
|
|
echo "### draining unresolved node $node"
|
|
scontrol update NodeName="$node" State=DRAIN Reason="auto-remediation failed"
|
|
done
|
|
|
|
sinfo -N
|
|
args:
|
|
executable: /bin/bash
|
|
register: drain_unresolved
|
|
changed_when: still_bad_nodes | length > 0
|
|
|
|
- name: Show remediation summary
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "### initial bad nodes"
|
|
bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}"
|
|
if [ -z "$bad_nodes" ]; then
|
|
echo "none"
|
|
else
|
|
printf '%s\n' $bad_nodes
|
|
fi
|
|
|
|
echo
|
|
echo "### still bad nodes"
|
|
still_bad_nodes="{{ (still_bad_nodes | default([])) | join(' ') }}"
|
|
if [ -z "$still_bad_nodes" ]; then
|
|
echo "none"
|
|
else
|
|
printf '%s\n' $still_bad_nodes
|
|
fi
|
|
|
|
echo
|
|
echo "### final sinfo"
|
|
sinfo -N
|
|
|
|
echo
|
|
echo "### queue"
|
|
squeue
|
|
args:
|
|
executable: /bin/bash
|
|
register: remediation_summary
|
|
changed_when: false
|
|
|
|
- name: Print remediation summary
|
|
ansible.builtin.debug:
|
|
var: remediation_summary.stdout_lines
|