This commit is contained in:
+216
@@ -0,0 +1,216 @@
|
||||
---
|
||||
- name: Detect problematic Slurm nodes
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Detect nodes needing remediation
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
sinfo -N -h -o "%N %T" | awk '
|
||||
tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1}
|
||||
' | sort -u
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: bad_nodes_raw
|
||||
changed_when: false
|
||||
|
||||
- name: Store bad node list
|
||||
ansible.builtin.set_fact:
|
||||
bad_nodes: "{{ bad_nodes_raw.stdout_lines }}"
|
||||
|
||||
- name: Show detected problematic nodes
|
||||
ansible.builtin.debug:
|
||||
var: bad_nodes
|
||||
|
||||
|
||||
- name: Attempt auto-remediation on problematic nodes
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: false
|
||||
serial: 1
|
||||
|
||||
vars:
|
||||
bad_nodes_from_controller: "{{ hostvars[groups['slurm_controller'][0]].bad_nodes | default([]) }}"
|
||||
|
||||
tasks:
|
||||
- name: Skip healthy nodes
|
||||
ansible.builtin.meta: end_host
|
||||
when: inventory_hostname not in bad_nodes_from_controller
|
||||
|
||||
- name: Restart Munge
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Restart slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Validate local services after remediation attempt
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
|
||||
echo
|
||||
echo "### services"
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
|
||||
echo
|
||||
echo "### munge"
|
||||
munge -n | unmunge >/dev/null
|
||||
echo "munge OK"
|
||||
|
||||
echo
|
||||
echo "### controller ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### slurmd listener"
|
||||
ss -lntp | grep ':6818 ' || true
|
||||
|
||||
echo
|
||||
echo "### recent slurmd logs"
|
||||
journalctl -u slurmd -n 30 --no-pager || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: local_repair_check
|
||||
changed_when: false
|
||||
|
||||
- name: Print local remediation result
|
||||
ansible.builtin.debug:
|
||||
var: local_repair_check.stdout_lines
|
||||
|
||||
|
||||
- name: Refresh controller and validate remediated nodes
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Restart slurmctld to refresh node states
|
||||
ansible.builtin.systemd:
|
||||
name: slurmctld
|
||||
state: restarted
|
||||
|
||||
- name: Wait for controller
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: slurmctld_ping
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: slurmctld_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Clear maintenance state on previously bad nodes
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}"
|
||||
|
||||
if [ -z "$bad_nodes" ]; then
|
||||
echo "No bad nodes detected. Nothing to clear."
|
||||
sinfo -N
|
||||
exit 0
|
||||
fi
|
||||
|
||||
for node in $bad_nodes; do
|
||||
echo "### clearing state on $node"
|
||||
scontrol update NodeName="$node" State=RESUME 2>/dev/null || true
|
||||
scontrol update NodeName="$node" State=UNDRAIN 2>/dev/null || true
|
||||
scontrol update NodeName="$node" State=IDLE 2>/dev/null || true
|
||||
done
|
||||
|
||||
sleep 5
|
||||
sinfo -N
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: clear_result
|
||||
changed_when: true
|
||||
|
||||
- name: Print clear-state result
|
||||
ansible.builtin.debug:
|
||||
var: clear_result.stdout_lines
|
||||
|
||||
- name: Detect nodes still unhealthy after remediation
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
sinfo -N -h -o "%N %T" | awk '
|
||||
tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1}
|
||||
' | sort -u
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: still_bad_nodes_raw
|
||||
changed_when: false
|
||||
|
||||
- name: Store still bad nodes
|
||||
ansible.builtin.set_fact:
|
||||
still_bad_nodes: "{{ still_bad_nodes_raw.stdout_lines }}"
|
||||
|
||||
- name: Drain nodes that remain unhealthy
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
unresolved_nodes="{{ still_bad_nodes | join(' ') }}"
|
||||
|
||||
if [ -z "$unresolved_nodes" ]; then
|
||||
echo "No unresolved unhealthy nodes."
|
||||
sinfo -N
|
||||
exit 0
|
||||
fi
|
||||
|
||||
for node in $unresolved_nodes; do
|
||||
echo "### draining unresolved node $node"
|
||||
scontrol update NodeName="$node" State=DRAIN Reason="auto-remediation failed"
|
||||
done
|
||||
|
||||
sinfo -N
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: drain_unresolved
|
||||
changed_when: still_bad_nodes | length > 0
|
||||
|
||||
- name: Show remediation summary
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### initial bad nodes"
|
||||
bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}"
|
||||
if [ -z "$bad_nodes" ]; then
|
||||
echo "none"
|
||||
else
|
||||
printf '%s\n' $bad_nodes
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "### still bad nodes"
|
||||
still_bad_nodes="{{ (still_bad_nodes | default([])) | join(' ') }}"
|
||||
if [ -z "$still_bad_nodes" ]; then
|
||||
echo "none"
|
||||
else
|
||||
printf '%s\n' $still_bad_nodes
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "### final sinfo"
|
||||
sinfo -N
|
||||
|
||||
echo
|
||||
echo "### queue"
|
||||
squeue
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: remediation_summary
|
||||
changed_when: false
|
||||
|
||||
- name: Print remediation summary
|
||||
ansible.builtin.debug:
|
||||
var: remediation_summary.stdout_lines
|
||||
@@ -0,0 +1,149 @@
|
||||
---
|
||||
- name: Check Slurm controller health
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Check controller services and cluster state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### controller services"
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmctld
|
||||
systemctl is-active slurmdbd || true
|
||||
systemctl is-active mariadb || true
|
||||
|
||||
echo
|
||||
echo "### slurm ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### nodes"
|
||||
sinfo -N
|
||||
|
||||
echo
|
||||
echo "### partitions"
|
||||
sinfo
|
||||
|
||||
echo
|
||||
echo "### queue"
|
||||
squeue
|
||||
|
||||
echo
|
||||
echo "### problematic nodes"
|
||||
sinfo -N -h -o "%N %T %E" | awk '$2 !~ /idle|alloc|mix/ {print}' || true
|
||||
|
||||
echo
|
||||
echo "### accounting"
|
||||
sacctmgr -n list cluster || true
|
||||
|
||||
echo
|
||||
echo "### recent failed jobs"
|
||||
sacct -S today --state=FAILED,CANCELLED,TIMEOUT,NODE_FAIL,OUT_OF_MEMORY \
|
||||
--format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,NodeList | tail -30 || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: controller_health
|
||||
changed_when: false
|
||||
|
||||
- name: Print controller health
|
||||
ansible.builtin.debug:
|
||||
var: controller_health.stdout_lines
|
||||
|
||||
|
||||
- name: Check Slurm worker health
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Check worker services, config and connectivity
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "FQDN=$(hostname -f 2>/dev/null || hostname)"
|
||||
echo "KERNEL=$(uname -r)"
|
||||
echo "UPTIME=$(uptime -p)"
|
||||
|
||||
echo
|
||||
echo "### services"
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
|
||||
echo
|
||||
echo "### munge local test"
|
||||
munge -n | unmunge >/dev/null
|
||||
echo "munge OK"
|
||||
|
||||
echo
|
||||
echo "### controller connectivity"
|
||||
getent hosts slurm-ctl01 || true
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### slurmd listener"
|
||||
ss -lntp | grep ':6818 ' || true
|
||||
|
||||
echo
|
||||
echo "### config checksums"
|
||||
sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true
|
||||
|
||||
echo
|
||||
echo "### shared filesystem"
|
||||
test -d /shared
|
||||
touch /shared/.slurm-health-$(hostname)
|
||||
ls -l /shared/.slurm-health-$(hostname)
|
||||
rm -f /shared/.slurm-health-$(hostname)
|
||||
|
||||
echo
|
||||
echo "### cgroup"
|
||||
mount | grep cgroup || true
|
||||
|
||||
echo
|
||||
echo "### gpu check"
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi --query-gpu=index,name,driver_version,memory.total,temperature.gpu,utilization.gpu --format=csv,noheader || true
|
||||
else
|
||||
echo "NO_NVIDIA_SMI"
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: worker_health
|
||||
changed_when: false
|
||||
|
||||
- name: Print worker health
|
||||
ansible.builtin.debug:
|
||||
var: worker_health.stdout_lines
|
||||
|
||||
|
||||
- name: Check Slurm-reported node state consistency
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Build Slurm node health summary
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### node summary"
|
||||
sinfo -N -o "%N %P %T %C %m %G %E"
|
||||
|
||||
echo
|
||||
echo "### full problematic node details"
|
||||
for node in $(sinfo -N -h -o "%N %T" | awk '$2 ~ /down|drain|fail|unk|not_responding|idle\\*/ {print $1}' | sort -u); do
|
||||
echo
|
||||
echo "### $node"
|
||||
scontrol show node "$node"
|
||||
done
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurm_node_summary
|
||||
changed_when: false
|
||||
|
||||
- name: Print Slurm node summary
|
||||
ansible.builtin.debug:
|
||||
var: slurm_node_summary.stdout_lines
|
||||
@@ -0,0 +1,217 @@
|
||||
---
|
||||
- name: Validate target node
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Require target_node
|
||||
ansible.builtin.fail:
|
||||
msg: "Use: ansible-playbook repair-slurm-node.yml -e target_node=<hostname>"
|
||||
when: target_node is not defined
|
||||
|
||||
- name: Ensure target_node is in inventory
|
||||
ansible.builtin.fail:
|
||||
msg: "target_node={{ target_node }} is not in Ansible inventory"
|
||||
when: target_node not in groups['all']
|
||||
|
||||
|
||||
- name: Capture node state before repair
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Show target node state before repair
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### sinfo"
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
|
||||
echo
|
||||
echo "### scontrol"
|
||||
scontrol show node {{ target_node }} || true
|
||||
|
||||
echo
|
||||
echo "### jobs"
|
||||
squeue -w {{ target_node }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_state_before
|
||||
changed_when: false
|
||||
|
||||
- name: Print target node state before repair
|
||||
ansible.builtin.debug:
|
||||
var: node_state_before.stdout_lines
|
||||
|
||||
|
||||
- name: Repair local services on target node
|
||||
hosts: "{{ target_node }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Restart Munge
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Restart slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
enabled: true
|
||||
when:
|
||||
- inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', [])
|
||||
|
||||
- name: Validate local repair
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### services"
|
||||
systemctl is-active munge
|
||||
systemctl is-active slurmd
|
||||
|
||||
echo
|
||||
echo "### munge"
|
||||
munge -n | unmunge >/dev/null
|
||||
echo "munge OK"
|
||||
|
||||
echo
|
||||
echo "### controller ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### slurmd listener"
|
||||
ss -lntp | grep ':6818 ' || true
|
||||
|
||||
echo
|
||||
echo "### recent slurmd logs"
|
||||
journalctl -u slurmd -n 40 --no-pager || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: local_repair_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print local repair state
|
||||
ansible.builtin.debug:
|
||||
var: local_repair_state.stdout_lines
|
||||
|
||||
|
||||
- name: Clear Slurm maintenance/down state after repair
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Restart controller to refresh node state
|
||||
ansible.builtin.systemd:
|
||||
name: slurmctld
|
||||
state: restarted
|
||||
|
||||
- name: Wait for controller
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: slurmctld_ping
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: slurmctld_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Clear target node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
scontrol update NodeName={{ target_node }} State=RESUME 2>/dev/null || true
|
||||
scontrol update NodeName={{ target_node }} State=UNDRAIN 2>/dev/null || true
|
||||
scontrol update NodeName={{ target_node }} State=IDLE 2>/dev/null || true
|
||||
|
||||
sleep 5
|
||||
|
||||
sinfo -N -n {{ target_node }}
|
||||
scontrol show node {{ target_node }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: clear_state
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until node is healthy
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ target_node }}
|
||||
scontrol show node {{ target_node }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_health_after
|
||||
retries: 30
|
||||
delay: 5
|
||||
until:
|
||||
- node_health_after.rc == 0
|
||||
- "'not_responding' not in node_health_after.stdout.lower()"
|
||||
- "'down' not in node_health_after.stdout.lower()"
|
||||
- "'drain' not in node_health_after.stdout.lower()"
|
||||
- "'idle*' not in node_health_after.stdout.lower()"
|
||||
changed_when: false
|
||||
|
||||
- name: Print node state after repair
|
||||
ansible.builtin.debug:
|
||||
var: node_health_after.stdout_lines
|
||||
|
||||
|
||||
- name: Submit repair validation job
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Submit validation job to repaired node
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<SBATCH
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=repair-node-test
|
||||
#SBATCH --partition=all
|
||||
#SBATCH --nodelist={{ target_node }}
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --account=lab
|
||||
#SBATCH --qos=normal
|
||||
#SBATCH --output=/shared/repair-node-test-%j.out
|
||||
|
||||
echo "HOST=\$(hostname)"
|
||||
echo "USER=\$(whoami)"
|
||||
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/repair-node-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: repair_validation_job
|
||||
changed_when: true
|
||||
|
||||
- name: Print repair validation job
|
||||
ansible.builtin.debug:
|
||||
var: repair_validation_job.stdout_lines
|
||||
Reference in New Issue
Block a user