218 lines
5.6 KiB
YAML
218 lines
5.6 KiB
YAML
---
|
|
- name: Validate target node
|
|
hosts: localhost
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Require target_node
|
|
ansible.builtin.fail:
|
|
msg: "Use: ansible-playbook repair-slurm-node.yml -e target_node=<hostname>"
|
|
when: target_node is not defined
|
|
|
|
- name: Ensure target_node is in inventory
|
|
ansible.builtin.fail:
|
|
msg: "target_node={{ target_node }} is not in Ansible inventory"
|
|
when: target_node not in groups['all']
|
|
|
|
|
|
- name: Capture node state before repair
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Show target node state before repair
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "### sinfo"
|
|
sinfo -N -n {{ target_node }} || true
|
|
|
|
echo
|
|
echo "### scontrol"
|
|
scontrol show node {{ target_node }} || true
|
|
|
|
echo
|
|
echo "### jobs"
|
|
squeue -w {{ target_node }} || true
|
|
args:
|
|
executable: /bin/bash
|
|
register: node_state_before
|
|
changed_when: false
|
|
|
|
- name: Print target node state before repair
|
|
ansible.builtin.debug:
|
|
var: node_state_before.stdout_lines
|
|
|
|
|
|
- name: Repair local services on target node
|
|
hosts: "{{ target_node }}"
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Restart Munge
|
|
ansible.builtin.systemd:
|
|
name: munge
|
|
state: restarted
|
|
enabled: true
|
|
|
|
- name: Restart slurmd
|
|
ansible.builtin.systemd:
|
|
name: slurmd
|
|
state: restarted
|
|
enabled: true
|
|
when:
|
|
- inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', [])
|
|
|
|
- name: Validate local repair
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "### services"
|
|
systemctl is-active munge
|
|
systemctl is-active slurmd
|
|
|
|
echo
|
|
echo "### munge"
|
|
munge -n | unmunge >/dev/null
|
|
echo "munge OK"
|
|
|
|
echo
|
|
echo "### controller ping"
|
|
scontrol ping
|
|
|
|
echo
|
|
echo "### slurmd listener"
|
|
ss -lntp | grep ':6818 ' || true
|
|
|
|
echo
|
|
echo "### recent slurmd logs"
|
|
journalctl -u slurmd -n 40 --no-pager || true
|
|
args:
|
|
executable: /bin/bash
|
|
register: local_repair_state
|
|
changed_when: false
|
|
|
|
- name: Print local repair state
|
|
ansible.builtin.debug:
|
|
var: local_repair_state.stdout_lines
|
|
|
|
|
|
- name: Clear Slurm maintenance/down state after repair
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Restart controller to refresh node state
|
|
ansible.builtin.systemd:
|
|
name: slurmctld
|
|
state: restarted
|
|
|
|
- name: Wait for controller
|
|
ansible.builtin.command:
|
|
cmd: scontrol ping
|
|
register: slurmctld_ping
|
|
retries: 15
|
|
delay: 2
|
|
until: slurmctld_ping.rc == 0
|
|
changed_when: false
|
|
|
|
- name: Clear target node state
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
scontrol update NodeName={{ target_node }} State=RESUME 2>/dev/null || true
|
|
scontrol update NodeName={{ target_node }} State=UNDRAIN 2>/dev/null || true
|
|
scontrol update NodeName={{ target_node }} State=IDLE 2>/dev/null || true
|
|
|
|
sleep 5
|
|
|
|
sinfo -N -n {{ target_node }}
|
|
scontrol show node {{ target_node }}
|
|
args:
|
|
executable: /bin/bash
|
|
register: clear_state
|
|
changed_when: true
|
|
|
|
- name: Wait until node is healthy
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
sinfo -N -n {{ target_node }}
|
|
scontrol show node {{ target_node }}
|
|
args:
|
|
executable: /bin/bash
|
|
register: node_health_after
|
|
retries: 30
|
|
delay: 5
|
|
until:
|
|
- node_health_after.rc == 0
|
|
- "'not_responding' not in node_health_after.stdout.lower()"
|
|
- "'down' not in node_health_after.stdout.lower()"
|
|
- "'drain' not in node_health_after.stdout.lower()"
|
|
- "'idle*' not in node_health_after.stdout.lower()"
|
|
changed_when: false
|
|
|
|
- name: Print node state after repair
|
|
ansible.builtin.debug:
|
|
var: node_health_after.stdout_lines
|
|
|
|
|
|
- name: Submit repair validation job
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Submit validation job to repaired node
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
job_id="$(
|
|
sudo -iu slurmuser sbatch --parsable <<SBATCH
|
|
#!/bin/bash
|
|
#SBATCH --job-name=repair-node-test
|
|
#SBATCH --partition=all
|
|
#SBATCH --nodelist={{ target_node }}
|
|
#SBATCH --cpus-per-task=1
|
|
#SBATCH --mem=256M
|
|
#SBATCH --time=00:02:00
|
|
#SBATCH --account=lab
|
|
#SBATCH --qos=normal
|
|
#SBATCH --output=/shared/repair-node-test-%j.out
|
|
|
|
echo "HOST=\$(hostname)"
|
|
echo "USER=\$(whoami)"
|
|
echo "SLURM_JOB_ID=\$SLURM_JOB_ID"
|
|
echo "SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST"
|
|
echo "CPUS_ALLOWED=\$(grep Cpus_allowed_list /proc/self/status)"
|
|
date
|
|
SBATCH
|
|
)"
|
|
|
|
echo "JOB_ID=$job_id"
|
|
|
|
for i in $(seq 1 90); do
|
|
if squeue -h -j "$job_id" | grep -q .; then
|
|
squeue -j "$job_id"
|
|
sleep 1
|
|
else
|
|
break
|
|
fi
|
|
done
|
|
|
|
echo "### sacct"
|
|
sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList
|
|
|
|
echo "### output"
|
|
cat "/shared/repair-node-test-${job_id}.out"
|
|
args:
|
|
executable: /bin/bash
|
|
register: repair_validation_job
|
|
changed_when: true
|
|
|
|
- name: Print repair validation job
|
|
ansible.builtin.debug:
|
|
var: repair_validation_job.stdout_lines
|