--- - name: Validate target node hosts: localhost gather_facts: false tasks: - name: Require target_node ansible.builtin.fail: msg: "Use: ansible-playbook repair-slurm-node.yml -e target_node=" when: target_node is not defined - name: Ensure target_node is in inventory ansible.builtin.fail: msg: "target_node={{ target_node }} is not in Ansible inventory" when: target_node not in groups['all'] - name: Capture node state before repair hosts: slurm_controller become: true gather_facts: false tasks: - name: Show target node state before repair ansible.builtin.shell: | set -euo pipefail echo "### sinfo" sinfo -N -n {{ target_node }} || true echo echo "### scontrol" scontrol show node {{ target_node }} || true echo echo "### jobs" squeue -w {{ target_node }} || true args: executable: /bin/bash register: node_state_before changed_when: false - name: Print target node state before repair ansible.builtin.debug: var: node_state_before.stdout_lines - name: Repair local services on target node hosts: "{{ target_node }}" become: true gather_facts: false tasks: - name: Restart Munge ansible.builtin.systemd: name: munge state: restarted enabled: true - name: Restart slurmd ansible.builtin.systemd: name: slurmd state: restarted enabled: true when: - inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', []) - name: Validate local repair ansible.builtin.shell: | set -euo pipefail echo "### services" systemctl is-active munge systemctl is-active slurmd echo echo "### munge" munge -n | unmunge >/dev/null echo "munge OK" echo echo "### controller ping" scontrol ping echo echo "### slurmd listener" ss -lntp | grep ':6818 ' || true echo echo "### recent slurmd logs" journalctl -u slurmd -n 40 --no-pager || true args: executable: /bin/bash register: local_repair_state changed_when: false - name: Print local repair state ansible.builtin.debug: var: local_repair_state.stdout_lines - name: Clear Slurm maintenance/down state after repair hosts: slurm_controller become: true gather_facts: false tasks: - name: Restart controller to refresh node state ansible.builtin.systemd: name: slurmctld state: restarted - name: Wait for controller ansible.builtin.command: cmd: scontrol ping register: slurmctld_ping retries: 15 delay: 2 until: slurmctld_ping.rc == 0 changed_when: false - name: Clear target node state ansible.builtin.shell: | set -euo pipefail scontrol update NodeName={{ target_node }} State=RESUME 2>/dev/null || true scontrol update NodeName={{ target_node }} State=UNDRAIN 2>/dev/null || true scontrol update NodeName={{ target_node }} State=IDLE 2>/dev/null || true sleep 5 sinfo -N -n {{ target_node }} scontrol show node {{ target_node }} args: executable: /bin/bash register: clear_state changed_when: true - name: Wait until node is healthy ansible.builtin.shell: | set -euo pipefail sinfo -N -n {{ target_node }} scontrol show node {{ target_node }} args: executable: /bin/bash register: node_health_after retries: 30 delay: 5 until: - node_health_after.rc == 0 - "'not_responding' not in node_health_after.stdout.lower()" - "'down' not in node_health_after.stdout.lower()" - "'drain' not in node_health_after.stdout.lower()" - "'idle*' not in node_health_after.stdout.lower()" changed_when: false - name: Print node state after repair ansible.builtin.debug: var: node_health_after.stdout_lines - name: Submit repair validation job hosts: slurm_controller become: true gather_facts: false tasks: - name: Submit validation job to repaired node ansible.builtin.shell: | set -euo pipefail job_id="$( sudo -iu slurmuser sbatch --parsable <