--- - name: Detect problematic Slurm nodes hosts: slurm_controller become: true gather_facts: false tasks: - name: Detect nodes needing remediation ansible.builtin.shell: | set -euo pipefail sinfo -N -h -o "%N %T" | awk ' tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1} ' | sort -u args: executable: /bin/bash register: bad_nodes_raw changed_when: false - name: Store bad node list ansible.builtin.set_fact: bad_nodes: "{{ bad_nodes_raw.stdout_lines }}" - name: Show detected problematic nodes ansible.builtin.debug: var: bad_nodes - name: Attempt auto-remediation on problematic nodes hosts: slurm_compute:slurm_gpu become: true gather_facts: false serial: 1 vars: bad_nodes_from_controller: "{{ hostvars[groups['slurm_controller'][0]].bad_nodes | default([]) }}" tasks: - name: Skip healthy nodes ansible.builtin.meta: end_host when: inventory_hostname not in bad_nodes_from_controller - name: Restart Munge ansible.builtin.systemd: name: munge state: restarted enabled: true - name: Restart slurmd ansible.builtin.systemd: name: slurmd state: restarted enabled: true - name: Validate local services after remediation attempt ansible.builtin.shell: | set -euo pipefail echo "HOST=$(hostname)" echo echo "### services" systemctl is-active munge systemctl is-active slurmd echo echo "### munge" munge -n | unmunge >/dev/null echo "munge OK" echo echo "### controller ping" scontrol ping echo echo "### slurmd listener" ss -lntp | grep ':6818 ' || true echo echo "### recent slurmd logs" journalctl -u slurmd -n 30 --no-pager || true args: executable: /bin/bash register: local_repair_check changed_when: false - name: Print local remediation result ansible.builtin.debug: var: local_repair_check.stdout_lines - name: Refresh controller and validate remediated nodes hosts: slurm_controller become: true gather_facts: false tasks: - name: Restart slurmctld to refresh node states ansible.builtin.systemd: name: slurmctld state: restarted - name: Wait for controller ansible.builtin.command: cmd: scontrol ping register: slurmctld_ping retries: 15 delay: 2 until: slurmctld_ping.rc == 0 changed_when: false - name: Clear maintenance state on previously bad nodes ansible.builtin.shell: | set -euo pipefail bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}" if [ -z "$bad_nodes" ]; then echo "No bad nodes detected. Nothing to clear." sinfo -N exit 0 fi for node in $bad_nodes; do echo "### clearing state on $node" scontrol update NodeName="$node" State=RESUME 2>/dev/null || true scontrol update NodeName="$node" State=UNDRAIN 2>/dev/null || true scontrol update NodeName="$node" State=IDLE 2>/dev/null || true done sleep 5 sinfo -N args: executable: /bin/bash register: clear_result changed_when: true - name: Print clear-state result ansible.builtin.debug: var: clear_result.stdout_lines - name: Detect nodes still unhealthy after remediation ansible.builtin.shell: | set -euo pipefail sinfo -N -h -o "%N %T" | awk ' tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1} ' | sort -u args: executable: /bin/bash register: still_bad_nodes_raw changed_when: false - name: Store still bad nodes ansible.builtin.set_fact: still_bad_nodes: "{{ still_bad_nodes_raw.stdout_lines }}" - name: Drain nodes that remain unhealthy ansible.builtin.shell: | set -euo pipefail unresolved_nodes="{{ still_bad_nodes | join(' ') }}" if [ -z "$unresolved_nodes" ]; then echo "No unresolved unhealthy nodes." sinfo -N exit 0 fi for node in $unresolved_nodes; do echo "### draining unresolved node $node" scontrol update NodeName="$node" State=DRAIN Reason="auto-remediation failed" done sinfo -N args: executable: /bin/bash register: drain_unresolved changed_when: still_bad_nodes | length > 0 - name: Show remediation summary ansible.builtin.shell: | set -euo pipefail echo "### initial bad nodes" bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}" if [ -z "$bad_nodes" ]; then echo "none" else printf '%s\n' $bad_nodes fi echo echo "### still bad nodes" still_bad_nodes="{{ (still_bad_nodes | default([])) | join(' ') }}" if [ -z "$still_bad_nodes" ]; then echo "none" else printf '%s\n' $still_bad_nodes fi echo echo "### final sinfo" sinfo -N echo echo "### queue" squeue args: executable: /bin/bash register: remediation_summary changed_when: false - name: Print remediation summary ansible.builtin.debug: var: remediation_summary.stdout_lines