--- - name: Validate target_node variable hosts: localhost gather_facts: false tasks: - name: Require target_node ansible.builtin.fail: msg: "Use: ansible-playbook decommission-slurm-node.yml -e target_node= [-e decom_reason='reason']" when: target_node is not defined - name: Ensure target_node is in inventory ansible.builtin.fail: msg: "target_node={{ target_node }} is not in Ansible inventory" when: target_node not in groups['all'] - name: Drain target node and wait for jobs to leave hosts: slurm_controller become: true gather_facts: false vars: decom_reason_effective: "{{ decom_reason | default('decommission by Ansible') }}" decom_wait_retries_effective: "{{ decom_wait_retries | default(120) }}" decom_wait_delay_effective: "{{ decom_wait_delay | default(10) }}" tasks: - name: Show current target node state ansible.builtin.shell: | set -euo pipefail sinfo -N -n {{ target_node }} || true scontrol show node {{ target_node }} || true args: executable: /bin/bash register: node_state_before changed_when: false - name: Print current target node state ansible.builtin.debug: var: node_state_before.stdout_lines - name: Drain target node ansible.builtin.command: cmd: scontrol update NodeName={{ target_node }} State=DRAIN Reason="{{ decom_reason_effective }}" changed_when: true - name: Wait until no jobs are running on target node ansible.builtin.shell: | set -euo pipefail squeue -h -w {{ target_node }} || true args: executable: /bin/bash register: jobs_on_node retries: "{{ decom_wait_retries_effective | int }}" delay: "{{ decom_wait_delay_effective | int }}" until: jobs_on_node.stdout | trim == "" changed_when: false - name: Show drained node state ansible.builtin.shell: | set -euo pipefail sinfo -N -n {{ target_node }} || true scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true args: executable: /bin/bash register: node_state_drained changed_when: false - name: Print drained node state ansible.builtin.debug: var: node_state_drained.stdout_lines - name: Stop Slurm worker service on target node hosts: "{{ target_node }}" become: true gather_facts: false tasks: - name: Stop slurmd ansible.builtin.systemd: name: slurmd state: stopped enabled: false when: - inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', []) - name: Show slurmd state ansible.builtin.shell: | systemctl is-enabled slurmd 2>/dev/null || true systemctl is-active slurmd 2>/dev/null || true args: executable: /bin/bash register: slurmd_state_after changed_when: false - name: Print slurmd state ansible.builtin.debug: var: slurmd_state_after.stdout_lines - name: Mark node down in Slurm controller hosts: slurm_controller become: true gather_facts: false tasks: - name: Mark target node DOWN after service stop ansible.builtin.command: cmd: scontrol update NodeName={{ target_node }} State=DOWN Reason="decommissioned" changed_when: true - name: Show final node state ansible.builtin.shell: | set -euo pipefail sinfo -N -n {{ target_node }} || true scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true args: executable: /bin/bash register: final_node_state changed_when: false - name: Print final node state ansible.builtin.debug: var: final_node_state.stdout_lines