Files
portfolio/platform-projects/hpc-slurm-ai-cluster/playbooks/lifecycle/decommission-slurm-node.yml
T
2026-06-05 15:38:56 +00:00

127 lines
3.8 KiB
YAML

---
- name: Validate target_node variable
hosts: localhost
gather_facts: false
tasks:
- name: Require target_node
ansible.builtin.fail:
msg: "Use: ansible-playbook decommission-slurm-node.yml -e target_node=<hostname> [-e decom_reason='reason']"
when: target_node is not defined
- name: Ensure target_node is in inventory
ansible.builtin.fail:
msg: "target_node={{ target_node }} is not in Ansible inventory"
when: target_node not in groups['all']
- name: Drain target node and wait for jobs to leave
hosts: slurm_controller
become: true
gather_facts: false
vars:
decom_reason_effective: "{{ decom_reason | default('decommission by Ansible') }}"
decom_wait_retries_effective: "{{ decom_wait_retries | default(120) }}"
decom_wait_delay_effective: "{{ decom_wait_delay | default(10) }}"
tasks:
- name: Show current target node state
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ target_node }} || true
scontrol show node {{ target_node }} || true
args:
executable: /bin/bash
register: node_state_before
changed_when: false
- name: Print current target node state
ansible.builtin.debug:
var: node_state_before.stdout_lines
- name: Drain target node
ansible.builtin.command:
cmd: scontrol update NodeName={{ target_node }} State=DRAIN Reason="{{ decom_reason_effective }}"
changed_when: true
- name: Wait until no jobs are running on target node
ansible.builtin.shell: |
set -euo pipefail
squeue -h -w {{ target_node }} || true
args:
executable: /bin/bash
register: jobs_on_node
retries: "{{ decom_wait_retries_effective | int }}"
delay: "{{ decom_wait_delay_effective | int }}"
until: jobs_on_node.stdout | trim == ""
changed_when: false
- name: Show drained node state
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ target_node }} || true
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
args:
executable: /bin/bash
register: node_state_drained
changed_when: false
- name: Print drained node state
ansible.builtin.debug:
var: node_state_drained.stdout_lines
- name: Stop Slurm worker service on target node
hosts: "{{ target_node }}"
become: true
gather_facts: false
tasks:
- name: Stop slurmd
ansible.builtin.systemd:
name: slurmd
state: stopped
enabled: false
when:
- inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', [])
- name: Show slurmd state
ansible.builtin.shell: |
systemctl is-enabled slurmd 2>/dev/null || true
systemctl is-active slurmd 2>/dev/null || true
args:
executable: /bin/bash
register: slurmd_state_after
changed_when: false
- name: Print slurmd state
ansible.builtin.debug:
var: slurmd_state_after.stdout_lines
- name: Mark node down in Slurm controller
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Mark target node DOWN after service stop
ansible.builtin.command:
cmd: scontrol update NodeName={{ target_node }} State=DOWN Reason="decommissioned"
changed_when: true
- name: Show final node state
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ target_node }} || true
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
args:
executable: /bin/bash
register: final_node_state
changed_when: false
- name: Print final node state
ansible.builtin.debug:
var: final_node_state.stdout_lines