127 lines
3.8 KiB
YAML
127 lines
3.8 KiB
YAML
---
|
|
- name: Validate target_node variable
|
|
hosts: localhost
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Require target_node
|
|
ansible.builtin.fail:
|
|
msg: "Use: ansible-playbook decommission-slurm-node.yml -e target_node=<hostname> [-e decom_reason='reason']"
|
|
when: target_node is not defined
|
|
|
|
- name: Ensure target_node is in inventory
|
|
ansible.builtin.fail:
|
|
msg: "target_node={{ target_node }} is not in Ansible inventory"
|
|
when: target_node not in groups['all']
|
|
|
|
|
|
- name: Drain target node and wait for jobs to leave
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
vars:
|
|
decom_reason_effective: "{{ decom_reason | default('decommission by Ansible') }}"
|
|
decom_wait_retries_effective: "{{ decom_wait_retries | default(120) }}"
|
|
decom_wait_delay_effective: "{{ decom_wait_delay | default(10) }}"
|
|
|
|
tasks:
|
|
- name: Show current target node state
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
sinfo -N -n {{ target_node }} || true
|
|
scontrol show node {{ target_node }} || true
|
|
args:
|
|
executable: /bin/bash
|
|
register: node_state_before
|
|
changed_when: false
|
|
|
|
- name: Print current target node state
|
|
ansible.builtin.debug:
|
|
var: node_state_before.stdout_lines
|
|
|
|
- name: Drain target node
|
|
ansible.builtin.command:
|
|
cmd: scontrol update NodeName={{ target_node }} State=DRAIN Reason="{{ decom_reason_effective }}"
|
|
changed_when: true
|
|
|
|
- name: Wait until no jobs are running on target node
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
squeue -h -w {{ target_node }} || true
|
|
args:
|
|
executable: /bin/bash
|
|
register: jobs_on_node
|
|
retries: "{{ decom_wait_retries_effective | int }}"
|
|
delay: "{{ decom_wait_delay_effective | int }}"
|
|
until: jobs_on_node.stdout | trim == ""
|
|
changed_when: false
|
|
|
|
- name: Show drained node state
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
sinfo -N -n {{ target_node }} || true
|
|
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
|
|
args:
|
|
executable: /bin/bash
|
|
register: node_state_drained
|
|
changed_when: false
|
|
|
|
- name: Print drained node state
|
|
ansible.builtin.debug:
|
|
var: node_state_drained.stdout_lines
|
|
|
|
|
|
- name: Stop Slurm worker service on target node
|
|
hosts: "{{ target_node }}"
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Stop slurmd
|
|
ansible.builtin.systemd:
|
|
name: slurmd
|
|
state: stopped
|
|
enabled: false
|
|
when:
|
|
- inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', [])
|
|
|
|
- name: Show slurmd state
|
|
ansible.builtin.shell: |
|
|
systemctl is-enabled slurmd 2>/dev/null || true
|
|
systemctl is-active slurmd 2>/dev/null || true
|
|
args:
|
|
executable: /bin/bash
|
|
register: slurmd_state_after
|
|
changed_when: false
|
|
|
|
- name: Print slurmd state
|
|
ansible.builtin.debug:
|
|
var: slurmd_state_after.stdout_lines
|
|
|
|
|
|
- name: Mark node down in Slurm controller
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Mark target node DOWN after service stop
|
|
ansible.builtin.command:
|
|
cmd: scontrol update NodeName={{ target_node }} State=DOWN Reason="decommissioned"
|
|
changed_when: true
|
|
|
|
- name: Show final node state
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
sinfo -N -n {{ target_node }} || true
|
|
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
|
|
args:
|
|
executable: /bin/bash
|
|
register: final_node_state
|
|
changed_when: false
|
|
|
|
- name: Print final node state
|
|
ansible.builtin.debug:
|
|
var: final_node_state.stdout_lines
|