Add Slurm AI/HPC cluster platform project
This commit is contained in:
+126
@@ -0,0 +1,126 @@
|
||||
---
|
||||
- name: Validate target_node variable
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Require target_node
|
||||
ansible.builtin.fail:
|
||||
msg: "Use: ansible-playbook decommission-slurm-node.yml -e target_node=<hostname> [-e decom_reason='reason']"
|
||||
when: target_node is not defined
|
||||
|
||||
- name: Ensure target_node is in inventory
|
||||
ansible.builtin.fail:
|
||||
msg: "target_node={{ target_node }} is not in Ansible inventory"
|
||||
when: target_node not in groups['all']
|
||||
|
||||
|
||||
- name: Drain target node and wait for jobs to leave
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
decom_reason_effective: "{{ decom_reason | default('decommission by Ansible') }}"
|
||||
decom_wait_retries_effective: "{{ decom_wait_retries | default(120) }}"
|
||||
decom_wait_delay_effective: "{{ decom_wait_delay | default(10) }}"
|
||||
|
||||
tasks:
|
||||
- name: Show current target node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
scontrol show node {{ target_node }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_state_before
|
||||
changed_when: false
|
||||
|
||||
- name: Print current target node state
|
||||
ansible.builtin.debug:
|
||||
var: node_state_before.stdout_lines
|
||||
|
||||
- name: Drain target node
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol update NodeName={{ target_node }} State=DRAIN Reason="{{ decom_reason_effective }}"
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until no jobs are running on target node
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
squeue -h -w {{ target_node }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: jobs_on_node
|
||||
retries: "{{ decom_wait_retries_effective | int }}"
|
||||
delay: "{{ decom_wait_delay_effective | int }}"
|
||||
until: jobs_on_node.stdout | trim == ""
|
||||
changed_when: false
|
||||
|
||||
- name: Show drained node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_state_drained
|
||||
changed_when: false
|
||||
|
||||
- name: Print drained node state
|
||||
ansible.builtin.debug:
|
||||
var: node_state_drained.stdout_lines
|
||||
|
||||
|
||||
- name: Stop Slurm worker service on target node
|
||||
hosts: "{{ target_node }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Stop slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: stopped
|
||||
enabled: false
|
||||
when:
|
||||
- inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', [])
|
||||
|
||||
- name: Show slurmd state
|
||||
ansible.builtin.shell: |
|
||||
systemctl is-enabled slurmd 2>/dev/null || true
|
||||
systemctl is-active slurmd 2>/dev/null || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurmd_state_after
|
||||
changed_when: false
|
||||
|
||||
- name: Print slurmd state
|
||||
ansible.builtin.debug:
|
||||
var: slurmd_state_after.stdout_lines
|
||||
|
||||
|
||||
- name: Mark node down in Slurm controller
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Mark target node DOWN after service stop
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol update NodeName={{ target_node }} State=DOWN Reason="decommissioned"
|
||||
changed_when: true
|
||||
|
||||
- name: Show final node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: final_node_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print final node state
|
||||
ansible.builtin.debug:
|
||||
var: final_node_state.stdout_lines
|
||||
Reference in New Issue
Block a user