This commit is contained in:
+126
@@ -0,0 +1,126 @@
|
||||
---
|
||||
- name: Validate target_node variable
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Require target_node
|
||||
ansible.builtin.fail:
|
||||
msg: "Use: ansible-playbook decommission-slurm-node.yml -e target_node=<hostname> [-e decom_reason='reason']"
|
||||
when: target_node is not defined
|
||||
|
||||
- name: Ensure target_node is in inventory
|
||||
ansible.builtin.fail:
|
||||
msg: "target_node={{ target_node }} is not in Ansible inventory"
|
||||
when: target_node not in groups['all']
|
||||
|
||||
|
||||
- name: Drain target node and wait for jobs to leave
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
decom_reason_effective: "{{ decom_reason | default('decommission by Ansible') }}"
|
||||
decom_wait_retries_effective: "{{ decom_wait_retries | default(120) }}"
|
||||
decom_wait_delay_effective: "{{ decom_wait_delay | default(10) }}"
|
||||
|
||||
tasks:
|
||||
- name: Show current target node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
scontrol show node {{ target_node }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_state_before
|
||||
changed_when: false
|
||||
|
||||
- name: Print current target node state
|
||||
ansible.builtin.debug:
|
||||
var: node_state_before.stdout_lines
|
||||
|
||||
- name: Drain target node
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol update NodeName={{ target_node }} State=DRAIN Reason="{{ decom_reason_effective }}"
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until no jobs are running on target node
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
squeue -h -w {{ target_node }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: jobs_on_node
|
||||
retries: "{{ decom_wait_retries_effective | int }}"
|
||||
delay: "{{ decom_wait_delay_effective | int }}"
|
||||
until: jobs_on_node.stdout | trim == ""
|
||||
changed_when: false
|
||||
|
||||
- name: Show drained node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_state_drained
|
||||
changed_when: false
|
||||
|
||||
- name: Print drained node state
|
||||
ansible.builtin.debug:
|
||||
var: node_state_drained.stdout_lines
|
||||
|
||||
|
||||
- name: Stop Slurm worker service on target node
|
||||
hosts: "{{ target_node }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Stop slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: stopped
|
||||
enabled: false
|
||||
when:
|
||||
- inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', [])
|
||||
|
||||
- name: Show slurmd state
|
||||
ansible.builtin.shell: |
|
||||
systemctl is-enabled slurmd 2>/dev/null || true
|
||||
systemctl is-active slurmd 2>/dev/null || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurmd_state_after
|
||||
changed_when: false
|
||||
|
||||
- name: Print slurmd state
|
||||
ansible.builtin.debug:
|
||||
var: slurmd_state_after.stdout_lines
|
||||
|
||||
|
||||
- name: Mark node down in Slurm controller
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Mark target node DOWN after service stop
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol update NodeName={{ target_node }} State=DOWN Reason="decommissioned"
|
||||
changed_when: true
|
||||
|
||||
- name: Show final node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: final_node_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print final node state
|
||||
ansible.builtin.debug:
|
||||
var: final_node_state.stdout_lines
|
||||
@@ -0,0 +1,246 @@
|
||||
---
|
||||
- name: Validate target_node variable
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Require target_node
|
||||
ansible.builtin.fail:
|
||||
msg: "Use: ansible-playbook provision-slurm-node.yml -e target_node=<hostname>"
|
||||
when: target_node is not defined
|
||||
|
||||
- name: Ensure target_node is in inventory
|
||||
ansible.builtin.fail:
|
||||
msg: "target_node={{ target_node }} is not in Ansible inventory"
|
||||
when: target_node not in groups['all']
|
||||
|
||||
|
||||
- name: Prepare OS, packages and Slurm directories on target node
|
||||
hosts: "{{ target_node }}"
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Ensure target is a Slurm worker or GPU node
|
||||
ansible.builtin.fail:
|
||||
msg: "{{ inventory_hostname }} must be in slurm_compute or slurm_gpu group"
|
||||
when:
|
||||
- inventory_hostname not in groups.get('slurm_compute', [])
|
||||
- inventory_hostname not in groups.get('slurm_gpu', [])
|
||||
|
||||
- name: Install Slurm worker packages
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- munge
|
||||
- libmunge2
|
||||
- slurm-client
|
||||
- slurmd
|
||||
- slurm-wlm-basic-plugins
|
||||
- slurm-wlm-plugins
|
||||
- slurm-wlm-mysql-plugin
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Ensure Slurm config directory exists
|
||||
ansible.builtin.file:
|
||||
path: "{{ slurm_config_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure Slurm log directory exists
|
||||
ansible.builtin.file:
|
||||
path: /var/log/slurm
|
||||
state: directory
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure slurmd spool directory exists
|
||||
ansible.builtin.file:
|
||||
path: /var/spool/slurmd
|
||||
state: directory
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure munge dirs exist
|
||||
ansible.builtin.file:
|
||||
path: "{{ item.path }}"
|
||||
state: directory
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "{{ item.mode }}"
|
||||
loop:
|
||||
- { path: /etc/munge, mode: "0700" }
|
||||
- { path: /var/log/munge, mode: "0755" }
|
||||
- { path: /var/lib/munge, mode: "0711" }
|
||||
- { path: /run/munge, mode: "0755" }
|
||||
|
||||
|
||||
- name: Deploy Munge key from controller to target node
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Read controller munge.key
|
||||
ansible.builtin.slurp:
|
||||
src: /etc/munge/munge.key
|
||||
register: controller_munge_key_raw
|
||||
|
||||
- name: Store controller Munge key as fact
|
||||
ansible.builtin.set_fact:
|
||||
cluster_munge_key_b64: "{{ controller_munge_key_raw.content }}"
|
||||
|
||||
|
||||
- name: Configure target node with Munge and Slurm files
|
||||
hosts: "{{ target_node }}"
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
controller_host: "{{ groups['slurm_controller'][0] }}"
|
||||
|
||||
tasks:
|
||||
- name: Deploy shared munge.key
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/munge/munge.key
|
||||
content: "{{ hostvars[controller_host].cluster_munge_key_b64 | b64decode }}"
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "0400"
|
||||
notify:
|
||||
- Restart munge
|
||||
|
||||
- name: Deploy managed slurm.conf
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/slurm.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/slurm.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify:
|
||||
- Restart slurmd
|
||||
|
||||
- name: Deploy managed cgroup.conf
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/cgroup.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/cgroup.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: slurm_enable_cgroup | default(false) | bool
|
||||
notify:
|
||||
- Restart slurmd
|
||||
|
||||
- name: Deploy managed gres.conf on GPU nodes
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/gres.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/gres.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: inventory_hostname in groups.get('slurm_gpu', [])
|
||||
notify:
|
||||
- Restart slurmd
|
||||
|
||||
- name: Ensure munge is enabled and running
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: Ensure slurmd is enabled and running
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
handlers:
|
||||
- name: Restart munge
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
|
||||
- name: Restart slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
|
||||
|
||||
- name: Deploy updated Slurm config to whole cluster and reconfigure controller
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Deploy managed slurm.conf to all nodes
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/slurm.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/slurm.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
|
||||
- name: Deploy managed cgroup.conf to all nodes
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/cgroup.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/cgroup.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: slurm_enable_cgroup | default(false) | bool
|
||||
|
||||
|
||||
- name: Reconfigure Slurm and validate target node
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Reconfigure Slurm controller
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol reconfigure
|
||||
changed_when: true
|
||||
|
||||
- name: Restart Slurm controller after node reprovision
|
||||
ansible.builtin.systemd:
|
||||
name: slurmctld
|
||||
state: restarted
|
||||
|
||||
- name: Wait for Slurm controller after restart
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: slurmctld_ping_after_restart
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: slurmctld_ping_after_restart.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Resume target node in Slurm
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol update NodeName={{ target_node }} State=RESUME
|
||||
changed_when: true
|
||||
|
||||
- name: Wait until target node is visible and not down
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
scontrol show node {{ target_node }}
|
||||
sinfo -N -n {{ target_node }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: target_node_state
|
||||
retries: 20
|
||||
delay: 3
|
||||
until:
|
||||
- target_node_state.rc == 0
|
||||
- "'down' not in target_node_state.stdout.lower()"
|
||||
- "'not_responding' not in target_node_state.stdout.lower()"
|
||||
- "'idle*' not in target_node_state.stdout.lower()"
|
||||
changed_when: false
|
||||
|
||||
- name: Show target node state
|
||||
ansible.builtin.debug:
|
||||
var: target_node_state.stdout_lines
|
||||
@@ -0,0 +1,33 @@
|
||||
---
|
||||
- name: Show Slurm node state
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Require target_node
|
||||
ansible.builtin.fail:
|
||||
msg: "Use: ansible-playbook show-slurm-node.yml -e target_node=<hostname>"
|
||||
when: target_node is not defined
|
||||
|
||||
- name: Show node state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
echo "### sinfo"
|
||||
sinfo -N -n {{ target_node }} || true
|
||||
|
||||
echo
|
||||
echo "### scontrol"
|
||||
scontrol show node {{ target_node }} || true
|
||||
|
||||
echo
|
||||
echo "### jobs on node"
|
||||
squeue -w {{ target_node }} || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: node_lifecycle_state
|
||||
changed_when: false
|
||||
|
||||
- name: Print node lifecycle state
|
||||
ansible.builtin.debug:
|
||||
var: node_lifecycle_state.stdout_lines
|
||||
Reference in New Issue
Block a user