Add Slurm AI/HPC cluster platform project
lint / shell-yaml-ansible (push) Failing after 47s

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit d300d490f5
49 changed files with 4777 additions and 0 deletions
@@ -0,0 +1,126 @@
---
- name: Validate target_node variable
hosts: localhost
gather_facts: false
tasks:
- name: Require target_node
ansible.builtin.fail:
msg: "Use: ansible-playbook decommission-slurm-node.yml -e target_node=<hostname> [-e decom_reason='reason']"
when: target_node is not defined
- name: Ensure target_node is in inventory
ansible.builtin.fail:
msg: "target_node={{ target_node }} is not in Ansible inventory"
when: target_node not in groups['all']
- name: Drain target node and wait for jobs to leave
hosts: slurm_controller
become: true
gather_facts: false
vars:
decom_reason_effective: "{{ decom_reason | default('decommission by Ansible') }}"
decom_wait_retries_effective: "{{ decom_wait_retries | default(120) }}"
decom_wait_delay_effective: "{{ decom_wait_delay | default(10) }}"
tasks:
- name: Show current target node state
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ target_node }} || true
scontrol show node {{ target_node }} || true
args:
executable: /bin/bash
register: node_state_before
changed_when: false
- name: Print current target node state
ansible.builtin.debug:
var: node_state_before.stdout_lines
- name: Drain target node
ansible.builtin.command:
cmd: scontrol update NodeName={{ target_node }} State=DRAIN Reason="{{ decom_reason_effective }}"
changed_when: true
- name: Wait until no jobs are running on target node
ansible.builtin.shell: |
set -euo pipefail
squeue -h -w {{ target_node }} || true
args:
executable: /bin/bash
register: jobs_on_node
retries: "{{ decom_wait_retries_effective | int }}"
delay: "{{ decom_wait_delay_effective | int }}"
until: jobs_on_node.stdout | trim == ""
changed_when: false
- name: Show drained node state
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ target_node }} || true
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
args:
executable: /bin/bash
register: node_state_drained
changed_when: false
- name: Print drained node state
ansible.builtin.debug:
var: node_state_drained.stdout_lines
- name: Stop Slurm worker service on target node
hosts: "{{ target_node }}"
become: true
gather_facts: false
tasks:
- name: Stop slurmd
ansible.builtin.systemd:
name: slurmd
state: stopped
enabled: false
when:
- inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', [])
- name: Show slurmd state
ansible.builtin.shell: |
systemctl is-enabled slurmd 2>/dev/null || true
systemctl is-active slurmd 2>/dev/null || true
args:
executable: /bin/bash
register: slurmd_state_after
changed_when: false
- name: Print slurmd state
ansible.builtin.debug:
var: slurmd_state_after.stdout_lines
- name: Mark node down in Slurm controller
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Mark target node DOWN after service stop
ansible.builtin.command:
cmd: scontrol update NodeName={{ target_node }} State=DOWN Reason="decommissioned"
changed_when: true
- name: Show final node state
ansible.builtin.shell: |
set -euo pipefail
sinfo -N -n {{ target_node }} || true
scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true
args:
executable: /bin/bash
register: final_node_state
changed_when: false
- name: Print final node state
ansible.builtin.debug:
var: final_node_state.stdout_lines
@@ -0,0 +1,246 @@
---
- name: Validate target_node variable
hosts: localhost
gather_facts: false
tasks:
- name: Require target_node
ansible.builtin.fail:
msg: "Use: ansible-playbook provision-slurm-node.yml -e target_node=<hostname>"
when: target_node is not defined
- name: Ensure target_node is in inventory
ansible.builtin.fail:
msg: "target_node={{ target_node }} is not in Ansible inventory"
when: target_node not in groups['all']
- name: Prepare OS, packages and Slurm directories on target node
hosts: "{{ target_node }}"
become: true
gather_facts: true
tasks:
- name: Ensure target is a Slurm worker or GPU node
ansible.builtin.fail:
msg: "{{ inventory_hostname }} must be in slurm_compute or slurm_gpu group"
when:
- inventory_hostname not in groups.get('slurm_compute', [])
- inventory_hostname not in groups.get('slurm_gpu', [])
- name: Install Slurm worker packages
ansible.builtin.apt:
name:
- munge
- libmunge2
- slurm-client
- slurmd
- slurm-wlm-basic-plugins
- slurm-wlm-plugins
- slurm-wlm-mysql-plugin
state: present
update_cache: true
- name: Ensure Slurm config directory exists
ansible.builtin.file:
path: "{{ slurm_config_dir }}"
state: directory
owner: root
group: root
mode: "0755"
- name: Ensure Slurm log directory exists
ansible.builtin.file:
path: /var/log/slurm
state: directory
owner: slurm
group: slurm
mode: "0755"
- name: Ensure slurmd spool directory exists
ansible.builtin.file:
path: /var/spool/slurmd
state: directory
owner: slurm
group: slurm
mode: "0755"
- name: Ensure munge dirs exist
ansible.builtin.file:
path: "{{ item.path }}"
state: directory
owner: munge
group: munge
mode: "{{ item.mode }}"
loop:
- { path: /etc/munge, mode: "0700" }
- { path: /var/log/munge, mode: "0755" }
- { path: /var/lib/munge, mode: "0711" }
- { path: /run/munge, mode: "0755" }
- name: Deploy Munge key from controller to target node
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Read controller munge.key
ansible.builtin.slurp:
src: /etc/munge/munge.key
register: controller_munge_key_raw
- name: Store controller Munge key as fact
ansible.builtin.set_fact:
cluster_munge_key_b64: "{{ controller_munge_key_raw.content }}"
- name: Configure target node with Munge and Slurm files
hosts: "{{ target_node }}"
become: true
gather_facts: false
vars:
controller_host: "{{ groups['slurm_controller'][0] }}"
tasks:
- name: Deploy shared munge.key
ansible.builtin.copy:
dest: /etc/munge/munge.key
content: "{{ hostvars[controller_host].cluster_munge_key_b64 | b64decode }}"
owner: munge
group: munge
mode: "0400"
notify:
- Restart munge
- name: Deploy managed slurm.conf
ansible.builtin.template:
src: ../../templates/slurm.conf.j2
dest: "{{ slurm_config_dir }}/slurm.conf"
owner: root
group: root
mode: "0644"
notify:
- Restart slurmd
- name: Deploy managed cgroup.conf
ansible.builtin.template:
src: ../../templates/cgroup.conf.j2
dest: "{{ slurm_config_dir }}/cgroup.conf"
owner: root
group: root
mode: "0644"
when: slurm_enable_cgroup | default(false) | bool
notify:
- Restart slurmd
- name: Deploy managed gres.conf on GPU nodes
ansible.builtin.template:
src: ../../templates/gres.conf.j2
dest: "{{ slurm_config_dir }}/gres.conf"
owner: root
group: root
mode: "0644"
when: inventory_hostname in groups.get('slurm_gpu', [])
notify:
- Restart slurmd
- name: Ensure munge is enabled and running
ansible.builtin.systemd:
name: munge
enabled: true
state: started
- name: Ensure slurmd is enabled and running
ansible.builtin.systemd:
name: slurmd
enabled: true
state: started
handlers:
- name: Restart munge
ansible.builtin.systemd:
name: munge
state: restarted
- name: Restart slurmd
ansible.builtin.systemd:
name: slurmd
state: restarted
- name: Deploy updated Slurm config to whole cluster and reconfigure controller
hosts: slurm_cluster
become: true
gather_facts: false
tasks:
- name: Deploy managed slurm.conf to all nodes
ansible.builtin.template:
src: ../../templates/slurm.conf.j2
dest: "{{ slurm_config_dir }}/slurm.conf"
owner: root
group: root
mode: "0644"
- name: Deploy managed cgroup.conf to all nodes
ansible.builtin.template:
src: ../../templates/cgroup.conf.j2
dest: "{{ slurm_config_dir }}/cgroup.conf"
owner: root
group: root
mode: "0644"
when: slurm_enable_cgroup | default(false) | bool
- name: Reconfigure Slurm and validate target node
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Reconfigure Slurm controller
ansible.builtin.command:
cmd: scontrol reconfigure
changed_when: true
- name: Restart Slurm controller after node reprovision
ansible.builtin.systemd:
name: slurmctld
state: restarted
- name: Wait for Slurm controller after restart
ansible.builtin.command:
cmd: scontrol ping
register: slurmctld_ping_after_restart
retries: 15
delay: 2
until: slurmctld_ping_after_restart.rc == 0
changed_when: false
- name: Resume target node in Slurm
ansible.builtin.command:
cmd: scontrol update NodeName={{ target_node }} State=RESUME
changed_when: true
- name: Wait until target node is visible and not down
ansible.builtin.shell: |
set -euo pipefail
scontrol show node {{ target_node }}
sinfo -N -n {{ target_node }}
args:
executable: /bin/bash
register: target_node_state
retries: 20
delay: 3
until:
- target_node_state.rc == 0
- "'down' not in target_node_state.stdout.lower()"
- "'not_responding' not in target_node_state.stdout.lower()"
- "'idle*' not in target_node_state.stdout.lower()"
changed_when: false
- name: Show target node state
ansible.builtin.debug:
var: target_node_state.stdout_lines
@@ -0,0 +1,33 @@
---
- name: Show Slurm node state
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Require target_node
ansible.builtin.fail:
msg: "Use: ansible-playbook show-slurm-node.yml -e target_node=<hostname>"
when: target_node is not defined
- name: Show node state
ansible.builtin.shell: |
set -euo pipefail
echo "### sinfo"
sinfo -N -n {{ target_node }} || true
echo
echo "### scontrol"
scontrol show node {{ target_node }} || true
echo
echo "### jobs on node"
squeue -w {{ target_node }} || true
args:
executable: /bin/bash
register: node_lifecycle_state
changed_when: false
- name: Print node lifecycle state
ansible.builtin.debug:
var: node_lifecycle_state.stdout_lines