Add Slurm AI/HPC cluster platform project
lint / shell-yaml-ansible (push) Failing after 47s

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit d300d490f5
49 changed files with 4777 additions and 0 deletions
@@ -0,0 +1,133 @@
---
- name: Read Munge key from Slurm controller
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Check controller munge.key exists
ansible.builtin.stat:
path: /etc/munge/munge.key
register: controller_munge_key
- name: Fail if controller munge.key is missing
ansible.builtin.fail:
msg: "/etc/munge/munge.key is missing on controller. Do not continue."
when: not controller_munge_key.stat.exists
- name: Read controller munge.key
ansible.builtin.slurp:
src: /etc/munge/munge.key
register: controller_munge_key_raw
- name: Store controller Munge key as fact
ansible.builtin.set_fact:
cluster_munge_key_b64: "{{ controller_munge_key_raw.content }}"
- name: Deploy controller Munge key to all Slurm nodes
hosts: slurm_cluster
become: true
gather_facts: false
vars:
controller_host: "{{ groups['slurm_controller'][0] }}"
tasks:
- name: Ensure munge package is installed
ansible.builtin.apt:
name:
- munge
- libmunge2
state: present
update_cache: true
- name: Ensure munge group exists
ansible.builtin.group:
name: munge
system: true
state: present
- name: Ensure munge user exists
ansible.builtin.user:
name: munge
group: munge
system: true
shell: /usr/sbin/nologin
home: /nonexistent
create_home: false
state: present
- name: Ensure /etc/munge exists
ansible.builtin.file:
path: /etc/munge
state: directory
owner: munge
group: munge
mode: "0700"
- name: Deploy shared munge.key from controller
ansible.builtin.copy:
dest: /etc/munge/munge.key
content: "{{ hostvars[controller_host].cluster_munge_key_b64 | b64decode }}"
owner: munge
group: munge
mode: "0400"
notify:
- Restart munge
- name: Ensure /var/log/munge exists
ansible.builtin.file:
path: /var/log/munge
state: directory
owner: munge
group: munge
mode: "0755"
- name: Ensure /var/lib/munge exists
ansible.builtin.file:
path: /var/lib/munge
state: directory
owner: munge
group: munge
mode: "0711"
- name: Ensure /run/munge exists
ansible.builtin.file:
path: /run/munge
state: directory
owner: munge
group: munge
mode: "0755"
- name: Ensure munge is enabled and running
ansible.builtin.systemd:
name: munge
enabled: true
state: started
handlers:
- name: Restart munge
ansible.builtin.systemd:
name: munge
state: restarted
- name: Validate Munge locally on all nodes
hosts: slurm_cluster
become: true
gather_facts: false
tasks:
- name: Test local munge encode/decode
ansible.builtin.shell: |
set -euo pipefail
munge -n | unmunge
args:
executable: /bin/bash
register: munge_local_test
changed_when: false
- name: Show local Munge validation
ansible.builtin.debug:
var: munge_local_test.stdout_lines
@@ -0,0 +1,132 @@
---
- name: Prepare Slurm config directories and logs
hosts: slurm_cluster
become: true
gather_facts: false
tasks:
- name: Ensure Slurm config directory exists
ansible.builtin.file:
path: "{{ slurm_config_dir }}"
state: directory
owner: root
group: root
mode: "0755"
- name: Ensure Slurm log directory exists
ansible.builtin.file:
path: /var/log/slurm
state: directory
owner: slurm
group: slurm
mode: "0755"
- name: Ensure slurmctld spool directory exists on controller
ansible.builtin.file:
path: /var/spool/slurmctld
state: directory
owner: slurm
group: slurm
mode: "0755"
when: inventory_hostname in groups['slurm_controller']
- name: Ensure slurmd spool directory exists on workers
ansible.builtin.file:
path: /var/spool/slurmd
state: directory
owner: slurm
group: slurm
mode: "0755"
when: inventory_hostname in groups['slurm_compute'] or inventory_hostname in groups['slurm_gpu']
- name: Deploy Slurm config files
hosts: slurm_cluster
become: true
gather_facts: false
tasks:
- name: Backup current slurm.conf before managed deployment
ansible.builtin.copy:
src: "{{ slurm_config_dir }}/slurm.conf"
dest: "{{ slurm_config_dir }}/slurm.conf.pre-ansible-managed"
remote_src: true
owner: root
group: root
mode: "0644"
force: false
- name: Deploy managed slurm.conf
ansible.builtin.template:
src: ../../templates/slurm.conf.j2
dest: "{{ slurm_config_dir }}/slurm.conf"
owner: root
group: root
mode: "0644"
notify:
- Reconfigure slurmctld
- Restart slurmd
- name: Deploy managed cgroup.conf
ansible.builtin.template:
src: ../../templates/cgroup.conf.j2
dest: "{{ slurm_config_dir }}/cgroup.conf"
owner: root
group: root
mode: "0644"
when: slurm_enable_cgroup | default(false) | bool
notify:
- Reconfigure slurmctld
- Restart slurmd
- name: Deploy managed gres.conf only on GPU nodes
ansible.builtin.template:
src: ../../templates/gres.conf.j2
dest: "{{ slurm_config_dir }}/gres.conf"
owner: root
group: root
mode: "0644"
when: inventory_hostname in groups['slurm_gpu']
notify:
- Reconfigure slurmctld
- Restart slurmd
handlers:
- name: Reconfigure slurmctld
ansible.builtin.command:
cmd: scontrol reconfigure
when: inventory_hostname in groups['slurm_controller']
changed_when: true
- name: Restart slurmd
ansible.builtin.systemd:
name: slurmd
state: restarted
when: inventory_hostname in groups['slurm_compute'] or inventory_hostname in groups['slurm_gpu']
- name: Validate Slurm after config deployment
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Reconfigure controller
ansible.builtin.command:
cmd: scontrol reconfigure
changed_when: true
- name: Validate cluster state
ansible.builtin.shell: |
set -euo pipefail
scontrol ping
sinfo
scontrol show nodes
args:
executable: /bin/bash
register: slurm_config_validation
changed_when: false
- name: Show validation output
ansible.builtin.debug:
var: slurm_config_validation.stdout_lines
@@ -0,0 +1,103 @@
---
- name: Restart Slurm controller safely
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Restart munge on controller
ansible.builtin.systemd:
name: munge
state: restarted
enabled: true
- name: Restart slurmctld on controller
ansible.builtin.systemd:
name: slurmctld
state: restarted
enabled: true
- name: Wait for slurmctld to answer
ansible.builtin.command:
cmd: scontrol ping
register: scontrol_ping
retries: 15
delay: 2
until: scontrol_ping.rc == 0
changed_when: false
- name: Show controller ping
ansible.builtin.debug:
var: scontrol_ping.stdout_lines
- name: Restart Slurm workers safely one by one
hosts: slurm_compute:slurm_gpu
become: true
gather_facts: false
serial: 1
tasks:
- name: Restart munge on worker
ansible.builtin.systemd:
name: munge
state: restarted
enabled: true
- name: Restart slurmd on worker
ansible.builtin.systemd:
name: slurmd
state: restarted
enabled: true
- name: Wait for slurmd to be active
ansible.builtin.command:
cmd: systemctl is-active slurmd
register: slurmd_active
retries: 15
delay: 2
until: slurmd_active.stdout == "active"
changed_when: false
- name: Wait until this node is visible in Slurm
ansible.builtin.command:
cmd: scontrol show node {{ inventory_hostname }}
delegate_to: "{{ groups['slurm_controller'][0] }}"
register: node_visible
retries: 15
delay: 2
until: node_visible.rc == 0
changed_when: false
- name: Validate Slurm after restart
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Validate Slurm cluster state
ansible.builtin.shell: |
set -euo pipefail
echo "### scontrol ping"
scontrol ping
echo
echo "### sinfo"
sinfo
echo
echo "### nodes"
scontrol show nodes
echo
echo "### partitions"
scontrol show partitions
args:
executable: /bin/bash
register: slurm_validation
changed_when: false
- name: Show Slurm validation
ansible.builtin.debug:
var: slurm_validation.stdout_lines