This commit is contained in:
@@ -0,0 +1,133 @@
|
||||
---
|
||||
- name: Read Munge key from Slurm controller
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Check controller munge.key exists
|
||||
ansible.builtin.stat:
|
||||
path: /etc/munge/munge.key
|
||||
register: controller_munge_key
|
||||
|
||||
- name: Fail if controller munge.key is missing
|
||||
ansible.builtin.fail:
|
||||
msg: "/etc/munge/munge.key is missing on controller. Do not continue."
|
||||
when: not controller_munge_key.stat.exists
|
||||
|
||||
- name: Read controller munge.key
|
||||
ansible.builtin.slurp:
|
||||
src: /etc/munge/munge.key
|
||||
register: controller_munge_key_raw
|
||||
|
||||
- name: Store controller Munge key as fact
|
||||
ansible.builtin.set_fact:
|
||||
cluster_munge_key_b64: "{{ controller_munge_key_raw.content }}"
|
||||
|
||||
|
||||
- name: Deploy controller Munge key to all Slurm nodes
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
controller_host: "{{ groups['slurm_controller'][0] }}"
|
||||
|
||||
tasks:
|
||||
- name: Ensure munge package is installed
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- munge
|
||||
- libmunge2
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Ensure munge group exists
|
||||
ansible.builtin.group:
|
||||
name: munge
|
||||
system: true
|
||||
state: present
|
||||
|
||||
- name: Ensure munge user exists
|
||||
ansible.builtin.user:
|
||||
name: munge
|
||||
group: munge
|
||||
system: true
|
||||
shell: /usr/sbin/nologin
|
||||
home: /nonexistent
|
||||
create_home: false
|
||||
state: present
|
||||
|
||||
- name: Ensure /etc/munge exists
|
||||
ansible.builtin.file:
|
||||
path: /etc/munge
|
||||
state: directory
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "0700"
|
||||
|
||||
- name: Deploy shared munge.key from controller
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/munge/munge.key
|
||||
content: "{{ hostvars[controller_host].cluster_munge_key_b64 | b64decode }}"
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "0400"
|
||||
notify:
|
||||
- Restart munge
|
||||
|
||||
- name: Ensure /var/log/munge exists
|
||||
ansible.builtin.file:
|
||||
path: /var/log/munge
|
||||
state: directory
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure /var/lib/munge exists
|
||||
ansible.builtin.file:
|
||||
path: /var/lib/munge
|
||||
state: directory
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "0711"
|
||||
|
||||
- name: Ensure /run/munge exists
|
||||
ansible.builtin.file:
|
||||
path: /run/munge
|
||||
state: directory
|
||||
owner: munge
|
||||
group: munge
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure munge is enabled and running
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
handlers:
|
||||
- name: Restart munge
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
|
||||
|
||||
- name: Validate Munge locally on all nodes
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Test local munge encode/decode
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
munge -n | unmunge
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: munge_local_test
|
||||
changed_when: false
|
||||
|
||||
- name: Show local Munge validation
|
||||
ansible.builtin.debug:
|
||||
var: munge_local_test.stdout_lines
|
||||
@@ -0,0 +1,132 @@
|
||||
---
|
||||
- name: Prepare Slurm config directories and logs
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Ensure Slurm config directory exists
|
||||
ansible.builtin.file:
|
||||
path: "{{ slurm_config_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure Slurm log directory exists
|
||||
ansible.builtin.file:
|
||||
path: /var/log/slurm
|
||||
state: directory
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0755"
|
||||
|
||||
- name: Ensure slurmctld spool directory exists on controller
|
||||
ansible.builtin.file:
|
||||
path: /var/spool/slurmctld
|
||||
state: directory
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0755"
|
||||
when: inventory_hostname in groups['slurm_controller']
|
||||
|
||||
- name: Ensure slurmd spool directory exists on workers
|
||||
ansible.builtin.file:
|
||||
path: /var/spool/slurmd
|
||||
state: directory
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0755"
|
||||
when: inventory_hostname in groups['slurm_compute'] or inventory_hostname in groups['slurm_gpu']
|
||||
|
||||
|
||||
- name: Deploy Slurm config files
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Backup current slurm.conf before managed deployment
|
||||
ansible.builtin.copy:
|
||||
src: "{{ slurm_config_dir }}/slurm.conf"
|
||||
dest: "{{ slurm_config_dir }}/slurm.conf.pre-ansible-managed"
|
||||
remote_src: true
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
force: false
|
||||
|
||||
- name: Deploy managed slurm.conf
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/slurm.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/slurm.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify:
|
||||
- Reconfigure slurmctld
|
||||
- Restart slurmd
|
||||
|
||||
- name: Deploy managed cgroup.conf
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/cgroup.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/cgroup.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: slurm_enable_cgroup | default(false) | bool
|
||||
notify:
|
||||
- Reconfigure slurmctld
|
||||
- Restart slurmd
|
||||
|
||||
- name: Deploy managed gres.conf only on GPU nodes
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/gres.conf.j2
|
||||
dest: "{{ slurm_config_dir }}/gres.conf"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: inventory_hostname in groups['slurm_gpu']
|
||||
notify:
|
||||
- Reconfigure slurmctld
|
||||
- Restart slurmd
|
||||
|
||||
handlers:
|
||||
- name: Reconfigure slurmctld
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol reconfigure
|
||||
when: inventory_hostname in groups['slurm_controller']
|
||||
changed_when: true
|
||||
|
||||
- name: Restart slurmd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
when: inventory_hostname in groups['slurm_compute'] or inventory_hostname in groups['slurm_gpu']
|
||||
|
||||
|
||||
- name: Validate Slurm after config deployment
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Reconfigure controller
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol reconfigure
|
||||
changed_when: true
|
||||
|
||||
- name: Validate cluster state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
scontrol ping
|
||||
sinfo
|
||||
scontrol show nodes
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurm_config_validation
|
||||
changed_when: false
|
||||
|
||||
- name: Show validation output
|
||||
ansible.builtin.debug:
|
||||
var: slurm_config_validation.stdout_lines
|
||||
@@ -0,0 +1,103 @@
|
||||
---
|
||||
- name: Restart Slurm controller safely
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Restart munge on controller
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Restart slurmctld on controller
|
||||
ansible.builtin.systemd:
|
||||
name: slurmctld
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Wait for slurmctld to answer
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol ping
|
||||
register: scontrol_ping
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: scontrol_ping.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Show controller ping
|
||||
ansible.builtin.debug:
|
||||
var: scontrol_ping.stdout_lines
|
||||
|
||||
|
||||
- name: Restart Slurm workers safely one by one
|
||||
hosts: slurm_compute:slurm_gpu
|
||||
become: true
|
||||
gather_facts: false
|
||||
serial: 1
|
||||
|
||||
tasks:
|
||||
- name: Restart munge on worker
|
||||
ansible.builtin.systemd:
|
||||
name: munge
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Restart slurmd on worker
|
||||
ansible.builtin.systemd:
|
||||
name: slurmd
|
||||
state: restarted
|
||||
enabled: true
|
||||
|
||||
- name: Wait for slurmd to be active
|
||||
ansible.builtin.command:
|
||||
cmd: systemctl is-active slurmd
|
||||
register: slurmd_active
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: slurmd_active.stdout == "active"
|
||||
changed_when: false
|
||||
|
||||
- name: Wait until this node is visible in Slurm
|
||||
ansible.builtin.command:
|
||||
cmd: scontrol show node {{ inventory_hostname }}
|
||||
delegate_to: "{{ groups['slurm_controller'][0] }}"
|
||||
register: node_visible
|
||||
retries: 15
|
||||
delay: 2
|
||||
until: node_visible.rc == 0
|
||||
changed_when: false
|
||||
|
||||
|
||||
- name: Validate Slurm after restart
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Validate Slurm cluster state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
echo "### scontrol ping"
|
||||
scontrol ping
|
||||
|
||||
echo
|
||||
echo "### sinfo"
|
||||
sinfo
|
||||
|
||||
echo
|
||||
echo "### nodes"
|
||||
scontrol show nodes
|
||||
|
||||
echo
|
||||
echo "### partitions"
|
||||
scontrol show partitions
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurm_validation
|
||||
changed_when: false
|
||||
|
||||
- name: Show Slurm validation
|
||||
ansible.builtin.debug:
|
||||
var: slurm_validation.stdout_lines
|
||||
Reference in New Issue
Block a user