247 lines
6.6 KiB
YAML
247 lines
6.6 KiB
YAML
---
|
|
- name: Validate target_node variable
|
|
hosts: localhost
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Require target_node
|
|
ansible.builtin.fail:
|
|
msg: "Use: ansible-playbook provision-slurm-node.yml -e target_node=<hostname>"
|
|
when: target_node is not defined
|
|
|
|
- name: Ensure target_node is in inventory
|
|
ansible.builtin.fail:
|
|
msg: "target_node={{ target_node }} is not in Ansible inventory"
|
|
when: target_node not in groups['all']
|
|
|
|
|
|
- name: Prepare OS, packages and Slurm directories on target node
|
|
hosts: "{{ target_node }}"
|
|
become: true
|
|
gather_facts: true
|
|
|
|
tasks:
|
|
- name: Ensure target is a Slurm worker or GPU node
|
|
ansible.builtin.fail:
|
|
msg: "{{ inventory_hostname }} must be in slurm_compute or slurm_gpu group"
|
|
when:
|
|
- inventory_hostname not in groups.get('slurm_compute', [])
|
|
- inventory_hostname not in groups.get('slurm_gpu', [])
|
|
|
|
- name: Install Slurm worker packages
|
|
ansible.builtin.apt:
|
|
name:
|
|
- munge
|
|
- libmunge2
|
|
- slurm-client
|
|
- slurmd
|
|
- slurm-wlm-basic-plugins
|
|
- slurm-wlm-plugins
|
|
- slurm-wlm-mysql-plugin
|
|
state: present
|
|
update_cache: true
|
|
|
|
- name: Ensure Slurm config directory exists
|
|
ansible.builtin.file:
|
|
path: "{{ slurm_config_dir }}"
|
|
state: directory
|
|
owner: root
|
|
group: root
|
|
mode: "0755"
|
|
|
|
- name: Ensure Slurm log directory exists
|
|
ansible.builtin.file:
|
|
path: /var/log/slurm
|
|
state: directory
|
|
owner: slurm
|
|
group: slurm
|
|
mode: "0755"
|
|
|
|
- name: Ensure slurmd spool directory exists
|
|
ansible.builtin.file:
|
|
path: /var/spool/slurmd
|
|
state: directory
|
|
owner: slurm
|
|
group: slurm
|
|
mode: "0755"
|
|
|
|
- name: Ensure munge dirs exist
|
|
ansible.builtin.file:
|
|
path: "{{ item.path }}"
|
|
state: directory
|
|
owner: munge
|
|
group: munge
|
|
mode: "{{ item.mode }}"
|
|
loop:
|
|
- { path: /etc/munge, mode: "0700" }
|
|
- { path: /var/log/munge, mode: "0755" }
|
|
- { path: /var/lib/munge, mode: "0711" }
|
|
- { path: /run/munge, mode: "0755" }
|
|
|
|
|
|
- name: Deploy Munge key from controller to target node
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Read controller munge.key
|
|
ansible.builtin.slurp:
|
|
src: /etc/munge/munge.key
|
|
register: controller_munge_key_raw
|
|
|
|
- name: Store controller Munge key as fact
|
|
ansible.builtin.set_fact:
|
|
cluster_munge_key_b64: "{{ controller_munge_key_raw.content }}"
|
|
|
|
|
|
- name: Configure target node with Munge and Slurm files
|
|
hosts: "{{ target_node }}"
|
|
become: true
|
|
gather_facts: false
|
|
|
|
vars:
|
|
controller_host: "{{ groups['slurm_controller'][0] }}"
|
|
|
|
tasks:
|
|
- name: Deploy shared munge.key
|
|
ansible.builtin.copy:
|
|
dest: /etc/munge/munge.key
|
|
content: "{{ hostvars[controller_host].cluster_munge_key_b64 | b64decode }}"
|
|
owner: munge
|
|
group: munge
|
|
mode: "0400"
|
|
notify:
|
|
- Restart munge
|
|
|
|
- name: Deploy managed slurm.conf
|
|
ansible.builtin.template:
|
|
src: ../../templates/slurm.conf.j2
|
|
dest: "{{ slurm_config_dir }}/slurm.conf"
|
|
owner: root
|
|
group: root
|
|
mode: "0644"
|
|
notify:
|
|
- Restart slurmd
|
|
|
|
- name: Deploy managed cgroup.conf
|
|
ansible.builtin.template:
|
|
src: ../../templates/cgroup.conf.j2
|
|
dest: "{{ slurm_config_dir }}/cgroup.conf"
|
|
owner: root
|
|
group: root
|
|
mode: "0644"
|
|
when: slurm_enable_cgroup | default(false) | bool
|
|
notify:
|
|
- Restart slurmd
|
|
|
|
- name: Deploy managed gres.conf on GPU nodes
|
|
ansible.builtin.template:
|
|
src: ../../templates/gres.conf.j2
|
|
dest: "{{ slurm_config_dir }}/gres.conf"
|
|
owner: root
|
|
group: root
|
|
mode: "0644"
|
|
when: inventory_hostname in groups.get('slurm_gpu', [])
|
|
notify:
|
|
- Restart slurmd
|
|
|
|
- name: Ensure munge is enabled and running
|
|
ansible.builtin.systemd:
|
|
name: munge
|
|
enabled: true
|
|
state: started
|
|
|
|
- name: Ensure slurmd is enabled and running
|
|
ansible.builtin.systemd:
|
|
name: slurmd
|
|
enabled: true
|
|
state: started
|
|
|
|
handlers:
|
|
- name: Restart munge
|
|
ansible.builtin.systemd:
|
|
name: munge
|
|
state: restarted
|
|
|
|
- name: Restart slurmd
|
|
ansible.builtin.systemd:
|
|
name: slurmd
|
|
state: restarted
|
|
|
|
|
|
- name: Deploy updated Slurm config to whole cluster and reconfigure controller
|
|
hosts: slurm_cluster
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Deploy managed slurm.conf to all nodes
|
|
ansible.builtin.template:
|
|
src: ../../templates/slurm.conf.j2
|
|
dest: "{{ slurm_config_dir }}/slurm.conf"
|
|
owner: root
|
|
group: root
|
|
mode: "0644"
|
|
|
|
- name: Deploy managed cgroup.conf to all nodes
|
|
ansible.builtin.template:
|
|
src: ../../templates/cgroup.conf.j2
|
|
dest: "{{ slurm_config_dir }}/cgroup.conf"
|
|
owner: root
|
|
group: root
|
|
mode: "0644"
|
|
when: slurm_enable_cgroup | default(false) | bool
|
|
|
|
|
|
- name: Reconfigure Slurm and validate target node
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Reconfigure Slurm controller
|
|
ansible.builtin.command:
|
|
cmd: scontrol reconfigure
|
|
changed_when: true
|
|
|
|
- name: Restart Slurm controller after node reprovision
|
|
ansible.builtin.systemd:
|
|
name: slurmctld
|
|
state: restarted
|
|
|
|
- name: Wait for Slurm controller after restart
|
|
ansible.builtin.command:
|
|
cmd: scontrol ping
|
|
register: slurmctld_ping_after_restart
|
|
retries: 15
|
|
delay: 2
|
|
until: slurmctld_ping_after_restart.rc == 0
|
|
changed_when: false
|
|
|
|
- name: Resume target node in Slurm
|
|
ansible.builtin.command:
|
|
cmd: scontrol update NodeName={{ target_node }} State=RESUME
|
|
changed_when: true
|
|
|
|
- name: Wait until target node is visible and not down
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
scontrol show node {{ target_node }}
|
|
sinfo -N -n {{ target_node }}
|
|
args:
|
|
executable: /bin/bash
|
|
register: target_node_state
|
|
retries: 20
|
|
delay: 3
|
|
until:
|
|
- target_node_state.rc == 0
|
|
- "'down' not in target_node_state.stdout.lower()"
|
|
- "'not_responding' not in target_node_state.stdout.lower()"
|
|
- "'idle*' not in target_node_state.stdout.lower()"
|
|
changed_when: false
|
|
|
|
- name: Show target node state
|
|
ansible.builtin.debug:
|
|
var: target_node_state.stdout_lines
|