--- - name: Validate target_node variable hosts: localhost gather_facts: false tasks: - name: Require target_node ansible.builtin.fail: msg: "Use: ansible-playbook provision-slurm-node.yml -e target_node=" when: target_node is not defined - name: Ensure target_node is in inventory ansible.builtin.fail: msg: "target_node={{ target_node }} is not in Ansible inventory" when: target_node not in groups['all'] - name: Prepare OS, packages and Slurm directories on target node hosts: "{{ target_node }}" become: true gather_facts: true tasks: - name: Ensure target is a Slurm worker or GPU node ansible.builtin.fail: msg: "{{ inventory_hostname }} must be in slurm_compute or slurm_gpu group" when: - inventory_hostname not in groups.get('slurm_compute', []) - inventory_hostname not in groups.get('slurm_gpu', []) - name: Install Slurm worker packages ansible.builtin.apt: name: - munge - libmunge2 - slurm-client - slurmd - slurm-wlm-basic-plugins - slurm-wlm-plugins - slurm-wlm-mysql-plugin state: present update_cache: true - name: Ensure Slurm config directory exists ansible.builtin.file: path: "{{ slurm_config_dir }}" state: directory owner: root group: root mode: "0755" - name: Ensure Slurm log directory exists ansible.builtin.file: path: /var/log/slurm state: directory owner: slurm group: slurm mode: "0755" - name: Ensure slurmd spool directory exists ansible.builtin.file: path: /var/spool/slurmd state: directory owner: slurm group: slurm mode: "0755" - name: Ensure munge dirs exist ansible.builtin.file: path: "{{ item.path }}" state: directory owner: munge group: munge mode: "{{ item.mode }}" loop: - { path: /etc/munge, mode: "0700" } - { path: /var/log/munge, mode: "0755" } - { path: /var/lib/munge, mode: "0711" } - { path: /run/munge, mode: "0755" } - name: Deploy Munge key from controller to target node hosts: slurm_controller become: true gather_facts: false tasks: - name: Read controller munge.key ansible.builtin.slurp: src: /etc/munge/munge.key register: controller_munge_key_raw - name: Store controller Munge key as fact ansible.builtin.set_fact: cluster_munge_key_b64: "{{ controller_munge_key_raw.content }}" - name: Configure target node with Munge and Slurm files hosts: "{{ target_node }}" become: true gather_facts: false vars: controller_host: "{{ groups['slurm_controller'][0] }}" tasks: - name: Deploy shared munge.key ansible.builtin.copy: dest: /etc/munge/munge.key content: "{{ hostvars[controller_host].cluster_munge_key_b64 | b64decode }}" owner: munge group: munge mode: "0400" notify: - Restart munge - name: Deploy managed slurm.conf ansible.builtin.template: src: ../../templates/slurm.conf.j2 dest: "{{ slurm_config_dir }}/slurm.conf" owner: root group: root mode: "0644" notify: - Restart slurmd - name: Deploy managed cgroup.conf ansible.builtin.template: src: ../../templates/cgroup.conf.j2 dest: "{{ slurm_config_dir }}/cgroup.conf" owner: root group: root mode: "0644" when: slurm_enable_cgroup | default(false) | bool notify: - Restart slurmd - name: Deploy managed gres.conf on GPU nodes ansible.builtin.template: src: ../../templates/gres.conf.j2 dest: "{{ slurm_config_dir }}/gres.conf" owner: root group: root mode: "0644" when: inventory_hostname in groups.get('slurm_gpu', []) notify: - Restart slurmd - name: Ensure munge is enabled and running ansible.builtin.systemd: name: munge enabled: true state: started - name: Ensure slurmd is enabled and running ansible.builtin.systemd: name: slurmd enabled: true state: started handlers: - name: Restart munge ansible.builtin.systemd: name: munge state: restarted - name: Restart slurmd ansible.builtin.systemd: name: slurmd state: restarted - name: Deploy updated Slurm config to whole cluster and reconfigure controller hosts: slurm_cluster become: true gather_facts: false tasks: - name: Deploy managed slurm.conf to all nodes ansible.builtin.template: src: ../../templates/slurm.conf.j2 dest: "{{ slurm_config_dir }}/slurm.conf" owner: root group: root mode: "0644" - name: Deploy managed cgroup.conf to all nodes ansible.builtin.template: src: ../../templates/cgroup.conf.j2 dest: "{{ slurm_config_dir }}/cgroup.conf" owner: root group: root mode: "0644" when: slurm_enable_cgroup | default(false) | bool - name: Reconfigure Slurm and validate target node hosts: slurm_controller become: true gather_facts: false tasks: - name: Reconfigure Slurm controller ansible.builtin.command: cmd: scontrol reconfigure changed_when: true - name: Restart Slurm controller after node reprovision ansible.builtin.systemd: name: slurmctld state: restarted - name: Wait for Slurm controller after restart ansible.builtin.command: cmd: scontrol ping register: slurmctld_ping_after_restart retries: 15 delay: 2 until: slurmctld_ping_after_restart.rc == 0 changed_when: false - name: Resume target node in Slurm ansible.builtin.command: cmd: scontrol update NodeName={{ target_node }} State=RESUME changed_when: true - name: Wait until target node is visible and not down ansible.builtin.shell: | set -euo pipefail scontrol show node {{ target_node }} sinfo -N -n {{ target_node }} args: executable: /bin/bash register: target_node_state retries: 20 delay: 3 until: - target_node_state.rc == 0 - "'down' not in target_node_state.stdout.lower()" - "'not_responding' not in target_node_state.stdout.lower()" - "'idle*' not in target_node_state.stdout.lower()" changed_when: false - name: Show target node state ansible.builtin.debug: var: target_node_state.stdout_lines