--- - name: Rolling upgrade Slurm worker nodes hosts: slurm_compute:slurm_gpu become: true gather_facts: true serial: 1 vars: skip_canary_node: "{{ canary_node | default('slurm-c02') }}" do_skip_canary: "{{ skip_canary | default(true) | bool }}" pre_tasks: - name: Skip canary node if requested ansible.builtin.meta: end_host when: - do_skip_canary - inventory_hostname == skip_canary_node - name: Drain node before OS upgrade ansible.builtin.command: cmd: scontrol update NodeName={{ inventory_hostname }} State=DRAIN Reason="rolling OS upgrade" delegate_to: "{{ groups['slurm_controller'][0] }}" changed_when: true - name: Wait until no jobs are running on this node ansible.builtin.shell: | set -euo pipefail squeue -h -w {{ inventory_hostname }} || true args: executable: /bin/bash delegate_to: "{{ groups['slurm_controller'][0] }}" register: jobs_on_node retries: 120 delay: 10 until: jobs_on_node.stdout | trim == "" changed_when: false tasks: - name: Update apt cache ansible.builtin.apt: update_cache: true cache_valid_time: 1800 - name: Full upgrade packages ansible.builtin.apt: upgrade: full autoremove: true autoclean: true register: apt_upgrade_result - name: Check if reboot is required ansible.builtin.stat: path: /var/run/reboot-required register: reboot_required - name: Show upgrade status ansible.builtin.debug: msg: - "Node: {{ inventory_hostname }}" - "Apt changed: {{ apt_upgrade_result.changed }}" - "Reboot required: {{ reboot_required.stat.exists }}" - name: Reboot node if required ansible.builtin.reboot: msg: "Reboot after rolling OS upgrade" reboot_timeout: 900 connect_timeout: 20 pre_reboot_delay: 5 post_reboot_delay: 20 when: reboot_required.stat.exists - name: Restart munge ansible.builtin.systemd: name: munge state: restarted enabled: true - name: Restart slurmd ansible.builtin.systemd: name: slurmd state: restarted enabled: true - name: Validate local slurm services ansible.builtin.shell: | set -euo pipefail systemctl is-active munge systemctl is-active slurmd munge -n | unmunge >/dev/null scontrol ping args: executable: /bin/bash changed_when: false post_tasks: - name: Restart controller to refresh state after node upgrade ansible.builtin.systemd: name: slurmctld state: restarted delegate_to: "{{ groups['slurm_controller'][0] }}" run_once: false - name: Wait for controller after restart ansible.builtin.command: cmd: scontrol ping delegate_to: "{{ groups['slurm_controller'][0] }}" register: slurmctld_ping retries: 15 delay: 2 until: slurmctld_ping.rc == 0 changed_when: false - name: Clear upgraded node maintenance state ansible.builtin.shell: | set -euo pipefail scontrol update NodeName={{ inventory_hostname }} State=RESUME 2>/dev/null || true scontrol update NodeName={{ inventory_hostname }} State=UNDRAIN 2>/dev/null || true scontrol update NodeName={{ inventory_hostname }} State=IDLE 2>/dev/null || true sleep 3 sinfo -N -n {{ inventory_hostname }} scontrol show node {{ inventory_hostname }} args: executable: /bin/bash delegate_to: "{{ groups['slurm_controller'][0] }}" register: resume_node changed_when: true - name: Wait until node is healthy ansible.builtin.shell: | set -euo pipefail sinfo -N -n {{ inventory_hostname }} scontrol show node {{ inventory_hostname }} args: executable: /bin/bash delegate_to: "{{ groups['slurm_controller'][0] }}" register: upgraded_node_state retries: 30 delay: 5 until: - upgraded_node_state.rc == 0 - "'not_responding' not in upgraded_node_state.stdout.lower()" - "'down' not in upgraded_node_state.stdout.lower()" - "'drain' not in upgraded_node_state.stdout.lower()" - "'idle*' not in upgraded_node_state.stdout.lower()" changed_when: false - name: Submit node-local post-upgrade test job ansible.builtin.shell: | set -euo pipefail job_id="$( sudo -iu slurmuser sbatch --parsable <