--- - name: Validate canary node variable hosts: localhost gather_facts: false vars: canary_node_effective: "{{ canary_node | default('slurm-c02') }}" tasks: - name: Ensure canary node is in inventory ansible.builtin.fail: msg: "canary_node={{ canary_node_effective }} is not in inventory" when: canary_node_effective not in groups['all'] - name: Ensure canary node is not the controller ansible.builtin.fail: msg: "Do not use controller as canary for worker rolling upgrade" when: canary_node_effective in groups['slurm_controller'] - name: Drain canary node hosts: slurm_controller become: true gather_facts: false vars: canary_node_effective: "{{ canary_node | default('slurm-c02') }}" tasks: - name: Show canary state before drain ansible.builtin.shell: | set -euo pipefail sinfo -N -n {{ canary_node_effective }} || true scontrol show node {{ canary_node_effective }} || true squeue -w {{ canary_node_effective }} || true args: executable: /bin/bash register: canary_before changed_when: false - name: Print canary state before drain ansible.builtin.debug: var: canary_before.stdout_lines - name: Drain canary node ansible.builtin.command: cmd: scontrol update NodeName={{ canary_node_effective }} State=DRAIN Reason="canary OS upgrade" changed_when: true - name: Wait until canary has no running jobs ansible.builtin.shell: | set -euo pipefail squeue -h -w {{ canary_node_effective }} || true args: executable: /bin/bash register: canary_jobs retries: 120 delay: 10 until: canary_jobs.stdout | trim == "" changed_when: false - name: Upgrade canary node OS packages hosts: "{{ canary_node | default('slurm-c02') }}" become: true gather_facts: true tasks: - name: Ensure apt cache is updated ansible.builtin.apt: update_cache: true cache_valid_time: 1800 - name: Full upgrade packages ansible.builtin.apt: upgrade: full autoremove: true autoclean: true register: apt_upgrade_result - name: Check if reboot is required ansible.builtin.stat: path: /var/run/reboot-required register: reboot_required - name: Show upgrade summary ansible.builtin.debug: msg: - "Host: {{ inventory_hostname }}" - "Apt changed: {{ apt_upgrade_result.changed }}" - "Reboot required: {{ reboot_required.stat.exists }}" - name: Reboot canary if required ansible.builtin.reboot: msg: "Reboot after canary OS upgrade" reboot_timeout: 900 connect_timeout: 20 pre_reboot_delay: 5 post_reboot_delay: 20 when: reboot_required.stat.exists - name: Ensure munge is running ansible.builtin.systemd: name: munge state: restarted enabled: true - name: Ensure slurmd is running ansible.builtin.systemd: name: slurmd state: restarted enabled: true - name: Validate local services ansible.builtin.shell: | set -euo pipefail systemctl is-active munge systemctl is-active slurmd munge -n | unmunge >/dev/null scontrol ping args: executable: /bin/bash changed_when: false - name: Resume canary node and run canary job hosts: slurm_controller become: true gather_facts: false vars: canary_node_effective: "{{ canary_node | default('slurm-c02') }}" tasks: - name: Reconfigure controller ansible.builtin.command: cmd: scontrol reconfigure changed_when: true - name: Restart controller to refresh node state ansible.builtin.systemd: name: slurmctld state: restarted - name: Wait for controller ansible.builtin.command: cmd: scontrol ping register: slurmctld_ping retries: 15 delay: 2 until: slurmctld_ping.rc == 0 changed_when: false - name: Clear canary node maintenance state ansible.builtin.shell: | set -euo pipefail scontrol update NodeName={{ canary_node_effective }} State=RESUME 2>/dev/null || true scontrol update NodeName={{ canary_node_effective }} State=UNDRAIN 2>/dev/null || true scontrol update NodeName={{ canary_node_effective }} State=IDLE 2>/dev/null || true sleep 3 sinfo -N -n {{ canary_node_effective }} scontrol show node {{ canary_node_effective }} args: executable: /bin/bash register: resume_canary changed_when: true - name: Wait until canary is IDLE and responding ansible.builtin.shell: | set -euo pipefail sinfo -N -n {{ canary_node_effective }} scontrol show node {{ canary_node_effective }} args: executable: /bin/bash register: canary_state retries: 30 delay: 5 until: - canary_state.rc == 0 - "'not_responding' not in canary_state.stdout.lower()" - "'down' not in canary_state.stdout.lower()" - "'drain' not in canary_state.stdout.lower()" - "'idle*' not in canary_state.stdout.lower()" changed_when: false - name: Submit canary test job to upgraded node ansible.builtin.shell: | set -euo pipefail job_id="$( sudo -iu slurmuser sbatch --parsable <