--- - name: Upgrade Slurm controller OS safely hosts: slurm_controller become: true gather_facts: true tasks: - name: Show cluster state before controller upgrade ansible.builtin.shell: | set -euo pipefail scontrol ping sinfo squeue systemctl is-active munge systemctl is-active slurmctld systemctl is-active slurmdbd || true systemctl is-active mariadb || true args: executable: /bin/bash register: before_state changed_when: false - name: Print cluster state before controller upgrade ansible.builtin.debug: var: before_state.stdout_lines - name: Update apt cache ansible.builtin.apt: update_cache: true cache_valid_time: 1800 - name: Full upgrade controller packages ansible.builtin.apt: upgrade: full autoremove: true autoclean: true register: controller_upgrade - name: Check if reboot is required ansible.builtin.stat: path: /var/run/reboot-required register: controller_reboot_required - name: Show controller upgrade status ansible.builtin.debug: msg: - "Apt changed: {{ controller_upgrade.changed }}" - "Reboot required: {{ controller_reboot_required.stat.exists }}" - name: Reboot controller if required ansible.builtin.reboot: msg: "Reboot after controller OS upgrade" reboot_timeout: 900 connect_timeout: 20 pre_reboot_delay: 5 post_reboot_delay: 30 when: controller_reboot_required.stat.exists - name: Restart controller services ansible.builtin.systemd: name: "{{ item }}" state: restarted enabled: true loop: - munge - mariadb - slurmdbd - slurmctld - name: Wait for slurmctld ansible.builtin.command: cmd: scontrol ping register: slurmctld_ping retries: 20 delay: 3 until: slurmctld_ping.rc == 0 changed_when: false - name: Validate controller after upgrade ansible.builtin.shell: | set -euo pipefail scontrol ping sinfo squeue scontrol show config | grep -E "AccountingStorage|JobAcctGather|TaskPlugin|ProctrackType" sacct -S today --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -20 args: executable: /bin/bash register: controller_after changed_when: false - name: Print controller validation after upgrade ansible.builtin.debug: var: controller_after.stdout_lines