Files
portfolio/platform-projects/hpc-slurm-ai-cluster/playbooks/upgrade/upgrade-slurm-controller.yml
T
Mateusz Suski d300d490f5
lint / shell-yaml-ansible (push) Failing after 47s
Add Slurm AI/HPC cluster platform project
2026-06-04 19:42:45 +00:00

95 lines
2.6 KiB
YAML

---
- name: Upgrade Slurm controller OS safely
hosts: slurm_controller
become: true
gather_facts: true
tasks:
- name: Show cluster state before controller upgrade
ansible.builtin.shell: |
set -euo pipefail
scontrol ping
sinfo
squeue
systemctl is-active munge
systemctl is-active slurmctld
systemctl is-active slurmdbd || true
systemctl is-active mariadb || true
args:
executable: /bin/bash
register: before_state
changed_when: false
- name: Print cluster state before controller upgrade
ansible.builtin.debug:
var: before_state.stdout_lines
- name: Update apt cache
ansible.builtin.apt:
update_cache: true
cache_valid_time: 1800
- name: Full upgrade controller packages
ansible.builtin.apt:
upgrade: full
autoremove: true
autoclean: true
register: controller_upgrade
- name: Check if reboot is required
ansible.builtin.stat:
path: /var/run/reboot-required
register: controller_reboot_required
- name: Show controller upgrade status
ansible.builtin.debug:
msg:
- "Apt changed: {{ controller_upgrade.changed }}"
- "Reboot required: {{ controller_reboot_required.stat.exists }}"
- name: Reboot controller if required
ansible.builtin.reboot:
msg: "Reboot after controller OS upgrade"
reboot_timeout: 900
connect_timeout: 20
pre_reboot_delay: 5
post_reboot_delay: 30
when: controller_reboot_required.stat.exists
- name: Restart controller services
ansible.builtin.systemd:
name: "{{ item }}"
state: restarted
enabled: true
loop:
- munge
- mariadb
- slurmdbd
- slurmctld
- name: Wait for slurmctld
ansible.builtin.command:
cmd: scontrol ping
register: slurmctld_ping
retries: 20
delay: 3
until: slurmctld_ping.rc == 0
changed_when: false
- name: Validate controller after upgrade
ansible.builtin.shell: |
set -euo pipefail
scontrol ping
sinfo
squeue
scontrol show config | grep -E "AccountingStorage|JobAcctGather|TaskPlugin|ProctrackType"
sacct -S today --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -20
args:
executable: /bin/bash
register: controller_after
changed_when: false
- name: Print controller validation after upgrade
ansible.builtin.debug:
var: controller_after.stdout_lines