95 lines
2.6 KiB
YAML
95 lines
2.6 KiB
YAML
---
|
|
- name: Upgrade Slurm controller OS safely
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: true
|
|
|
|
tasks:
|
|
- name: Show cluster state before controller upgrade
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
scontrol ping
|
|
sinfo
|
|
squeue
|
|
systemctl is-active munge
|
|
systemctl is-active slurmctld
|
|
systemctl is-active slurmdbd || true
|
|
systemctl is-active mariadb || true
|
|
args:
|
|
executable: /bin/bash
|
|
register: before_state
|
|
changed_when: false
|
|
|
|
- name: Print cluster state before controller upgrade
|
|
ansible.builtin.debug:
|
|
var: before_state.stdout_lines
|
|
|
|
- name: Update apt cache
|
|
ansible.builtin.apt:
|
|
update_cache: true
|
|
cache_valid_time: 1800
|
|
|
|
- name: Full upgrade controller packages
|
|
ansible.builtin.apt:
|
|
upgrade: full
|
|
autoremove: true
|
|
autoclean: true
|
|
register: controller_upgrade
|
|
|
|
- name: Check if reboot is required
|
|
ansible.builtin.stat:
|
|
path: /var/run/reboot-required
|
|
register: controller_reboot_required
|
|
|
|
- name: Show controller upgrade status
|
|
ansible.builtin.debug:
|
|
msg:
|
|
- "Apt changed: {{ controller_upgrade.changed }}"
|
|
- "Reboot required: {{ controller_reboot_required.stat.exists }}"
|
|
|
|
- name: Reboot controller if required
|
|
ansible.builtin.reboot:
|
|
msg: "Reboot after controller OS upgrade"
|
|
reboot_timeout: 900
|
|
connect_timeout: 20
|
|
pre_reboot_delay: 5
|
|
post_reboot_delay: 30
|
|
when: controller_reboot_required.stat.exists
|
|
|
|
- name: Restart controller services
|
|
ansible.builtin.systemd:
|
|
name: "{{ item }}"
|
|
state: restarted
|
|
enabled: true
|
|
loop:
|
|
- munge
|
|
- mariadb
|
|
- slurmdbd
|
|
- slurmctld
|
|
|
|
- name: Wait for slurmctld
|
|
ansible.builtin.command:
|
|
cmd: scontrol ping
|
|
register: slurmctld_ping
|
|
retries: 20
|
|
delay: 3
|
|
until: slurmctld_ping.rc == 0
|
|
changed_when: false
|
|
|
|
- name: Validate controller after upgrade
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
scontrol ping
|
|
sinfo
|
|
squeue
|
|
scontrol show config | grep -E "AccountingStorage|JobAcctGather|TaskPlugin|ProctrackType"
|
|
sacct -S today --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -20
|
|
args:
|
|
executable: /bin/bash
|
|
register: controller_after
|
|
changed_when: false
|
|
|
|
- name: Print controller validation after upgrade
|
|
ansible.builtin.debug:
|
|
var: controller_after.stdout_lines
|