Files
portfolio/platform-projects/hpc-slurm-ai-cluster/playbooks/backup/backup-slurm-state.yml
T

84 lines
2.6 KiB
YAML
Raw Normal View History

2026-06-04 19:41:05 +00:00
---
- name: Backup Slurm and Munge state on all cluster nodes
hosts: slurm_cluster
become: true
gather_facts: true
vars:
backup_base_dir: /var/backups/slurm
tasks:
- name: Create backup base directory
ansible.builtin.file:
path: "{{ backup_base_dir }}"
state: directory
owner: root
group: root
mode: "0700"
- name: Create timestamped backup directory
ansible.builtin.shell: |
set -euo pipefail
ts="$(date +%F-%H%M%S)"
dir="{{ backup_base_dir }}/$ts"
mkdir -p "$dir"
echo "$dir"
args:
executable: /bin/bash
register: backup_dir_result
changed_when: true
- name: Store backup directory fact
ansible.builtin.set_fact:
node_backup_dir: "{{ backup_dir_result.stdout }}"
- name: Backup Slurm and Munge config/state if present
ansible.builtin.shell: |
set -euo pipefail
backup_dir="{{ node_backup_dir }}"
for p in \
/etc/slurm \
/etc/slurm-llnl \
/etc/munge \
/var/spool/slurmctld \
/var/spool/slurmd \
/var/log/slurm \
/var/log/slurm-llnl
do
if [ -e "$p" ]; then
cp -a "$p" "$backup_dir/"
fi
done
systemctl status munge --no-pager > "$backup_dir/systemctl-munge.txt" 2>&1 || true
systemctl status slurmctld --no-pager > "$backup_dir/systemctl-slurmctld.txt" 2>&1 || true
systemctl status slurmd --no-pager > "$backup_dir/systemctl-slurmd.txt" 2>&1 || true
journalctl -u munge -n 200 --no-pager > "$backup_dir/journal-munge.txt" 2>&1 || true
journalctl -u slurmctld -n 200 --no-pager > "$backup_dir/journal-slurmctld.txt" 2>&1 || true
journalctl -u slurmd -n 200 --no-pager > "$backup_dir/journal-slurmd.txt" 2>&1 || true
if command -v sinfo >/dev/null 2>&1; then
sinfo > "$backup_dir/sinfo.txt" 2>&1 || true
fi
if command -v scontrol >/dev/null 2>&1; then
scontrol show config > "$backup_dir/scontrol-show-config.txt" 2>&1 || true
scontrol show nodes > "$backup_dir/scontrol-show-nodes.txt" 2>&1 || true
scontrol show partitions > "$backup_dir/scontrol-show-partitions.txt" 2>&1 || true
fi
find "$backup_dir" -maxdepth 2 -type f -o -type d
args:
executable: /bin/bash
register: backup_content
changed_when: true
- name: Show backup location on node
ansible.builtin.debug:
msg:
- "Host: {{ inventory_hostname }}"
- "Backup directory: {{ node_backup_dir }}"