This commit is contained in:
@@ -0,0 +1,83 @@
|
||||
---
|
||||
- name: Backup Slurm and Munge state on all cluster nodes
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
vars:
|
||||
backup_base_dir: /var/backups/slurm
|
||||
|
||||
tasks:
|
||||
- name: Create backup base directory
|
||||
ansible.builtin.file:
|
||||
path: "{{ backup_base_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0700"
|
||||
|
||||
- name: Create timestamped backup directory
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
ts="$(date +%F-%H%M%S)"
|
||||
dir="{{ backup_base_dir }}/$ts"
|
||||
mkdir -p "$dir"
|
||||
echo "$dir"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: backup_dir_result
|
||||
changed_when: true
|
||||
|
||||
- name: Store backup directory fact
|
||||
ansible.builtin.set_fact:
|
||||
node_backup_dir: "{{ backup_dir_result.stdout }}"
|
||||
|
||||
- name: Backup Slurm and Munge config/state if present
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
backup_dir="{{ node_backup_dir }}"
|
||||
|
||||
for p in \
|
||||
/etc/slurm \
|
||||
/etc/slurm-llnl \
|
||||
/etc/munge \
|
||||
/var/spool/slurmctld \
|
||||
/var/spool/slurmd \
|
||||
/var/log/slurm \
|
||||
/var/log/slurm-llnl
|
||||
do
|
||||
if [ -e "$p" ]; then
|
||||
cp -a "$p" "$backup_dir/"
|
||||
fi
|
||||
done
|
||||
|
||||
systemctl status munge --no-pager > "$backup_dir/systemctl-munge.txt" 2>&1 || true
|
||||
systemctl status slurmctld --no-pager > "$backup_dir/systemctl-slurmctld.txt" 2>&1 || true
|
||||
systemctl status slurmd --no-pager > "$backup_dir/systemctl-slurmd.txt" 2>&1 || true
|
||||
|
||||
journalctl -u munge -n 200 --no-pager > "$backup_dir/journal-munge.txt" 2>&1 || true
|
||||
journalctl -u slurmctld -n 200 --no-pager > "$backup_dir/journal-slurmctld.txt" 2>&1 || true
|
||||
journalctl -u slurmd -n 200 --no-pager > "$backup_dir/journal-slurmd.txt" 2>&1 || true
|
||||
|
||||
if command -v sinfo >/dev/null 2>&1; then
|
||||
sinfo > "$backup_dir/sinfo.txt" 2>&1 || true
|
||||
fi
|
||||
|
||||
if command -v scontrol >/dev/null 2>&1; then
|
||||
scontrol show config > "$backup_dir/scontrol-show-config.txt" 2>&1 || true
|
||||
scontrol show nodes > "$backup_dir/scontrol-show-nodes.txt" 2>&1 || true
|
||||
scontrol show partitions > "$backup_dir/scontrol-show-partitions.txt" 2>&1 || true
|
||||
fi
|
||||
|
||||
find "$backup_dir" -maxdepth 2 -type f -o -type d
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: backup_content
|
||||
changed_when: true
|
||||
|
||||
- name: Show backup location on node
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Host: {{ inventory_hostname }}"
|
||||
- "Backup directory: {{ node_backup_dir }}"
|
||||
@@ -0,0 +1,46 @@
|
||||
---
|
||||
- name: Fetch latest Slurm backups from nodes to pvef
|
||||
hosts: slurm_cluster
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
remote_backup_base: /var/backups/slurm
|
||||
local_backup_base: "{{ playbook_dir }}/../../artifacts/backups"
|
||||
|
||||
tasks:
|
||||
- name: Find latest remote backup directory
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
ls -1dt {{ remote_backup_base }}/* | head -n 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: latest_backup_dir
|
||||
changed_when: false
|
||||
|
||||
- name: Create local backup directory on pvef
|
||||
ansible.builtin.file:
|
||||
path: "{{ local_backup_base }}/{{ inventory_hostname }}"
|
||||
state: directory
|
||||
mode: "0700"
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
|
||||
- name: Archive latest backup directory on remote node
|
||||
ansible.builtin.archive:
|
||||
path: "{{ latest_backup_dir.stdout }}"
|
||||
dest: "/tmp/{{ inventory_hostname }}-slurm-backup.tgz"
|
||||
format: gz
|
||||
force_archive: true
|
||||
changed_when: true
|
||||
|
||||
- name: Fetch archive to pvef
|
||||
ansible.builtin.fetch:
|
||||
src: "/tmp/{{ inventory_hostname }}-slurm-backup.tgz"
|
||||
dest: "{{ local_backup_base }}/{{ inventory_hostname }}/"
|
||||
flat: true
|
||||
|
||||
- name: Remove temporary remote archive
|
||||
ansible.builtin.file:
|
||||
path: "/tmp/{{ inventory_hostname }}-slurm-backup.tgz"
|
||||
state: absent
|
||||
Reference in New Issue
Block a user