Add Slurm AI/HPC cluster platform project

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit cd6830334b
47 changed files with 4727 additions and 0 deletions
@@ -0,0 +1,98 @@
---
- name: Restore-check latest SlurmDBD backup into test database
hosts: slurm_controller
become: true
gather_facts: false
vars:
restore_check_db: "{{ slurmdbd_storage_loc }}_restorecheck"
slurmdbd_backup_dir: /var/backups/slurmdbd
tasks:
- name: Validate MariaDB is running
ansible.builtin.command:
cmd: systemctl is-active mariadb
changed_when: false
- name: Find latest SlurmDBD backup
ansible.builtin.shell: |
set -euo pipefail
ls -1t {{ slurmdbd_backup_dir }}/{{ slurmdbd_storage_loc }}-*.sql.gz | head -n 1
args:
executable: /bin/bash
register: latest_backup
changed_when: false
- name: Validate latest backup exists
ansible.builtin.stat:
path: "{{ latest_backup.stdout }}"
register: latest_backup_stat
- name: Fail if latest backup is missing or empty
ansible.builtin.fail:
msg: "Latest SlurmDBD backup is missing or empty: {{ latest_backup.stdout }}"
when:
- not latest_backup_stat.stat.exists or latest_backup_stat.stat.size | int < 1024
- name: Recreate restore-check database
ansible.builtin.shell: |
set -euo pipefail
mysql <<SQL
DROP DATABASE IF EXISTS {{ restore_check_db }};
CREATE DATABASE {{ restore_check_db }};
SQL
args:
executable: /bin/bash
changed_when: true
- name: Import backup into restore-check database
ansible.builtin.shell: |
set -euo pipefail
zcat "{{ latest_backup.stdout }}" | mysql {{ restore_check_db }}
args:
executable: /bin/bash
changed_when: true
- name: Validate restored table count
ansible.builtin.shell: |
set -euo pipefail
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ restore_check_db }}';"
args:
executable: /bin/bash
register: restored_tables
changed_when: false
failed_when: restored_tables.stdout | int < 1
- name: Validate restored row count sample
ansible.builtin.shell: |
set -euo pipefail
echo "### restored database"
echo "{{ restore_check_db }}"
echo
echo "### table count"
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ restore_check_db }}';"
echo
echo "### largest tables"
mysql -N -B -e "
SELECT table_name, table_rows
FROM information_schema.tables
WHERE table_schema='{{ restore_check_db }}'
ORDER BY table_rows DESC
LIMIT 10;
"
args:
executable: /bin/bash
register: restore_check_summary
changed_when: false
- name: Show restore-check result
ansible.builtin.debug:
msg:
- "Imported backup: {{ latest_backup.stdout }}"
- "Restore-check DB: {{ restore_check_db }}"
- "Restored tables: {{ restored_tables.stdout }}"
- "Summary:"
- "{{ restore_check_summary.stdout_lines }}"