Add Slurm AI/HPC cluster platform project

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit cd6830334b
47 changed files with 4727 additions and 0 deletions
@@ -0,0 +1,90 @@
---
- name: Backup SlurmDBD MariaDB database
hosts: slurm_controller
become: true
gather_facts: true
vars:
slurmdbd_backup_dir: /var/backups/slurmdbd
local_fetch_dir: "{{ playbook_dir }}/../../artifacts/backups/slurmdbd"
tasks:
- name: Create remote backup directory
ansible.builtin.file:
path: "{{ slurmdbd_backup_dir }}"
state: directory
owner: root
group: root
mode: "0700"
- name: Create local fetch directory on Ansible controller
ansible.builtin.file:
path: "{{ local_fetch_dir }}"
state: directory
owner: root
group: root
mode: "0700"
delegate_to: localhost
become: false
- name: Validate MariaDB is running
ansible.builtin.command:
cmd: systemctl is-active mariadb
changed_when: false
- name: Validate SlurmDBD is running
ansible.builtin.command:
cmd: systemctl is-active slurmdbd
changed_when: false
- name: Validate Slurm accounting database exists
ansible.builtin.shell: |
set -euo pipefail
mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';" | grep -qx "{{ slurmdbd_storage_loc }}"
args:
executable: /bin/bash
changed_when: false
- name: Dump Slurm accounting database
ansible.builtin.shell: |
set -euo pipefail
ts="$(date +%F-%H%M%S)"
out="{{ slurmdbd_backup_dir }}/{{ slurmdbd_storage_loc }}-${ts}.sql.gz"
mysqldump \
--single-transaction \
--routines \
--events \
--triggers \
{{ slurmdbd_storage_loc }} | gzip -9 > "$out"
chmod 0600 "$out"
echo "$out"
args:
executable: /bin/bash
register: db_dump
changed_when: true
- name: Validate backup file is non-empty
ansible.builtin.stat:
path: "{{ db_dump.stdout }}"
register: backup_file
- name: Fail if backup file is empty
ansible.builtin.fail:
msg: "Backup file is empty: {{ db_dump.stdout }}"
when: backup_file.stat.size | int < 1024
- name: Fetch DB backup to Ansible controller
ansible.builtin.fetch:
src: "{{ db_dump.stdout }}"
dest: "{{ local_fetch_dir }}/"
flat: true
- name: Show DB backup result
ansible.builtin.debug:
msg:
- "Remote backup: {{ db_dump.stdout }}"
- "Backup size bytes: {{ backup_file.stat.size }}"
- "Fetched to: {{ local_fetch_dir }}/"
@@ -0,0 +1,126 @@
---
- name: Initialize Slurm accounting entities
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Wait for sacctmgr connectivity
ansible.builtin.command:
cmd: sacctmgr -n list cluster
register: sacctmgr_cluster_list
retries: 20
delay: 2
until: sacctmgr_cluster_list.rc == 0
changed_when: false
- name: Show current accounting state before changes
ansible.builtin.shell: |
set -euo pipefail
echo "### clusters"
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
echo
echo "### accounts"
sacctmgr list account format=Account,Descr,Org
echo
echo "### users"
sacctmgr list user format=User,DefaultAccount,Admin
echo
echo "### associations"
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
args:
executable: /bin/bash
register: accounting_state_before
changed_when: false
- name: Print current accounting state before changes
ansible.builtin.debug:
var: accounting_state_before.stdout_lines
- name: Ensure Slurm cluster exists in accounting DB
ansible.builtin.shell: |
set -euo pipefail
if sacctmgr -n list cluster format=Cluster | awk '{print $1}' | grep -qx "{{ slurm_cluster_name }}"; then
echo "Cluster {{ slurm_cluster_name }} already exists"
else
sacctmgr -i add cluster {{ slurm_cluster_name }}
fi
args:
executable: /bin/bash
register: ensure_cluster
changed_when: "'Adding Cluster' in ensure_cluster.stdout"
- name: Ensure default lab account exists for cluster
ansible.builtin.shell: |
set -euo pipefail
if sacctmgr -n list assoc format=Cluster,Account,User | awk '$1=="{{ slurm_cluster_name }}" && $2=="{{ slurm_account_name }}" && $3=="" {found=1} END {exit !found}'; then
echo "Account {{ slurm_account_name }} already associated with cluster {{ slurm_cluster_name }}"
else
sacctmgr -i add account {{ slurm_account_name }} \
Cluster={{ slurm_cluster_name }} \
Description="{{ slurm_account_description }}" \
Organization="{{ slurm_account_organization }}"
fi
args:
executable: /bin/bash
register: ensure_account
changed_when: "'Adding Account' in ensure_account.stdout"
- name: Ensure slurmuser exists with lab account association
ansible.builtin.shell: |
set -euo pipefail
if sacctmgr -n list assoc format=Cluster,Account,User | awk '$1=="{{ slurm_cluster_name }}" && $2=="{{ slurm_account_name }}" && $3=="slurmuser" {found=1} END {exit !found}'; then
echo "User slurmuser already associated with account {{ slurm_account_name }} on cluster {{ slurm_cluster_name }}"
else
sacctmgr -i add user slurmuser \
Cluster={{ slurm_cluster_name }} \
Account={{ slurm_account_name }} \
DefaultAccount={{ slurm_account_name }}
fi
args:
executable: /bin/bash
register: ensure_user_assoc
changed_when: "'Adding User' in ensure_user_assoc.stdout"
- name: Ensure slurmuser has default account set
ansible.builtin.shell: |
set -euo pipefail
sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }}
args:
executable: /bin/bash
register: set_default_account
changed_when: "'Modified user' in (set_default_account.stdout + set_default_account.stderr)"
- name: Show final accounting state
ansible.builtin.shell: |
set -euo pipefail
echo "### clusters"
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
echo
echo "### accounts"
sacctmgr list account format=Account,Descr,Org
echo
echo "### users"
sacctmgr list user format=User,DefaultAccount,Admin
echo
echo "### associations"
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
args:
executable: /bin/bash
register: accounting_state_after
changed_when: false
- name: Print final accounting state
ansible.builtin.debug:
var: accounting_state_after.stdout_lines
@@ -0,0 +1,98 @@
---
- name: Restore-check latest SlurmDBD backup into test database
hosts: slurm_controller
become: true
gather_facts: false
vars:
restore_check_db: "{{ slurmdbd_storage_loc }}_restorecheck"
slurmdbd_backup_dir: /var/backups/slurmdbd
tasks:
- name: Validate MariaDB is running
ansible.builtin.command:
cmd: systemctl is-active mariadb
changed_when: false
- name: Find latest SlurmDBD backup
ansible.builtin.shell: |
set -euo pipefail
ls -1t {{ slurmdbd_backup_dir }}/{{ slurmdbd_storage_loc }}-*.sql.gz | head -n 1
args:
executable: /bin/bash
register: latest_backup
changed_when: false
- name: Validate latest backup exists
ansible.builtin.stat:
path: "{{ latest_backup.stdout }}"
register: latest_backup_stat
- name: Fail if latest backup is missing or empty
ansible.builtin.fail:
msg: "Latest SlurmDBD backup is missing or empty: {{ latest_backup.stdout }}"
when:
- not latest_backup_stat.stat.exists or latest_backup_stat.stat.size | int < 1024
- name: Recreate restore-check database
ansible.builtin.shell: |
set -euo pipefail
mysql <<SQL
DROP DATABASE IF EXISTS {{ restore_check_db }};
CREATE DATABASE {{ restore_check_db }};
SQL
args:
executable: /bin/bash
changed_when: true
- name: Import backup into restore-check database
ansible.builtin.shell: |
set -euo pipefail
zcat "{{ latest_backup.stdout }}" | mysql {{ restore_check_db }}
args:
executable: /bin/bash
changed_when: true
- name: Validate restored table count
ansible.builtin.shell: |
set -euo pipefail
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ restore_check_db }}';"
args:
executable: /bin/bash
register: restored_tables
changed_when: false
failed_when: restored_tables.stdout | int < 1
- name: Validate restored row count sample
ansible.builtin.shell: |
set -euo pipefail
echo "### restored database"
echo "{{ restore_check_db }}"
echo
echo "### table count"
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ restore_check_db }}';"
echo
echo "### largest tables"
mysql -N -B -e "
SELECT table_name, table_rows
FROM information_schema.tables
WHERE table_schema='{{ restore_check_db }}'
ORDER BY table_rows DESC
LIMIT 10;
"
args:
executable: /bin/bash
register: restore_check_summary
changed_when: false
- name: Show restore-check result
ansible.builtin.debug:
msg:
- "Imported backup: {{ latest_backup.stdout }}"
- "Restore-check DB: {{ restore_check_db }}"
- "Restored tables: {{ restored_tables.stdout }}"
- "Summary:"
- "{{ restore_check_summary.stdout_lines }}"
@@ -0,0 +1,105 @@
---
- name: Install and configure MariaDB for SlurmDBD
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Install MariaDB and SlurmDBD packages
ansible.builtin.apt:
name:
- mariadb-server
- mariadb-client
- slurmdbd
- slurm-wlm-mysql-plugin
state: present
update_cache: true
- name: Ensure MariaDB is enabled and running
ansible.builtin.systemd:
name: mariadb
enabled: true
state: started
- name: Ensure Slurm log directory exists
ansible.builtin.file:
path: /var/log/slurm
state: directory
owner: slurm
group: slurm
mode: "0755"
- name: Create Slurm accounting database and DB user
ansible.builtin.shell: |
set -euo pipefail
mysql <<SQL
CREATE DATABASE IF NOT EXISTS {{ slurmdbd_storage_loc }};
CREATE USER IF NOT EXISTS '{{ slurmdbd_storage_user }}'@'localhost' IDENTIFIED BY '{{ slurmdbd_storage_pass }}';
CREATE USER IF NOT EXISTS '{{ slurmdbd_storage_user }}'@'127.0.0.1' IDENTIFIED BY '{{ slurmdbd_storage_pass }}';
GRANT ALL PRIVILEGES ON {{ slurmdbd_storage_loc }}.* TO '{{ slurmdbd_storage_user }}'@'localhost';
GRANT ALL PRIVILEGES ON {{ slurmdbd_storage_loc }}.* TO '{{ slurmdbd_storage_user }}'@'127.0.0.1';
FLUSH PRIVILEGES;
SQL
args:
executable: /bin/bash
changed_when: true
- name: Ensure /etc/slurm exists
ansible.builtin.file:
path: /etc/slurm
state: directory
owner: root
group: root
mode: "0755"
- name: Deploy slurmdbd.conf
ansible.builtin.template:
src: ../../templates/slurmdbd.conf.j2
dest: /etc/slurm/slurmdbd.conf
owner: slurm
group: slurm
mode: "0600"
notify:
- Restart slurmdbd
- name: Ensure slurmdbd is enabled and running
ansible.builtin.systemd:
name: slurmdbd
enabled: true
state: started
- name: Flush handlers before validation
ansible.builtin.meta: flush_handlers
- name: Validate slurmdbd service is active
ansible.builtin.command:
cmd: systemctl is-active slurmdbd
register: slurmdbd_active
retries: 10
delay: 2
until: slurmdbd_active.stdout == "active"
changed_when: false
- name: Validate slurmdbd is listening on port
ansible.builtin.shell: |
set -euo pipefail
ss -lntp | grep ':{{ slurmdbd_port }} '
args:
executable: /bin/bash
register: slurmdbd_port_check
retries: 10
delay: 2
until: slurmdbd_port_check.rc == 0
changed_when: false
- name: Show slurmdbd service validation
ansible.builtin.debug:
msg:
- "slurmdbd is active"
- "{{ slurmdbd_port_check.stdout_lines }}"
handlers:
- name: Restart slurmdbd
ansible.builtin.systemd:
name: slurmdbd
state: restarted
@@ -0,0 +1,178 @@
---
- name: Validate Slurm accounting production-like setup
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Validate accounting services
ansible.builtin.shell: |
set -euo pipefail
echo "### services"
systemctl is-active mariadb
systemctl is-active slurmdbd
systemctl is-active slurmctld
echo
echo "### slurmdbd listener"
ss -lntp | grep ':6819 '
args:
executable: /bin/bash
register: service_check
changed_when: false
- name: Validate Slurm accounting runtime config
ansible.builtin.shell: |
set -euo pipefail
echo "### accounting config"
scontrol show config | grep -E "AccountingStorage|JobAcctGather|ClusterName"
echo
echo "### priority / select / cgroup config"
scontrol show config | grep -E "SelectType|TaskPlugin|ProctrackType"
args:
executable: /bin/bash
register: config_check
changed_when: false
- name: Validate sacctmgr entities
ansible.builtin.shell: |
set -euo pipefail
echo "### clusters"
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
echo
echo "### accounts"
sacctmgr list account format=Account,Descr,Org
echo
echo "### users"
sacctmgr list user format=User,DefaultAccount,Admin
echo
echo "### associations"
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
args:
executable: /bin/bash
register: entity_check
changed_when: false
- name: Submit accounting validation job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=acct-prodlike-test
#SBATCH --partition=debug
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --output=/shared/acct-prodlike-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/acct-prodlike-test-${job_id}.out"
args:
executable: /bin/bash
register: acct_job
changed_when: true
- name: Validate sacct can read recent jobs
ansible.builtin.shell: |
set -euo pipefail
echo "### recent jobs"
sacct -S today --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30
args:
executable: /bin/bash
register: sacct_recent
changed_when: false
- name: Validate sreport commands
ansible.builtin.shell: |
set -euo pipefail
echo "### cluster utilization"
sreport cluster utilization start=today || true
echo
echo "### account utilization by user"
sreport cluster AccountUtilizationByUser start=today || true
echo
echo "### user top"
sreport user top start=today || true
args:
executable: /bin/bash
register: sreport_check
changed_when: false
- name: Validate MariaDB table health summary
ansible.builtin.shell: |
set -euo pipefail
echo "### database exists"
mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';"
echo
echo "### table count"
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ slurmdbd_storage_loc }}';"
echo
echo "### largest tables"
mysql -N -B -e "
SELECT table_name, table_rows
FROM information_schema.tables
WHERE table_schema='{{ slurmdbd_storage_loc }}'
ORDER BY table_rows DESC
LIMIT 10;
"
args:
executable: /bin/bash
register: db_health
changed_when: false
- name: Print accounting validation
ansible.builtin.debug:
msg:
- "### services"
- "{{ service_check.stdout_lines }}"
- "### runtime config"
- "{{ config_check.stdout_lines }}"
- "### accounting entities"
- "{{ entity_check.stdout_lines }}"
- "### accounting validation job"
- "{{ acct_job.stdout_lines }}"
- "### recent sacct data"
- "{{ sacct_recent.stdout_lines }}"
- "### sreport"
- "{{ sreport_check.stdout_lines }}"
- "### database health"
- "{{ db_health.stdout_lines }}"