This commit is contained in:
@@ -0,0 +1,90 @@
|
||||
---
|
||||
- name: Backup SlurmDBD MariaDB database
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: true
|
||||
|
||||
vars:
|
||||
slurmdbd_backup_dir: /var/backups/slurmdbd
|
||||
local_fetch_dir: "{{ playbook_dir }}/../../artifacts/backups/slurmdbd"
|
||||
|
||||
tasks:
|
||||
- name: Create remote backup directory
|
||||
ansible.builtin.file:
|
||||
path: "{{ slurmdbd_backup_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0700"
|
||||
|
||||
- name: Create local fetch directory on Ansible controller
|
||||
ansible.builtin.file:
|
||||
path: "{{ local_fetch_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0700"
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
|
||||
- name: Validate MariaDB is running
|
||||
ansible.builtin.command:
|
||||
cmd: systemctl is-active mariadb
|
||||
changed_when: false
|
||||
|
||||
- name: Validate SlurmDBD is running
|
||||
ansible.builtin.command:
|
||||
cmd: systemctl is-active slurmdbd
|
||||
changed_when: false
|
||||
|
||||
- name: Validate Slurm accounting database exists
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';" | grep -qx "{{ slurmdbd_storage_loc }}"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
|
||||
- name: Dump Slurm accounting database
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
ts="$(date +%F-%H%M%S)"
|
||||
out="{{ slurmdbd_backup_dir }}/{{ slurmdbd_storage_loc }}-${ts}.sql.gz"
|
||||
|
||||
mysqldump \
|
||||
--single-transaction \
|
||||
--routines \
|
||||
--events \
|
||||
--triggers \
|
||||
{{ slurmdbd_storage_loc }} | gzip -9 > "$out"
|
||||
|
||||
chmod 0600 "$out"
|
||||
echo "$out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: db_dump
|
||||
changed_when: true
|
||||
|
||||
- name: Validate backup file is non-empty
|
||||
ansible.builtin.stat:
|
||||
path: "{{ db_dump.stdout }}"
|
||||
register: backup_file
|
||||
|
||||
- name: Fail if backup file is empty
|
||||
ansible.builtin.fail:
|
||||
msg: "Backup file is empty: {{ db_dump.stdout }}"
|
||||
when: backup_file.stat.size | int < 1024
|
||||
|
||||
- name: Fetch DB backup to Ansible controller
|
||||
ansible.builtin.fetch:
|
||||
src: "{{ db_dump.stdout }}"
|
||||
dest: "{{ local_fetch_dir }}/"
|
||||
flat: true
|
||||
|
||||
- name: Show DB backup result
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Remote backup: {{ db_dump.stdout }}"
|
||||
- "Backup size bytes: {{ backup_file.stat.size }}"
|
||||
- "Fetched to: {{ local_fetch_dir }}/"
|
||||
+126
@@ -0,0 +1,126 @@
|
||||
---
|
||||
- name: Initialize Slurm accounting entities
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Wait for sacctmgr connectivity
|
||||
ansible.builtin.command:
|
||||
cmd: sacctmgr -n list cluster
|
||||
register: sacctmgr_cluster_list
|
||||
retries: 20
|
||||
delay: 2
|
||||
until: sacctmgr_cluster_list.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Show current accounting state before changes
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### clusters"
|
||||
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
|
||||
|
||||
echo
|
||||
echo "### accounts"
|
||||
sacctmgr list account format=Account,Descr,Org
|
||||
|
||||
echo
|
||||
echo "### users"
|
||||
sacctmgr list user format=User,DefaultAccount,Admin
|
||||
|
||||
echo
|
||||
echo "### associations"
|
||||
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: accounting_state_before
|
||||
changed_when: false
|
||||
|
||||
- name: Print current accounting state before changes
|
||||
ansible.builtin.debug:
|
||||
var: accounting_state_before.stdout_lines
|
||||
|
||||
- name: Ensure Slurm cluster exists in accounting DB
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
if sacctmgr -n list cluster format=Cluster | awk '{print $1}' | grep -qx "{{ slurm_cluster_name }}"; then
|
||||
echo "Cluster {{ slurm_cluster_name }} already exists"
|
||||
else
|
||||
sacctmgr -i add cluster {{ slurm_cluster_name }}
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: ensure_cluster
|
||||
changed_when: "'Adding Cluster' in ensure_cluster.stdout"
|
||||
|
||||
- name: Ensure default lab account exists for cluster
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
if sacctmgr -n list assoc format=Cluster,Account,User | awk '$1=="{{ slurm_cluster_name }}" && $2=="{{ slurm_account_name }}" && $3=="" {found=1} END {exit !found}'; then
|
||||
echo "Account {{ slurm_account_name }} already associated with cluster {{ slurm_cluster_name }}"
|
||||
else
|
||||
sacctmgr -i add account {{ slurm_account_name }} \
|
||||
Cluster={{ slurm_cluster_name }} \
|
||||
Description="{{ slurm_account_description }}" \
|
||||
Organization="{{ slurm_account_organization }}"
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: ensure_account
|
||||
changed_when: "'Adding Account' in ensure_account.stdout"
|
||||
|
||||
- name: Ensure slurmuser exists with lab account association
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
if sacctmgr -n list assoc format=Cluster,Account,User | awk '$1=="{{ slurm_cluster_name }}" && $2=="{{ slurm_account_name }}" && $3=="slurmuser" {found=1} END {exit !found}'; then
|
||||
echo "User slurmuser already associated with account {{ slurm_account_name }} on cluster {{ slurm_cluster_name }}"
|
||||
else
|
||||
sacctmgr -i add user slurmuser \
|
||||
Cluster={{ slurm_cluster_name }} \
|
||||
Account={{ slurm_account_name }} \
|
||||
DefaultAccount={{ slurm_account_name }}
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: ensure_user_assoc
|
||||
changed_when: "'Adding User' in ensure_user_assoc.stdout"
|
||||
|
||||
- name: Ensure slurmuser has default account set
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: set_default_account
|
||||
changed_when: "'Modified user' in (set_default_account.stdout + set_default_account.stderr)"
|
||||
|
||||
- name: Show final accounting state
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### clusters"
|
||||
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
|
||||
|
||||
echo
|
||||
echo "### accounts"
|
||||
sacctmgr list account format=Account,Descr,Org
|
||||
|
||||
echo
|
||||
echo "### users"
|
||||
sacctmgr list user format=User,DefaultAccount,Admin
|
||||
|
||||
echo
|
||||
echo "### associations"
|
||||
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: accounting_state_after
|
||||
changed_when: false
|
||||
|
||||
- name: Print final accounting state
|
||||
ansible.builtin.debug:
|
||||
var: accounting_state_after.stdout_lines
|
||||
+98
@@ -0,0 +1,98 @@
|
||||
---
|
||||
- name: Restore-check latest SlurmDBD backup into test database
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
restore_check_db: "{{ slurmdbd_storage_loc }}_restorecheck"
|
||||
slurmdbd_backup_dir: /var/backups/slurmdbd
|
||||
|
||||
tasks:
|
||||
- name: Validate MariaDB is running
|
||||
ansible.builtin.command:
|
||||
cmd: systemctl is-active mariadb
|
||||
changed_when: false
|
||||
|
||||
- name: Find latest SlurmDBD backup
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
ls -1t {{ slurmdbd_backup_dir }}/{{ slurmdbd_storage_loc }}-*.sql.gz | head -n 1
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: latest_backup
|
||||
changed_when: false
|
||||
|
||||
- name: Validate latest backup exists
|
||||
ansible.builtin.stat:
|
||||
path: "{{ latest_backup.stdout }}"
|
||||
register: latest_backup_stat
|
||||
|
||||
- name: Fail if latest backup is missing or empty
|
||||
ansible.builtin.fail:
|
||||
msg: "Latest SlurmDBD backup is missing or empty: {{ latest_backup.stdout }}"
|
||||
when:
|
||||
- not latest_backup_stat.stat.exists or latest_backup_stat.stat.size | int < 1024
|
||||
|
||||
- name: Recreate restore-check database
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
mysql <<SQL
|
||||
DROP DATABASE IF EXISTS {{ restore_check_db }};
|
||||
CREATE DATABASE {{ restore_check_db }};
|
||||
SQL
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Import backup into restore-check database
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
zcat "{{ latest_backup.stdout }}" | mysql {{ restore_check_db }}
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Validate restored table count
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ restore_check_db }}';"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: restored_tables
|
||||
changed_when: false
|
||||
failed_when: restored_tables.stdout | int < 1
|
||||
|
||||
- name: Validate restored row count sample
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### restored database"
|
||||
echo "{{ restore_check_db }}"
|
||||
|
||||
echo
|
||||
echo "### table count"
|
||||
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ restore_check_db }}';"
|
||||
|
||||
echo
|
||||
echo "### largest tables"
|
||||
mysql -N -B -e "
|
||||
SELECT table_name, table_rows
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema='{{ restore_check_db }}'
|
||||
ORDER BY table_rows DESC
|
||||
LIMIT 10;
|
||||
"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: restore_check_summary
|
||||
changed_when: false
|
||||
|
||||
- name: Show restore-check result
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "Imported backup: {{ latest_backup.stdout }}"
|
||||
- "Restore-check DB: {{ restore_check_db }}"
|
||||
- "Restored tables: {{ restored_tables.stdout }}"
|
||||
- "Summary:"
|
||||
- "{{ restore_check_summary.stdout_lines }}"
|
||||
@@ -0,0 +1,105 @@
|
||||
---
|
||||
- name: Install and configure MariaDB for SlurmDBD
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Install MariaDB and SlurmDBD packages
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- mariadb-server
|
||||
- mariadb-client
|
||||
- slurmdbd
|
||||
- slurm-wlm-mysql-plugin
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Ensure MariaDB is enabled and running
|
||||
ansible.builtin.systemd:
|
||||
name: mariadb
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: Ensure Slurm log directory exists
|
||||
ansible.builtin.file:
|
||||
path: /var/log/slurm
|
||||
state: directory
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0755"
|
||||
|
||||
- name: Create Slurm accounting database and DB user
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
mysql <<SQL
|
||||
CREATE DATABASE IF NOT EXISTS {{ slurmdbd_storage_loc }};
|
||||
CREATE USER IF NOT EXISTS '{{ slurmdbd_storage_user }}'@'localhost' IDENTIFIED BY '{{ slurmdbd_storage_pass }}';
|
||||
CREATE USER IF NOT EXISTS '{{ slurmdbd_storage_user }}'@'127.0.0.1' IDENTIFIED BY '{{ slurmdbd_storage_pass }}';
|
||||
GRANT ALL PRIVILEGES ON {{ slurmdbd_storage_loc }}.* TO '{{ slurmdbd_storage_user }}'@'localhost';
|
||||
GRANT ALL PRIVILEGES ON {{ slurmdbd_storage_loc }}.* TO '{{ slurmdbd_storage_user }}'@'127.0.0.1';
|
||||
FLUSH PRIVILEGES;
|
||||
SQL
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Ensure /etc/slurm exists
|
||||
ansible.builtin.file:
|
||||
path: /etc/slurm
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0755"
|
||||
|
||||
- name: Deploy slurmdbd.conf
|
||||
ansible.builtin.template:
|
||||
src: ../../templates/slurmdbd.conf.j2
|
||||
dest: /etc/slurm/slurmdbd.conf
|
||||
owner: slurm
|
||||
group: slurm
|
||||
mode: "0600"
|
||||
notify:
|
||||
- Restart slurmdbd
|
||||
|
||||
- name: Ensure slurmdbd is enabled and running
|
||||
ansible.builtin.systemd:
|
||||
name: slurmdbd
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: Flush handlers before validation
|
||||
ansible.builtin.meta: flush_handlers
|
||||
|
||||
- name: Validate slurmdbd service is active
|
||||
ansible.builtin.command:
|
||||
cmd: systemctl is-active slurmdbd
|
||||
register: slurmdbd_active
|
||||
retries: 10
|
||||
delay: 2
|
||||
until: slurmdbd_active.stdout == "active"
|
||||
changed_when: false
|
||||
|
||||
- name: Validate slurmdbd is listening on port
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
ss -lntp | grep ':{{ slurmdbd_port }} '
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: slurmdbd_port_check
|
||||
retries: 10
|
||||
delay: 2
|
||||
until: slurmdbd_port_check.rc == 0
|
||||
changed_when: false
|
||||
|
||||
- name: Show slurmdbd service validation
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "slurmdbd is active"
|
||||
- "{{ slurmdbd_port_check.stdout_lines }}"
|
||||
|
||||
handlers:
|
||||
- name: Restart slurmdbd
|
||||
ansible.builtin.systemd:
|
||||
name: slurmdbd
|
||||
state: restarted
|
||||
+178
@@ -0,0 +1,178 @@
|
||||
---
|
||||
- name: Validate Slurm accounting production-like setup
|
||||
hosts: slurm_controller
|
||||
become: true
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Validate accounting services
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### services"
|
||||
systemctl is-active mariadb
|
||||
systemctl is-active slurmdbd
|
||||
systemctl is-active slurmctld
|
||||
|
||||
echo
|
||||
echo "### slurmdbd listener"
|
||||
ss -lntp | grep ':6819 '
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: service_check
|
||||
changed_when: false
|
||||
|
||||
- name: Validate Slurm accounting runtime config
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### accounting config"
|
||||
scontrol show config | grep -E "AccountingStorage|JobAcctGather|ClusterName"
|
||||
|
||||
echo
|
||||
echo "### priority / select / cgroup config"
|
||||
scontrol show config | grep -E "SelectType|TaskPlugin|ProctrackType"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: config_check
|
||||
changed_when: false
|
||||
|
||||
- name: Validate sacctmgr entities
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### clusters"
|
||||
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
|
||||
|
||||
echo
|
||||
echo "### accounts"
|
||||
sacctmgr list account format=Account,Descr,Org
|
||||
|
||||
echo
|
||||
echo "### users"
|
||||
sacctmgr list user format=User,DefaultAccount,Admin
|
||||
|
||||
echo
|
||||
echo "### associations"
|
||||
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: entity_check
|
||||
changed_when: false
|
||||
|
||||
- name: Submit accounting validation job
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
job_id="$(
|
||||
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=acct-prodlike-test
|
||||
#SBATCH --partition=debug
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --mem=256M
|
||||
#SBATCH --time=00:02:00
|
||||
#SBATCH --output=/shared/acct-prodlike-test-%j.out
|
||||
|
||||
echo "HOST=$(hostname)"
|
||||
echo "USER=$(whoami)"
|
||||
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
||||
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
||||
date
|
||||
SBATCH
|
||||
)"
|
||||
|
||||
echo "JOB_ID=$job_id"
|
||||
|
||||
for i in $(seq 1 90); do
|
||||
if squeue -h -j "$job_id" | grep -q .; then
|
||||
squeue -j "$job_id"
|
||||
sleep 1
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "### sacct"
|
||||
sacct -j "$job_id" --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
||||
|
||||
echo "### output"
|
||||
cat "/shared/acct-prodlike-test-${job_id}.out"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: acct_job
|
||||
changed_when: true
|
||||
|
||||
- name: Validate sacct can read recent jobs
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### recent jobs"
|
||||
sacct -S today --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: sacct_recent
|
||||
changed_when: false
|
||||
|
||||
- name: Validate sreport commands
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### cluster utilization"
|
||||
sreport cluster utilization start=today || true
|
||||
|
||||
echo
|
||||
echo "### account utilization by user"
|
||||
sreport cluster AccountUtilizationByUser start=today || true
|
||||
|
||||
echo
|
||||
echo "### user top"
|
||||
sreport user top start=today || true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: sreport_check
|
||||
changed_when: false
|
||||
|
||||
- name: Validate MariaDB table health summary
|
||||
ansible.builtin.shell: |
|
||||
set -euo pipefail
|
||||
|
||||
echo "### database exists"
|
||||
mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';"
|
||||
|
||||
echo
|
||||
echo "### table count"
|
||||
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ slurmdbd_storage_loc }}';"
|
||||
|
||||
echo
|
||||
echo "### largest tables"
|
||||
mysql -N -B -e "
|
||||
SELECT table_name, table_rows
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema='{{ slurmdbd_storage_loc }}'
|
||||
ORDER BY table_rows DESC
|
||||
LIMIT 10;
|
||||
"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: db_health
|
||||
changed_when: false
|
||||
|
||||
- name: Print accounting validation
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "### services"
|
||||
- "{{ service_check.stdout_lines }}"
|
||||
- "### runtime config"
|
||||
- "{{ config_check.stdout_lines }}"
|
||||
- "### accounting entities"
|
||||
- "{{ entity_check.stdout_lines }}"
|
||||
- "### accounting validation job"
|
||||
- "{{ acct_job.stdout_lines }}"
|
||||
- "### recent sacct data"
|
||||
- "{{ sacct_recent.stdout_lines }}"
|
||||
- "### sreport"
|
||||
- "{{ sreport_check.stdout_lines }}"
|
||||
- "### database health"
|
||||
- "{{ db_health.stdout_lines }}"
|
||||
Reference in New Issue
Block a user