Files

179 lines
5.1 KiB
YAML
Raw Permalink Normal View History

2026-06-04 19:41:05 +00:00
---
- name: Validate Slurm accounting production-like setup
hosts: slurm_controller
become: true
gather_facts: false
tasks:
- name: Validate accounting services
ansible.builtin.shell: |
set -euo pipefail
echo "### services"
systemctl is-active mariadb
systemctl is-active slurmdbd
systemctl is-active slurmctld
echo
echo "### slurmdbd listener"
ss -lntp | grep ':6819 '
args:
executable: /bin/bash
register: service_check
changed_when: false
- name: Validate Slurm accounting runtime config
ansible.builtin.shell: |
set -euo pipefail
echo "### accounting config"
scontrol show config | grep -E "AccountingStorage|JobAcctGather|ClusterName"
echo
echo "### priority / select / cgroup config"
scontrol show config | grep -E "SelectType|TaskPlugin|ProctrackType"
args:
executable: /bin/bash
register: config_check
changed_when: false
- name: Validate sacctmgr entities
ansible.builtin.shell: |
set -euo pipefail
echo "### clusters"
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
echo
echo "### accounts"
sacctmgr list account format=Account,Descr,Org
echo
echo "### users"
sacctmgr list user format=User,DefaultAccount,Admin
echo
echo "### associations"
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
args:
executable: /bin/bash
register: entity_check
changed_when: false
- name: Submit accounting validation job
ansible.builtin.shell: |
set -euo pipefail
job_id="$(
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
#!/bin/bash
#SBATCH --job-name=acct-prodlike-test
#SBATCH --partition=debug
#SBATCH --cpus-per-task=1
#SBATCH --mem=256M
#SBATCH --time=00:02:00
#SBATCH --output=/shared/acct-prodlike-test-%j.out
echo "HOST=$(hostname)"
echo "USER=$(whoami)"
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
date
SBATCH
)"
echo "JOB_ID=$job_id"
for i in $(seq 1 90); do
if squeue -h -j "$job_id" | grep -q .; then
squeue -j "$job_id"
sleep 1
else
break
fi
done
echo "### sacct"
sacct -j "$job_id" --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
echo "### output"
cat "/shared/acct-prodlike-test-${job_id}.out"
args:
executable: /bin/bash
register: acct_job
changed_when: true
- name: Validate sacct can read recent jobs
ansible.builtin.shell: |
set -euo pipefail
echo "### recent jobs"
sacct -S today --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30
args:
executable: /bin/bash
register: sacct_recent
changed_when: false
- name: Validate sreport commands
ansible.builtin.shell: |
set -euo pipefail
echo "### cluster utilization"
sreport cluster utilization start=today || true
echo
echo "### account utilization by user"
sreport cluster AccountUtilizationByUser start=today || true
echo
echo "### user top"
sreport user top start=today || true
args:
executable: /bin/bash
register: sreport_check
changed_when: false
- name: Validate MariaDB table health summary
ansible.builtin.shell: |
set -euo pipefail
echo "### database exists"
mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';"
echo
echo "### table count"
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ slurmdbd_storage_loc }}';"
echo
echo "### largest tables"
mysql -N -B -e "
SELECT table_name, table_rows
FROM information_schema.tables
WHERE table_schema='{{ slurmdbd_storage_loc }}'
ORDER BY table_rows DESC
LIMIT 10;
"
args:
executable: /bin/bash
register: db_health
changed_when: false
- name: Print accounting validation
ansible.builtin.debug:
msg:
- "### services"
- "{{ service_check.stdout_lines }}"
- "### runtime config"
- "{{ config_check.stdout_lines }}"
- "### accounting entities"
- "{{ entity_check.stdout_lines }}"
- "### accounting validation job"
- "{{ acct_job.stdout_lines }}"
- "### recent sacct data"
- "{{ sacct_recent.stdout_lines }}"
- "### sreport"
- "{{ sreport_check.stdout_lines }}"
- "### database health"
- "{{ db_health.stdout_lines }}"