179 lines
5.1 KiB
YAML
179 lines
5.1 KiB
YAML
---
|
|
- name: Validate Slurm accounting production-like setup
|
|
hosts: slurm_controller
|
|
become: true
|
|
gather_facts: false
|
|
|
|
tasks:
|
|
- name: Validate accounting services
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "### services"
|
|
systemctl is-active mariadb
|
|
systemctl is-active slurmdbd
|
|
systemctl is-active slurmctld
|
|
|
|
echo
|
|
echo "### slurmdbd listener"
|
|
ss -lntp | grep ':6819 '
|
|
args:
|
|
executable: /bin/bash
|
|
register: service_check
|
|
changed_when: false
|
|
|
|
- name: Validate Slurm accounting runtime config
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "### accounting config"
|
|
scontrol show config | grep -E "AccountingStorage|JobAcctGather|ClusterName"
|
|
|
|
echo
|
|
echo "### priority / select / cgroup config"
|
|
scontrol show config | grep -E "SelectType|TaskPlugin|ProctrackType"
|
|
args:
|
|
executable: /bin/bash
|
|
register: config_check
|
|
changed_when: false
|
|
|
|
- name: Validate sacctmgr entities
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "### clusters"
|
|
sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC
|
|
|
|
echo
|
|
echo "### accounts"
|
|
sacctmgr list account format=Account,Descr,Org
|
|
|
|
echo
|
|
echo "### users"
|
|
sacctmgr list user format=User,DefaultAccount,Admin
|
|
|
|
echo
|
|
echo "### associations"
|
|
sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS
|
|
args:
|
|
executable: /bin/bash
|
|
register: entity_check
|
|
changed_when: false
|
|
|
|
- name: Submit accounting validation job
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
job_id="$(
|
|
sudo -iu slurmuser sbatch --parsable <<'SBATCH'
|
|
#!/bin/bash
|
|
#SBATCH --job-name=acct-prodlike-test
|
|
#SBATCH --partition=debug
|
|
#SBATCH --cpus-per-task=1
|
|
#SBATCH --mem=256M
|
|
#SBATCH --time=00:02:00
|
|
#SBATCH --output=/shared/acct-prodlike-test-%j.out
|
|
|
|
echo "HOST=$(hostname)"
|
|
echo "USER=$(whoami)"
|
|
echo "SLURM_JOB_ID=$SLURM_JOB_ID"
|
|
echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST"
|
|
echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)"
|
|
date
|
|
SBATCH
|
|
)"
|
|
|
|
echo "JOB_ID=$job_id"
|
|
|
|
for i in $(seq 1 90); do
|
|
if squeue -h -j "$job_id" | grep -q .; then
|
|
squeue -j "$job_id"
|
|
sleep 1
|
|
else
|
|
break
|
|
fi
|
|
done
|
|
|
|
echo "### sacct"
|
|
sacct -j "$job_id" --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList
|
|
|
|
echo "### output"
|
|
cat "/shared/acct-prodlike-test-${job_id}.out"
|
|
args:
|
|
executable: /bin/bash
|
|
register: acct_job
|
|
changed_when: true
|
|
|
|
- name: Validate sacct can read recent jobs
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "### recent jobs"
|
|
sacct -S today --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30
|
|
args:
|
|
executable: /bin/bash
|
|
register: sacct_recent
|
|
changed_when: false
|
|
|
|
- name: Validate sreport commands
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "### cluster utilization"
|
|
sreport cluster utilization start=today || true
|
|
|
|
echo
|
|
echo "### account utilization by user"
|
|
sreport cluster AccountUtilizationByUser start=today || true
|
|
|
|
echo
|
|
echo "### user top"
|
|
sreport user top start=today || true
|
|
args:
|
|
executable: /bin/bash
|
|
register: sreport_check
|
|
changed_when: false
|
|
|
|
- name: Validate MariaDB table health summary
|
|
ansible.builtin.shell: |
|
|
set -euo pipefail
|
|
|
|
echo "### database exists"
|
|
mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';"
|
|
|
|
echo
|
|
echo "### table count"
|
|
mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ slurmdbd_storage_loc }}';"
|
|
|
|
echo
|
|
echo "### largest tables"
|
|
mysql -N -B -e "
|
|
SELECT table_name, table_rows
|
|
FROM information_schema.tables
|
|
WHERE table_schema='{{ slurmdbd_storage_loc }}'
|
|
ORDER BY table_rows DESC
|
|
LIMIT 10;
|
|
"
|
|
args:
|
|
executable: /bin/bash
|
|
register: db_health
|
|
changed_when: false
|
|
|
|
- name: Print accounting validation
|
|
ansible.builtin.debug:
|
|
msg:
|
|
- "### services"
|
|
- "{{ service_check.stdout_lines }}"
|
|
- "### runtime config"
|
|
- "{{ config_check.stdout_lines }}"
|
|
- "### accounting entities"
|
|
- "{{ entity_check.stdout_lines }}"
|
|
- "### accounting validation job"
|
|
- "{{ acct_job.stdout_lines }}"
|
|
- "### recent sacct data"
|
|
- "{{ sacct_recent.stdout_lines }}"
|
|
- "### sreport"
|
|
- "{{ sreport_check.stdout_lines }}"
|
|
- "### database health"
|
|
- "{{ db_health.stdout_lines }}"
|