--- - name: Validate Slurm accounting production-like setup hosts: slurm_controller become: true gather_facts: false tasks: - name: Validate accounting services ansible.builtin.shell: | set -euo pipefail echo "### services" systemctl is-active mariadb systemctl is-active slurmdbd systemctl is-active slurmctld echo echo "### slurmdbd listener" ss -lntp | grep ':6819 ' args: executable: /bin/bash register: service_check changed_when: false - name: Validate Slurm accounting runtime config ansible.builtin.shell: | set -euo pipefail echo "### accounting config" scontrol show config | grep -E "AccountingStorage|JobAcctGather|ClusterName" echo echo "### priority / select / cgroup config" scontrol show config | grep -E "SelectType|TaskPlugin|ProctrackType" args: executable: /bin/bash register: config_check changed_when: false - name: Validate sacctmgr entities ansible.builtin.shell: | set -euo pipefail echo "### clusters" sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC echo echo "### accounts" sacctmgr list account format=Account,Descr,Org echo echo "### users" sacctmgr list user format=User,DefaultAccount,Admin echo echo "### associations" sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS args: executable: /bin/bash register: entity_check changed_when: false - name: Submit accounting validation job ansible.builtin.shell: | set -euo pipefail job_id="$( sudo -iu slurmuser sbatch --parsable <<'SBATCH' #!/bin/bash #SBATCH --job-name=acct-prodlike-test #SBATCH --partition=debug #SBATCH --cpus-per-task=1 #SBATCH --mem=256M #SBATCH --time=00:02:00 #SBATCH --output=/shared/acct-prodlike-test-%j.out echo "HOST=$(hostname)" echo "USER=$(whoami)" echo "SLURM_JOB_ID=$SLURM_JOB_ID" echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" date SBATCH )" echo "JOB_ID=$job_id" for i in $(seq 1 90); do if squeue -h -j "$job_id" | grep -q .; then squeue -j "$job_id" sleep 1 else break fi done echo "### sacct" sacct -j "$job_id" --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList echo "### output" cat "/shared/acct-prodlike-test-${job_id}.out" args: executable: /bin/bash register: acct_job changed_when: true - name: Validate sacct can read recent jobs ansible.builtin.shell: | set -euo pipefail echo "### recent jobs" sacct -S today --format=JobID,JobName,User,Account,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList | tail -30 args: executable: /bin/bash register: sacct_recent changed_when: false - name: Validate sreport commands ansible.builtin.shell: | set -euo pipefail echo "### cluster utilization" sreport cluster utilization start=today || true echo echo "### account utilization by user" sreport cluster AccountUtilizationByUser start=today || true echo echo "### user top" sreport user top start=today || true args: executable: /bin/bash register: sreport_check changed_when: false - name: Validate MariaDB table health summary ansible.builtin.shell: | set -euo pipefail echo "### database exists" mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';" echo echo "### table count" mysql -N -B -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='{{ slurmdbd_storage_loc }}';" echo echo "### largest tables" mysql -N -B -e " SELECT table_name, table_rows FROM information_schema.tables WHERE table_schema='{{ slurmdbd_storage_loc }}' ORDER BY table_rows DESC LIMIT 10; " args: executable: /bin/bash register: db_health changed_when: false - name: Print accounting validation ansible.builtin.debug: msg: - "### services" - "{{ service_check.stdout_lines }}" - "### runtime config" - "{{ config_check.stdout_lines }}" - "### accounting entities" - "{{ entity_check.stdout_lines }}" - "### accounting validation job" - "{{ acct_job.stdout_lines }}" - "### recent sacct data" - "{{ sacct_recent.stdout_lines }}" - "### sreport" - "{{ sreport_check.stdout_lines }}" - "### database health" - "{{ db_health.stdout_lines }}"