From d300d490f5064f323856f070167a89639768d349 Mon Sep 17 00:00:00 2001 From: Mateusz Suski Date: Thu, 4 Jun 2026 19:41:05 +0000 Subject: [PATCH] Add Slurm AI/HPC cluster platform project --- .../hpc-slurm-ai-cluster/README.md | 59 +++++ .../hpc-slurm-ai-cluster/ansible.cfg | 14 + .../hpc-slurm-ai-cluster/artifacts/README.md | 1 + .../docs/interview-cheatsheet.md | 22 ++ .../hpc-slurm-ai-cluster/docs/runbook.md | 62 +++++ .../docs/troubleshooting-cases.md | 28 ++ .../lab/group_vars/slurm_cluster.yml | 128 +++++++++ .../lab/group_vars/vault.example.yml | 5 + .../inventories/lab/inventory.yml | 24 ++ .../playbooks/accounting/backup-slurmdbd.yml | 90 +++++++ .../initialize-slurm-accounting.yml | 126 +++++++++ .../accounting/restore-check-slurmdbd.yml | 98 +++++++ .../playbooks/accounting/setup-slurmdbd.yml | 105 ++++++++ .../accounting/validate-slurm-accounting.yml | 178 +++++++++++++ .../playbooks/backup/backup-slurm-state.yml | 83 ++++++ .../playbooks/backup/fetch-slurm-backups.yml | 46 ++++ .../playbooks/bootstrap/bootstrap-ansible.yml | 58 +++++ .../playbooks/bootstrap/slurm-hosts.yml | 16 ++ .../bootstrap/slurmuser-ssh-mesh.yml | 218 ++++++++++++++++ .../bootstrap/slurmuser-sudoers-fix.yml | 112 ++++++++ .../playbooks/core/manage-munge.yml | 133 ++++++++++ .../playbooks/core/manage-slurm-config.yml | 132 ++++++++++ .../playbooks/core/restart-slurm-safe.yml | 103 ++++++++ .../discovery/discover-slurm-resources.yml | 40 +++ .../discovery/inspect-slurm-state.yml | 89 +++++++ .../health/auto-remediate-slurm-health.yml | 216 +++++++++++++++ .../playbooks/health/check-slurm-health.yml | 149 +++++++++++ .../playbooks/health/repair-slurm-node.yml | 217 +++++++++++++++ .../lifecycle/decommission-slurm-node.yml | 126 +++++++++ .../lifecycle/provision-slurm-node.yml | 246 ++++++++++++++++++ .../playbooks/lifecycle/show-slurm-node.yml | 33 +++ .../playbooks/qos/configure-slurm-qos.yml | 169 ++++++++++++ .../qos/validate-slurm-qos-priority.yml | 235 +++++++++++++++++ .../tests/test-cgroup-cpu-gpu-node.yml | 59 +++++ .../playbooks/tests/test-cpu-job.yml | 60 +++++ .../tests/test-gpu-deny-without-gres.yml | 58 +++++ .../playbooks/tests/test-gpu-job.yml | 70 +++++ .../playbooks/tests/test-specific-node.yml | 95 +++++++ .../playbooks/tests/test-sreport-usage.yml | 60 +++++ .../tests/validate-slurm-operator.yml | 140 ++++++++++ .../upgrade/canary-slurm-node-upgrade.yml | 236 +++++++++++++++++ .../upgrade/rolling-upgrade-slurm-workers.yml | 197 ++++++++++++++ .../upgrade/upgrade-slurm-controller.yml | 94 +++++++ .../upgrade/validate-after-os-upgrade.yml | 207 +++++++++++++++ .../prompts/codex/repo-documentation.md | 15 ++ .../templates/cgroup.conf.j2 | 16 ++ .../templates/gres.conf.j2 | 4 + .../templates/slurm.conf.j2 | 67 +++++ .../templates/slurmdbd.conf.j2 | 38 +++ 49 files changed, 4777 insertions(+) create mode 100644 platform-projects/hpc-slurm-ai-cluster/README.md create mode 100644 platform-projects/hpc-slurm-ai-cluster/ansible.cfg create mode 100644 platform-projects/hpc-slurm-ai-cluster/artifacts/README.md create mode 100644 platform-projects/hpc-slurm-ai-cluster/docs/interview-cheatsheet.md create mode 100644 platform-projects/hpc-slurm-ai-cluster/docs/runbook.md create mode 100644 platform-projects/hpc-slurm-ai-cluster/docs/troubleshooting-cases.md create mode 100644 platform-projects/hpc-slurm-ai-cluster/inventories/lab/group_vars/slurm_cluster.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/inventories/lab/group_vars/vault.example.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/inventories/lab/inventory.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/backup-slurmdbd.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/initialize-slurm-accounting.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/restore-check-slurmdbd.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/setup-slurmdbd.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/validate-slurm-accounting.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/backup/backup-slurm-state.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/backup/fetch-slurm-backups.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/bootstrap-ansible.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurm-hosts.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurmuser-ssh-mesh.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurmuser-sudoers-fix.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/core/manage-munge.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/core/manage-slurm-config.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/core/restart-slurm-safe.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/discovery/discover-slurm-resources.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/discovery/inspect-slurm-state.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/health/auto-remediate-slurm-health.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/health/check-slurm-health.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/health/repair-slurm-node.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/lifecycle/decommission-slurm-node.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/lifecycle/provision-slurm-node.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/lifecycle/show-slurm-node.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/qos/configure-slurm-qos.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/qos/validate-slurm-qos-priority.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-cgroup-cpu-gpu-node.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-cpu-job.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-gpu-deny-without-gres.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-gpu-job.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-specific-node.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-sreport-usage.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/tests/validate-slurm-operator.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/upgrade/canary-slurm-node-upgrade.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/upgrade/rolling-upgrade-slurm-workers.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/upgrade/upgrade-slurm-controller.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/playbooks/upgrade/validate-after-os-upgrade.yml create mode 100644 platform-projects/hpc-slurm-ai-cluster/prompts/codex/repo-documentation.md create mode 100644 platform-projects/hpc-slurm-ai-cluster/templates/cgroup.conf.j2 create mode 100644 platform-projects/hpc-slurm-ai-cluster/templates/gres.conf.j2 create mode 100644 platform-projects/hpc-slurm-ai-cluster/templates/slurm.conf.j2 create mode 100644 platform-projects/hpc-slurm-ai-cluster/templates/slurmdbd.conf.j2 diff --git a/platform-projects/hpc-slurm-ai-cluster/README.md b/platform-projects/hpc-slurm-ai-cluster/README.md new file mode 100644 index 0000000..61dab47 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/README.md @@ -0,0 +1,59 @@ +# Ansible Slurm AI/HPC Lab + +Ansible automation for a small Slurm AI/HPC lab with CPU nodes, a GPU node, Munge, cgroups, GRES, SlurmDBD accounting, QOS/fairshare, node lifecycle workflows, rolling OS upgrades and health remediation. + +This repository is sanitized for publication. Replace the example inventory values under `inventories/lab/` with your own hostnames, IP addresses and users before running it. + +## What this lab covers + +- Slurm controller and worker configuration +- Munge key distribution +- GPU GRES configuration +- cgroup CPU/GPU/device enforcement +- SlurmDBD + MariaDB accounting +- `sacct`, `sreport`, `sacctmgr` validation +- QOS, limits, fairshare and priority/multifactor +- Node provisioning and decommissioning +- Rolling OS upgrades with canary validation +- Health checks and node auto-remediation + +## Repository layout + +```text +inventories/lab/ Example inventory and group variables +templates/ Slurm, cgroup, gres and slurmdbd templates +playbooks/bootstrap/ Initial SSH, sudo and /etc/hosts setup +playbooks/core/ Munge, Slurm config and safe restart workflows +playbooks/accounting/ SlurmDBD, backup/restore-check and accounting validation +playbooks/qos/ QOS, fairshare and priority configuration +playbooks/lifecycle/ Provisioning and decommissioning nodes +playbooks/upgrade/ Rolling OS upgrade and canary workflow +playbooks/health/ Health checks and auto-remediation +playbooks/tests/ CPU/GPU/cgroup/accounting validation jobs +playbooks/backup/ Slurm config backup helpers +docs/ Runbooks and interview notes +prompts/codex/ Prompts for generating or expanding documentation +``` + +## Quick start + +1. Edit `inventories/lab/inventory.yml`. +2. Edit `inventories/lab/group_vars/slurm_cluster.yml`. +3. Create and encrypt a vault file for database credentials: + +```bash +cp inventories/lab/group_vars/vault.example.yml inventories/lab/group_vars/vault.yml +ansible-vault encrypt inventories/lab/group_vars/vault.yml +``` + +4. Run syntax checks: + +```bash +find playbooks -name '*.yml' -print0 | xargs -0 -n1 ansible-playbook --syntax-check +``` + +5. Run the bootstrap/core workflows in the order described in `docs/runbook.md`. + +## Security notes + +Do not commit real inventories, backup archives, SQL dumps, Munge keys, private SSH keys or Ansible Vault files. This repository intentionally excludes generated backup artifacts. diff --git a/platform-projects/hpc-slurm-ai-cluster/ansible.cfg b/platform-projects/hpc-slurm-ai-cluster/ansible.cfg new file mode 100644 index 0000000..7f03bd9 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/ansible.cfg @@ -0,0 +1,14 @@ +[defaults] +inventory = ./inventories/lab/inventory.yml +host_key_checking = False +retry_files_enabled = False +stdout_callback = default +result_format = yaml +interpreter_python = auto_silent +timeout = 30 +roles_path = ./roles +collections_path = ./collections + +[ssh_connection] +pipelining = True +ssh_args = -o ControlMaster=auto -o ControlPersist=60s diff --git a/platform-projects/hpc-slurm-ai-cluster/artifacts/README.md b/platform-projects/hpc-slurm-ai-cluster/artifacts/README.md new file mode 100644 index 0000000..63101e9 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/artifacts/README.md @@ -0,0 +1 @@ +Generated backups and reports can be stored here locally. This directory is ignored by git. diff --git a/platform-projects/hpc-slurm-ai-cluster/docs/interview-cheatsheet.md b/platform-projects/hpc-slurm-ai-cluster/docs/interview-cheatsheet.md new file mode 100644 index 0000000..0bb20f7 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/docs/interview-cheatsheet.md @@ -0,0 +1,22 @@ +# Interview Cheatsheet: Slurm AI/HPC Lab + +## One-minute summary + +I built an Ansible-managed Slurm AI/HPC lab with a controller, CPU compute nodes and a GPU node. The lab includes Munge authentication, cgroup-based CPU/GPU enforcement, GRES GPU scheduling, SlurmDBD accounting backed by MariaDB, QOS/fairshare/priority policies, rolling OS upgrades, node provisioning/decommissioning and health remediation workflows. + +## Topics I can discuss + +- How Slurm schedules CPU and GPU workloads. +- Difference between GRES scheduling and cgroup device enforcement. +- Why Munge key consistency matters. +- How `slurmdbd`, `sacct`, `sacctmgr` and `sreport` fit together. +- How QOS, account associations, fairshare and multifactor priority work. +- Operational workflows: drain, decommission, provision, rolling upgrade, canary test and auto-remediation. + +## Real troubleshooting examples + +- `IDLE+NOT_RESPONDING` after node reprovisioning. +- Accounting delay where `sacct` temporarily showed `PENDING` while job output existed. +- Missing `gres/gpu` TRES before QOS GPU limits could be configured. +- `sacctmgr` idempotency issues such as `Nothing new added`. +- Slurm version differences around state transitions such as `RESUME`, `UNDRAIN` and `IDLE`. diff --git a/platform-projects/hpc-slurm-ai-cluster/docs/runbook.md b/platform-projects/hpc-slurm-ai-cluster/docs/runbook.md new file mode 100644 index 0000000..d6763af --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/docs/runbook.md @@ -0,0 +1,62 @@ +# Slurm AI/HPC Lab Runbook + +## Standard deployment order + +```bash +ansible-playbook playbooks/bootstrap/bootstrap-ansible.yml --ask-pass --ask-become-pass +ansible-playbook playbooks/bootstrap/slurm-hosts.yml +ansible-playbook playbooks/bootstrap/slurmuser-ssh-mesh.yml +ansible-playbook playbooks/bootstrap/slurmuser-sudoers-fix.yml + +ansible-playbook playbooks/core/manage-munge.yml +ansible-playbook playbooks/core/manage-slurm-config.yml --check --diff +ansible-playbook playbooks/core/manage-slurm-config.yml --diff +ansible-playbook playbooks/core/restart-slurm-safe.yml + +ansible-playbook playbooks/tests/validate-slurm-operator.yml +ansible-playbook playbooks/tests/test-cpu-job.yml +ansible-playbook playbooks/tests/test-gpu-job.yml +ansible-playbook playbooks/tests/test-gpu-deny-without-gres.yml + +ansible-playbook playbooks/accounting/setup-slurmdbd.yml +ansible-playbook playbooks/accounting/initialize-slurm-accounting.yml +ansible-playbook playbooks/accounting/backup-slurmdbd.yml +ansible-playbook playbooks/accounting/restore-check-slurmdbd.yml +ansible-playbook playbooks/accounting/validate-slurm-accounting.yml + +ansible-playbook playbooks/qos/configure-slurm-qos.yml +ansible-playbook playbooks/qos/validate-slurm-qos-priority.yml + +ansible-playbook playbooks/health/check-slurm-health.yml +``` + +## Node lifecycle + +Provision a node: + +```bash +ansible-playbook playbooks/lifecycle/provision-slurm-node.yml -e target_node=slurm-c02 +``` + +Decommission a node: + +```bash +ansible-playbook playbooks/lifecycle/decommission-slurm-node.yml -e target_node=slurm-c02 -e "decom_reason=planned maintenance" +``` + +Repair a node: + +```bash +ansible-playbook playbooks/health/repair-slurm-node.yml -e target_node=slurm-c02 +``` + +## Rolling OS upgrade + +```bash +ansible-playbook playbooks/upgrade/canary-slurm-node-upgrade.yml -e canary_node=slurm-c02 +ansible-playbook playbooks/upgrade/rolling-upgrade-slurm-workers.yml -e canary_node=slurm-c02 -e skip_canary=true +ansible-playbook playbooks/upgrade/upgrade-slurm-controller.yml +ansible-playbook playbooks/upgrade/validate-after-os-upgrade.yml +``` + +If `upgrade-slurm-controller.yml` is not present, create it from the documented controller upgrade workflow or keep controller upgrades manual. diff --git a/platform-projects/hpc-slurm-ai-cluster/docs/troubleshooting-cases.md b/platform-projects/hpc-slurm-ai-cluster/docs/troubleshooting-cases.md new file mode 100644 index 0000000..f4293c6 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/docs/troubleshooting-cases.md @@ -0,0 +1,28 @@ +# Troubleshooting Cases + +## `IDLE+NOT_RESPONDING` after node maintenance + +Symptoms: `sinfo` shows `idle*` or `scontrol show node` shows `IDLE+NOT_RESPONDING`. + +Actions: + +```bash +systemctl restart munge +systemctl restart slurmd +systemctl restart slurmctld +scontrol update NodeName= State=RESUME || true +scontrol update NodeName= State=UNDRAIN || true +scontrol update NodeName= State=IDLE || true +``` + +## Missing GPU TRES + +Symptoms: `sacctmgr` fails with `no TRES known by type gres/gpu`. + +Fix: add `AccountingStorageTRES=...,gres/gpu`, restart/reconfigure Slurm, run a GPU job and verify with `sacctmgr show tres`. + +## SlurmDBD objects already exist + +Symptoms: `sacctmgr` returns `Nothing new added` or `Already existing`. + +Fix: make Ansible tasks idempotent: attempt the change, tolerate known existing-object messages, then normalize state with `modify`. diff --git a/platform-projects/hpc-slurm-ai-cluster/inventories/lab/group_vars/slurm_cluster.yml b/platform-projects/hpc-slurm-ai-cluster/inventories/lab/group_vars/slurm_cluster.yml new file mode 100644 index 0000000..8e06d4e --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/inventories/lab/group_vars/slurm_cluster.yml @@ -0,0 +1,128 @@ +--- +# Example lab inventory variables. Replace addresses, users and node topology for your environment. + +slurm_cluster_name: labcluster + +slurm_control_machine: slurm-ctl01 +slurm_control_addr: 10.10.10.11 + +slurm_config_dir: /etc/slurm +slurm_user: slurm +slurm_operator_user: slurmuser + +slurmctld_port: 6817 +slurmd_port: 6818 + +slurm_job_comp_type: jobcomp/none + +slurm_select_type: select/cons_tres +slurm_select_type_parameters: CR_Core_Memory + +slurm_return_to_service: 2 +slurm_default_mpi_type: none + +slurm_gres_types: gpu + +slurm_nodes: + - name: slurm-c01 + managed_state: present + addr: 10.10.10.12 + cpus: 2 + real_memory: 1800 + features: "" + gres: "" + topology: "" + - name: slurm-c02 + managed_state: present + addr: 10.10.10.13 + cpus: 2 + real_memory: 1800 + features: "" + gres: "" + topology: "" + - name: gpu01 + managed_state: present + addr: 10.10.10.14 + cpus: 12 + real_memory: 60000 + features: "gpu" + gres: "gpu:1" + gres_file: /dev/nvidia0 + topology: "Boards=1 SocketsPerBoard=1 CoresPerSocket=6 ThreadsPerCore=2" + +slurm_partitions: + - name: debug + managed_state: present + nodes: "slurm-c[01-02]" + default: "YES" + max_time: "INFINITE" + state: "UP" + - name: gpu + managed_state: present + nodes: "gpu01" + default: "NO" + max_time: "INFINITE" + state: "UP" + - name: all + managed_state: present + nodes: "slurm-c[01-02],gpu01" + default: "NO" + max_time: "INFINITE" + state: "UP" + +# Cgroup enforcement +slurm_enable_cgroup: true +slurm_task_plugin: task/cgroup,task/affinity +slurm_proctrack_type: proctrack/cgroup +slurm_job_acct_gather_type: jobacct_gather/cgroup + +# Slurm accounting / SlurmDBD +slurm_accounting_storage_type: accounting_storage/slurmdbd +slurm_accounting_storage_host: slurm-ctl01 +slurm_accounting_storage_port: 6819 +slurm_accounting_storage_enforce: associations,limits,qos +slurm_accounting_storage_tres: cpu,mem,energy,node,billing,fs/disk,pages,vmem,gres/gpu + +slurmdbd_host: slurm-ctl01 +slurmdbd_port: 6819 +slurmdbd_storage_type: accounting_storage/mysql +slurmdbd_storage_host: localhost +slurmdbd_storage_port: 3306 +slurmdbd_storage_loc: slurm_acct_db +slurmdbd_storage_user: slurm +# Use Ansible Vault in real environments. See inventories/lab/group_vars/vault.example.yml +slurmdbd_storage_pass: "{{ vault_slurmdbd_storage_pass | default('CHANGE_ME_USE_ANSIBLE_VAULT') }}" + +slurm_account_name: lab +slurm_account_description: "AI/HPC Slurm lab account" +slurm_account_organization: "labcluster" + +# SlurmDBD purge / retention policy for lab +slurmdbd_commit_delay: 1 +slurmdbd_purge_event_after: 12months +slurmdbd_purge_job_after: 12months +slurmdbd_purge_resv_after: 12months +slurmdbd_purge_step_after: 3months +slurmdbd_purge_suspend_after: 3months +slurmdbd_purge_txn_after: 12months +slurmdbd_purge_usage_after: 24months + +# Archive is disabled for the lab; backup playbooks handle database dumps. +slurmdbd_archive_events: no +slurmdbd_archive_jobs: no +slurmdbd_archive_steps: no +slurmdbd_archive_suspend: no +slurmdbd_archive_txn: no +slurmdbd_archive_usage: no + +# Slurm priority / fairshare +slurm_priority_type: priority/multifactor +slurm_priority_decay_half_life: 7-0 +slurm_priority_calc_period: 5 +slurm_priority_favor_small: "NO" +slurm_priority_weight_age: 1000 +slurm_priority_weight_fairshare: 10000 +slurm_priority_weight_job_size: 1000 +slurm_priority_weight_partition: 1000 +slurm_priority_weight_qos: 10000 +slurm_priority_max_age: 1-0 diff --git a/platform-projects/hpc-slurm-ai-cluster/inventories/lab/group_vars/vault.example.yml b/platform-projects/hpc-slurm-ai-cluster/inventories/lab/group_vars/vault.example.yml new file mode 100644 index 0000000..78dbd8c --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/inventories/lab/group_vars/vault.example.yml @@ -0,0 +1,5 @@ +--- +# Copy this file to vault.yml and encrypt it with ansible-vault. +# ansible-vault encrypt inventories/lab/group_vars/vault.yml + +vault_slurmdbd_storage_pass: CHANGE_ME diff --git a/platform-projects/hpc-slurm-ai-cluster/inventories/lab/inventory.yml b/platform-projects/hpc-slurm-ai-cluster/inventories/lab/inventory.yml new file mode 100644 index 0000000..8281743 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/inventories/lab/inventory.yml @@ -0,0 +1,24 @@ +all: + vars: + ansible_ssh_common_args: '-o StrictHostKeyChecking=no' + children: + slurm_cluster: + children: + slurm_controller: + hosts: + slurm-ctl01: + ansible_host: 10.10.10.11 + ansible_user: ansible + slurm_compute: + hosts: + slurm-c01: + ansible_host: 10.10.10.12 + ansible_user: ansible + slurm-c02: + ansible_host: 10.10.10.13 + ansible_user: ansible + slurm_gpu: + hosts: + gpu01: + ansible_host: 10.10.10.14 + ansible_user: ansible diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/backup-slurmdbd.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/backup-slurmdbd.yml new file mode 100644 index 0000000..415179d --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/backup-slurmdbd.yml @@ -0,0 +1,90 @@ +--- +- name: Backup SlurmDBD MariaDB database + hosts: slurm_controller + become: true + gather_facts: true + + vars: + slurmdbd_backup_dir: /var/backups/slurmdbd + local_fetch_dir: "{{ playbook_dir }}/../../artifacts/backups/slurmdbd" + + tasks: + - name: Create remote backup directory + ansible.builtin.file: + path: "{{ slurmdbd_backup_dir }}" + state: directory + owner: root + group: root + mode: "0700" + + - name: Create local fetch directory on Ansible controller + ansible.builtin.file: + path: "{{ local_fetch_dir }}" + state: directory + owner: root + group: root + mode: "0700" + delegate_to: localhost + become: false + + - name: Validate MariaDB is running + ansible.builtin.command: + cmd: systemctl is-active mariadb + changed_when: false + + - name: Validate SlurmDBD is running + ansible.builtin.command: + cmd: systemctl is-active slurmdbd + changed_when: false + + - name: Validate Slurm accounting database exists + ansible.builtin.shell: | + set -euo pipefail + mysql -N -B -e "SHOW DATABASES LIKE '{{ slurmdbd_storage_loc }}';" | grep -qx "{{ slurmdbd_storage_loc }}" + args: + executable: /bin/bash + changed_when: false + + - name: Dump Slurm accounting database + ansible.builtin.shell: | + set -euo pipefail + + ts="$(date +%F-%H%M%S)" + out="{{ slurmdbd_backup_dir }}/{{ slurmdbd_storage_loc }}-${ts}.sql.gz" + + mysqldump \ + --single-transaction \ + --routines \ + --events \ + --triggers \ + {{ slurmdbd_storage_loc }} | gzip -9 > "$out" + + chmod 0600 "$out" + echo "$out" + args: + executable: /bin/bash + register: db_dump + changed_when: true + + - name: Validate backup file is non-empty + ansible.builtin.stat: + path: "{{ db_dump.stdout }}" + register: backup_file + + - name: Fail if backup file is empty + ansible.builtin.fail: + msg: "Backup file is empty: {{ db_dump.stdout }}" + when: backup_file.stat.size | int < 1024 + + - name: Fetch DB backup to Ansible controller + ansible.builtin.fetch: + src: "{{ db_dump.stdout }}" + dest: "{{ local_fetch_dir }}/" + flat: true + + - name: Show DB backup result + ansible.builtin.debug: + msg: + - "Remote backup: {{ db_dump.stdout }}" + - "Backup size bytes: {{ backup_file.stat.size }}" + - "Fetched to: {{ local_fetch_dir }}/" diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/initialize-slurm-accounting.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/initialize-slurm-accounting.yml new file mode 100644 index 0000000..327e872 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/initialize-slurm-accounting.yml @@ -0,0 +1,126 @@ +--- +- name: Initialize Slurm accounting entities + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Wait for sacctmgr connectivity + ansible.builtin.command: + cmd: sacctmgr -n list cluster + register: sacctmgr_cluster_list + retries: 20 + delay: 2 + until: sacctmgr_cluster_list.rc == 0 + changed_when: false + + - name: Show current accounting state before changes + ansible.builtin.shell: | + set -euo pipefail + + echo "### clusters" + sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC + + echo + echo "### accounts" + sacctmgr list account format=Account,Descr,Org + + echo + echo "### users" + sacctmgr list user format=User,DefaultAccount,Admin + + echo + echo "### associations" + sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS + args: + executable: /bin/bash + register: accounting_state_before + changed_when: false + + - name: Print current accounting state before changes + ansible.builtin.debug: + var: accounting_state_before.stdout_lines + + - name: Ensure Slurm cluster exists in accounting DB + ansible.builtin.shell: | + set -euo pipefail + + if sacctmgr -n list cluster format=Cluster | awk '{print $1}' | grep -qx "{{ slurm_cluster_name }}"; then + echo "Cluster {{ slurm_cluster_name }} already exists" + else + sacctmgr -i add cluster {{ slurm_cluster_name }} + fi + args: + executable: /bin/bash + register: ensure_cluster + changed_when: "'Adding Cluster' in ensure_cluster.stdout" + + - name: Ensure default lab account exists for cluster + ansible.builtin.shell: | + set -euo pipefail + + if sacctmgr -n list assoc format=Cluster,Account,User | awk '$1=="{{ slurm_cluster_name }}" && $2=="{{ slurm_account_name }}" && $3=="" {found=1} END {exit !found}'; then + echo "Account {{ slurm_account_name }} already associated with cluster {{ slurm_cluster_name }}" + else + sacctmgr -i add account {{ slurm_account_name }} \ + Cluster={{ slurm_cluster_name }} \ + Description="{{ slurm_account_description }}" \ + Organization="{{ slurm_account_organization }}" + fi + args: + executable: /bin/bash + register: ensure_account + changed_when: "'Adding Account' in ensure_account.stdout" + + - name: Ensure slurmuser exists with lab account association + ansible.builtin.shell: | + set -euo pipefail + + if sacctmgr -n list assoc format=Cluster,Account,User | awk '$1=="{{ slurm_cluster_name }}" && $2=="{{ slurm_account_name }}" && $3=="slurmuser" {found=1} END {exit !found}'; then + echo "User slurmuser already associated with account {{ slurm_account_name }} on cluster {{ slurm_cluster_name }}" + else + sacctmgr -i add user slurmuser \ + Cluster={{ slurm_cluster_name }} \ + Account={{ slurm_account_name }} \ + DefaultAccount={{ slurm_account_name }} + fi + args: + executable: /bin/bash + register: ensure_user_assoc + changed_when: "'Adding User' in ensure_user_assoc.stdout" + + - name: Ensure slurmuser has default account set + ansible.builtin.shell: | + set -euo pipefail + sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }} + args: + executable: /bin/bash + register: set_default_account + changed_when: "'Modified user' in (set_default_account.stdout + set_default_account.stderr)" + + - name: Show final accounting state + ansible.builtin.shell: | + set -euo pipefail + + echo "### clusters" + sacctmgr list cluster format=Cluster,ControlHost,ControlPort,RPC + + echo + echo "### accounts" + sacctmgr list account format=Account,Descr,Org + + echo + echo "### users" + sacctmgr list user format=User,DefaultAccount,Admin + + echo + echo "### associations" + sacctmgr list assoc format=Cluster,Account,User,Partition,Share,QOS,DefaultQOS + args: + executable: /bin/bash + register: accounting_state_after + changed_when: false + + - name: Print final accounting state + ansible.builtin.debug: + var: accounting_state_after.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/restore-check-slurmdbd.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/restore-check-slurmdbd.yml new file mode 100644 index 0000000..06a5762 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/accounting/restore-check-slurmdbd.yml @@ -0,0 +1,98 @@ +--- +- name: Restore-check latest SlurmDBD backup into test database + hosts: slurm_controller + become: true + gather_facts: false + + vars: + restore_check_db: "{{ slurmdbd_storage_loc }}_restorecheck" + slurmdbd_backup_dir: /var/backups/slurmdbd + + tasks: + - name: Validate MariaDB is running + ansible.builtin.command: + cmd: systemctl is-active mariadb + changed_when: false + + - name: Find latest SlurmDBD backup + ansible.builtin.shell: | + set -euo pipefail + ls -1t {{ slurmdbd_backup_dir }}/{{ slurmdbd_storage_loc }}-*.sql.gz | head -n 1 + args: + executable: /bin/bash + register: latest_backup + changed_when: false + + - name: Validate latest backup exists + ansible.builtin.stat: + path: "{{ latest_backup.stdout }}" + register: latest_backup_stat + + - name: Fail if latest backup is missing or empty + ansible.builtin.fail: + msg: "Latest SlurmDBD backup is missing or empty: {{ latest_backup.stdout }}" + when: + - not latest_backup_stat.stat.exists or latest_backup_stat.stat.size | int < 1024 + + - name: Recreate restore-check database + ansible.builtin.shell: | + set -euo pipefail + mysql < "$backup_dir/systemctl-munge.txt" 2>&1 || true + systemctl status slurmctld --no-pager > "$backup_dir/systemctl-slurmctld.txt" 2>&1 || true + systemctl status slurmd --no-pager > "$backup_dir/systemctl-slurmd.txt" 2>&1 || true + + journalctl -u munge -n 200 --no-pager > "$backup_dir/journal-munge.txt" 2>&1 || true + journalctl -u slurmctld -n 200 --no-pager > "$backup_dir/journal-slurmctld.txt" 2>&1 || true + journalctl -u slurmd -n 200 --no-pager > "$backup_dir/journal-slurmd.txt" 2>&1 || true + + if command -v sinfo >/dev/null 2>&1; then + sinfo > "$backup_dir/sinfo.txt" 2>&1 || true + fi + + if command -v scontrol >/dev/null 2>&1; then + scontrol show config > "$backup_dir/scontrol-show-config.txt" 2>&1 || true + scontrol show nodes > "$backup_dir/scontrol-show-nodes.txt" 2>&1 || true + scontrol show partitions > "$backup_dir/scontrol-show-partitions.txt" 2>&1 || true + fi + + find "$backup_dir" -maxdepth 2 -type f -o -type d + args: + executable: /bin/bash + register: backup_content + changed_when: true + + - name: Show backup location on node + ansible.builtin.debug: + msg: + - "Host: {{ inventory_hostname }}" + - "Backup directory: {{ node_backup_dir }}" diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/backup/fetch-slurm-backups.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/backup/fetch-slurm-backups.yml new file mode 100644 index 0000000..56a543e --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/backup/fetch-slurm-backups.yml @@ -0,0 +1,46 @@ +--- +- name: Fetch latest Slurm backups from nodes to pvef + hosts: slurm_cluster + become: true + gather_facts: false + + vars: + remote_backup_base: /var/backups/slurm + local_backup_base: "{{ playbook_dir }}/../../artifacts/backups" + + tasks: + - name: Find latest remote backup directory + ansible.builtin.shell: | + set -euo pipefail + ls -1dt {{ remote_backup_base }}/* | head -n 1 + args: + executable: /bin/bash + register: latest_backup_dir + changed_when: false + + - name: Create local backup directory on pvef + ansible.builtin.file: + path: "{{ local_backup_base }}/{{ inventory_hostname }}" + state: directory + mode: "0700" + delegate_to: localhost + become: false + + - name: Archive latest backup directory on remote node + ansible.builtin.archive: + path: "{{ latest_backup_dir.stdout }}" + dest: "/tmp/{{ inventory_hostname }}-slurm-backup.tgz" + format: gz + force_archive: true + changed_when: true + + - name: Fetch archive to pvef + ansible.builtin.fetch: + src: "/tmp/{{ inventory_hostname }}-slurm-backup.tgz" + dest: "{{ local_backup_base }}/{{ inventory_hostname }}/" + flat: true + + - name: Remove temporary remote archive + ansible.builtin.file: + path: "/tmp/{{ inventory_hostname }}-slurm-backup.tgz" + state: absent diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/bootstrap-ansible.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/bootstrap-ansible.yml new file mode 100644 index 0000000..5c67691 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/bootstrap-ansible.yml @@ -0,0 +1,58 @@ +--- +- name: Bootstrap Ansible SSH access from pvef to Slurm nodes + hosts: slurm_cluster + gather_facts: false + become: true + + vars: + ansible_controller_pubkey: "{{ lookup('file', lookup('env', 'HOME') + '/.ssh/id_ed25519.pub') }}" + + pre_tasks: + - name: Wait for SSH + ansible.builtin.wait_for_connection: + timeout: 30 + + - name: Install Python if missing - Debian/Ubuntu + ansible.builtin.raw: | + test -e /usr/bin/python3 || (apt-get update && apt-get install -y python3) + changed_when: false + + tasks: + - name: Ensure sudo is installed + ansible.builtin.apt: + name: + - sudo + - openssh-server + state: present + update_cache: true + + - name: Ensure SSH server is enabled and running + ansible.builtin.service: + name: ssh + state: started + enabled: true + + - name: Ensure .ssh directory exists for login user + ansible.builtin.file: + path: "/home/{{ ansible_user }}/.ssh" + state: directory + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: "0700" + + - name: Add pvef root public key to login user's authorized_keys + ansible.builtin.authorized_key: + user: "{{ ansible_user }}" + key: "{{ ansible_controller_pubkey }}" + state: present + manage_dir: true + + - name: Allow bootstrap login user passwordless sudo + ansible.builtin.copy: + dest: "/etc/sudoers.d/90-ansible-{{ ansible_user }}" + owner: root + group: root + mode: "0440" + content: | + {{ ansible_user }} ALL=(ALL) NOPASSWD:ALL + validate: "visudo -cf %s" diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurm-hosts.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurm-hosts.yml new file mode 100644 index 0000000..3371c66 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurm-hosts.yml @@ -0,0 +1,16 @@ +--- +- name: Configure /etc/hosts for Slurm cluster + hosts: slurm_cluster + become: true + gather_facts: false + + tasks: + - name: Add Slurm cluster hosts to /etc/hosts + ansible.builtin.blockinfile: + path: /etc/hosts + marker: "# {mark} ANSIBLE MANAGED SLURM CLUSTER HOSTS" + block: | + {{ slurm_control_addr }} {{ slurm_control_machine }} + {% for node in slurm_nodes if node.managed_state | default('present') == 'present' %} + {{ node.addr }} {{ node.name }} + {% endfor %} diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurmuser-ssh-mesh.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurmuser-ssh-mesh.yml new file mode 100644 index 0000000..c429d7e --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurmuser-ssh-mesh.yml @@ -0,0 +1,218 @@ +--- +- name: Create slurmuser and generate SSH keys on every Slurm node + hosts: slurm_cluster + become: true + gather_facts: true + + vars: + slurm_operator_user: slurmuser + slurm_operator_shell: /bin/bash + + tasks: + - name: Ensure useful packages are installed + ansible.builtin.apt: + name: + - sudo + - openssh-client + - openssh-server + - acl + state: present + update_cache: true + + - name: Ensure slurmuser exists + ansible.builtin.user: + name: "{{ slurm_operator_user }}" + shell: "{{ slurm_operator_shell }}" + create_home: true + state: present + + - name: Ensure .ssh directory exists for slurmuser + ansible.builtin.file: + path: "/home/{{ slurm_operator_user }}/.ssh" + state: directory + owner: "{{ slurm_operator_user }}" + group: "{{ slurm_operator_user }}" + mode: "0700" + + - name: Generate SSH key for slurmuser if missing + ansible.builtin.openssh_keypair: + path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519" + type: ed25519 + owner: "{{ slurm_operator_user }}" + group: "{{ slurm_operator_user }}" + mode: "0600" + comment: "{{ slurm_operator_user }}@{{ inventory_hostname }}" + force: false + + - name: Read public key from each node + ansible.builtin.slurp: + src: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub" + register: slurmuser_pubkey_raw + + - name: Store decoded public key as host fact + ansible.builtin.set_fact: + slurmuser_pubkey: "{{ slurmuser_pubkey_raw.content | b64decode | trim }}" + + +- name: Exchange slurmuser SSH keys across all Slurm nodes + hosts: slurm_cluster + become: true + gather_facts: false + + vars: + slurm_operator_user: slurmuser + + tasks: + - name: Install all slurmuser public keys into authorized_keys on every node + ansible.builtin.authorized_key: + user: "{{ slurm_operator_user }}" + key: "{{ hostvars[item].slurmuser_pubkey }}" + state: present + manage_dir: true + loop: "{{ groups['slurm_cluster'] }}" + + - name: Build SSH known_hosts entries for all cluster nodes + ansible.builtin.shell: | + set -e + mkdir -p /home/{{ slurm_operator_user }}/.ssh + touch /home/{{ slurm_operator_user }}/.ssh/known_hosts + + {% for host in groups['slurm_cluster'] %} + ssh-keyscan -H {{ host }} {{ hostvars[host].ansible_host }} 2>/dev/null >> /home/{{ slurm_operator_user }}/.ssh/known_hosts || true + {% endfor %} + + sort -u /home/{{ slurm_operator_user }}/.ssh/known_hosts -o /home/{{ slurm_operator_user }}/.ssh/known_hosts + chown {{ slurm_operator_user }}:{{ slurm_operator_user }} /home/{{ slurm_operator_user }}/.ssh/known_hosts + chmod 0644 /home/{{ slurm_operator_user }}/.ssh/known_hosts + args: + executable: /bin/bash + changed_when: true + + - name: Ensure SSH permissions are correct + ansible.builtin.file: + path: "/home/{{ slurm_operator_user }}/.ssh" + state: directory + owner: "{{ slurm_operator_user }}" + group: "{{ slurm_operator_user }}" + mode: "0700" + + - name: Ensure private key permissions are correct + ansible.builtin.file: + path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519" + owner: "{{ slurm_operator_user }}" + group: "{{ slurm_operator_user }}" + mode: "0600" + + - name: Ensure public key permissions are correct + ansible.builtin.file: + path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub" + owner: "{{ slurm_operator_user }}" + group: "{{ slurm_operator_user }}" + mode: "0644" + + +- name: Configure sudo permissions for slurmuser + hosts: slurm_cluster + become: true + gather_facts: false + + vars: + slurm_operator_user: slurmuser + + tasks: + - name: Configure sudoers for slurmuser on Slurm controller + ansible.builtin.copy: + dest: /etc/sudoers.d/91-slurmuser-slurm-controller + owner: root + group: root + mode: "0440" + content: | + # Managed by Ansible + # Operator access for Slurm controller node. + {{ slurm_operator_user }} ALL=(root) NOPASSWD: \ + /bin/systemctl status slurmctld, \ + /bin/systemctl restart slurmctld, \ + /bin/systemctl reload slurmctld, \ + /bin/systemctl stop slurmctld, \ + /bin/systemctl start slurmctld, \ + /bin/systemctl status slurmd, \ + /bin/systemctl restart slurmd, \ + /bin/systemctl reload slurmd, \ + /bin/systemctl stop slurmd, \ + /bin/systemctl start slurmd, \ + /bin/journalctl -u slurmctld, \ + /bin/journalctl -u slurmd, \ + /usr/bin/scontrol, \ + /usr/bin/sinfo, \ + /usr/bin/squeue, \ + /usr/bin/scancel, \ + /usr/bin/sacct, \ + /usr/bin/sacctmgr, \ + /usr/bin/sbatch, \ + /usr/bin/srun, \ + /usr/bin/salloc + validate: "visudo -cf %s" + when: inventory_hostname in groups['slurm_controller'] + + - name: Configure sudoers for slurmuser on Slurm compute and GPU nodes + ansible.builtin.copy: + dest: /etc/sudoers.d/91-slurmuser-slurm-compute + owner: root + group: root + mode: "0440" + content: | + # Managed by Ansible + # Operator access for Slurm worker/GPU nodes. + {{ slurm_operator_user }} ALL=(root) NOPASSWD: \ + /bin/systemctl status slurmd, \ + /bin/systemctl restart slurmd, \ + /bin/systemctl reload slurmd, \ + /bin/systemctl stop slurmd, \ + /bin/systemctl start slurmd, \ + /bin/journalctl -u slurmd, \ + /usr/bin/scontrol, \ + /usr/bin/sinfo, \ + /usr/bin/squeue, \ + /usr/bin/scancel, \ + /usr/bin/sacct, \ + /usr/bin/sbatch, \ + /usr/bin/srun, \ + /usr/bin/salloc + validate: "visudo -cf %s" + when: inventory_hostname not in groups['slurm_controller'] + + +- name: Validate slurmuser SSH mesh and Slurm access + hosts: slurm_cluster + become: true + gather_facts: false + + vars: + slurm_operator_user: slurmuser + + tasks: + - name: Test local Slurm commands as slurmuser + ansible.builtin.command: "sudo -iu {{ slurm_operator_user }} sinfo" + register: sinfo_test + changed_when: false + failed_when: sinfo_test.rc != 0 + + - name: Show sinfo result + ansible.builtin.debug: + var: sinfo_test.stdout_lines + + - name: Test SSH from each node to every other node as slurmuser + ansible.builtin.shell: | + set -e + {% for host in groups['slurm_cluster'] %} + ssh -o BatchMode=yes -o ConnectTimeout=5 {{ host }} 'hostname' + {% endfor %} + args: + executable: /bin/bash + become_user: "{{ slurm_operator_user }}" + register: ssh_mesh_test + changed_when: false + + - name: Show SSH mesh test result + ansible.builtin.debug: + var: ssh_mesh_test.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurmuser-sudoers-fix.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurmuser-sudoers-fix.yml new file mode 100644 index 0000000..86334a4 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurmuser-sudoers-fix.yml @@ -0,0 +1,112 @@ +--- +- name: Fix sudo permissions for slurmuser Slurm operations + hosts: slurm_cluster + become: true + gather_facts: false + + vars: + slurm_operator_user: slurmuser + + tasks: + - name: Configure sudoers for slurmuser on controller + ansible.builtin.copy: + dest: /etc/sudoers.d/91-slurmuser-slurm-controller + owner: root + group: root + mode: "0440" + content: | + # Managed by Ansible + + Cmnd_Alias SLURM_SYSTEMCTL_CONTROLLER = \ + /bin/systemctl status slurmctld, \ + /bin/systemctl status slurmctld *, \ + /bin/systemctl restart slurmctld, \ + /bin/systemctl reload slurmctld, \ + /bin/systemctl start slurmctld, \ + /bin/systemctl stop slurmctld, \ + /bin/systemctl status slurmd, \ + /bin/systemctl status slurmd *, \ + /bin/systemctl restart slurmd, \ + /bin/systemctl reload slurmd, \ + /bin/systemctl start slurmd, \ + /bin/systemctl stop slurmd, \ + /usr/bin/systemctl status slurmctld, \ + /usr/bin/systemctl status slurmctld *, \ + /usr/bin/systemctl restart slurmctld, \ + /usr/bin/systemctl reload slurmctld, \ + /usr/bin/systemctl start slurmctld, \ + /usr/bin/systemctl stop slurmctld, \ + /usr/bin/systemctl status slurmd, \ + /usr/bin/systemctl status slurmd *, \ + /usr/bin/systemctl restart slurmd, \ + /usr/bin/systemctl reload slurmd, \ + /usr/bin/systemctl start slurmd, \ + /usr/bin/systemctl stop slurmd + + Cmnd_Alias SLURM_JOURNAL_CONTROLLER = \ + /bin/journalctl -u slurmctld, \ + /bin/journalctl -u slurmctld *, \ + /bin/journalctl -u slurmd, \ + /bin/journalctl -u slurmd *, \ + /usr/bin/journalctl -u slurmctld, \ + /usr/bin/journalctl -u slurmctld *, \ + /usr/bin/journalctl -u slurmd, \ + /usr/bin/journalctl -u slurmd * + + Cmnd_Alias SLURM_COMMANDS = \ + /usr/bin/scontrol, /usr/bin/scontrol *, \ + /usr/bin/sinfo, /usr/bin/sinfo *, \ + /usr/bin/squeue, /usr/bin/squeue *, \ + /usr/bin/scancel, /usr/bin/scancel *, \ + /usr/bin/sacct, /usr/bin/sacct *, \ + /usr/bin/sacctmgr, /usr/bin/sacctmgr *, \ + /usr/bin/sbatch, /usr/bin/sbatch *, \ + /usr/bin/srun, /usr/bin/srun *, \ + /usr/bin/salloc, /usr/bin/salloc * + + {{ slurm_operator_user }} ALL=(root) NOPASSWD: SLURM_SYSTEMCTL_CONTROLLER, SLURM_JOURNAL_CONTROLLER, SLURM_COMMANDS + validate: "visudo -cf %s" + when: inventory_hostname in groups['slurm_controller'] + + - name: Configure sudoers for slurmuser on compute and GPU nodes + ansible.builtin.copy: + dest: /etc/sudoers.d/91-slurmuser-slurm-compute + owner: root + group: root + mode: "0440" + content: | + # Managed by Ansible + + Cmnd_Alias SLURM_SYSTEMCTL_COMPUTE = \ + /bin/systemctl status slurmd, \ + /bin/systemctl status slurmd *, \ + /bin/systemctl restart slurmd, \ + /bin/systemctl reload slurmd, \ + /bin/systemctl start slurmd, \ + /bin/systemctl stop slurmd, \ + /usr/bin/systemctl status slurmd, \ + /usr/bin/systemctl status slurmd *, \ + /usr/bin/systemctl restart slurmd, \ + /usr/bin/systemctl reload slurmd, \ + /usr/bin/systemctl start slurmd, \ + /usr/bin/systemctl stop slurmd + + Cmnd_Alias SLURM_JOURNAL_COMPUTE = \ + /bin/journalctl -u slurmd, \ + /bin/journalctl -u slurmd *, \ + /usr/bin/journalctl -u slurmd, \ + /usr/bin/journalctl -u slurmd * + + Cmnd_Alias SLURM_COMMANDS = \ + /usr/bin/scontrol, /usr/bin/scontrol *, \ + /usr/bin/sinfo, /usr/bin/sinfo *, \ + /usr/bin/squeue, /usr/bin/squeue *, \ + /usr/bin/scancel, /usr/bin/scancel *, \ + /usr/bin/sacct, /usr/bin/sacct *, \ + /usr/bin/sbatch, /usr/bin/sbatch *, \ + /usr/bin/srun, /usr/bin/srun *, \ + /usr/bin/salloc, /usr/bin/salloc * + + {{ slurm_operator_user }} ALL=(root) NOPASSWD: SLURM_SYSTEMCTL_COMPUTE, SLURM_JOURNAL_COMPUTE, SLURM_COMMANDS + validate: "visudo -cf %s" + when: inventory_hostname not in groups['slurm_controller'] diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/core/manage-munge.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/core/manage-munge.yml new file mode 100644 index 0000000..eefc094 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/core/manage-munge.yml @@ -0,0 +1,133 @@ +--- +- name: Read Munge key from Slurm controller + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Check controller munge.key exists + ansible.builtin.stat: + path: /etc/munge/munge.key + register: controller_munge_key + + - name: Fail if controller munge.key is missing + ansible.builtin.fail: + msg: "/etc/munge/munge.key is missing on controller. Do not continue." + when: not controller_munge_key.stat.exists + + - name: Read controller munge.key + ansible.builtin.slurp: + src: /etc/munge/munge.key + register: controller_munge_key_raw + + - name: Store controller Munge key as fact + ansible.builtin.set_fact: + cluster_munge_key_b64: "{{ controller_munge_key_raw.content }}" + + +- name: Deploy controller Munge key to all Slurm nodes + hosts: slurm_cluster + become: true + gather_facts: false + + vars: + controller_host: "{{ groups['slurm_controller'][0] }}" + + tasks: + - name: Ensure munge package is installed + ansible.builtin.apt: + name: + - munge + - libmunge2 + state: present + update_cache: true + + - name: Ensure munge group exists + ansible.builtin.group: + name: munge + system: true + state: present + + - name: Ensure munge user exists + ansible.builtin.user: + name: munge + group: munge + system: true + shell: /usr/sbin/nologin + home: /nonexistent + create_home: false + state: present + + - name: Ensure /etc/munge exists + ansible.builtin.file: + path: /etc/munge + state: directory + owner: munge + group: munge + mode: "0700" + + - name: Deploy shared munge.key from controller + ansible.builtin.copy: + dest: /etc/munge/munge.key + content: "{{ hostvars[controller_host].cluster_munge_key_b64 | b64decode }}" + owner: munge + group: munge + mode: "0400" + notify: + - Restart munge + + - name: Ensure /var/log/munge exists + ansible.builtin.file: + path: /var/log/munge + state: directory + owner: munge + group: munge + mode: "0755" + + - name: Ensure /var/lib/munge exists + ansible.builtin.file: + path: /var/lib/munge + state: directory + owner: munge + group: munge + mode: "0711" + + - name: Ensure /run/munge exists + ansible.builtin.file: + path: /run/munge + state: directory + owner: munge + group: munge + mode: "0755" + + - name: Ensure munge is enabled and running + ansible.builtin.systemd: + name: munge + enabled: true + state: started + + handlers: + - name: Restart munge + ansible.builtin.systemd: + name: munge + state: restarted + + +- name: Validate Munge locally on all nodes + hosts: slurm_cluster + become: true + gather_facts: false + + tasks: + - name: Test local munge encode/decode + ansible.builtin.shell: | + set -euo pipefail + munge -n | unmunge + args: + executable: /bin/bash + register: munge_local_test + changed_when: false + + - name: Show local Munge validation + ansible.builtin.debug: + var: munge_local_test.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/core/manage-slurm-config.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/core/manage-slurm-config.yml new file mode 100644 index 0000000..af7dac4 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/core/manage-slurm-config.yml @@ -0,0 +1,132 @@ +--- +- name: Prepare Slurm config directories and logs + hosts: slurm_cluster + become: true + gather_facts: false + + tasks: + - name: Ensure Slurm config directory exists + ansible.builtin.file: + path: "{{ slurm_config_dir }}" + state: directory + owner: root + group: root + mode: "0755" + + - name: Ensure Slurm log directory exists + ansible.builtin.file: + path: /var/log/slurm + state: directory + owner: slurm + group: slurm + mode: "0755" + + - name: Ensure slurmctld spool directory exists on controller + ansible.builtin.file: + path: /var/spool/slurmctld + state: directory + owner: slurm + group: slurm + mode: "0755" + when: inventory_hostname in groups['slurm_controller'] + + - name: Ensure slurmd spool directory exists on workers + ansible.builtin.file: + path: /var/spool/slurmd + state: directory + owner: slurm + group: slurm + mode: "0755" + when: inventory_hostname in groups['slurm_compute'] or inventory_hostname in groups['slurm_gpu'] + + +- name: Deploy Slurm config files + hosts: slurm_cluster + become: true + gather_facts: false + + tasks: + - name: Backup current slurm.conf before managed deployment + ansible.builtin.copy: + src: "{{ slurm_config_dir }}/slurm.conf" + dest: "{{ slurm_config_dir }}/slurm.conf.pre-ansible-managed" + remote_src: true + owner: root + group: root + mode: "0644" + force: false + + - name: Deploy managed slurm.conf + ansible.builtin.template: + src: ../../templates/slurm.conf.j2 + dest: "{{ slurm_config_dir }}/slurm.conf" + owner: root + group: root + mode: "0644" + notify: + - Reconfigure slurmctld + - Restart slurmd + + - name: Deploy managed cgroup.conf + ansible.builtin.template: + src: ../../templates/cgroup.conf.j2 + dest: "{{ slurm_config_dir }}/cgroup.conf" + owner: root + group: root + mode: "0644" + when: slurm_enable_cgroup | default(false) | bool + notify: + - Reconfigure slurmctld + - Restart slurmd + + - name: Deploy managed gres.conf only on GPU nodes + ansible.builtin.template: + src: ../../templates/gres.conf.j2 + dest: "{{ slurm_config_dir }}/gres.conf" + owner: root + group: root + mode: "0644" + when: inventory_hostname in groups['slurm_gpu'] + notify: + - Reconfigure slurmctld + - Restart slurmd + + handlers: + - name: Reconfigure slurmctld + ansible.builtin.command: + cmd: scontrol reconfigure + when: inventory_hostname in groups['slurm_controller'] + changed_when: true + + - name: Restart slurmd + ansible.builtin.systemd: + name: slurmd + state: restarted + when: inventory_hostname in groups['slurm_compute'] or inventory_hostname in groups['slurm_gpu'] + + +- name: Validate Slurm after config deployment + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Reconfigure controller + ansible.builtin.command: + cmd: scontrol reconfigure + changed_when: true + + - name: Validate cluster state + ansible.builtin.shell: | + set -euo pipefail + scontrol ping + sinfo + scontrol show nodes + args: + executable: /bin/bash + register: slurm_config_validation + changed_when: false + + - name: Show validation output + ansible.builtin.debug: + var: slurm_config_validation.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/core/restart-slurm-safe.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/core/restart-slurm-safe.yml new file mode 100644 index 0000000..2044cc5 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/core/restart-slurm-safe.yml @@ -0,0 +1,103 @@ +--- +- name: Restart Slurm controller safely + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Restart munge on controller + ansible.builtin.systemd: + name: munge + state: restarted + enabled: true + + - name: Restart slurmctld on controller + ansible.builtin.systemd: + name: slurmctld + state: restarted + enabled: true + + - name: Wait for slurmctld to answer + ansible.builtin.command: + cmd: scontrol ping + register: scontrol_ping + retries: 15 + delay: 2 + until: scontrol_ping.rc == 0 + changed_when: false + + - name: Show controller ping + ansible.builtin.debug: + var: scontrol_ping.stdout_lines + + +- name: Restart Slurm workers safely one by one + hosts: slurm_compute:slurm_gpu + become: true + gather_facts: false + serial: 1 + + tasks: + - name: Restart munge on worker + ansible.builtin.systemd: + name: munge + state: restarted + enabled: true + + - name: Restart slurmd on worker + ansible.builtin.systemd: + name: slurmd + state: restarted + enabled: true + + - name: Wait for slurmd to be active + ansible.builtin.command: + cmd: systemctl is-active slurmd + register: slurmd_active + retries: 15 + delay: 2 + until: slurmd_active.stdout == "active" + changed_when: false + + - name: Wait until this node is visible in Slurm + ansible.builtin.command: + cmd: scontrol show node {{ inventory_hostname }} + delegate_to: "{{ groups['slurm_controller'][0] }}" + register: node_visible + retries: 15 + delay: 2 + until: node_visible.rc == 0 + changed_when: false + + +- name: Validate Slurm after restart + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Validate Slurm cluster state + ansible.builtin.shell: | + set -euo pipefail + echo "### scontrol ping" + scontrol ping + + echo + echo "### sinfo" + sinfo + + echo + echo "### nodes" + scontrol show nodes + + echo + echo "### partitions" + scontrol show partitions + args: + executable: /bin/bash + register: slurm_validation + changed_when: false + + - name: Show Slurm validation + ansible.builtin.debug: + var: slurm_validation.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/discovery/discover-slurm-resources.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/discovery/discover-slurm-resources.yml new file mode 100644 index 0000000..9aa75be --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/discovery/discover-slurm-resources.yml @@ -0,0 +1,40 @@ +--- +- name: Discover node resources for Slurm config + hosts: slurm_cluster + become: true + gather_facts: true + + tasks: + - name: Discover CPU and memory + ansible.builtin.shell: | + set -euo pipefail + echo "HOST={{ inventory_hostname }}" + echo "CPUS=$(nproc)" + echo "REAL_MEMORY_MB=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo)" + echo "SOCKETS=$(lscpu | awk -F: '/Socket\\(s\\)/ {gsub(/ /,\"\",$2); print $2}')" + echo "CORES_PER_SOCKET=$(lscpu | awk -F: '/Core\\(s\\) per socket/ {gsub(/ /,\"\",$2); print $2}')" + echo "THREADS_PER_CORE=$(lscpu | awk -F: '/Thread\\(s\\) per core/ {gsub(/ /,\"\",$2); print $2}')" + args: + executable: /bin/bash + register: cpu_mem + changed_when: false + + - name: Discover NVIDIA GPU if present + ansible.builtin.shell: | + set -euo pipefail + if command -v nvidia-smi >/dev/null 2>&1; then + nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader + else + echo "NO_NVIDIA_SMI" + fi + args: + executable: /bin/bash + register: gpu_info + changed_when: false + + - name: Show discovered resources + ansible.builtin.debug: + msg: + - "{{ cpu_mem.stdout_lines }}" + - "GPU:" + - "{{ gpu_info.stdout_lines }}" diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/discovery/inspect-slurm-state.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/discovery/inspect-slurm-state.yml new file mode 100644 index 0000000..23e9357 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/discovery/inspect-slurm-state.yml @@ -0,0 +1,89 @@ +--- +- name: Inspect current Slurm and Munge state + hosts: slurm_cluster + become: true + gather_facts: true + + tasks: + - name: Basic host info + ansible.builtin.shell: | + set -e + echo "HOST=$(hostname -f 2>/dev/null || hostname)" + echo "SHORT_HOST=$(hostname -s)" + echo "IP_ADDRESSES=$(hostname -I)" + echo "OS=$(lsb_release -ds 2>/dev/null || cat /etc/os-release | grep PRETTY_NAME || true)" + echo "KERNEL=$(uname -r)" + args: + executable: /bin/bash + register: host_info + changed_when: false + + - name: Slurm package info + ansible.builtin.shell: | + dpkg -l | grep -Ei 'slurm|munge' || true + args: + executable: /bin/bash + register: package_info + changed_when: false + + - name: Slurm config paths + ansible.builtin.shell: | + set -e + for p in /etc/slurm /etc/slurm-llnl /etc/munge; do + echo "### $p" + if [ -e "$p" ]; then + find "$p" -maxdepth 2 -type f -printf "%m %u %g %p\n" | sort + else + echo "MISSING" + fi + done + args: + executable: /bin/bash + register: config_paths + changed_when: false + + - name: Service state + ansible.builtin.shell: | + for s in munge slurmctld slurmd; do + echo "### $s" + systemctl is-enabled "$s" 2>/dev/null || true + systemctl is-active "$s" 2>/dev/null || true + done + args: + executable: /bin/bash + register: service_state + changed_when: false + + - name: Slurm commands + ansible.builtin.shell: | + echo "### which" + command -v sinfo || true + command -v scontrol || true + command -v sbatch || true + command -v srun || true + command -v munge || true + command -v unmunge || true + + echo "### sinfo" + sinfo 2>&1 || true + + echo "### scontrol ping" + scontrol ping 2>&1 || true + args: + executable: /bin/bash + register: slurm_commands + changed_when: false + + - name: Show inspection report + ansible.builtin.debug: + msg: + - "===== {{ inventory_hostname }} :: host_info =====" + - "{{ host_info.stdout_lines }}" + - "===== {{ inventory_hostname }} :: packages =====" + - "{{ package_info.stdout_lines }}" + - "===== {{ inventory_hostname }} :: config_paths =====" + - "{{ config_paths.stdout_lines }}" + - "===== {{ inventory_hostname }} :: services =====" + - "{{ service_state.stdout_lines }}" + - "===== {{ inventory_hostname }} :: slurm_commands =====" + - "{{ slurm_commands.stdout_lines }}" diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/health/auto-remediate-slurm-health.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/health/auto-remediate-slurm-health.yml new file mode 100644 index 0000000..04d03c2 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/health/auto-remediate-slurm-health.yml @@ -0,0 +1,216 @@ +--- +- name: Detect problematic Slurm nodes + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Detect nodes needing remediation + ansible.builtin.shell: | + set -euo pipefail + + sinfo -N -h -o "%N %T" | awk ' + tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1} + ' | sort -u + args: + executable: /bin/bash + register: bad_nodes_raw + changed_when: false + + - name: Store bad node list + ansible.builtin.set_fact: + bad_nodes: "{{ bad_nodes_raw.stdout_lines }}" + + - name: Show detected problematic nodes + ansible.builtin.debug: + var: bad_nodes + + +- name: Attempt auto-remediation on problematic nodes + hosts: slurm_compute:slurm_gpu + become: true + gather_facts: false + serial: 1 + + vars: + bad_nodes_from_controller: "{{ hostvars[groups['slurm_controller'][0]].bad_nodes | default([]) }}" + + tasks: + - name: Skip healthy nodes + ansible.builtin.meta: end_host + when: inventory_hostname not in bad_nodes_from_controller + + - name: Restart Munge + ansible.builtin.systemd: + name: munge + state: restarted + enabled: true + + - name: Restart slurmd + ansible.builtin.systemd: + name: slurmd + state: restarted + enabled: true + + - name: Validate local services after remediation attempt + ansible.builtin.shell: | + set -euo pipefail + + echo "HOST=$(hostname)" + + echo + echo "### services" + systemctl is-active munge + systemctl is-active slurmd + + echo + echo "### munge" + munge -n | unmunge >/dev/null + echo "munge OK" + + echo + echo "### controller ping" + scontrol ping + + echo + echo "### slurmd listener" + ss -lntp | grep ':6818 ' || true + + echo + echo "### recent slurmd logs" + journalctl -u slurmd -n 30 --no-pager || true + args: + executable: /bin/bash + register: local_repair_check + changed_when: false + + - name: Print local remediation result + ansible.builtin.debug: + var: local_repair_check.stdout_lines + + +- name: Refresh controller and validate remediated nodes + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Restart slurmctld to refresh node states + ansible.builtin.systemd: + name: slurmctld + state: restarted + + - name: Wait for controller + ansible.builtin.command: + cmd: scontrol ping + register: slurmctld_ping + retries: 15 + delay: 2 + until: slurmctld_ping.rc == 0 + changed_when: false + + - name: Clear maintenance state on previously bad nodes + ansible.builtin.shell: | + set -euo pipefail + + bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}" + + if [ -z "$bad_nodes" ]; then + echo "No bad nodes detected. Nothing to clear." + sinfo -N + exit 0 + fi + + for node in $bad_nodes; do + echo "### clearing state on $node" + scontrol update NodeName="$node" State=RESUME 2>/dev/null || true + scontrol update NodeName="$node" State=UNDRAIN 2>/dev/null || true + scontrol update NodeName="$node" State=IDLE 2>/dev/null || true + done + + sleep 5 + sinfo -N + args: + executable: /bin/bash + register: clear_result + changed_when: true + + - name: Print clear-state result + ansible.builtin.debug: + var: clear_result.stdout_lines + + - name: Detect nodes still unhealthy after remediation + ansible.builtin.shell: | + set -euo pipefail + + sinfo -N -h -o "%N %T" | awk ' + tolower($2) ~ /down|drain|fail|unknown|not_responding|idle\*/ {print $1} + ' | sort -u + args: + executable: /bin/bash + register: still_bad_nodes_raw + changed_when: false + + - name: Store still bad nodes + ansible.builtin.set_fact: + still_bad_nodes: "{{ still_bad_nodes_raw.stdout_lines }}" + + - name: Drain nodes that remain unhealthy + ansible.builtin.shell: | + set -euo pipefail + + unresolved_nodes="{{ still_bad_nodes | join(' ') }}" + + if [ -z "$unresolved_nodes" ]; then + echo "No unresolved unhealthy nodes." + sinfo -N + exit 0 + fi + + for node in $unresolved_nodes; do + echo "### draining unresolved node $node" + scontrol update NodeName="$node" State=DRAIN Reason="auto-remediation failed" + done + + sinfo -N + args: + executable: /bin/bash + register: drain_unresolved + changed_when: still_bad_nodes | length > 0 + + - name: Show remediation summary + ansible.builtin.shell: | + set -euo pipefail + + echo "### initial bad nodes" + bad_nodes="{{ (bad_nodes | default([])) | join(' ') }}" + if [ -z "$bad_nodes" ]; then + echo "none" + else + printf '%s\n' $bad_nodes + fi + + echo + echo "### still bad nodes" + still_bad_nodes="{{ (still_bad_nodes | default([])) | join(' ') }}" + if [ -z "$still_bad_nodes" ]; then + echo "none" + else + printf '%s\n' $still_bad_nodes + fi + + echo + echo "### final sinfo" + sinfo -N + + echo + echo "### queue" + squeue + args: + executable: /bin/bash + register: remediation_summary + changed_when: false + + - name: Print remediation summary + ansible.builtin.debug: + var: remediation_summary.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/health/check-slurm-health.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/health/check-slurm-health.yml new file mode 100644 index 0000000..3dbeea5 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/health/check-slurm-health.yml @@ -0,0 +1,149 @@ +--- +- name: Check Slurm controller health + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Check controller services and cluster state + ansible.builtin.shell: | + set -euo pipefail + + echo "### controller services" + systemctl is-active munge + systemctl is-active slurmctld + systemctl is-active slurmdbd || true + systemctl is-active mariadb || true + + echo + echo "### slurm ping" + scontrol ping + + echo + echo "### nodes" + sinfo -N + + echo + echo "### partitions" + sinfo + + echo + echo "### queue" + squeue + + echo + echo "### problematic nodes" + sinfo -N -h -o "%N %T %E" | awk '$2 !~ /idle|alloc|mix/ {print}' || true + + echo + echo "### accounting" + sacctmgr -n list cluster || true + + echo + echo "### recent failed jobs" + sacct -S today --state=FAILED,CANCELLED,TIMEOUT,NODE_FAIL,OUT_OF_MEMORY \ + --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,NodeList | tail -30 || true + args: + executable: /bin/bash + register: controller_health + changed_when: false + + - name: Print controller health + ansible.builtin.debug: + var: controller_health.stdout_lines + + +- name: Check Slurm worker health + hosts: slurm_compute:slurm_gpu + become: true + gather_facts: true + + tasks: + - name: Check worker services, config and connectivity + ansible.builtin.shell: | + set -euo pipefail + + echo "HOST=$(hostname)" + echo "FQDN=$(hostname -f 2>/dev/null || hostname)" + echo "KERNEL=$(uname -r)" + echo "UPTIME=$(uptime -p)" + + echo + echo "### services" + systemctl is-active munge + systemctl is-active slurmd + + echo + echo "### munge local test" + munge -n | unmunge >/dev/null + echo "munge OK" + + echo + echo "### controller connectivity" + getent hosts slurm-ctl01 || true + scontrol ping + + echo + echo "### slurmd listener" + ss -lntp | grep ':6818 ' || true + + echo + echo "### config checksums" + sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true + + echo + echo "### shared filesystem" + test -d /shared + touch /shared/.slurm-health-$(hostname) + ls -l /shared/.slurm-health-$(hostname) + rm -f /shared/.slurm-health-$(hostname) + + echo + echo "### cgroup" + mount | grep cgroup || true + + echo + echo "### gpu check" + if command -v nvidia-smi >/dev/null 2>&1; then + nvidia-smi --query-gpu=index,name,driver_version,memory.total,temperature.gpu,utilization.gpu --format=csv,noheader || true + else + echo "NO_NVIDIA_SMI" + fi + args: + executable: /bin/bash + register: worker_health + changed_when: false + + - name: Print worker health + ansible.builtin.debug: + var: worker_health.stdout_lines + + +- name: Check Slurm-reported node state consistency + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Build Slurm node health summary + ansible.builtin.shell: | + set -euo pipefail + + echo "### node summary" + sinfo -N -o "%N %P %T %C %m %G %E" + + echo + echo "### full problematic node details" + for node in $(sinfo -N -h -o "%N %T" | awk '$2 ~ /down|drain|fail|unk|not_responding|idle\\*/ {print $1}' | sort -u); do + echo + echo "### $node" + scontrol show node "$node" + done + args: + executable: /bin/bash + register: slurm_node_summary + changed_when: false + + - name: Print Slurm node summary + ansible.builtin.debug: + var: slurm_node_summary.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/health/repair-slurm-node.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/health/repair-slurm-node.yml new file mode 100644 index 0000000..3994980 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/health/repair-slurm-node.yml @@ -0,0 +1,217 @@ +--- +- name: Validate target node + hosts: localhost + gather_facts: false + + tasks: + - name: Require target_node + ansible.builtin.fail: + msg: "Use: ansible-playbook repair-slurm-node.yml -e target_node=" + when: target_node is not defined + + - name: Ensure target_node is in inventory + ansible.builtin.fail: + msg: "target_node={{ target_node }} is not in Ansible inventory" + when: target_node not in groups['all'] + + +- name: Capture node state before repair + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Show target node state before repair + ansible.builtin.shell: | + set -euo pipefail + + echo "### sinfo" + sinfo -N -n {{ target_node }} || true + + echo + echo "### scontrol" + scontrol show node {{ target_node }} || true + + echo + echo "### jobs" + squeue -w {{ target_node }} || true + args: + executable: /bin/bash + register: node_state_before + changed_when: false + + - name: Print target node state before repair + ansible.builtin.debug: + var: node_state_before.stdout_lines + + +- name: Repair local services on target node + hosts: "{{ target_node }}" + become: true + gather_facts: false + + tasks: + - name: Restart Munge + ansible.builtin.systemd: + name: munge + state: restarted + enabled: true + + - name: Restart slurmd + ansible.builtin.systemd: + name: slurmd + state: restarted + enabled: true + when: + - inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', []) + + - name: Validate local repair + ansible.builtin.shell: | + set -euo pipefail + + echo "### services" + systemctl is-active munge + systemctl is-active slurmd + + echo + echo "### munge" + munge -n | unmunge >/dev/null + echo "munge OK" + + echo + echo "### controller ping" + scontrol ping + + echo + echo "### slurmd listener" + ss -lntp | grep ':6818 ' || true + + echo + echo "### recent slurmd logs" + journalctl -u slurmd -n 40 --no-pager || true + args: + executable: /bin/bash + register: local_repair_state + changed_when: false + + - name: Print local repair state + ansible.builtin.debug: + var: local_repair_state.stdout_lines + + +- name: Clear Slurm maintenance/down state after repair + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Restart controller to refresh node state + ansible.builtin.systemd: + name: slurmctld + state: restarted + + - name: Wait for controller + ansible.builtin.command: + cmd: scontrol ping + register: slurmctld_ping + retries: 15 + delay: 2 + until: slurmctld_ping.rc == 0 + changed_when: false + + - name: Clear target node state + ansible.builtin.shell: | + set -euo pipefail + + scontrol update NodeName={{ target_node }} State=RESUME 2>/dev/null || true + scontrol update NodeName={{ target_node }} State=UNDRAIN 2>/dev/null || true + scontrol update NodeName={{ target_node }} State=IDLE 2>/dev/null || true + + sleep 5 + + sinfo -N -n {{ target_node }} + scontrol show node {{ target_node }} + args: + executable: /bin/bash + register: clear_state + changed_when: true + + - name: Wait until node is healthy + ansible.builtin.shell: | + set -euo pipefail + sinfo -N -n {{ target_node }} + scontrol show node {{ target_node }} + args: + executable: /bin/bash + register: node_health_after + retries: 30 + delay: 5 + until: + - node_health_after.rc == 0 + - "'not_responding' not in node_health_after.stdout.lower()" + - "'down' not in node_health_after.stdout.lower()" + - "'drain' not in node_health_after.stdout.lower()" + - "'idle*' not in node_health_after.stdout.lower()" + changed_when: false + + - name: Print node state after repair + ansible.builtin.debug: + var: node_health_after.stdout_lines + + +- name: Submit repair validation job + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Submit validation job to repaired node + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable < [-e decom_reason='reason']" + when: target_node is not defined + + - name: Ensure target_node is in inventory + ansible.builtin.fail: + msg: "target_node={{ target_node }} is not in Ansible inventory" + when: target_node not in groups['all'] + + +- name: Drain target node and wait for jobs to leave + hosts: slurm_controller + become: true + gather_facts: false + + vars: + decom_reason_effective: "{{ decom_reason | default('decommission by Ansible') }}" + decom_wait_retries_effective: "{{ decom_wait_retries | default(120) }}" + decom_wait_delay_effective: "{{ decom_wait_delay | default(10) }}" + + tasks: + - name: Show current target node state + ansible.builtin.shell: | + set -euo pipefail + sinfo -N -n {{ target_node }} || true + scontrol show node {{ target_node }} || true + args: + executable: /bin/bash + register: node_state_before + changed_when: false + + - name: Print current target node state + ansible.builtin.debug: + var: node_state_before.stdout_lines + + - name: Drain target node + ansible.builtin.command: + cmd: scontrol update NodeName={{ target_node }} State=DRAIN Reason="{{ decom_reason_effective }}" + changed_when: true + + - name: Wait until no jobs are running on target node + ansible.builtin.shell: | + set -euo pipefail + squeue -h -w {{ target_node }} || true + args: + executable: /bin/bash + register: jobs_on_node + retries: "{{ decom_wait_retries_effective | int }}" + delay: "{{ decom_wait_delay_effective | int }}" + until: jobs_on_node.stdout | trim == "" + changed_when: false + + - name: Show drained node state + ansible.builtin.shell: | + set -euo pipefail + sinfo -N -n {{ target_node }} || true + scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true + args: + executable: /bin/bash + register: node_state_drained + changed_when: false + + - name: Print drained node state + ansible.builtin.debug: + var: node_state_drained.stdout_lines + + +- name: Stop Slurm worker service on target node + hosts: "{{ target_node }}" + become: true + gather_facts: false + + tasks: + - name: Stop slurmd + ansible.builtin.systemd: + name: slurmd + state: stopped + enabled: false + when: + - inventory_hostname in groups.get('slurm_compute', []) or inventory_hostname in groups.get('slurm_gpu', []) + + - name: Show slurmd state + ansible.builtin.shell: | + systemctl is-enabled slurmd 2>/dev/null || true + systemctl is-active slurmd 2>/dev/null || true + args: + executable: /bin/bash + register: slurmd_state_after + changed_when: false + + - name: Print slurmd state + ansible.builtin.debug: + var: slurmd_state_after.stdout_lines + + +- name: Mark node down in Slurm controller + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Mark target node DOWN after service stop + ansible.builtin.command: + cmd: scontrol update NodeName={{ target_node }} State=DOWN Reason="decommissioned" + changed_when: true + + - name: Show final node state + ansible.builtin.shell: | + set -euo pipefail + sinfo -N -n {{ target_node }} || true + scontrol show node {{ target_node }} | grep -E "NodeName=|State=|Reason=" || true + args: + executable: /bin/bash + register: final_node_state + changed_when: false + + - name: Print final node state + ansible.builtin.debug: + var: final_node_state.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/lifecycle/provision-slurm-node.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/lifecycle/provision-slurm-node.yml new file mode 100644 index 0000000..2cf8baa --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/lifecycle/provision-slurm-node.yml @@ -0,0 +1,246 @@ +--- +- name: Validate target_node variable + hosts: localhost + gather_facts: false + + tasks: + - name: Require target_node + ansible.builtin.fail: + msg: "Use: ansible-playbook provision-slurm-node.yml -e target_node=" + when: target_node is not defined + + - name: Ensure target_node is in inventory + ansible.builtin.fail: + msg: "target_node={{ target_node }} is not in Ansible inventory" + when: target_node not in groups['all'] + + +- name: Prepare OS, packages and Slurm directories on target node + hosts: "{{ target_node }}" + become: true + gather_facts: true + + tasks: + - name: Ensure target is a Slurm worker or GPU node + ansible.builtin.fail: + msg: "{{ inventory_hostname }} must be in slurm_compute or slurm_gpu group" + when: + - inventory_hostname not in groups.get('slurm_compute', []) + - inventory_hostname not in groups.get('slurm_gpu', []) + + - name: Install Slurm worker packages + ansible.builtin.apt: + name: + - munge + - libmunge2 + - slurm-client + - slurmd + - slurm-wlm-basic-plugins + - slurm-wlm-plugins + - slurm-wlm-mysql-plugin + state: present + update_cache: true + + - name: Ensure Slurm config directory exists + ansible.builtin.file: + path: "{{ slurm_config_dir }}" + state: directory + owner: root + group: root + mode: "0755" + + - name: Ensure Slurm log directory exists + ansible.builtin.file: + path: /var/log/slurm + state: directory + owner: slurm + group: slurm + mode: "0755" + + - name: Ensure slurmd spool directory exists + ansible.builtin.file: + path: /var/spool/slurmd + state: directory + owner: slurm + group: slurm + mode: "0755" + + - name: Ensure munge dirs exist + ansible.builtin.file: + path: "{{ item.path }}" + state: directory + owner: munge + group: munge + mode: "{{ item.mode }}" + loop: + - { path: /etc/munge, mode: "0700" } + - { path: /var/log/munge, mode: "0755" } + - { path: /var/lib/munge, mode: "0711" } + - { path: /run/munge, mode: "0755" } + + +- name: Deploy Munge key from controller to target node + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Read controller munge.key + ansible.builtin.slurp: + src: /etc/munge/munge.key + register: controller_munge_key_raw + + - name: Store controller Munge key as fact + ansible.builtin.set_fact: + cluster_munge_key_b64: "{{ controller_munge_key_raw.content }}" + + +- name: Configure target node with Munge and Slurm files + hosts: "{{ target_node }}" + become: true + gather_facts: false + + vars: + controller_host: "{{ groups['slurm_controller'][0] }}" + + tasks: + - name: Deploy shared munge.key + ansible.builtin.copy: + dest: /etc/munge/munge.key + content: "{{ hostvars[controller_host].cluster_munge_key_b64 | b64decode }}" + owner: munge + group: munge + mode: "0400" + notify: + - Restart munge + + - name: Deploy managed slurm.conf + ansible.builtin.template: + src: ../../templates/slurm.conf.j2 + dest: "{{ slurm_config_dir }}/slurm.conf" + owner: root + group: root + mode: "0644" + notify: + - Restart slurmd + + - name: Deploy managed cgroup.conf + ansible.builtin.template: + src: ../../templates/cgroup.conf.j2 + dest: "{{ slurm_config_dir }}/cgroup.conf" + owner: root + group: root + mode: "0644" + when: slurm_enable_cgroup | default(false) | bool + notify: + - Restart slurmd + + - name: Deploy managed gres.conf on GPU nodes + ansible.builtin.template: + src: ../../templates/gres.conf.j2 + dest: "{{ slurm_config_dir }}/gres.conf" + owner: root + group: root + mode: "0644" + when: inventory_hostname in groups.get('slurm_gpu', []) + notify: + - Restart slurmd + + - name: Ensure munge is enabled and running + ansible.builtin.systemd: + name: munge + enabled: true + state: started + + - name: Ensure slurmd is enabled and running + ansible.builtin.systemd: + name: slurmd + enabled: true + state: started + + handlers: + - name: Restart munge + ansible.builtin.systemd: + name: munge + state: restarted + + - name: Restart slurmd + ansible.builtin.systemd: + name: slurmd + state: restarted + + +- name: Deploy updated Slurm config to whole cluster and reconfigure controller + hosts: slurm_cluster + become: true + gather_facts: false + + tasks: + - name: Deploy managed slurm.conf to all nodes + ansible.builtin.template: + src: ../../templates/slurm.conf.j2 + dest: "{{ slurm_config_dir }}/slurm.conf" + owner: root + group: root + mode: "0644" + + - name: Deploy managed cgroup.conf to all nodes + ansible.builtin.template: + src: ../../templates/cgroup.conf.j2 + dest: "{{ slurm_config_dir }}/cgroup.conf" + owner: root + group: root + mode: "0644" + when: slurm_enable_cgroup | default(false) | bool + + +- name: Reconfigure Slurm and validate target node + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Reconfigure Slurm controller + ansible.builtin.command: + cmd: scontrol reconfigure + changed_when: true + + - name: Restart Slurm controller after node reprovision + ansible.builtin.systemd: + name: slurmctld + state: restarted + + - name: Wait for Slurm controller after restart + ansible.builtin.command: + cmd: scontrol ping + register: slurmctld_ping_after_restart + retries: 15 + delay: 2 + until: slurmctld_ping_after_restart.rc == 0 + changed_when: false + + - name: Resume target node in Slurm + ansible.builtin.command: + cmd: scontrol update NodeName={{ target_node }} State=RESUME + changed_when: true + + - name: Wait until target node is visible and not down + ansible.builtin.shell: | + set -euo pipefail + scontrol show node {{ target_node }} + sinfo -N -n {{ target_node }} + args: + executable: /bin/bash + register: target_node_state + retries: 20 + delay: 3 + until: + - target_node_state.rc == 0 + - "'down' not in target_node_state.stdout.lower()" + - "'not_responding' not in target_node_state.stdout.lower()" + - "'idle*' not in target_node_state.stdout.lower()" + changed_when: false + + - name: Show target node state + ansible.builtin.debug: + var: target_node_state.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/lifecycle/show-slurm-node.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/lifecycle/show-slurm-node.yml new file mode 100644 index 0000000..69b0e9c --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/lifecycle/show-slurm-node.yml @@ -0,0 +1,33 @@ +--- +- name: Show Slurm node state + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Require target_node + ansible.builtin.fail: + msg: "Use: ansible-playbook show-slurm-node.yml -e target_node=" + when: target_node is not defined + + - name: Show node state + ansible.builtin.shell: | + set -euo pipefail + echo "### sinfo" + sinfo -N -n {{ target_node }} || true + + echo + echo "### scontrol" + scontrol show node {{ target_node }} || true + + echo + echo "### jobs on node" + squeue -w {{ target_node }} || true + args: + executable: /bin/bash + register: node_lifecycle_state + changed_when: false + + - name: Print node lifecycle state + ansible.builtin.debug: + var: node_lifecycle_state.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/qos/configure-slurm-qos.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/qos/configure-slurm-qos.yml new file mode 100644 index 0000000..9a88356 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/qos/configure-slurm-qos.yml @@ -0,0 +1,169 @@ +--- +- name: Configure Slurm QOS, limits and fairshare + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Ensure sacctmgr is avgpu01le + ansible.builtin.command: + cmd: sacctmgr -n list cluster + changed_when: false + + - name: Validate accounting GPU TRES exists + ansible.builtin.shell: | + set -euo pipefail + + echo "### configured AccountingStorageTRES" + scontrol show config | grep -E "AccountingStorageTRES|AccountingStorageType|AccountingStorageEnforce" + + echo + echo "### known TRES" + sacctmgr show tres + + echo + echo "### checking gres/gpu" + sacctmgr -n show tres format=Type,Name | awk '$1=="gres" && $2=="gpu" {found=1} END {exit !found}' + args: + executable: /bin/bash + register: gpu_tres_check + changed_when: false + + - name: Ensure normal QOS exists + ansible.builtin.shell: | + set -euo pipefail + sacctmgr -i add qos normal Priority=100 + args: + executable: /bin/bash + register: add_qos_normal + changed_when: "'Adding QOS' in (add_qos_normal.stdout + add_qos_normal.stderr)" + failed_when: > + add_qos_normal.rc != 0 and + 'Nothing new added' not in (add_qos_normal.stdout + add_qos_normal.stderr) and + 'already exists' not in (add_qos_normal.stdout + add_qos_normal.stderr) and + 'Already existing' not in (add_qos_normal.stdout + add_qos_normal.stderr) + + - name: Ensure debug-short QOS exists + ansible.builtin.shell: | + set -euo pipefail + sacctmgr -i add qos debug-short Priority=500 + args: + executable: /bin/bash + register: add_qos_debug + changed_when: "'Adding QOS' in (add_qos_debug.stdout + add_qos_debug.stderr)" + failed_when: > + add_qos_debug.rc != 0 and + 'Nothing new added' not in (add_qos_debug.stdout + add_qos_debug.stderr) and + 'already exists' not in (add_qos_debug.stdout + add_qos_debug.stderr) and + 'Already existing' not in (add_qos_debug.stdout + add_qos_debug.stderr) + + - name: Ensure gpu-short QOS exists + ansible.builtin.shell: | + set -euo pipefail + sacctmgr -i add qos gpu-short Priority=1000 + args: + executable: /bin/bash + register: add_qos_gpu + changed_when: "'Adding QOS' in (add_qos_gpu.stdout + add_qos_gpu.stderr)" + failed_when: > + add_qos_gpu.rc != 0 and + 'Nothing new added' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and + 'already exists' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) and + 'Already existing' not in (add_qos_gpu.stdout + add_qos_gpu.stderr) + + - name: Ensure maintenance QOS exists + ansible.builtin.shell: | + set -euo pipefail + sacctmgr -i add qos maintenance Priority=5000 + args: + executable: /bin/bash + register: add_qos_maintenance + changed_when: "'Adding QOS' in (add_qos_maintenance.stdout + add_qos_maintenance.stderr)" + failed_when: > + add_qos_maintenance.rc != 0 and + 'Nothing new added' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and + 'already exists' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) and + 'Already existing' not in (add_qos_maintenance.stdout + add_qos_maintenance.stderr) + + - name: Normalize normal QOS settings + ansible.builtin.shell: | + set -euo pipefail + sacctmgr -i modify qos normal set Priority=100 + args: + executable: /bin/bash + changed_when: true + + - name: Normalize debug-short QOS settings + ansible.builtin.shell: | + set -euo pipefail + sacctmgr -i modify qos debug-short set Priority=500 MaxWall=00:10:00 MaxTRESPU=cpu=2 MaxJobsPU=4 + args: + executable: /bin/bash + changed_when: true + + - name: Normalize gpu-short QOS settings + ansible.builtin.shell: | + set -euo pipefail + sacctmgr -i modify qos gpu-short set Priority=1000 MaxWall=01:00:00 MaxTRESPU=gres/gpu=1,cpu=12 MaxJobsPU=2 + args: + executable: /bin/bash + changed_when: true + + - name: Normalize maintenance QOS settings + ansible.builtin.shell: | + set -euo pipefail + sacctmgr -i modify qos maintenance set Priority=5000 MaxWall=02:00:00 + args: + executable: /bin/bash + changed_when: true + + - name: Assign QOS set to lab account + ansible.builtin.shell: | + set -euo pipefail + sacctmgr -i modify account {{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100 + args: + executable: /bin/bash + changed_when: true + + - name: Assign default account to slurmuser + ansible.builtin.shell: | + set -euo pipefail + sacctmgr -i modify user where name=slurmuser set DefaultAccount={{ slurm_account_name }} + args: + executable: /bin/bash + changed_when: true + + - name: Assign QOS set to slurmuser association + ansible.builtin.shell: | + set -euo pipefail + sacctmgr -i modify user where name=slurmuser account={{ slurm_account_name }} set QOS=normal,debug-short,gpu-short,maintenance DefaultQOS=normal Fairshare=100 + args: + executable: /bin/bash + changed_when: true + + - name: Show configured QOS and associations + ansible.builtin.shell: | + set -euo pipefail + + echo "### TRES" + sacctmgr show tres + + echo + echo "### QOS" + sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%40,MaxJobsPU + + echo + echo "### Associations" + sacctmgr show assoc format=Cluster,Account,User,Share,QOS%60,DefaultQOS,Fairshare + + echo + echo "### Fairshare" + sshare -A {{ slurm_account_name }} || true + args: + executable: /bin/bash + register: qos_state + changed_when: false + + - name: Print QOS state + ansible.builtin.debug: + var: qos_state.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/qos/validate-slurm-qos-priority.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/qos/validate-slurm-qos-priority.yml new file mode 100644 index 0000000..90c7ad7 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/qos/validate-slurm-qos-priority.yml @@ -0,0 +1,235 @@ +--- +- name: Validate Slurm QOS, fairshare and priority + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Validate priority runtime config + ansible.builtin.shell: | + set -euo pipefail + + echo "### priority config" + scontrol show config | grep -E "PriorityType|PriorityWeight|PriorityDecay|PriorityCalc|PriorityMaxAge|PriorityFavor" + + echo + echo "### accounting enforcement" + scontrol show config | grep -E "AccountingStorageType|AccountingStorageEnforce|AccountingStorageTRES" + + echo + echo "### QOS" + sacctmgr show qos format=Name%20,Priority,MaxWall,MaxTRESPU%50,MaxJobsPU + + echo + echo "### associations" + sacctmgr show assoc format=Cluster,Account,User,Share,QOS%80,DefaultQOS,Fairshare + + echo + echo "### fairshare" + sshare -A {{ slurm_account_name }} || true + args: + executable: /bin/bash + register: priority_state + changed_when: false + + - name: Submit debug-short QOS job + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable <<'SBATCH' + #!/bin/bash + #SBATCH --job-name=qos-debug-test + #SBATCH --partition=debug + #SBATCH --qos=debug-short + #SBATCH --account=lab + #SBATCH --cpus-per-task=1 + #SBATCH --mem=256M + #SBATCH --time=00:02:00 + #SBATCH --output=/shared/qos-debug-test-%j.out + + echo "HOST=$(hostname)" + echo "USER=$(whoami)" + echo "QOS=${SLURM_JOB_QOS:-}" + echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" + echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" + date + SBATCH + )" + + echo "JOB_ID=$job_id" + + for i in $(seq 1 90); do + if squeue -h -j "$job_id" | grep -q .; then + squeue -j "$job_id" + sleep 1 + else + break + fi + done + + echo "### sacct" + sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList + + echo "### output" + cat "/shared/qos-debug-test-${job_id}.out" + args: + executable: /bin/bash + register: debug_qos_job + changed_when: true + + - name: Submit gpu-short QOS job + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable <<'SBATCH' + #!/bin/bash + #SBATCH --job-name=qos-gpu-test + #SBATCH --partition=gpu + #SBATCH --qos=gpu-short + #SBATCH --account=lab + #SBATCH --gres=gpu:1 + #SBATCH --cpus-per-task=2 + #SBATCH --mem=1G + #SBATCH --time=00:03:00 + #SBATCH --output=/shared/qos-gpu-test-%j.out + + echo "HOST=$(hostname)" + echo "USER=$(whoami)" + echo "QOS=${SLURM_JOB_QOS:-}" + echo "ACCOUNT=${SLURM_JOB_ACCOUNT:-}" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" + echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}" + echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}" + echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" + echo + nvidia-smi + SBATCH + )" + + echo "JOB_ID=$job_id" + + for i in $(seq 1 120); do + if squeue -h -j "$job_id" | grep -q .; then + squeue -j "$job_id" + sleep 1 + else + break + fi + done + + echo "### sacct" + sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList + + echo "### output" + cat "/shared/qos-gpu-test-${job_id}.out" + args: + executable: /bin/bash + register: gpu_qos_job + changed_when: true + + - name: Validate debug-short walltime limit behavior + ansible.builtin.shell: | + set -euo pipefail + + set +e + output="$( + sudo -iu slurmuser sbatch --parsable <<'SBATCH' 2>&1 + #!/bin/bash + #SBATCH --job-name=qos-limit-fail + #SBATCH --partition=debug + #SBATCH --qos=debug-short + #SBATCH --account=lab + #SBATCH --cpus-per-task=1 + #SBATCH --mem=256M + #SBATCH --time=00:30:00 + #SBATCH --output=/shared/qos-limit-fail-%j.out + + sleep 10 + SBATCH + )" + rc=$? + set -e + + echo "RC=$rc" + echo "$output" + + if [ "$rc" -ne 0 ]; then + echo "Limit rejection test passed at submit time" + exit 0 + fi + + job_id="$output" + echo "Submitted job despite expected limit check: $job_id" + + sleep 3 + + echo "### squeue" + squeue -j "$job_id" -o "%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" || true + + echo + echo "### job detail" + scontrol show job "$job_id" || true + + state="$(squeue -h -j "$job_id" -o "%T" || true)" + reason="$(squeue -h -j "$job_id" -o "%R" || true)" + + echo "STATE=$state" + echo "REASON=$reason" + + if echo "$state" | grep -qE "PENDING|CONFIGURING"; then + if echo "$reason" | grep -qiE "qos|limit|time|max|assoc"; then + echo "Limit enforcement test passed via pending reason" + scancel "$job_id" || true + exit 0 + fi + fi + + echo "Job was accepted without an obvious QOS/limit pending reason" + scancel "$job_id" || true + exit 1 + args: + executable: /bin/bash + register: limit_rejection + changed_when: false + + - name: Show priority and fairshare snapshot + ansible.builtin.shell: | + set -euo pipefail + + echo "### queue" + squeue || true + + echo + echo "### sprio" + sprio || true + + echo + echo "### sshare" + sshare -A {{ slurm_account_name }} || true + + echo + echo "### recent sacct" + sacct -S today --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,NodeList | tail -40 + args: + executable: /bin/bash + register: priority_snapshot + changed_when: false + + - name: Print validation result + ansible.builtin.debug: + msg: + - "### priority state" + - "{{ priority_state.stdout_lines }}" + - "### debug QOS job" + - "{{ debug_qos_job.stdout_lines }}" + - "### GPU QOS job" + - "{{ gpu_qos_job.stdout_lines }}" + - "### limit rejection" + - "{{ limit_rejection.stdout_lines }}" + - "### priority snapshot" + - "{{ priority_snapshot.stdout_lines }}" diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-cgroup-cpu-gpu-node.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-cgroup-cpu-gpu-node.yml new file mode 100644 index 0000000..40f75b1 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-cgroup-cpu-gpu-node.yml @@ -0,0 +1,59 @@ +--- +- name: Test CPU cgroup enforcement on gpu01 + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Submit cgroup CPU test to gpu01 + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable <<'SBATCH' + #!/bin/bash + #SBATCH --job-name=cgroup-cpu-test + #SBATCH --partition=all + #SBATCH --nodelist=gpu01 + #SBATCH --cpus-per-task=2 + #SBATCH --mem=1G + #SBATCH --time=00:02:00 + #SBATCH --output=/shared/cgroup-cpu-test-%j.out + + echo "HOST=$(hostname)" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" + echo "SLURM_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK:-}" + echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" + echo "MEM_ALLOWED=$(grep Mems_allowed_list /proc/self/status || true)" + echo + echo "### cgroup" + cat /proc/self/cgroup + echo + echo "### mounted cgroups" + mount | grep cgroup || true + sleep 5 + SBATCH + )" + + echo "JOB_ID=$job_id" + + for i in $(seq 1 60); do + if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then + sudo -iu slurmuser squeue -j "$job_id" + sleep 1 + else + break + fi + done + + echo "### output" + cat "/shared/cgroup-cpu-test-${job_id}.out" + args: + executable: /bin/bash + register: cgroup_cpu_result + changed_when: true + + - name: Show cgroup CPU result + ansible.builtin.debug: + var: cgroup_cpu_result.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-cpu-job.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-cpu-job.yml new file mode 100644 index 0000000..d236ea8 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-cpu-job.yml @@ -0,0 +1,60 @@ +--- +- name: Submit CPU test job + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Submit test job to debug partition + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable <<'SBATCH' + #!/bin/bash + #SBATCH --job-name=cpu-test + #SBATCH --partition=debug + #SBATCH --cpus-per-task=1 + #SBATCH --mem=512M + #SBATCH --time=00:02:00 + #SBATCH --output=/shared/cpu-test-%j.out + + echo "HOST=$(hostname)" + echo "USER=$(whoami)" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" + echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" + date + SBATCH + )" + + echo "JOB_ID=$job_id" + + for i in $(seq 1 60); do + if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then + sudo -iu slurmuser squeue -j "$job_id" + sleep 1 + else + break + fi + done + + echo "### sacct" + sudo -iu slurmuser sacct -j "$job_id" --format=JobID,JobName,Partition,State,ExitCode 2>/dev/null || true + + echo "### output" + if [ -f "/shared/cpu-test-${job_id}.out" ]; then + cat "/shared/cpu-test-${job_id}.out" + else + echo "Output file not found: /shared/cpu-test-${job_id}.out" + find /shared -maxdepth 1 -name "cpu-test-*.out" -ls | tail -5 || true + exit 1 + fi + args: + executable: /bin/bash + register: cpu_job_result + changed_when: true + + - name: Show CPU job result + ansible.builtin.debug: + var: cpu_job_result.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-gpu-deny-without-gres.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-gpu-deny-without-gres.yml new file mode 100644 index 0000000..4b1a568 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-gpu-deny-without-gres.yml @@ -0,0 +1,58 @@ +--- +- name: Test GPU access without GRES allocation + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Submit job to gpu01 without requesting GPU + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable <<'SBATCH' + #!/bin/bash + #SBATCH --job-name=gpu-deny-test + #SBATCH --partition=all + #SBATCH --nodelist=gpu01 + #SBATCH --cpus-per-task=1 + #SBATCH --mem=1G + #SBATCH --time=00:02:00 + #SBATCH --output=/shared/gpu-deny-test-%j.out + + echo "HOST=$(hostname)" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" + echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}" + echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}" + echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" + echo + echo "### ls nvidia devices" + ls -l /dev/nvidia* 2>&1 || true + echo + echo "### nvidia-smi without GRES" + nvidia-smi 2>&1 || true + SBATCH + )" + + echo "JOB_ID=$job_id" + + for i in $(seq 1 60); do + if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then + sudo -iu slurmuser squeue -j "$job_id" + sleep 1 + else + break + fi + done + + echo "### output" + cat "/shared/gpu-deny-test-${job_id}.out" + args: + executable: /bin/bash + register: gpu_deny_result + changed_when: true + + - name: Show GPU deny test result + ansible.builtin.debug: + var: gpu_deny_result.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-gpu-job.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-gpu-job.yml new file mode 100644 index 0000000..d768d2b --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-gpu-job.yml @@ -0,0 +1,70 @@ +--- +- name: Submit GPU test job + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Submit test job to gpu partition + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable <<'SBATCH' + #!/bin/bash + #SBATCH --job-name=gpu-test + #SBATCH --partition=gpu + #SBATCH --gres=gpu:1 + #SBATCH --cpus-per-task=2 + #SBATCH --mem=2G + #SBATCH --time=00:03:00 + #SBATCH --output=/shared/gpu-test-%j.out + + echo "HOST=$(hostname)" + echo "USER=$(whoami)" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" + echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}" + echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}" + echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" + echo + + echo "### nvidia-smi" + nvidia-smi + + echo + echo "### GPU process table" + nvidia-smi pmon -c 1 || true + SBATCH + )" + + echo "JOB_ID=$job_id" + + for i in $(seq 1 90); do + if sudo -iu slurmuser squeue -h -j "$job_id" | grep -q .; then + sudo -iu slurmuser squeue -j "$job_id" + sleep 1 + else + break + fi + done + + echo "### sacct" + sudo -iu slurmuser sacct -j "$job_id" --format=JobID,JobName,Partition,State,ExitCode 2>/dev/null || true + + echo "### output" + if [ -f "/shared/gpu-test-${job_id}.out" ]; then + cat "/shared/gpu-test-${job_id}.out" + else + echo "Output file not found: /shared/gpu-test-${job_id}.out" + find /shared -maxdepth 1 -name "gpu-test-*.out" -ls | tail -5 || true + exit 1 + fi + args: + executable: /bin/bash + register: gpu_job_result + changed_when: true + + - name: Show GPU job result + ansible.builtin.debug: + var: gpu_job_result.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-specific-node.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-specific-node.yml new file mode 100644 index 0000000..8245fda --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-specific-node.yml @@ -0,0 +1,95 @@ +--- +- name: Submit job to specific Slurm node + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Require target_node + ansible.builtin.fail: + msg: "Use: ansible-playbook test-specific-node.yml -e target_node=" + when: target_node is not defined + + - name: Submit test job to target node + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable </dev/null \ + | head -n 1 \ + | cut -d'|' -f1 \ + | awk '{print $1}' + )" + + if echo "$final_state" | grep -qE "COMPLETED|FAILED|CANCELLED|TIMEOUT|NODE_FAIL|OUT_OF_MEMORY"; then + break + fi + + sleep 1 + done + + echo "FINAL_STATE=${final_state:-UNKNOWN}" + + echo "### sacct" + sacct -j "$job_id" --format=JobID,JobName,User,Account,QOS,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList + + echo "### output" + cat "/shared/node-test-${job_id}.out" + + if [ "${final_state:-UNKNOWN}" != "COMPLETED" ]; then + echo "Job did not reach COMPLETED state according to sacct" + exit 1 + fi + args: + executable: /bin/bash + register: node_test + changed_when: true + + - name: Show node test result + ansible.builtin.debug: + var: node_test.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-sreport-usage.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-sreport-usage.yml new file mode 100644 index 0000000..7061151 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/test-sreport-usage.yml @@ -0,0 +1,60 @@ +--- +- name: Generate measurable Slurm usage for sreport + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Submit CPU usage job + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable <<'SBATCH' + #!/bin/bash + #SBATCH --job-name=sreport-usage + #SBATCH --partition=debug + #SBATCH --cpus-per-task=2 + #SBATCH --mem=512M + #SBATCH --time=00:03:00 + #SBATCH --output=/shared/sreport-usage-%j.out + + echo "HOST=$(hostname)" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "SLURM_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK:-}" + echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" + echo "Burning CPU for 90 seconds" + + timeout 90 bash -c 'while true; do :; done' & + timeout 90 bash -c 'while true; do :; done' & + wait + + echo "Done" + date + SBATCH + )" + + echo "JOB_ID=$job_id" + + for i in $(seq 1 150); do + if squeue -h -j "$job_id" | grep -q .; then + squeue -j "$job_id" + sleep 2 + else + break + fi + done + + echo "### sacct" + sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList + + echo "### output" + cat "/shared/sreport-usage-${job_id}.out" + args: + executable: /bin/bash + register: sreport_usage_job + changed_when: true + + - name: Show usage job result + ansible.builtin.debug: + var: sreport_usage_job.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/validate-slurm-operator.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/validate-slurm-operator.yml new file mode 100644 index 0000000..ed4a119 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/tests/validate-slurm-operator.yml @@ -0,0 +1,140 @@ +--- +- name: Validate Slurm operator user and SSH mesh + hosts: slurm_cluster + become: true + gather_facts: false + + vars: + slurm_operator_user: "{{ slurm_operator_user | default('slurmuser') }}" + slurm_hosts: "{{ groups['slurm_cluster'] }}" + + tasks: + - name: Validate slurmuser exists + ansible.builtin.command: + cmd: id {{ slurm_operator_user }} + changed_when: false + + - name: Validate sinfo as slurmuser + ansible.builtin.command: + cmd: sudo -iu {{ slurm_operator_user }} sinfo + changed_when: false + + - name: Validate squeue as slurmuser + ansible.builtin.command: + cmd: sudo -iu {{ slurm_operator_user }} squeue + changed_when: false + + - name: Validate SSH mesh as slurmuser + ansible.builtin.shell: | + set -euo pipefail + for h in {{ slurm_hosts | join(' ') }}; do + echo "=== $h ===" + ssh -o BatchMode=yes -o ConnectTimeout=5 "$h" hostname + done + args: + executable: /bin/bash + become_user: "{{ slurm_operator_user }}" + changed_when: false + + +- name: Validate Slurm controller commands + hosts: slurm_controller + become: true + gather_facts: false + + vars: + slurm_operator_user: slurmuser + + tasks: + - name: Validate slurmctld status through sudo + ansible.builtin.command: + cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmctld --no-pager + changed_when: false + + - name: Validate controller Slurm commands + ansible.builtin.shell: | + set -euo pipefail + sudo -iu {{ slurm_operator_user }} sinfo + sudo -iu {{ slurm_operator_user }} squeue + sudo -iu {{ slurm_operator_user }} scontrol show nodes + args: + executable: /bin/bash + changed_when: false + + +- name: Validate Slurm worker commands + hosts: slurm_compute:slurm_gpu + become: true + gather_facts: false + + vars: + slurm_operator_user: slurmuser + + tasks: + - name: Validate slurmd status through sudo + ansible.builtin.command: + cmd: sudo -iu {{ slurm_operator_user }} sudo -n systemctl status slurmd --no-pager + changed_when: false + + - name: Validate worker Slurm commands + ansible.builtin.shell: | + set -euo pipefail + sudo -iu {{ slurm_operator_user }} sinfo + sudo -iu {{ slurm_operator_user }} squeue + sudo -iu {{ slurm_operator_user }} scontrol show nodes + args: + executable: /bin/bash + changed_when: false + + +- name: Validate basic job submission + hosts: slurm_controller + become: true + gather_facts: false + + vars: + slurm_operator_user: slurmuser + + tasks: + - name: Submit simple Slurm test job as slurmuser + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu {{ slurm_operator_user }} sbatch --parsable <<'SBATCH' + #!/bin/bash + #SBATCH --job-name=ansible-validate + #SBATCH --partition=debug + #SBATCH --time=00:01:00 + #SBATCH --output=/tmp/ansible-validate-%j.out + + hostname + whoami + date + SBATCH + )" + + echo "$job_id" + + for i in $(seq 1 20); do + state="$(sudo -iu {{ slurm_operator_user }} squeue -h -j "$job_id" -o "%T" || true)" + if [ -z "$state" ]; then + break + fi + echo "job_state=$state" + sleep 1 + done + + sudo -iu {{ slurm_operator_user }} sacct -j "$job_id" --format=JobID,JobName,State,ExitCode 2>/dev/null || true + + if ls /tmp/ansible-validate-"$job_id".out >/dev/null 2>&1; then + cat /tmp/ansible-validate-"$job_id".out + fi + args: + executable: /bin/bash + register: slurm_job_test + changed_when: true + + - name: Show basic job submission result + ansible.builtin.debug: + var: slurm_job_test.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/playbooks/upgrade/canary-slurm-node-upgrade.yml b/platform-projects/hpc-slurm-ai-cluster/playbooks/upgrade/canary-slurm-node-upgrade.yml new file mode 100644 index 0000000..7d87685 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/playbooks/upgrade/canary-slurm-node-upgrade.yml @@ -0,0 +1,236 @@ +--- +- name: Validate canary node variable + hosts: localhost + gather_facts: false + + vars: + canary_node_effective: "{{ canary_node | default('slurm-c02') }}" + + tasks: + - name: Ensure canary node is in inventory + ansible.builtin.fail: + msg: "canary_node={{ canary_node_effective }} is not in inventory" + when: canary_node_effective not in groups['all'] + + - name: Ensure canary node is not the controller + ansible.builtin.fail: + msg: "Do not use controller as canary for worker rolling upgrade" + when: canary_node_effective in groups['slurm_controller'] + + +- name: Drain canary node + hosts: slurm_controller + become: true + gather_facts: false + + vars: + canary_node_effective: "{{ canary_node | default('slurm-c02') }}" + + tasks: + - name: Show canary state before drain + ansible.builtin.shell: | + set -euo pipefail + sinfo -N -n {{ canary_node_effective }} || true + scontrol show node {{ canary_node_effective }} || true + squeue -w {{ canary_node_effective }} || true + args: + executable: /bin/bash + register: canary_before + changed_when: false + + - name: Print canary state before drain + ansible.builtin.debug: + var: canary_before.stdout_lines + + - name: Drain canary node + ansible.builtin.command: + cmd: scontrol update NodeName={{ canary_node_effective }} State=DRAIN Reason="canary OS upgrade" + changed_when: true + + - name: Wait until canary has no running jobs + ansible.builtin.shell: | + set -euo pipefail + squeue -h -w {{ canary_node_effective }} || true + args: + executable: /bin/bash + register: canary_jobs + retries: 120 + delay: 10 + until: canary_jobs.stdout | trim == "" + changed_when: false + + +- name: Upgrade canary node OS packages + hosts: "{{ canary_node | default('slurm-c02') }}" + become: true + gather_facts: true + + tasks: + - name: Ensure apt cache is updated + ansible.builtin.apt: + update_cache: true + cache_valid_time: 1800 + + - name: Full upgrade packages + ansible.builtin.apt: + upgrade: full + autoremove: true + autoclean: true + register: apt_upgrade_result + + - name: Check if reboot is required + ansible.builtin.stat: + path: /var/run/reboot-required + register: reboot_required + + - name: Show upgrade summary + ansible.builtin.debug: + msg: + - "Host: {{ inventory_hostname }}" + - "Apt changed: {{ apt_upgrade_result.changed }}" + - "Reboot required: {{ reboot_required.stat.exists }}" + + - name: Reboot canary if required + ansible.builtin.reboot: + msg: "Reboot after canary OS upgrade" + reboot_timeout: 900 + connect_timeout: 20 + pre_reboot_delay: 5 + post_reboot_delay: 20 + when: reboot_required.stat.exists + + - name: Ensure munge is running + ansible.builtin.systemd: + name: munge + state: restarted + enabled: true + + - name: Ensure slurmd is running + ansible.builtin.systemd: + name: slurmd + state: restarted + enabled: true + + - name: Validate local services + ansible.builtin.shell: | + set -euo pipefail + systemctl is-active munge + systemctl is-active slurmd + munge -n | unmunge >/dev/null + scontrol ping + args: + executable: /bin/bash + changed_when: false + + +- name: Resume canary node and run canary job + hosts: slurm_controller + become: true + gather_facts: false + + vars: + canary_node_effective: "{{ canary_node | default('slurm-c02') }}" + + tasks: + - name: Reconfigure controller + ansible.builtin.command: + cmd: scontrol reconfigure + changed_when: true + + - name: Restart controller to refresh node state + ansible.builtin.systemd: + name: slurmctld + state: restarted + + - name: Wait for controller + ansible.builtin.command: + cmd: scontrol ping + register: slurmctld_ping + retries: 15 + delay: 2 + until: slurmctld_ping.rc == 0 + changed_when: false + + - name: Clear canary node maintenance state + ansible.builtin.shell: | + set -euo pipefail + + scontrol update NodeName={{ canary_node_effective }} State=RESUME 2>/dev/null || true + scontrol update NodeName={{ canary_node_effective }} State=UNDRAIN 2>/dev/null || true + scontrol update NodeName={{ canary_node_effective }} State=IDLE 2>/dev/null || true + + sleep 3 + sinfo -N -n {{ canary_node_effective }} + scontrol show node {{ canary_node_effective }} + args: + executable: /bin/bash + register: resume_canary + changed_when: true + + - name: Wait until canary is IDLE and responding + ansible.builtin.shell: | + set -euo pipefail + sinfo -N -n {{ canary_node_effective }} + scontrol show node {{ canary_node_effective }} + args: + executable: /bin/bash + register: canary_state + retries: 30 + delay: 5 + until: + - canary_state.rc == 0 + - "'not_responding' not in canary_state.stdout.lower()" + - "'down' not in canary_state.stdout.lower()" + - "'drain' not in canary_state.stdout.lower()" + - "'idle*' not in canary_state.stdout.lower()" + changed_when: false + + - name: Submit canary test job to upgraded node + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable </dev/null + scontrol ping + args: + executable: /bin/bash + changed_when: false + + post_tasks: + - name: Restart controller to refresh state after node upgrade + ansible.builtin.systemd: + name: slurmctld + state: restarted + delegate_to: "{{ groups['slurm_controller'][0] }}" + run_once: false + + - name: Wait for controller after restart + ansible.builtin.command: + cmd: scontrol ping + delegate_to: "{{ groups['slurm_controller'][0] }}" + register: slurmctld_ping + retries: 15 + delay: 2 + until: slurmctld_ping.rc == 0 + changed_when: false + + - name: Clear upgraded node maintenance state + ansible.builtin.shell: | + set -euo pipefail + + scontrol update NodeName={{ inventory_hostname }} State=RESUME 2>/dev/null || true + scontrol update NodeName={{ inventory_hostname }} State=UNDRAIN 2>/dev/null || true + scontrol update NodeName={{ inventory_hostname }} State=IDLE 2>/dev/null || true + + sleep 3 + sinfo -N -n {{ inventory_hostname }} + scontrol show node {{ inventory_hostname }} + args: + executable: /bin/bash + delegate_to: "{{ groups['slurm_controller'][0] }}" + register: resume_node + changed_when: true + + - name: Wait until node is healthy + ansible.builtin.shell: | + set -euo pipefail + sinfo -N -n {{ inventory_hostname }} + scontrol show node {{ inventory_hostname }} + args: + executable: /bin/bash + delegate_to: "{{ groups['slurm_controller'][0] }}" + register: upgraded_node_state + retries: 30 + delay: 5 + until: + - upgraded_node_state.rc == 0 + - "'not_responding' not in upgraded_node_state.stdout.lower()" + - "'down' not in upgraded_node_state.stdout.lower()" + - "'drain' not in upgraded_node_state.stdout.lower()" + - "'idle*' not in upgraded_node_state.stdout.lower()" + changed_when: false + + - name: Submit node-local post-upgrade test job + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable </dev/null || hostname)" + echo "KERNEL=$(uname -r)" + echo "UPTIME=$(uptime -p)" + + echo + echo "### services" + systemctl is-active munge + systemctl is-active slurmd + + echo + echo "### munge local test" + munge -n | unmunge >/dev/null + echo "munge OK" + + echo + echo "### controller ping" + scontrol ping + + echo + echo "### local slurm.conf checksum" + sha256sum /etc/slurm/slurm.conf /etc/slurm/cgroup.conf 2>/dev/null || true + + echo + echo "### gpu check if present" + if command -v nvidia-smi >/dev/null 2>&1; then + nvidia-smi --query-gpu=index,name,driver_version,memory.total --format=csv,noheader || true + else + echo "NO_NVIDIA_SMI" + fi + args: + executable: /bin/bash + register: worker_state + changed_when: false + + - name: Print worker state + ansible.builtin.debug: + var: worker_state.stdout_lines + + +- name: Submit post-upgrade CPU validation job + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Submit CPU validation job to debug partition + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable <<'SBATCH' + #!/bin/bash + #SBATCH --job-name=os-upgrade-cpu-test + #SBATCH --partition=debug + #SBATCH --cpus-per-task=1 + #SBATCH --mem=256M + #SBATCH --time=00:02:00 + #SBATCH --output=/shared/os-upgrade-cpu-test-%j.out + + echo "HOST=$(hostname)" + echo "USER=$(whoami)" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" + echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" + echo "KERNEL=$(uname -r)" + date + SBATCH + )" + + echo "JOB_ID=$job_id" + + for i in $(seq 1 90); do + if squeue -h -j "$job_id" | grep -q .; then + squeue -j "$job_id" + sleep 1 + else + break + fi + done + + echo "### sacct" + sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList + + echo "### output" + cat "/shared/os-upgrade-cpu-test-${job_id}.out" + args: + executable: /bin/bash + register: cpu_validation_job + changed_when: true + + - name: Print CPU validation job + ansible.builtin.debug: + var: cpu_validation_job.stdout_lines + + +- name: Submit post-upgrade GPU validation job + hosts: slurm_controller + become: true + gather_facts: false + + tasks: + - name: Submit GPU validation job to gpu partition + ansible.builtin.shell: | + set -euo pipefail + + job_id="$( + sudo -iu slurmuser sbatch --parsable <<'SBATCH' + #!/bin/bash + #SBATCH --job-name=os-upgrade-gpu-test + #SBATCH --partition=gpu + #SBATCH --gres=gpu:1 + #SBATCH --cpus-per-task=2 + #SBATCH --mem=1G + #SBATCH --time=00:03:00 + #SBATCH --output=/shared/os-upgrade-gpu-test-%j.out + + echo "HOST=$(hostname)" + echo "USER=$(whoami)" + echo "SLURM_JOB_ID=$SLURM_JOB_ID" + echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" + echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-}" + echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}" + echo "CPUS_ALLOWED=$(grep Cpus_allowed_list /proc/self/status)" + echo "KERNEL=$(uname -r)" + echo + nvidia-smi + SBATCH + )" + + echo "JOB_ID=$job_id" + + for i in $(seq 1 120); do + if squeue -h -j "$job_id" | grep -q .; then + squeue -j "$job_id" + sleep 1 + else + break + fi + done + + echo "### sacct" + sacct -j "$job_id" --format=JobID,JobName,User,Partition,State,ExitCode,Elapsed,AllocCPUS,ReqMem,NodeList + + echo "### output" + cat "/shared/os-upgrade-gpu-test-${job_id}.out" + args: + executable: /bin/bash + register: gpu_validation_job + changed_when: true + + - name: Print GPU validation job + ansible.builtin.debug: + var: gpu_validation_job.stdout_lines diff --git a/platform-projects/hpc-slurm-ai-cluster/prompts/codex/repo-documentation.md b/platform-projects/hpc-slurm-ai-cluster/prompts/codex/repo-documentation.md new file mode 100644 index 0000000..6a119f8 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/prompts/codex/repo-documentation.md @@ -0,0 +1,15 @@ +# Codex prompt: generate repository documentation + +You are working in an Ansible repository that automates a Slurm AI/HPC lab. + +Please review the repository and generate or improve documentation under `docs/` with the following goals: + +1. Explain the architecture and repository layout. +2. Document the end-to-end deployment sequence. +3. Document operational workflows: provisioning, decommissioning, rolling upgrades, health checks and auto-remediation. +4. Document SlurmDBD accounting, QOS, fairshare and priority workflows. +5. Add troubleshooting notes based on the playbooks and templates. +6. Avoid exposing secrets, real IP addresses, real hostnames, SQL dumps, backup archives, private keys or vault content. +7. Keep all text in English. + +Output should be practical, operator-focused and suitable for a public Git repository. diff --git a/platform-projects/hpc-slurm-ai-cluster/templates/cgroup.conf.j2 b/platform-projects/hpc-slurm-ai-cluster/templates/cgroup.conf.j2 new file mode 100644 index 0000000..08d822e --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/templates/cgroup.conf.j2 @@ -0,0 +1,16 @@ +# Managed by Ansible +# Slurm cgroup configuration + +CgroupPlugin=autodetect + +ConstrainCores=yes +ConstrainRAMSpace=yes +ConstrainSwapSpace=no +ConstrainDevices=yes + +AllowedRAMSpace=100 +AllowedSwapSpace=0 +MaxRAMPercent=100 +MaxSwapPercent=0 + +MinRAMSpace=30 diff --git a/platform-projects/hpc-slurm-ai-cluster/templates/gres.conf.j2 b/platform-projects/hpc-slurm-ai-cluster/templates/gres.conf.j2 new file mode 100644 index 0000000..4a81231 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/templates/gres.conf.j2 @@ -0,0 +1,4 @@ +# Managed by Ansible +{% for node in slurm_nodes if node.managed_state | default('present') == 'present' and node.gres | default('') | length > 0 %} +NodeName={{ node.name }} Name=gpu File={{ node.gres_file | default('/dev/nvidia0') }} +{% endfor %} diff --git a/platform-projects/hpc-slurm-ai-cluster/templates/slurm.conf.j2 b/platform-projects/hpc-slurm-ai-cluster/templates/slurm.conf.j2 new file mode 100644 index 0000000..f8f67a4 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/templates/slurm.conf.j2 @@ -0,0 +1,67 @@ +# Managed by Ansible + +ClusterName={{ slurm_cluster_name }} +SlurmctldHost={{ slurm_control_machine }}({{ slurm_control_addr }}) + +SlurmUser={{ slurm_user }} +AuthType=auth/munge +StateSaveLocation=/var/spool/slurmctld +SlurmdSpoolDir=/var/spool/slurmd +SwitchType=switch/none +MpiDefault={{ slurm_default_mpi_type }} +ProctrackType={{ slurm_proctrack_type }} +ReturnToService={{ slurm_return_to_service }} +{% if slurm_gres_types is defined and slurm_gres_types | length > 0 %} +GresTypes={{ slurm_gres_types }} +{% endif %} + +SlurmctldPidFile=/run/slurmctld.pid +SlurmdPidFile=/run/slurmd.pid +SlurmctldPort={{ slurmctld_port }} +SlurmdPort={{ slurmd_port }} + +TaskPlugin={{ slurm_task_plugin }} +SelectType={{ slurm_select_type }} +SelectTypeParameters={{ slurm_select_type_parameters }} + +SchedulerType=sched/backfill +# Priority / fairshare +PriorityType={{ slurm_priority_type | default('priority/multifactor') }} +PriorityDecayHalfLife={{ slurm_priority_decay_half_life | default('7-0') }} +PriorityCalcPeriod={{ slurm_priority_calc_period | default(5) }} +PriorityFavorSmall={{ slurm_priority_favor_small | default('NO') }} +PriorityWeightAge={{ slurm_priority_weight_age | default(1000) }} +PriorityWeightFairshare={{ slurm_priority_weight_fairshare | default(10000) }} +PriorityWeightJobSize={{ slurm_priority_weight_job_size | default(1000) }} +PriorityWeightPartition={{ slurm_priority_weight_partition | default(1000) }} +PriorityWeightQOS={{ slurm_priority_weight_qos | default(10000) }} +PriorityMaxAge={{ slurm_priority_max_age | default('1-0') }} + +SlurmctldTimeout=120 +SlurmdTimeout=300 +InactiveLimit=0 +KillWait=30 +Waittime=0 + +AccountingStorageType={{ slurm_accounting_storage_type }} +{% if slurm_accounting_storage_type == "accounting_storage/slurmdbd" %} +AccountingStorageHost={{ slurm_accounting_storage_host }} +AccountingStoragePort={{ slurm_accounting_storage_port }} +AccountingStorageEnforce={{ slurm_accounting_storage_enforce | default('associations,limits,qos') }} +AccountingStorageTRES={{ slurm_accounting_storage_tres | default('cpu,mem,energy,node,billing,fs/disk,pages,vmem,gres/gpu') }} +{% endif %} +JobAcctGatherType={{ slurm_job_acct_gather_type | default('jobacct_gather/none') }} +JobCompType={{ slurm_job_comp_type }} + +SlurmctldDebug=info +SlurmdDebug=info +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdLogFile=/var/log/slurm/slurmd.log + +{% for node in slurm_nodes if node.managed_state | default('present') == 'present' %} +NodeName={{ node.name }} NodeAddr={{ node.addr }} CPUs={{ node.cpus }}{% if node.topology | default('') | length > 0 %} {{ node.topology }}{% endif %} RealMemory={{ node.real_memory }}{% if node.gres | default('') | length > 0 %} Gres={{ node.gres }}{% endif %}{% if node.features | default('') | length > 0 %} Feature={{ node.features }}{% endif %} State=UNKNOWN +{% endfor %} + +{% for partition in slurm_partitions %} +PartitionName={{ partition.name }} Nodes={{ partition.nodes }} Default={{ partition.default }} MaxTime={{ partition.max_time }} State={{ partition.state }} +{% endfor %} diff --git a/platform-projects/hpc-slurm-ai-cluster/templates/slurmdbd.conf.j2 b/platform-projects/hpc-slurm-ai-cluster/templates/slurmdbd.conf.j2 new file mode 100644 index 0000000..2211ff6 --- /dev/null +++ b/platform-projects/hpc-slurm-ai-cluster/templates/slurmdbd.conf.j2 @@ -0,0 +1,38 @@ +# Managed by Ansible +# Slurm database daemon configuration + +AuthType=auth/munge + +DbdHost={{ slurmdbd_host }} +DbdPort={{ slurmdbd_port }} + +SlurmUser={{ slurm_user }} + +DebugLevel=info +LogFile=/var/log/slurm/slurmdbd.log +PidFile=/run/slurmdbd.pid + +CommitDelay={{ slurmdbd_commit_delay | default(1) }} + +StorageType={{ slurmdbd_storage_type }} +StorageHost={{ slurmdbd_storage_host }} +StoragePort={{ slurmdbd_storage_port }} +StorageLoc={{ slurmdbd_storage_loc }} +StorageUser={{ slurmdbd_storage_user }} +StoragePass={{ slurmdbd_storage_pass }} + +# Retention / purge policy +PurgeEventAfter={{ slurmdbd_purge_event_after | default('12months') }} +PurgeJobAfter={{ slurmdbd_purge_job_after | default('12months') }} +PurgeResvAfter={{ slurmdbd_purge_resv_after | default('12months') }} +PurgeStepAfter={{ slurmdbd_purge_step_after | default('3months') }} +PurgeSuspendAfter={{ slurmdbd_purge_suspend_after | default('3months') }} +PurgeTXNAfter={{ slurmdbd_purge_txn_after | default('12months') }} +PurgeUsageAfter={{ slurmdbd_purge_usage_after | default('24months') }} + +ArchiveEvents={{ slurmdbd_archive_events | default('no') }} +ArchiveJobs={{ slurmdbd_archive_jobs | default('no') }} +ArchiveSteps={{ slurmdbd_archive_steps | default('no') }} +ArchiveSuspend={{ slurmdbd_archive_suspend | default('no') }} +ArchiveTXN={{ slurmdbd_archive_txn | default('no') }} +ArchiveUsage={{ slurmdbd_archive_usage | default('no') }}