Add Slurm AI/HPC cluster platform project
lint / shell-yaml-ansible (push) Failing after 47s

This commit is contained in:
Mateusz Suski
2026-06-04 19:41:05 +00:00
parent e2624a7533
commit d300d490f5
49 changed files with 4777 additions and 0 deletions
@@ -0,0 +1,112 @@
---
- name: Fix sudo permissions for slurmuser Slurm operations
hosts: slurm_cluster
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Configure sudoers for slurmuser on controller
ansible.builtin.copy:
dest: /etc/sudoers.d/91-slurmuser-slurm-controller
owner: root
group: root
mode: "0440"
content: |
# Managed by Ansible
Cmnd_Alias SLURM_SYSTEMCTL_CONTROLLER = \
/bin/systemctl status slurmctld, \
/bin/systemctl status slurmctld *, \
/bin/systemctl restart slurmctld, \
/bin/systemctl reload slurmctld, \
/bin/systemctl start slurmctld, \
/bin/systemctl stop slurmctld, \
/bin/systemctl status slurmd, \
/bin/systemctl status slurmd *, \
/bin/systemctl restart slurmd, \
/bin/systemctl reload slurmd, \
/bin/systemctl start slurmd, \
/bin/systemctl stop slurmd, \
/usr/bin/systemctl status slurmctld, \
/usr/bin/systemctl status slurmctld *, \
/usr/bin/systemctl restart slurmctld, \
/usr/bin/systemctl reload slurmctld, \
/usr/bin/systemctl start slurmctld, \
/usr/bin/systemctl stop slurmctld, \
/usr/bin/systemctl status slurmd, \
/usr/bin/systemctl status slurmd *, \
/usr/bin/systemctl restart slurmd, \
/usr/bin/systemctl reload slurmd, \
/usr/bin/systemctl start slurmd, \
/usr/bin/systemctl stop slurmd
Cmnd_Alias SLURM_JOURNAL_CONTROLLER = \
/bin/journalctl -u slurmctld, \
/bin/journalctl -u slurmctld *, \
/bin/journalctl -u slurmd, \
/bin/journalctl -u slurmd *, \
/usr/bin/journalctl -u slurmctld, \
/usr/bin/journalctl -u slurmctld *, \
/usr/bin/journalctl -u slurmd, \
/usr/bin/journalctl -u slurmd *
Cmnd_Alias SLURM_COMMANDS = \
/usr/bin/scontrol, /usr/bin/scontrol *, \
/usr/bin/sinfo, /usr/bin/sinfo *, \
/usr/bin/squeue, /usr/bin/squeue *, \
/usr/bin/scancel, /usr/bin/scancel *, \
/usr/bin/sacct, /usr/bin/sacct *, \
/usr/bin/sacctmgr, /usr/bin/sacctmgr *, \
/usr/bin/sbatch, /usr/bin/sbatch *, \
/usr/bin/srun, /usr/bin/srun *, \
/usr/bin/salloc, /usr/bin/salloc *
{{ slurm_operator_user }} ALL=(root) NOPASSWD: SLURM_SYSTEMCTL_CONTROLLER, SLURM_JOURNAL_CONTROLLER, SLURM_COMMANDS
validate: "visudo -cf %s"
when: inventory_hostname in groups['slurm_controller']
- name: Configure sudoers for slurmuser on compute and GPU nodes
ansible.builtin.copy:
dest: /etc/sudoers.d/91-slurmuser-slurm-compute
owner: root
group: root
mode: "0440"
content: |
# Managed by Ansible
Cmnd_Alias SLURM_SYSTEMCTL_COMPUTE = \
/bin/systemctl status slurmd, \
/bin/systemctl status slurmd *, \
/bin/systemctl restart slurmd, \
/bin/systemctl reload slurmd, \
/bin/systemctl start slurmd, \
/bin/systemctl stop slurmd, \
/usr/bin/systemctl status slurmd, \
/usr/bin/systemctl status slurmd *, \
/usr/bin/systemctl restart slurmd, \
/usr/bin/systemctl reload slurmd, \
/usr/bin/systemctl start slurmd, \
/usr/bin/systemctl stop slurmd
Cmnd_Alias SLURM_JOURNAL_COMPUTE = \
/bin/journalctl -u slurmd, \
/bin/journalctl -u slurmd *, \
/usr/bin/journalctl -u slurmd, \
/usr/bin/journalctl -u slurmd *
Cmnd_Alias SLURM_COMMANDS = \
/usr/bin/scontrol, /usr/bin/scontrol *, \
/usr/bin/sinfo, /usr/bin/sinfo *, \
/usr/bin/squeue, /usr/bin/squeue *, \
/usr/bin/scancel, /usr/bin/scancel *, \
/usr/bin/sacct, /usr/bin/sacct *, \
/usr/bin/sbatch, /usr/bin/sbatch *, \
/usr/bin/srun, /usr/bin/srun *, \
/usr/bin/salloc, /usr/bin/salloc *
{{ slurm_operator_user }} ALL=(root) NOPASSWD: SLURM_SYSTEMCTL_COMPUTE, SLURM_JOURNAL_COMPUTE, SLURM_COMMANDS
validate: "visudo -cf %s"
when: inventory_hostname not in groups['slurm_controller']