--- - name: Create slurmuser and generate SSH keys on every Slurm node hosts: slurm_cluster become: true gather_facts: true vars: slurm_operator_user: slurmuser slurm_operator_shell: /bin/bash tasks: - name: Ensure useful packages are installed ansible.builtin.apt: name: - sudo - openssh-client - openssh-server - acl state: present update_cache: true - name: Ensure slurmuser exists ansible.builtin.user: name: "{{ slurm_operator_user }}" shell: "{{ slurm_operator_shell }}" create_home: true state: present - name: Ensure .ssh directory exists for slurmuser ansible.builtin.file: path: "/home/{{ slurm_operator_user }}/.ssh" state: directory owner: "{{ slurm_operator_user }}" group: "{{ slurm_operator_user }}" mode: "0700" - name: Generate SSH key for slurmuser if missing ansible.builtin.openssh_keypair: path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519" type: ed25519 owner: "{{ slurm_operator_user }}" group: "{{ slurm_operator_user }}" mode: "0600" comment: "{{ slurm_operator_user }}@{{ inventory_hostname }}" force: false - name: Read public key from each node ansible.builtin.slurp: src: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub" register: slurmuser_pubkey_raw - name: Store decoded public key as host fact ansible.builtin.set_fact: slurmuser_pubkey: "{{ slurmuser_pubkey_raw.content | b64decode | trim }}" - name: Exchange slurmuser SSH keys across all Slurm nodes hosts: slurm_cluster become: true gather_facts: false vars: slurm_operator_user: slurmuser tasks: - name: Install all slurmuser public keys into authorized_keys on every node ansible.builtin.authorized_key: user: "{{ slurm_operator_user }}" key: "{{ hostvars[item].slurmuser_pubkey }}" state: present manage_dir: true loop: "{{ groups['slurm_cluster'] }}" - name: Build SSH known_hosts entries for all cluster nodes ansible.builtin.shell: | set -e mkdir -p /home/{{ slurm_operator_user }}/.ssh touch /home/{{ slurm_operator_user }}/.ssh/known_hosts {% for host in groups['slurm_cluster'] %} ssh-keyscan -H {{ host }} {{ hostvars[host].ansible_host }} 2>/dev/null >> /home/{{ slurm_operator_user }}/.ssh/known_hosts || true {% endfor %} sort -u /home/{{ slurm_operator_user }}/.ssh/known_hosts -o /home/{{ slurm_operator_user }}/.ssh/known_hosts chown {{ slurm_operator_user }}:{{ slurm_operator_user }} /home/{{ slurm_operator_user }}/.ssh/known_hosts chmod 0644 /home/{{ slurm_operator_user }}/.ssh/known_hosts args: executable: /bin/bash changed_when: true - name: Ensure SSH permissions are correct ansible.builtin.file: path: "/home/{{ slurm_operator_user }}/.ssh" state: directory owner: "{{ slurm_operator_user }}" group: "{{ slurm_operator_user }}" mode: "0700" - name: Ensure private key permissions are correct ansible.builtin.file: path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519" owner: "{{ slurm_operator_user }}" group: "{{ slurm_operator_user }}" mode: "0600" - name: Ensure public key permissions are correct ansible.builtin.file: path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub" owner: "{{ slurm_operator_user }}" group: "{{ slurm_operator_user }}" mode: "0644" - name: Configure sudo permissions for slurmuser hosts: slurm_cluster become: true gather_facts: false vars: slurm_operator_user: slurmuser tasks: - name: Configure sudoers for slurmuser on Slurm controller ansible.builtin.copy: dest: /etc/sudoers.d/91-slurmuser-slurm-controller owner: root group: root mode: "0440" content: | # Managed by Ansible # Operator access for Slurm controller node. {{ slurm_operator_user }} ALL=(root) NOPASSWD: \ /bin/systemctl status slurmctld, \ /bin/systemctl restart slurmctld, \ /bin/systemctl reload slurmctld, \ /bin/systemctl stop slurmctld, \ /bin/systemctl start slurmctld, \ /bin/systemctl status slurmd, \ /bin/systemctl restart slurmd, \ /bin/systemctl reload slurmd, \ /bin/systemctl stop slurmd, \ /bin/systemctl start slurmd, \ /bin/journalctl -u slurmctld, \ /bin/journalctl -u slurmd, \ /usr/bin/scontrol, \ /usr/bin/sinfo, \ /usr/bin/squeue, \ /usr/bin/scancel, \ /usr/bin/sacct, \ /usr/bin/sacctmgr, \ /usr/bin/sbatch, \ /usr/bin/srun, \ /usr/bin/salloc validate: "visudo -cf %s" when: inventory_hostname in groups['slurm_controller'] - name: Configure sudoers for slurmuser on Slurm compute and GPU nodes ansible.builtin.copy: dest: /etc/sudoers.d/91-slurmuser-slurm-compute owner: root group: root mode: "0440" content: | # Managed by Ansible # Operator access for Slurm worker/GPU nodes. {{ slurm_operator_user }} ALL=(root) NOPASSWD: \ /bin/systemctl status slurmd, \ /bin/systemctl restart slurmd, \ /bin/systemctl reload slurmd, \ /bin/systemctl stop slurmd, \ /bin/systemctl start slurmd, \ /bin/journalctl -u slurmd, \ /usr/bin/scontrol, \ /usr/bin/sinfo, \ /usr/bin/squeue, \ /usr/bin/scancel, \ /usr/bin/sacct, \ /usr/bin/sbatch, \ /usr/bin/srun, \ /usr/bin/salloc validate: "visudo -cf %s" when: inventory_hostname not in groups['slurm_controller'] - name: Validate slurmuser SSH mesh and Slurm access hosts: slurm_cluster become: true gather_facts: false vars: slurm_operator_user: slurmuser tasks: - name: Test local Slurm commands as slurmuser ansible.builtin.command: "sudo -iu {{ slurm_operator_user }} sinfo" register: sinfo_test changed_when: false failed_when: sinfo_test.rc != 0 - name: Show sinfo result ansible.builtin.debug: var: sinfo_test.stdout_lines - name: Test SSH from each node to every other node as slurmuser ansible.builtin.shell: | set -e {% for host in groups['slurm_cluster'] %} ssh -o BatchMode=yes -o ConnectTimeout=5 {{ host }} 'hostname' {% endfor %} args: executable: /bin/bash become_user: "{{ slurm_operator_user }}" register: ssh_mesh_test changed_when: false - name: Show SSH mesh test result ansible.builtin.debug: var: ssh_mesh_test.stdout_lines