Files
portfolio/platform-projects/hpc-slurm-ai-cluster/playbooks/bootstrap/slurmuser-ssh-mesh.yml
T
2026-06-05 15:38:56 +00:00

219 lines
7.0 KiB
YAML

---
- name: Create slurmuser and generate SSH keys on every Slurm node
hosts: slurm_cluster
become: true
gather_facts: true
vars:
slurm_operator_user: slurmuser
slurm_operator_shell: /bin/bash
tasks:
- name: Ensure useful packages are installed
ansible.builtin.apt:
name:
- sudo
- openssh-client
- openssh-server
- acl
state: present
update_cache: true
- name: Ensure slurmuser exists
ansible.builtin.user:
name: "{{ slurm_operator_user }}"
shell: "{{ slurm_operator_shell }}"
create_home: true
state: present
- name: Ensure .ssh directory exists for slurmuser
ansible.builtin.file:
path: "/home/{{ slurm_operator_user }}/.ssh"
state: directory
owner: "{{ slurm_operator_user }}"
group: "{{ slurm_operator_user }}"
mode: "0700"
- name: Generate SSH key for slurmuser if missing
ansible.builtin.openssh_keypair:
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519"
type: ed25519
owner: "{{ slurm_operator_user }}"
group: "{{ slurm_operator_user }}"
mode: "0600"
comment: "{{ slurm_operator_user }}@{{ inventory_hostname }}"
force: false
- name: Read public key from each node
ansible.builtin.slurp:
src: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub"
register: slurmuser_pubkey_raw
- name: Store decoded public key as host fact
ansible.builtin.set_fact:
slurmuser_pubkey: "{{ slurmuser_pubkey_raw.content | b64decode | trim }}"
- name: Exchange slurmuser SSH keys across all Slurm nodes
hosts: slurm_cluster
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Install all slurmuser public keys into authorized_keys on every node
ansible.builtin.authorized_key:
user: "{{ slurm_operator_user }}"
key: "{{ hostvars[item].slurmuser_pubkey }}"
state: present
manage_dir: true
loop: "{{ groups['slurm_cluster'] }}"
- name: Build SSH known_hosts entries for all cluster nodes
ansible.builtin.shell: |
set -e
mkdir -p /home/{{ slurm_operator_user }}/.ssh
touch /home/{{ slurm_operator_user }}/.ssh/known_hosts
{% for host in groups['slurm_cluster'] %}
ssh-keyscan -H {{ host }} {{ hostvars[host].ansible_host }} 2>/dev/null >> /home/{{ slurm_operator_user }}/.ssh/known_hosts || true
{% endfor %}
sort -u /home/{{ slurm_operator_user }}/.ssh/known_hosts -o /home/{{ slurm_operator_user }}/.ssh/known_hosts
chown {{ slurm_operator_user }}:{{ slurm_operator_user }} /home/{{ slurm_operator_user }}/.ssh/known_hosts
chmod 0644 /home/{{ slurm_operator_user }}/.ssh/known_hosts
args:
executable: /bin/bash
changed_when: true
- name: Ensure SSH permissions are correct
ansible.builtin.file:
path: "/home/{{ slurm_operator_user }}/.ssh"
state: directory
owner: "{{ slurm_operator_user }}"
group: "{{ slurm_operator_user }}"
mode: "0700"
- name: Ensure private key permissions are correct
ansible.builtin.file:
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519"
owner: "{{ slurm_operator_user }}"
group: "{{ slurm_operator_user }}"
mode: "0600"
- name: Ensure public key permissions are correct
ansible.builtin.file:
path: "/home/{{ slurm_operator_user }}/.ssh/id_ed25519.pub"
owner: "{{ slurm_operator_user }}"
group: "{{ slurm_operator_user }}"
mode: "0644"
- name: Configure sudo permissions for slurmuser
hosts: slurm_cluster
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Configure sudoers for slurmuser on Slurm controller
ansible.builtin.copy:
dest: /etc/sudoers.d/91-slurmuser-slurm-controller
owner: root
group: root
mode: "0440"
content: |
# Managed by Ansible
# Operator access for Slurm controller node.
{{ slurm_operator_user }} ALL=(root) NOPASSWD: \
/bin/systemctl status slurmctld, \
/bin/systemctl restart slurmctld, \
/bin/systemctl reload slurmctld, \
/bin/systemctl stop slurmctld, \
/bin/systemctl start slurmctld, \
/bin/systemctl status slurmd, \
/bin/systemctl restart slurmd, \
/bin/systemctl reload slurmd, \
/bin/systemctl stop slurmd, \
/bin/systemctl start slurmd, \
/bin/journalctl -u slurmctld, \
/bin/journalctl -u slurmd, \
/usr/bin/scontrol, \
/usr/bin/sinfo, \
/usr/bin/squeue, \
/usr/bin/scancel, \
/usr/bin/sacct, \
/usr/bin/sacctmgr, \
/usr/bin/sbatch, \
/usr/bin/srun, \
/usr/bin/salloc
validate: "visudo -cf %s"
when: inventory_hostname in groups['slurm_controller']
- name: Configure sudoers for slurmuser on Slurm compute and GPU nodes
ansible.builtin.copy:
dest: /etc/sudoers.d/91-slurmuser-slurm-compute
owner: root
group: root
mode: "0440"
content: |
# Managed by Ansible
# Operator access for Slurm worker/GPU nodes.
{{ slurm_operator_user }} ALL=(root) NOPASSWD: \
/bin/systemctl status slurmd, \
/bin/systemctl restart slurmd, \
/bin/systemctl reload slurmd, \
/bin/systemctl stop slurmd, \
/bin/systemctl start slurmd, \
/bin/journalctl -u slurmd, \
/usr/bin/scontrol, \
/usr/bin/sinfo, \
/usr/bin/squeue, \
/usr/bin/scancel, \
/usr/bin/sacct, \
/usr/bin/sbatch, \
/usr/bin/srun, \
/usr/bin/salloc
validate: "visudo -cf %s"
when: inventory_hostname not in groups['slurm_controller']
- name: Validate slurmuser SSH mesh and Slurm access
hosts: slurm_cluster
become: true
gather_facts: false
vars:
slurm_operator_user: slurmuser
tasks:
- name: Test local Slurm commands as slurmuser
ansible.builtin.command: "sudo -iu {{ slurm_operator_user }} sinfo"
register: sinfo_test
changed_when: false
failed_when: sinfo_test.rc != 0
- name: Show sinfo result
ansible.builtin.debug:
var: sinfo_test.stdout_lines
- name: Test SSH from each node to every other node as slurmuser
ansible.builtin.shell: |
set -e
{% for host in groups['slurm_cluster'] %}
ssh -o BatchMode=yes -o ConnectTimeout=5 {{ host }} 'hostname'
{% endfor %}
args:
executable: /bin/bash
become_user: "{{ slurm_operator_user }}"
register: ssh_mesh_test
changed_when: false
- name: Show SSH mesh test result
ansible.builtin.debug:
var: ssh_mesh_test.stdout_lines