Skip to content

Ansible

Inventory

Ansible's inventory file
inventory.ini
[all:vars]
ansible_connection=ssh
ansible_ssh_common_args='-o StrictHostKeyChecking=no'
ansible_user=guest

[cluster]
192.168.0.[103:116] ib_address=10.0.0.[3:16]
192.168.0.[119:124] ib_address=10.0.0.[19:24]
192.168.0.126 ib_address=10.0.0.26
192.168.0.130 ib_address=10.0.0.30
192.168.0.[132:135] ib_address=10.0.0.[32:35]
192.168.0.[137:143] ib_address=10.0.0.[37:43]
192.168.0.[145:149] ib_address=10.0.0.[45:49]
192.168.0.151 ib_address=10.0.0.51

[login_nodes]
192.168.0.102 ib_address=10.0.0.2

[controller_nodes]
192.168.0.104 ib_address=10.0.0.4
192.168.0.115 ib_address=10.0.0.15

[db_nodes]
192.168.0.105 ib_address=10.0.0.5

[compute_nodes]
192.168.0.103 ib_address=10.0.0.3
192.168.0.[106:114] ib_address=10.0.0.[6:14]
192.168.0.116 ib_address=10.0.0.16
192.168.0.[119:124] ib_address=10.0.0.[19:24]
192.168.0.126 ib_address=10.0.0.26
192.168.0.130 ib_address=10.0.0.30
192.168.0.[132:135] ib_address=10.0.0.[32:35]
192.168.0.[137:143] ib_address=10.0.0.[37:43]
192.168.0.[145:149] ib_address=10.0.0.[45:49]
192.168.0.151 ib_address=10.0.0.51

[nodes_without_gpu]
192.168.0.[103:109] ib_address=10.0.0.[3:9]
192.168.0.[114:116] ib_address=10.0.0.[14:16]
192.168.0.121 ib_address=10.0.0.21
192.168.0.130 ib_address=10.0.0.30
192.168.0.[132:135] ib_address=10.0.0.[32:35]
192.168.0.[137:143] ib_address=10.0.0.[37:43]
192.168.0.[145:149] ib_address=10.0.0.[45:49]
192.168.0.151 ib_address=10.0.0.51

[nodes_with_1_gpu]
192.168.0.110 ib_address=10.0.0.10
192.168.0.122 ib_address=10.0.0.22

[nodes_with_2_gpus]
192.168.0.[111:113] ib_address=10.0.0.[11:13]
192.168.0.120 ib_address=10.0.0.20
192.168.0.[123:124] ib_address=10.0.0.[23:24]
192.168.0.126 ib_address=10.0.0.26

[nodes_with_fpga]
192.168.0.119 ib_address=10.0.0.19

[admin_partition]
192.168.0.121 ib_address=10.0.0.21

[department_only_partition]
192.168.0.103 ib_address=10.0.0.3
192.168.0.[112:114] ib_address=10.0.0.[12:14]
192.168.0.116 ib_address=10.0.0.16
192.168.0.120 ib_address=10.0.0.20
192.168.0.[123:124] ib_address=10.0.0.[23:24]
192.168.0.126 ib_address=10.0.0.26
192.168.0.130 ib_address=10.0.0.30
192.168.0.[132:135] ib_address=10.0.0.[32:35]
192.168.0.[137:143] ib_address=10.0.0.[37:43]

[multicore_partition]
192.168.0.[106:107] ib_address=10.0.0.[6:7]

[students_partition]
192.168.0.[108:111] ib_address=10.0.0.[8:11]
192.168.0.122 ib_address=10.0.0.22
192.168.0.[145:149] ib_address=10.0.0.[45:49]
192.168.0.151 ib_address=10.0.0.51

Environment

Sets environment variables and essential packages
ansible/setup_env.yaml
- name: Setup the environment
  hosts:
  - controller_nodes
  - login_nodes
  - db_nodes
  - compute_nodes
  become: true
  become_user: root
  tasks:
  - name: Updates apt cache
    shell:
      cmd: |
        apt clean -y
        apt update -y
        apt upgrade -y
        apt dist-upgrade -y
        apt autoremove -y
  - name: Install required packages
    apt:
      name:
        - zip
        - snapd
        - golang-go
      state: latest
      autoremove: true
  - name: Initialize nvidia-smi
    shell:
      cmd: nvidia-smi
    ignore_errors: true
  - name: Creates the /home/.shared folder
    file:
      path: /home/.shared
      state: directory
      mode: u=rwx,g=rx,o=rx


  # SINGULARITY
  # - name: Creates the /home/guest/.shared/singularity_cache folder
  #   file:
  #     path: /home/guest/.shared/singularity_cache
  #     state: directory
  #     mode: u=rwx,g=rwx,o=rwx
  #     recurse: true
  # - name: Sets the Singularity cache directory
  #   lineinfile:
  #     path: "/etc/environment"
  #     state: present
  #     regexp: "SINGULARITY_CACHEDIR="
  #     line: ""
Install OpenMPI
ansible/install_openmpi.yaml
- name: Install OpenMPI on all nodes
  hosts:
  - controller_nodes
  - login_nodes
  - db_nodes
  - compute_nodes
  become: true
  become_user: root
  tasks:
  # - name: Install pre-requisites
  #   apt:
  #     name:
  #       - openmpi-bin
  #   ignore_errors: true

  - name: Checks if OpenMPI is already installed
    shell:
      cmd: ompi_info
    register: ompi_info
    ignore_errors: true
  - name: Download OpenMPI
    get_url:
      url: https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.7.tar.bz2
      dest: /tmp/openmpi-5.0.7.tar.bz2
      force: true
    when: ompi_info.rc != 0
  - name: Unpacks OpenMPI
    shell:
      cmd: tar xf openmpi-5.0.7.tar.bz2
      chdir: /tmp
    when: ompi_info.rc != 0
  - name: Installs OpenMPI
    shell:
      cmd: |
        ./configure --prefix=/usr/local --with-slurm
        make -j 16 all
        make install
      chdir: /tmp/openmpi-5.0.7
    when: ompi_info.rc != 0
  - name: Install post-requisites
    apt:
      name:
        - hcoll 
        - ucx
    ignore_errors: true
    when: ompi_info.rc != 0
  - name: Prints ompi_infos to check whether everything is ok
    shell:
      cmd: ompi_info
    register: ompi_info_post
    failed_when: ompi_info_post.rc != 0
Install Anaconda on all nodes
ansible/install_anaconda.yaml
- name: Install Anaconda on all nodes
  hosts:
  - controller_nodes
  - login_nodes
  - db_nodes
  - compute_nodes
  become: true
  become_user: root
  tasks:
  - name: Checks if Anaconda is already installed
    shell:
      cmd: export PATH=/etc/anaconda3/bin:$PATH && conda list
    register: conda_list
    ignore_errors: true

  - name: Download Anaconda
    get_url:
      url: https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh
      dest: /tmp/Anaconda3-2024.10-1-Linux-x86_64.sh
      force: true
    when: conda_list.rc != 0
  - name: Install Anaconda
    shell:
      cmd: bash /tmp/Anaconda3-2024.10-1-Linux-x86_64.sh -bu -p /etc/anaconda3
    when: conda_list.rc != 0
  - name: Clean-up the installer
    shell:
      cmd: rm -rf /tmp/Anaconda*
    when: conda_list.rc != 0
  - name: Enable read for all
    file:
      path: /etc/anaconda3
      mode: +r
      recurse: yes
    when: conda_list.rc != 0
  - name: Enable read and write for shared package cache for all
    file:
      path: /home/guest/conda_pkgs
      mode: u+rwx,g+rwx,o+rwx
      recurse: yes
      state: directory
    # when: conda_list.rc != 0
  - name: Enable execution for all
    file:
      path: /etc/anaconda3/bin
      mode: +x
      recurse: yes
    when: conda_list.rc != 0
  - name: Initialize conda
    shell:
      cmd: export PATH=/etc/anaconda3/bin:$PATH && conda init --all --system

  - name: Activate Anaconda for the next login
    lineinfile:
      path: /etc/profile
      line: "{{ item }}"
      state: present
    loop:
      - export PATH=/etc/anaconda3/bin:$PATH
      - export PIP_CACHE_DIR=/home/guest/conda_pkgs
      - printf "pkgs_dirs:\n- /home/guest/conda_pkgs\n" > ~/.condarc
  - name: Clean-up /etc/profile
    lineinfile:
      path: /etc/profile
      regexp: "{{ item }}"
      state: absent
    loop:
      - echo*
      - printf "export*
Install Visual Studio Code Tunnel on all nodes
ansible/install_code.yaml
- name: Install Visual Studio Code Tunnel on all nodes
  hosts:
  - controller_nodes
  - login_nodes
  - db_nodes
  - compute_nodes
  become: true
  become_user: root
  tasks:
  - name: Checks if Visual Studio Code is already installed
    shell:
      cmd: code --version
    register: code_version
    ignore_errors: true

  - name: Download the installer
    get_url:
      url: https://code.visualstudio.com/sha/download?build=stable&os=cli-alpine-x64
      dest: /tmp/vscode_cli.tar.gz
      force: true
    when: code_version.rc != 0
  - name: Extract the content
    unarchive:
      src: /tmp/vscode_cli.tar.gz
      dest: /bin
    when: code_version.rc != 0
  - name: Clean-up the installer
    file:
      path: /tmp/vscode_cli.tar.gz
      state: absent

  - name: Checks again if Visual Studio Code has been installed
    shell:
      cmd: code --version
    register: code_post
    failed_when: code_post.rc != 0

Slurm

Slurm's configuration file
slurm.conf
# slurm.conf file generated by configurator easy.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=di_hpc_salaria
SlurmctldHost=node115(10.0.0.15)
SlurmctldHost=node104(10.0.0.4)
SlurmctldParameters=enable_configless
#
#MailProg=/bin/mail
#MpiDefault=
#MpiParams=ports=#-#
ProctrackType=proctrack/cgroup
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
#SlurmdPort=6818
SlurmdSpoolDir=/var/slurmproc/slurmd
SlurmUser=slurm
#SlurmdUser=root
StateSaveLocation=/var/slurmproc/slurmctld
#SwitchType=
TaskPlugin=task/affinity,task/cgroup
AuthType=auth/munge
#
#
# TIMERS
#KillWait=30
#MinJobAge=300
#SlurmctldTimeout=120
#SlurmdTimeout=300
#
#
#
#
# LOGGING AND ACCOUNTING
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=10.0.0.5
AccountingStorageUser=slurm
AccountingStoragePort=6819
#JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/cgroup
#SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurmctld.log
#SlurmdDebug=info
SlurmdLogFile=/var/log/slurmd.log
#
# PRIORITIES and FAIRSHARE
PriorityType=priority/multifactor
PriorityWeightFairshare=100000       
PriorityWeightAge=1000             
PriorityWeightQOS=50000           
PriorityWeightJobSize=500             
PriorityWeightPartition=10000           
PriorityDecayHalfLife=7-0           
PriorityCalcPeriod=5:00           
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core

# COMPUTE NODES
GresTypes=gpu

NodeName=node103 NodeAddr=10.0.0.3 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node106 NodeAddr=10.0.0.6 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node107 NodeAddr=10.0.0.7 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node108 NodeAddr=10.0.0.8 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node109 NodeAddr=10.0.0.9 CPUs=32 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=1 RealMemory=257566
NodeName=node110 NodeAddr=10.0.0.10 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566 Gres=gpu:quadro_rtx_6000:1
NodeName=node111 NodeAddr=10.0.0.11 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566 Gres=gpu:quadro_rtx_6000:2
NodeName=node112 NodeAddr=10.0.0.12 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566 Gres=gpu:quadro_rtx_6000:2
NodeName=node113 NodeAddr=10.0.0.13 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566 Gres=gpu:quadro_rtx_6000:2
NodeName=node114 NodeAddr=10.0.0.14 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=1027676
NodeName=node116 NodeAddr=10.0.0.16 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node118 NodeAddr=10.0.0.18 CPUs=32 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=1 RealMemory=257566
NodeName=node119 NodeAddr=10.0.0.19 CPUs=32 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=1 RealMemory=257566
NodeName=node120 NodeAddr=10.0.0.20 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566 Gres=gpu:quadro_rtx_6000:2
NodeName=node121 NodeAddr=10.0.0.21 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node122 NodeAddr=10.0.0.22 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566 Gres=gpu:quadro_rtx_6000:1
NodeName=node123 NodeAddr=10.0.0.23 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566 Gres=gpu:quadro_rtx_6000:2
NodeName=node124 NodeAddr=10.0.0.24 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566 Gres=gpu:quadro_rtx_6000:2
NodeName=node125 NodeAddr=192.168.0.125 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566 Gres=gpu:quadro_rtx_6000:2
NodeName=node126 NodeAddr=10.0.0.26 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566 Gres=gpu:quadro_rtx_6000:2
NodeName=node130 NodeAddr=10.0.0.30 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node132 NodeAddr=10.0.0.32 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node133 NodeAddr=10.0.0.33 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node134 NodeAddr=10.0.0.34 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node135 NodeAddr=10.0.0.35 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node137 NodeAddr=10.0.0.37 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node138 NodeAddr=10.0.0.38 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node139 NodeAddr=10.0.0.39 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node140 NodeAddr=10.0.0.40 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node141 NodeAddr=10.0.0.41 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=1027678
NodeName=node142 NodeAddr=10.0.0.42 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=1027678
NodeName=node143 NodeAddr=10.0.0.43 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=1027678
NodeName=node145 NodeAddr=10.0.0.45 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=1027678
NodeName=node146 NodeAddr=10.0.0.46 CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=257566
NodeName=node147 NodeAddr=10.0.0.47 CPUs=32 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=1 RealMemory=257566
NodeName=node148 NodeAddr=10.0.0.48 CPUs=32 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=1 RealMemory=257566
NodeName=node149 NodeAddr=10.0.0.49 CPUs=32 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=1 RealMemory=257566
NodeName=node151 NodeAddr=10.0.0.51 CPUs=32 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=1 RealMemory=257566

PartitionName=admin Nodes="" MaxTime=INFINITE State=UP AllowGroups=sudo DefMemPerNode=8192
PartitionName=department_only Nodes=ALL MaxTime=3-0 State=UP AllowGroups=sudo,department,group_leaders DefMemPerNode=8192
PartitionName=multicore Nodes=node[106-107] State=UP MaxTime=0-6 DefMemPerNode=1024 MaxNodes=1
PartitionName=fpga Nodes=node[119] State=UP MaxTime=1-0 MaxNodes=1 ExclusiveUser=YES OverSubscribe=EXCLUSIVE DefMemPerNode=257565
PartitionName=students Nodes=node[106-111,118,122,145-149,151] State=UP Default=TRUE MaxTime=1-0 DefMemPerNode=1024 MaxMemPerNode=32768 MaxCPUsPerNode=8 MaxNodes=1
Propagate Slurm's configuration file from controller to all nodes
propagate_slurm_conf.yaml
- name: Propagate slurm.conf
  hosts:
  - controller_nodes
  - login_nodes
  - db_nodes
  - compute_nodes
  become: true
  become_user: root
  tasks:
  - name: Creates /etc/slurm if does not exists
    file:
      path: /etc/slurm
      state: directory
  - name: Copy slurm.conf from central node to other nodes
    copy:
      src: /etc/slurm/slurm.conf
      dest: /etc/slurm/slurm.conf
  - name: Copy gres.conf from central node to other nodes
    copy:
      src: /etc/slurm/gres.conf
      dest: /etc/slurm/gres.conf
  - name: Ensure there is no oci.conf
    file:
      path: /etc/slurm/oci.conf
      state: absent
  - name: Ensure 'slurm' and 'users' group exists
    group:
      name: "{{ item }}"
      state: present
    loop:
    - slurm
    - users
  - name: Set slurm user uid correctly
    shell:
      cmd: sudo usermod -u 1217 slurm
  # - name: Reconfigure Slurm for changes to take effect
  #   shell:
  #     cmd: scontrol reconfigure
  #   delegate_to: "{{ ansible_hostname }}"
Reboot department_only partition nodes
reboot_department_only_nodes.yaml
- name: Propagate slurm.conf
  hosts:
  - controller_nodes
  - login_nodes
  - db_nodes
  - compute_nodes
  become: true
  become_user: root
  tasks:
  - name: Creates /etc/slurm if does not exists
    file:
      path: /etc/slurm
      state: directory
  - name: Copy slurm.conf from central node to other nodes
    copy:
      src: /etc/slurm/slurm.conf
      dest: /etc/slurm/slurm.conf
  - name: Copy gres.conf from central node to other nodes
    copy:
      src: /etc/slurm/gres.conf
      dest: /etc/slurm/gres.conf
  - name: Ensure there is no oci.conf
    file:
      path: /etc/slurm/oci.conf
      state: absent
  - name: Ensure 'slurm' and 'users' group exists
    group:
      name: "{{ item }}"
      state: present
    loop:
    - slurm
    - users
  - name: Set slurm user uid correctly
    shell:
      cmd: sudo usermod -u 1217 slurm
  # - name: Reconfigure Slurm for changes to take effect
  #   shell:
  #     cmd: scontrol reconfigure
  #   delegate_to: "{{ ansible_hostname }}"