User Tools

Site Tools


cluster:213


Back

New Head Node

We're embarking on a transition to a new head/login node name cottontail2. This server will be running Rocky 8 operating system. Early design ideas can be found at Cottontail2, all pre-pandemic. We are staying with a 1G ethernet network as we could not find 10G switches. Maybe in the near term we can upgrade.

Two new compute nodes (n100, n101) will be set up in a test queue. They each have four RTX5000 gpus which have the same architecture as our other gpus so all compiled software should work. These gpus have 16G memory foot print (twice as large as other gpus we have).

OpenHPC will be deployed next and I'll make some notes. We will to Slurm scheduler. (Slurm Test Env for users and Slurm Test Env techie page). Any old hardware that can be reimaged with Rocky 8 will be migrated to Slurm using Warewulf. But that all will take some time.

Some pictures below.

Config Recipe

Steps. “Ala n37” … so the RTX nodes are similar to the K20 nodes and we can put the local software in place. See K20 Redo page and exx96 Recipe for CentOS 7

New recipe for n100-n101 sporting Rocky 8.5 on cottontail2
Put node on internet…first though

  • Vanilla Backups using Warewulf and plain rsync (–exclude=[proc/,sys/,run/]
# login as root check some things out...
free -g
nvidia-smi # if gpus
cat /proc/cpuinfo

# check and set local time zone
mv /etc/localtime /etc/localtime.backup
ln -s /usr/share/zoneinfo/America/New_York /etc/localtime

# change passwords for root and vendor account
passwd
passwd microway
# set hostname
hostnamectl set-hostname cottontail2

# root: sync cottontail's master and known_hosts (tails+stores)
ssh-keygen -t rsa
scp 10.10.102.253:/root/.ssh/authorized_keys /root/.ssh/
/etc/ssh/sshd_config (PermitRootLogin)

# Put the warewulf cluster key in authorized_keys
# Put eth0 fingerprints in cottontail/greentail52 known hosts
# add to relevant known_hosts_servername file

# configure private subnets and ping file server
cd /etc/sysconfig/network-scripts/
vi ifcfg-eth0 # 192.168.102.x
vi ifcfg-eth1 # 10.10.102.x   # 'uuidgen eth1' to get uuid
vi ifcfg-eth3 # 129.133.52.x
scp 192.168.102.112:/etc/rc.d/rc.local /etc/rc.d/  # check +x, edit ib0, start

systemctl restart network
ping -c 3 192.168.102.42
ping -c 3 10.10.102.42

# make internet connection for yum

# iptables
dnf install -y iptables-services
vi /etc/sysconfig/iptables
# add 'local allow' ports  --dport 0:65535
systemctl start iptables # and enable
iptables -L


# eth3 for ctt2 or eth1 for n100-101
dnf install bind-utils
dig google.com
iptables -L # check!



# other configs
vi /etc/selinux/config # disabled, do not mistype, kernel will not boot!
mv /home /usr/local/
cd /;ln -s /usr/local/home 
cd /; ln -s /home /share
vi /etc/passwd (exx, dockeruser $HOME)


#exx96
mkdir /sanscratch /home/localscratch
chmod ugo+rwx /sanscratch /home/localscratch
chmod o+t /sanscratch /home/localscratch 
# exx96
# link localscratch in 1.4T /home to /

cd /home 
ln -s /zfshomes/apps
ln -s /zfshomes/tmp
ln -s /zfshomes/csmith06
ls -l

cat /sanscratch/tmp/fstab.tmp >> /etc/fstab; mkdir /astrostore; mount -a; df -h; cd /smithlab/;  ln -s /smithlab/home/opt/rhel08 opt; ls -l

# fstab file mounts
mkdir -p /zfshomes /home66 /home33 /mindstore /opt/ohpc/pub /opt/intel
mkdir -p /smithlab/home;cd /smithlab;ln -s /smithlab/home/opt/rhel08 opt; ls -l
# cottontail2 = greentail52
# n100-n101 = n79


# on head node /etc/chronyc.conf
allow 192.168.0.0/16
# compute nodes /etc/chronyc.conf
#pool 2.pool.ntp.org iburst
Server 192.168.102.250
Server 192.168.102.251
# check
systemctl restart chronyd
chronyc sources

# Rocky8
# https://docs.fedoraproject.org/en-US/epel/#Quickstart
dnf config-manager --set-enabled powertools -y
dnf install epel-release -y
dnf install netcdf netcdf-devel -y
dnf install yum-utils # yumdownloader -y
dnf install ddd grace gnuplot alpine -y # pico

yum groupinstall "Server" # server for compute nodes "Server with GUI"


# on head node install from epel repo
### yum install slurm-openlava
# error on conflicting libs, too bad!


# add packages and update
yum install epel-release -y
yum install flex bison -y 
yum install tcl tcl-devel dmtcp dmtcp-devel -y
yum install net-snmp net-snmp-libs net-tools net-snmp-utils -y
yum install freeglut-devel libXi-devel libXmu-devel -y
yum install blas blas-devel lapack lapack-devel boost boost-devel -y
yum install lm_sensors lm_sensors-libs -y
yum install zlib-devel bzip2-devel -y
yum install openmpi openmpi-devel perl-ExtUtils-MakeMaker -y
yum install cmake -y
yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y

#easybuild
yum install libibverbs libibverbs-devel

# amber20 cmake readline error fix needs
yum install ncurses-c++-libs-6.1-9.20180224.el8.x86_64.rpm \
            ncurses-devel-6.1-9.20180224.el8.x86_64.rpm \
            readline-devel-7.0-10.el8.x86_64.rpm

# amber20
yum -y install tcsh make \
               gcc gcc-gfortran gcc-c++ \
               which flex bison patch bc \
               libXt-devel libXext-devel \
               perl perl-ExtUtils-MakeMaker util-linux wget \
               bzip2 bzip2-devel zlib-devel tar 

# CENTOS7 pick the kernel vendor used for now
grep ^menuentry /etc/grub2.cfg
grub2-set-default 1
ls -d /sys/firmware/efi && echo "EFI" || echo "Legacy"
#grub2-mkconfig -o /boot/grub2/grub.cfg          # legacy
#grub2-mkconfig -o /boot/efi/EFI/centos/grub.cfg # efi

# compute nodes old level 3
systemctl set-default multi-user.target


# postfix
dnf install postfix
dnf install mailx
systemctl enable postfix
echo "relayhost = 192.168.102.251" >> /etc/postfix/main.cf


# edit /etc/snmp/snmpd.conf, enable and start
rocommunity public
dontLogTCPWrappersConnects yes
# enable, start, add to zenoss 


# compute nodes only
# leave old cuda versions behind (9.2 | 10.2)
cd usr/local/
# scp fron n79:/usr/local/
amber16/  amber20/ fsl-5.0.10/ gromacs-2018/ lammps-22Aug18/

# compute nodes only /usr/local/bin/
# copy scripts: gpu-free, gpu-info, gpu-process
# copy 10.10.102.89:/usr/local/bin/n37.openmpi.wrapper /usr/local/bin/
# done

# FINISH native vanilla installs
# R version 4.1.2 (2021-11-01) -- "Bird Hippie"
yum install R R-devel
# openjdk version "1.8.0_322"
rpm -qa | grep ^java  # check
yum install java-1.8.0-openjdk java-1.8.0-openjdk-devel \
java-1.8.0-openjdk-headless javapackages-filesystem
# python v 3.9
yum install python39 python39-devel
ln -s /usr/bin/python3.9 /usr/bin/python
# fftw 3.3.5-11.el8
yum install fftw fftw-devel
#gnu scientific libraries
yum install gsl gsl-devel
# ruby 2.5.9-109.module+el8.5.0
yum install ruby ruby-devel
# obabel chem file formats
yum install openbabel openbabel-devel




yum clean all
# eth3 onboot=no, private networks only
systemctl disable iptables


# now make it an ohpc compute node
# DO THIS on compute nodes BEFORE mounting ctt2:/opt
# pulls in newer version, potentail problem later on
  yum repolist
  rpm -ivh ohpc-release-2-1.el8.x86_64.rpm 
  yum install singularity-ohpc
  yum  install ohpc-base-compute --nobest
    yum  install ohpc-slurm-client
  
  scp cottontail2:/etc/resolv.conf /etc/resolv.conf

  
  # check status of service munge
  rpm -ivh /sanscratch/tmp/rpms/munge-devel-0.5.13-2.el8.x86_64.rpm
  systemctl enable munge
  systemctl start munge
  scp cottontail2:/etc/munge/munge.key /etc/munge/munge.key
  echo SLURMD_OPTIONS="--conf-server 192.168.102.250" > /etc/sysconfig/slurmd
  yum  install --allowerasing lmod-ohpc
  grep '/var' /etc/slurm/slurm.conf
  mkdir /var/log/slurm 
  chown slurm:munge /var/log/slurm 
  mkdir /var/spool/slurm 
  chown slurm:munge /var/spool/slurm 
  scp cottontail2:/etc/slurm/slurm.conf /etc/slurm/slurm.conf
  scp cottontail2:/etc/slurm/gres.conf /etc/slurm/gres.conf
  scp cottontail2:/etc/profile.d/lmod.sh /etc/profile.d/
  
# /etc/bashrc add
# ohpc lmod gcc mpicc
export PATH=/usr/local/slurm/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/slurm/lib:$LD_LIBRARY_PATH

  
# /var/[log|spool|run] need to be removed from
/usr/libexec/warewulf/wwmkchroot/gold-template

#test
  /usr/sbin/slurmd -D 
  
# start via rc.local (already copied)
#chmod +x /etc/rc.d/rc.local
#timing issue with munge
#sleep 15
#/usr/sbin/slurmd

systemctl stop firewalld
systemctl disable firewalld

systemctl  disable dnf-makecache.timer
systemctl stop dnf-makecache.timer

 mv /etc/issue.d/cockpit.issue /root/etc_issue.d_cockpit.issue
 mv /etc/motd.d/cockpit /root/etc_motd.d_cockpit


## edit passwd, shadow, group, hosts files ##
## make -orig backups and stage in /home/tmp/global
## cottontail2 = greentail52 sections
chown -R munge:munge /etc/munge /var/log/munge /var/lib/munge /var/run/munge
chown -R slurm:munge /var/log/slurm /var/spool/slurm


  
# slurmd ???
	libhwloc.so.15 => /opt/ohpc/pub/libs/hwloc/lib/libhwloc.so.15 (0x00007fd6e5684000)

# crontab

# ionice gaussian
0,15,30,45 * * * * /share/apps/scripts/ionice_lexes.sh  > /dev/null 2>&1

# cpu temps
40 * * * * /share/apps/scripts/lm_sensors.sh > /dev/null 2>&1

on compute node /etc/security/limits.conf
*                -       memlock         270039400


# file date_ctt2.sh

# ctt /etc/pdsh

# ctt:/root/scripts

# ctt2:/usr/local/bin/rslurm2022.sh

Pics

My data center robot thingie and node n100's gpus





Amber20

OpenHPC

# First **all the necessary packages ** (yum install...)

 988  tar xvfj ../AmberTools21.tar.bz2 
  989  tar xvfj ../Amber20.tar.bz2 
  993  cd amber20_src/
  994  cd build/
  996  vi run_cmake

#  Assume this is Linux:

# serial, do on head node, with miniconda true, compile, install
  cmake $AMBER_PREFIX/amber20_src \
    -DCMAKE_INSTALL_PREFIX=/share/apps/CENTOS8/ohpc/software/amber/20 \
    -DCOMPILER=GNU  \
    -DMPI=FALSE -DCUDA=FALSE -DINSTALL_TESTS=TRUE \
    -DDOWNLOAD_MINICONDA=TRUE -DMINICONDA_USE_PY3=TRUE \
    2>&1 | tee  cmake.log

# Env

[hmeij@n100 ~]$ module load cuda/11.6

[hmeij@n100 ~]$ echo $CUDA_HOME
/usr/local/cuda

[hmeij@n100 ~]$ which nvcc mpicc gcc
/usr/local/cuda/bin/nvcc
/opt/ohpc/pub/mpi/openmpi4-gnu9/4.1.1/bin/mpicc
/opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc

# [FIXED} cmake error on conda install, set to FALSE
# OS native python, install on n[100-101]
-- Python version 3.9 -- OK
-- Found PythonLibs: /usr/lib64/libpython3.9.so (found version "3.9.6") 
-- Checking for Python package numpy -- not found
-- Checking for Python package scipy -- not found
-- Checking for Python package matplotlib -- not found
-- Checking for Python package setuptools -- found
[END FIXED]

# mpi & cuda FALSE builds serial
./run_cmake
make install
# lots and lots of warnings

# then
source /share/apps/CENTOS8/ohpc/software/amber/20/amber.sh

# on n100 now, parallel, set miniconda flags to FALSE
-MPI=TRUE
./run_cmake
make install

# on n100 just change cuda flag
-CUDA=TRUE
./run_cmake
make install

#tests
cd $AMBERHOME
make test.serial
export DO_PARALLEL="mpirun -np 6"
make test.parallel
export CUDA_VISIBLE_DEVICES=0
make test.cuda.serial
make test.cuda.parallel

Amber22

OpenHPC

# First **all the necessary packages ** (yum install...)

 988  tar xvfj ../AmberTools22.tar.bz2 
  989  tar xvfj ../Amber22.tar.bz2 
  993  cd amber20_src/
  994  cd build/
  996  vi run_cmake

#  Assume this is Linux:

# serial, do on head node, with miniconda true, compile, install
  cmake $AMBER_PREFIX/amber22_src \
    -DCMAKE_INSTALL_PREFIX=/share/apps/CENTOS8/ohpc/software/amber/22 \
    -DCOMPILER=GNU  \
    -DMPI=FALSE -DCUDA=FALSE -DINSTALL_TESTS=TRUE \
    -DDOWNLOAD_MINICONDA=TRUE \
    2>&1 | tee  cmake.log
./run_cmake
make install


# Note !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
The OpenMPI and MPICH system installations provided by CentOS 
(i.e., through yum install) 
are known to be somehow incompatible with Amber22.
# OUCH !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


# GO TO node n100

# copy head node's amber22_src/ to n100:/usr/local/src/tmp/


source /share/apps/CENTOS8/ohpc/software/amber/22/amber.sh
echo $AMBERHOME

# install latest openmpi version
cd amber_src/Ambertools/src
tar xvfj ../../../../openmpi-4.1.4.tar.bz2 

./configure_openmpi gnu # openhpc gcc/gfortran 


# on n100 now, parallel, set 
-MPI=TRUE
-DDOWNLOAD_MINICONDA=FALSE
./run_cmake
make install

# on n100 just change cuda flag

[hmeij@n100 build]$ module load cuda/11.6
[hmeij@n100 build]$ which gcc mpicc nvcc
/opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc
/share/apps/CENTOS8/ohpc/software/amber/22/bin/mpicc
/usr/local/cuda/bin/nvcc
[hmeij@n100 ~]$ echo $CUDA_HOME
/usr/local/cuda

-MPI=TRUE
-CUDA=TRUE
-DDOWNLOAD_MINICONDA=FALSE
./run_cmake
make install


[hmeij@n100 ~]$ which nvcc mpicc gcc
/usr/local/cuda/bin/nvcc
/opt/ohpc/pub/mpi/openmpi4-gnu9/4.1.1/bin/mpicc
/opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc

#tests
cd $AMBERHOME
make test.serial
export DO_PARALLEL="mpirun -np 6"
make test.parallel
export CUDA_VISIBLE_DEVICES=0
make test.cuda.serial
make test.cuda.parallel

Back

cluster/213.txt · Last modified: 2024/01/12 15:09 by hmeij07