This is an old revision of the document!
We're embarking on a transition to a new head/login node name cottontail2
. This server will be running Rocky 8 operating system. Early design ideas can be found at Cottontail2, all pre-pandemic. We are staying with a 1G ethernet network as we could not find 10G switches. Maybe in the near term we can upgrade.
Two new compute nodes (n100, n101) will be set up in a test queue. They each have four RTX5000 gpus which have the same architecture as our other gpus so all compiled software should work. These gpus have 16G memory foot print (twice as large as other gpus we have).
OpenHPC will be deployed next and I'll make some notes. We will to Slurm scheduler. (Slurm Test Env for users and Slurm Test Env techie page). Any old hardware that can be reimaged with Rocky 8 will be migrated to Slurm using Warewulf. But that all will take some time.
Some pictures below.
Steps. “Ala n37” … so the RTX nodes are similar to the K20 nodes and we can put the local software in place. See K20 Redo page and exx96 Recipe for CentOS 7
New recipe for n100-n101 sporting Rocky 8.5 on cottontail2
Put node on internet…first though
# login as root check some things out... free -g nvidia-smi # if gpus cat /proc/cpuinfo # check and set local time zone mv /etc/localtime /etc/localtime.backup ln -s /usr/share/zoneinfo/America/New_York /etc/localtime # change passwords for root and vendor account passwd passwd microway # set hostname hostnamectl set-hostname cottontail2 # root: sync cottontail's master and known_hosts (tails+stores) ssh-keygen -t rsa scp 10.10.102.253:/root/.ssh/authorized_keys /root/.ssh/ /etc/ssh/sshd_config (PermitRootLogin) # Put the warewulf cluster key in authorized_keys # Put eth0 fingerprints in cottontail/greentail52 known hosts # add to relevant known_hosts_servername file # configure private subnets and ping file server cd /etc/sysconfig/network-scripts/ vi ifcfg-eth0 # 192.168.102.x vi ifcfg-eth1 # 10.10.102.x vi ifcfg-eth3 # 129.133.52.x systemctl restart network ping -c 3 192.168.102.42 ping -c 3 10.10.102.42 # make internet connection for yum # iptables dnf install -y iptables-services vi /etc/sysconfig/iptables # add 'local allow' ports --dport 0:65535 systemctl start iptables # and enable iptables -L systemctl stop firewalld systemctl disable firewalld # eth3 for ctt2 or eth1 for n100-101 dnf install bind-utils dig google.com iptables -L # check! # Rocky8 # https://docs.fedoraproject.org/en-US/epel/#Quickstart dnf config-manager --set-enabled powertools dnf install epel-release dnf install netcdf netcdf-devel dnf install yum-utils # yumdownloader dnf install ddd dnf install grace dnf install gnuplot dnf install alpine # pico yum groupinstall "Server" # server for compute nodes "Server with GUI" # other configs vi /etc/selinux/config # disabled, do not mistype, kernel will not boot! mv /home /usr/local/ cd /;ln -s /usr/local/home cd /; ln -s /home /share vi /etc/passwd (exx, dockeruser $HOME) ## edit passwd, shadow, group, hosts files ## ## make -orig backups and stage in /home/tmp/global ## cottontail2 = greentail52 sections #exx96 mkdir /sanscratch /home/localscratch chmod ugo+rwx /sanscratch /home/localscratch chmod o+t /sanscratch /home/localscratch # exx96 # link localscratch in 1.4T /home to / cd /home ln -s /zfshomes/apps ln -s /zfshomes/tmp ln -s /zfshomes/csmith06 # fstab file mounts mkdir -p /zfshomes /home66 /home33 /mindstore /opt/ohpc/pub /opt/intel # cottontail2 = greentail52 # n100-n101 = n79 # add to zenoss edit /etc/snmp/snmpd.conf, enable and start rocommunity public dontLogTCPWrappersConnects yes # on head node /etc/chronyc.conf allow 192.168.0.0/16 # compute nodes /etc/chronyc.conf #pool 2.pool.ntp.org iburst Server 192.168.102.250 Server 192.168.102.251 # check chronyc sources # on head node install from epel repo yum install slurm-openlava # error on conflicting libs, too bad! # add packages and update yum install epel-release -y yum install flex bison -y yum install tcl tcl-devel dmtcp -y yum install net-snmp net-snmp-libs net-tools net-snmp-utils -y yum install freeglut-devel libXi-devel libXmu-devel -y yum install blas blas-devel lapack lapack-devel boost boost-devel -y yum install lm_sensors lm_sensors-libs -y yum install zlib-devel bzip2-devel -y yum install openmpi openmpi-devel perl-ExtUtils-MakeMaker -y yum install cmake -y yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y #easybuild yum install libibverbs libibverbs-devel # amber20 cmake readline error fix needs yum install ncurses-c++-libs-6.1-9.20180224.el8.x86_64.rpm \ ncurses-devel-6.1-9.20180224.el8.x86_64.rpm \ readline-devel-7.0-10.el8.x86_64.rpm # amber20 yum -y install tcsh make \ gcc gcc-gfortran gcc-c++ \ which flex bison patch bc \ libXt-devel libXext-devel \ perl perl-ExtUtils-MakeMaker util-linux wget \ bzip2 bzip2-devel zlib-devel tar # CENTOS7 pick the kernel vendor used for now grep ^menuentry /etc/grub2.cfg grub2-set-default 1 ls -d /sys/firmware/efi && echo "EFI" || echo "Legacy" #grub2-mkconfig -o /boot/grub2/grub.cfg # legacy #grub2-mkconfig -o /boot/efi/EFI/centos/grub.cfg # efi # compute nodes old level 3 systemctl set-default multi-user.target # postfix dnf install postfix dnf install mailx systemctl enable postfix echo "relayhost = 192.168.102.251" >> /etc/postfix/main.cf # compute nodes only # leave old cuda versions behind (9.2 | 10.2) cd usr/local/ # scp fron n79:/usr/local/ amber16/ amber20/ fsl-5.0.10/ gromacs-2018/ lammps-22Aug18/ # compute nodes only /usr/local/bin/ # copy scripts: gpu-free, gpu-info, gpu-process # copy 10.10.102.89:/usr/local/bin/n37.openmpi.wrapper /usr/local/bin/ # done # FINISH native vanilla installs # R version 4.1.2 (2021-11-01) -- "Bird Hippie" yum install R R-devel # openjdk version "1.8.0_322" rpm -qa | grep ^java # check yum install java-1.8.0-openjdk java-1.8.0-openjdk-devel \ java-1.8.0-openjdk-headless javapackages-filesystem # python v 3.9 yum install python39 python39-devel ln -s /usr/bin/python3.9 /usr/bin/python # fftw 3.3.5-11.el8 yum install fftw fftw-devel #gnu scientific libraries yum install gsl gsl-devel # ruby 2.5.9-109.module+el8.5.0 yum install ruby ruby-devel # obabel chem file formats yum install openbabel openbabel-devel # dmtcp yum install dmtcp dmtcp-devel # check status of service munge yum clean all # eth3 onboot=no, private networks only systemctl disable iptables reboot # now make it an ohpc compute node yum repolist yum install ohpc-base-compute scp cottontail2:/etc/resolv.conf /etc/resolv.conf yum install ohpc-slurm-client systemctl enable munge systemctl start munge scp cottontail2:/etc/munge/munge.key /etc/munge/munge.key echo SLURMD_OPTIONS="--conf-server 192.168.102.250" > /etc/sysconfig/slurmd yum install --allowerasing lmod-ohpc grep '/var' /etc/slurm/slurm.conf mkdir /var/log/slurm chown slurm:munge /var/log/slurm mkdir /var/spool/slurm chown slurm:munge /var/spool/slurm scp cottontail2:/etc/slurm/slurm.conf /etc/slurm/slurm.conf scp cottontail2:/etc/slurm/gres.conf /etc/slurm/gres.conf scp cottontail2:/etc/profile.d/lmod.sh /etc/profile.d/ # /var/[log|spool|run] need to be removed from /usr/libexec/warewulf/wwmkchroot/gold-template #test /usr/sbin/slurmd -D # start via rc.local chmod +x /etc/rc.d/rc.local #timing issue with munge sleep 15 /usr/sbin/slurmd # slurmd ??? libhwloc.so.15 => /opt/ohpc/pub/libs/hwloc/lib/libhwloc.so.15 (0x00007fd6e5684000)
OpenHPC
# First **all the necessary packages ** (yum install...) 988 tar xvfj ../AmberTools21.tar.bz2 989 tar xvfj ../Amber20.tar.bz2 993 cd amber20_src/ 994 cd build/ 996 vi run_cmake # Assume this is Linux: # serial, do on head node, with miniconda true, compile, install cmake $AMBER_PREFIX/amber20_src \ -DCMAKE_INSTALL_PREFIX=/share/apps/CENTOS8/ohpc/software/amber/20 \ -DCOMPILER=GNU \ -DMPI=FALSE -DCUDA=FALSE -DINSTALL_TESTS=TRUE \ -DDOWNLOAD_MINICONDA=TRUE -DMINICONDA_USE_PY3=TRUE \ 2>&1 | tee cmake.log # Env [hmeij@n100 ~]$ module load cuda/11.6 [hmeij@n100 ~]$ echo $CUDA_HOME /usr/local/cuda [hmeij@n100 ~]$ which nvcc mpicc gcc /usr/local/cuda/bin/nvcc /opt/ohpc/pub/mpi/openmpi4-gnu9/4.1.1/bin/mpicc /opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc # [FIXED} cmake error on conda install, set to FALSE # OS native python, install on n[100-101] -- Python version 3.9 -- OK -- Found PythonLibs: /usr/lib64/libpython3.9.so (found version "3.9.6") -- Checking for Python package numpy -- not found -- Checking for Python package scipy -- not found -- Checking for Python package matplotlib -- not found -- Checking for Python package setuptools -- found [END FIXED] # mpi & cuda FALSE builds serial ./run_cmake make install # lots and lots of warnings # then source /share/apps/CENTOS8/ohpc/software/amber/20/amber.sh # on n100 now, parallel, set miniconda flags to FALSE -MPI=TRUE ./run_cmake make install # on n100 just change cuda flag -CUDA=TRUE ./run_cmake make install #tests cd $AMBERHOME make test.serial export DO_PARALLEL="mpirun -np 6" make test.parallel export CUDA_VISIBLE_DEVICES=0 make test.cuda.serial make test.cuda.parallel
OpenHPC
# First **all the necessary packages ** (yum install...) 988 tar xvfj ../AmberTools22.tar.bz2 989 tar xvfj ../Amber22.tar.bz2 993 cd amber20_src/ 994 cd build/ 996 vi run_cmake # Assume this is Linux: # serial, do on head node, with miniconda true, compile, install cmake $AMBER_PREFIX/amber22_src \ -DCMAKE_INSTALL_PREFIX=/share/apps/CENTOS8/ohpc/software/amber/22 \ -DCOMPILER=GNU \ -DMPI=FALSE -DCUDA=FALSE -DINSTALL_TESTS=TRUE \ -DDOWNLOAD_MINICONDA=TRUE \ 2>&1 | tee cmake.log ./run_cmake make install # Note !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! The OpenMPI and MPICH system installations provided by CentOS (i.e., through yum install) are known to be somehow incompatible with Amber22. # OUCH !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # GO TO node n100 # copy head node's amber22_src/ to n100:/usr/local/src/tmp/ source /share/apps/CENTOS8/ohpc/software/amber/22/amber.sh echo $AMBERHOME # install latest openmpi version cd amber_src/Ambertools/src tar xvfj ../../../../openmpi-4.1.4.tar.bz2 ./configure_openmpi gnu # openhpc gcc/gfortran # on n100 now, parallel, set -MPI=TRUE -DDOWNLOAD_MINICONDA=FALSE ./run_cmake make install # on n100 just change cuda flag [hmeij@n100 build]$ module load cuda/11.6 [hmeij@n100 build]$ which gcc mpicc nvcc /opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc /share/apps/CENTOS8/ohpc/software/amber/22/bin/mpicc /usr/local/cuda/bin/nvcc [hmeij@n100 ~]$ echo $CUDA_HOME /usr/local/cuda -MPI=TRUE -CUDA=TRUE -DDOWNLOAD_MINICONDA=FALSE ./run_cmake make install [hmeij@n100 ~]$ which nvcc mpicc gcc /usr/local/cuda/bin/nvcc /opt/ohpc/pub/mpi/openmpi4-gnu9/4.1.1/bin/mpicc /opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc #tests cd $AMBERHOME make test.serial export DO_PARALLEL="mpirun -np 6" make test.parallel export CUDA_VISIBLE_DEVICES=0 make test.cuda.serial make test.cuda.parallel