User Tools

Site Tools


cluster:213

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
Next revision Both sides next revision
cluster:213 [2022/03/16 13:27]
hmeij07
cluster:213 [2023/02/08 14:31]
hmeij07
Line 42: Line 42:
 scp 10.10.102.253:/root/.ssh/authorized_keys /root/.ssh/ scp 10.10.102.253:/root/.ssh/authorized_keys /root/.ssh/
 /etc/ssh/sshd_config (PermitRootLogin) /etc/ssh/sshd_config (PermitRootLogin)
 +
 +# Put the warewulf cluster key in authorized_keys
 +# Put eth0 fingerprints in cottontail/greentail52 known hosts
 +# add to relevant known_hosts_servername file
  
 # configure private subnets and ping file server # configure private subnets and ping file server
Line 53: Line 57:
  
 # make internet connection for yum # make internet connection for yum
-# eth3 for ctt2 or eth1 for n100-101 
-dnf install bind-utils 
-dig google.com 
- 
-#rocky8 
-# https://docs.fedoraproject.org/en-US/epel/#Quickstart 
-dnf config-manager --set-enabled powertools 
-dnf install epel-release 
-dnf install netcdf netcdf-devel 
-dnf install yum-utils # yumdownloader 
-dnf install ddd  
-dnf install grace 
-dnf install gnuplot 
-dnf install alpine # pico 
  
 # iptables # iptables
Line 76: Line 66:
 systemctl stop firewalld systemctl stop firewalld
 systemctl disable firewalld systemctl disable firewalld
 +
 +
 +# eth3 for ctt2 or eth1 for n100-101
 +dnf install bind-utils
 +dig google.com
 +iptables -L # check!
 +
 +
  
 # other configs # other configs
 vi /etc/selinux/config # disabled, do not mistype, kernel will not boot! vi /etc/selinux/config # disabled, do not mistype, kernel will not boot!
 mv /home /usr/local/ mv /home /usr/local/
 +cd /;ln -s /usr/local/home 
 +cd /; ln -s /home /share
 vi /etc/passwd (exx, dockeruser $HOME) vi /etc/passwd (exx, dockeruser $HOME)
  
-## edit passwd, shadow, group, hosts files ## 
-## make -orig backups and stage in /home/tmp/global 
-## cottontail2 = greentail52 sections 
  
 +#exx96
 mkdir /sanscratch /home/localscratch mkdir /sanscratch /home/localscratch
 chmod ugo+rwx /sanscratch /home/localscratch chmod ugo+rwx /sanscratch /home/localscratch
 chmod o+t /sanscratch /home/localscratch  chmod o+t /sanscratch /home/localscratch 
 +# exx96
 # link localscratch in 1.4T /home to / # link localscratch in 1.4T /home to /
-mkdir /home  + 
-cd /home # local dir+cd /home 
 ln -s /zfshomes/apps ln -s /zfshomes/apps
 ln -s /zfshomes/tmp ln -s /zfshomes/tmp
 ln -s /zfshomes/csmith06 ln -s /zfshomes/csmith06
-ln -s /zfshomes /share+ 
 +cat /sanscratch/tmp/fstab.tmp >> /etc/fstab; mkdir /astrostore; mount -a; df -h; cd /smithlab/;  ln -s /smithlab/home/opt/rhel08 opt; ls -l
  
 # fstab file mounts # fstab file mounts
 +mkdir -p /zfshomes /home66 /home33 /mindstore /opt/ohpc/pub /opt/intel
 # cottontail2 = greentail52 # cottontail2 = greentail52
 # n100-n101 = n79 # n100-n101 = n79
  
-# postfix 
-dnf install postfix 
-dnf install mailx 
-systemctl enable postfix 
-echo "relayhost = 192.168.102.251" >> /etc/postfix/main.cf 
  
 +# on head node /etc/chronyc.conf
 +allow 192.168.0.0/16
 # compute nodes /etc/chronyc.conf # compute nodes /etc/chronyc.conf
 #pool 2.pool.ntp.org iburst #pool 2.pool.ntp.org iburst
 Server 192.168.102.250 Server 192.168.102.250
 Server 192.168.102.251 Server 192.168.102.251
 +# check
 +chronyc sources
 +
 +# Rocky8
 +# https://docs.fedoraproject.org/en-US/epel/#Quickstart
 +dnf config-manager --set-enabled powertools
 +dnf install epel-release
 +dnf install netcdf netcdf-devel
 +dnf install yum-utils # yumdownloader
 +dnf install ddd 
 +dnf install grace
 +dnf install gnuplot
 +dnf install alpine # pico
 +yum groupinstall "Server" # server for compute nodes "Server with GUI"
 +
 +
 +# on head node install from epel repo
 +### yum install slurm-openlava
 +# error on conflicting libs, too bad!
  
  
Line 125: Line 142:
 yum install cmake -y yum install cmake -y
 yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y
-amber+ 
 +#easybuild 
 +yum install libibverbs libibverbs-devel 
 + 
 +# amber20 cmake readline error fix needs 
 +yum install ncurses-c++-libs-6.1-9.20180224.el8.x86_64.rpm \ 
 +            ncurses-devel-6.1-9.20180224.el8.x86_64.rpm \ 
 +            readline-devel-7.0-10.el8.x86_64.rpm 
 + 
 +# amber20
 yum -y install tcsh make \ yum -y install tcsh make \
                gcc gcc-gfortran gcc-c++ \                gcc gcc-gfortran gcc-c++ \
Line 132: Line 158:
                perl perl-ExtUtils-MakeMaker util-linux wget \                perl perl-ExtUtils-MakeMaker util-linux wget \
                bzip2 bzip2-devel zlib-devel tar                 bzip2 bzip2-devel zlib-devel tar 
-yum update -y 
-yum clean all 
  
 # CENTOS7 pick the kernel vendor used for now # CENTOS7 pick the kernel vendor used for now
Line 144: Line 168:
 # compute nodes old level 3 # compute nodes old level 3
 systemctl set-default multi-user.target systemctl set-default multi-user.target
-remove internetbring private back up + 
-reboot+ 
 +postfix 
 +dnf install postfix 
 +dnf install mailx 
 +systemctl enable postfix 
 +echo "relayhost = 192.168.102.251" >> /etc/postfix/main.cf 
 + 
 +yum install net-snmp-utils net-snmp-libs net-snmp net-snmp-agent-libs 
 + 
 +# edit /etc/snmp/snmpd.conf, enable and start 
 +rocommunity public 
 +dontLogTCPWrappersConnects yes 
 +# enable, startadd to zenoss  
  
 # compute nodes only # compute nodes only
Line 163: Line 200:
 # openjdk version "1.8.0_322" # openjdk version "1.8.0_322"
 rpm -qa | grep ^java  # check rpm -qa | grep ^java  # check
 +yum install java-1.8.0-openjdk java-1.8.0-openjdk-devel \
 +java-1.8.0-openjdk-headless javapackages-filesystem
 # python v 3.9 # python v 3.9
 yum install python39 python39-devel yum install python39 python39-devel
 +ln -s /usr/bin/python3.9 /usr/bin/python
 # fftw 3.3.5-11.el8 # fftw 3.3.5-11.el8
 yum install fftw fftw-devel yum install fftw fftw-devel
Line 175: Line 215:
 # dmtcp # dmtcp
 yum install dmtcp dmtcp-devel yum install dmtcp dmtcp-devel
 +
 +
  
 yum clean all yum clean all
 +# eth3 onboot=no, private networks only
 +systemctl disable iptables
 reboot reboot
 +
 +# now make it an ohpc compute node
 +  yum repolist
 +  yum install singularity-ohpc
 +  yum  install ohpc-base-compute --nobest
 +  
 +  scp cottontail2:/etc/resolv.conf /etc/resolv.conf
 +  yum  install ohpc-slurm-client
 +  # check status of service munge
 +  systemctl enable munge
 +  systemctl start munge
 +  scp cottontail2:/etc/munge/munge.key /etc/munge/munge.key
 +  echo SLURMD_OPTIONS="--conf-server 192.168.102.250" > /etc/sysconfig/slurmd
 +  yum  install --allowerasing lmod-ohpc
 +  grep '/var' /etc/slurm/slurm.conf
 +  mkdir /var/log/slurm 
 +  chown slurm:munge /var/log/slurm 
 +  mkdir /var/spool/slurm 
 +  chown slurm:munge /var/spool/slurm 
 +  scp cottontail2:/etc/slurm/slurm.conf /etc/slurm/slurm.conf
 +  scp cottontail2:/etc/slurm/gres.conf /etc/slurm/gres.conf
 +  scp cottontail2:/etc/profile.d/lmod.sh /etc/profile.d/
 +  
 +# /etc/bashrc add
 +# ohpc lmod gcc mpicc
 +export PATH=/usr/local/slurm/bin:$PATH
 +export LD_LIBRARY_PATH=/usr/local/slurm/lib:$LD_LIBRARY_PATH
 +
 +  
 +# /var/[log|spool|run] need to be removed from
 +/usr/libexec/warewulf/wwmkchroot/gold-template
 +
 +#test
 +  /usr/sbin/slurmd -D 
 +  
 +# start via rc.local
 +chmod +x /etc/rc.d/rc.local
 +#timing issue with munge
 +sleep 15
 +/usr/sbin/slurmd
 +
 +
 +## edit passwd, shadow, group, hosts files ##
 +## make -orig backups and stage in /home/tmp/global
 +## cottontail2 = greentail52 sections
 +
 +  
 +# slurmd ???
 + libhwloc.so.15 => /opt/ohpc/pub/libs/hwloc/lib/libhwloc.so.15 (0x00007fd6e5684000)
 +
  
  
 </code> </code>
-==== Configure Recipe ==== 
  
-Steps. "Ala n37" ... so the RTX nodes are similar to the K20 nodes and we can put the local software in place. See [[cluster:172|K20 Redo]] page and [[cluster:192|exx96]] Recipe for CentOS 7+==== Pics ====
  
-New recipe for n100-n101 sporting Rocky 8.5 on ''cottontail2''\\ 
-Put node on internet...first though 
  
-  * ** Vanilla Backups** using Warewulf and plain rsync (--exclude=[proc/,sys/,run/]+My data center robot thingie and node n100's gpus\\ 
 + 
 +\\ 
 + 
 +{{:cluster:dcrobot.jpg?400|}} 
 +\\ 
 +{{:cluster:n100.jpg?400|}}\\ 
 +\\ 
 + 
 +==== Amber20 ==== 
 + 
 +OpenHPC
  
 <code> <code>
  
-login as root check some things out... +First **all the necessary packages ** (yum install...)
-free -g +
-nvidia-smi # if gpus +
-cat /proc/cpuinfo+
  
-# check and set local time zone + 988  tar xvfj ../AmberTools21.tar.bz2  
-mv /etc/localtime /etc/localtime.backup +  989  tar xvfj ../Amber20.tar.bz2  
-ln -s /usr/share/zoneinfo/America/New_York /etc/localtime+  993  cd amber20_src/ 
 +  994  cd build/ 
 +  996  vi run_cmake
  
-change passwords for root and vendor account + Assume this is Linux:
-passwd +
-passwd microway +
-# set hostname +
-hostnamectl set-hostname cottontail2+
  
-root: sync cottontail's master and known_hosts (tails+stores) +serial, do on head node, with miniconda true, compile, install 
-ssh-keygen -t rsa +  cmake $AMBER_PREFIX/amber20_src \ 
-scp 10.10.102.253:/root/.ssh/authorized_keys /root/.ssh/ +    -DCMAKE_INSTALL_PREFIX=/share/apps/CENTOS8/ohpc/software/amber/20 \ 
-/etc/ssh/sshd_config (PermitRootLogin)+    -DCOMPILER=GNU 
 +    -DMPI=FALSE -DCUDA=FALSE -DINSTALL_TESTS=TRUE \ 
 +    -DDOWNLOAD_MINICONDA=TRUE -DMINICONDA_USE_PY3=TRUE \ 
 +    2>&1 | tee  cmake.log
  
-configure private subnets and ping file server +Env
-cd /etc/sysconfig/network-scripts/ +
-vi ifcfg-eth0 # 192.168.102.x +
-vi ifcfg-eth1 # 10.10.102.x +
-vi ifcfg-eth3 # 129.133.52.x +
-systemctl restart network +
-ping -c 3 192.168.102.42 +
-ping -c 3 10.10.102.42+
  
-# make internet connection for yum +[hmeij@n100 ~]$ module load cuda/11.6
-# eth3 for ctt2 or eth1 for n100-101 +
-dnf install bind-utils +
-dig google.com+
  
-#rocky8 +[hmeij@n100 ~]$ echo $CUDA_HOME 
-# https://docs.fedoraproject.org/en-US/epel/#Quickstart +/usr/local/cuda
-dnf config-manager --set-enabled powertools +
-dnf install epel-release +
-dnf install netcdf netcdf-devel +
-dnf install yum-utils # yumdownloader +
-dnf install ddd  +
-dnf install grace +
-dnf install gnuplot +
-dnf install alpine # pico+
  
-# iptables +[hmeij@n100 ~]$ which nvcc mpicc gcc 
-dnf install -y iptables-services +/usr/local/cuda/bin/nvcc 
-vi /etc/sysconfig/iptables +/opt/ohpc/pub/mpi/openmpi4-gnu9/4.1.1/bin/mpicc 
-# add 'local allow' ports  --dport 0:65535 +/opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc
-systemctl start iptables # and enable +
-iptables -L +
-systemctl stop firewalld +
-systemctl disable firewalld+
  
-other configs +[FIXED} cmake error on conda install, set to FALSE 
-vi /etc/selinux/config disableddo not mistype, kernel will not boot! +OS native pythoninstall on n[100-101] 
-mv /home /usr/local/ +-- Python version 3.9 -- OK 
-vi /etc/passwd (exx, dockeruser $HOME)+-- Found PythonLibs: /usr/lib64/libpython3.9.so (found version "3.9.6" 
 +-- Checking for Python package numpy -- not found 
 +-- Checking for Python package scipy -- not found 
 +-- Checking for Python package matplotlib -- not found 
 +-- Checking for Python package setuptools -- found 
 +[END FIXED]
  
-## edit passwd, shadow, group, hosts files ## +mpi & cuda FALSE builds serial 
-## make -orig backups and stage in /home/tmp/global +./run_cmake 
-## cottontail2 = greentail52 sections+make install 
 +lots and lots of warnings
  
-mkdir /sanscratch /home/localscratch +then 
-chmod ugo+rwx /sanscratch /home/localscratch +source /share/apps/CENTOS8/ohpc/software/amber/20/amber.sh
-chmod o+t /sanscratch /home/localscratch  +
-link localscratch in 1.4T /home to / +
-mkdir /home  +
-cd /home # local dir +
-ln -s /zfshomes/apps +
-ln -s /zfshomes/tmp +
-ln -s /zfshomes/csmith06 +
-ln -s /zfshomes /share+
  
-fstab file mounts +on n100 now, parallel, set miniconda flags to FALSE 
-# cottontail2 = greentail52 +-MPI=TRUE 
-n100-n101 n79+./run_cmake 
 +make install
  
-postfix +on n100 just change cuda flag 
-dnf install postfix +-CUDA=TRUE 
-dnf install mailx +./run_cmake 
-systemctl enable postfix +make install
-echo "relayhost = 192.168.102.42" >> /etc/postfix/main.cf+
  
-compute nodes /etc/chronyc.conf +#tests 
-#pool 2.pool.ntp.org iburst +cd $AMBERHOME 
-Server 192.168.102.250 +make test.serial 
-Server 192.168.102.251+export DO_PARALLEL="mpirun -np 6" 
 +make test.parallel 
 +export CUDA_VISIBLE_DEVICES=0 
 +make test.cuda.serial 
 +make test.cuda.parallel
  
 +</code>
  
-# add packages and update +==== Amber22 ====
-yum install epel-release -y +
-yum install flex bison -y  +
-yum install tcl tcl-devel dmtcp -y +
-yum install net-snmp net-snmp-libs net-tools net-snmp-utils -y +
-yum install freeglut-devel libXi-devel libXmu-devel -y +
-yum install blas blas-devel lapack lapack-devel boost boost-devel -y +
-yum install lm_sensors lm_sensors-libs -y +
-yum install zlib-devel bzip2-devel -y +
-yum install openmpi openmpi-devel perl-ExtUtils-MakeMaker -y +
-yum install cmake -y +
-yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y +
-# amber +
-yum -y install tcsh make \ +
-               gcc gcc-gfortran gcc-c++ \ +
-               which flex bison patch bc \ +
-               libXt-devel libXext-devel \ +
-               perl perl-ExtUtils-MakeMaker util-linux wget \ +
-               bzip2 bzip2-devel zlib-devel tar  +
-yum update -y +
-yum clean all+
  
-# CENTOS7 pick the kernel vendor used for now +OpenHPC
-grep ^menuentry /etc/grub2.cfg +
-grub2-set-default 1 +
-ls -d /sys/firmware/efi && echo "EFI" || echo "Legacy" +
-#grub2-mkconfig -o /boot/grub2/grub.cfg          # legacy +
-#grub2-mkconfig -o /boot/efi/EFI/centos/grub.cfg # efi+
  
-# compute nodes old level 3 +<code>
-systemctl set-default multi-user.target +
-# remove internet, bring private back up +
-reboot+
  
-compute nodes only +First **all the necessary packages ** (yum install...)
-# leave old cuda versions behind (9.2 | 10.2) +
-cd usr/local/ +
-# scp fron n79:/usr/local/ +
-amber16/  amber20/ fsl-5.0.10/ gromacs-2018/ lammps-22Aug18/+
  
-# compute nodes only /usr/local/bin+ 988  tar xvfj ../AmberTools22.tar.bz2  
-# copy scripts: gpu-free, gpu-info, gpu-process +  989  tar xvfj ../Amber22.tar.bz2  
-# copy 10.10.102.89:/usr/local/bin/n37.openmpi.wrapper /usr/local/bin+  993  cd amber20_src/ 
-# done+  994  cd build
 +  996  vi run_cmake
  
-FINISH native vanilla installs +#  Assume this is Linux:
-# R version 4.1.2 (2021-11-01) -- "Bird Hippie" +
-yum install R R-devel +
-# openjdk version "1.8.0_322" +
-rpm -qa | grep ^java  # check +
-# python v 3.9 +
-yum install python39 python39-devel +
-# fftw 3.3.5-11.el8 +
-yum install fftw fftw-devel +
-#gnu scientific libraries +
-yum install gsl gsl-devel +
-# ruby 2.5.9-109.module+el8.5.0 +
-yum install ruby ruby-devel +
-# obabel chem file formats +
-yum install openbabel openbabel-devel +
-# dmtcp +
-yum install dmtcp dmtcp-devel+
  
-yum clean all +# serial, do on head node, with miniconda true, compile, install 
-reboot+  cmake $AMBER_PREFIX/amber22_src \ 
 +    -DCMAKE_INSTALL_PREFIX=/share/apps/CENTOS8/ohpc/software/amber/22 \ 
 +    -DCOMPILER=GNU 
 +    -DMPI=FALSE -DCUDA=FALSE -DINSTALL_TESTS=TRUE \ 
 +    -DDOWNLOAD_MINICONDA=TRUE \ 
 +    2>&1 | tee  cmake.log 
 +./run_cmake 
 +make install
  
 +
 +# Note !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 +The OpenMPI and MPICH system installations provided by CentOS 
 +(i.e., through yum install) 
 +are known to be somehow incompatible with Amber22.
 +# OUCH !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 +
 +
 +# GO TO node n100
 +
 +# copy head node's amber22_src/ to n100:/usr/local/src/tmp/
 +
 +
 +source /share/apps/CENTOS8/ohpc/software/amber/22/amber.sh
 +echo $AMBERHOME
 +
 +# install latest openmpi version
 +cd amber_src/Ambertools/src
 +tar xvfj ../../../../openmpi-4.1.4.tar.bz2 
 +
 +./configure_openmpi gnu # openhpc gcc/gfortran 
 +
 +
 +# on n100 now, parallel, set 
 +-MPI=TRUE
 +-DDOWNLOAD_MINICONDA=FALSE
 +./run_cmake
 +make install
 +
 +# on n100 just change cuda flag
 +
 +[hmeij@n100 build]$ module load cuda/11.6
 +[hmeij@n100 build]$ which gcc mpicc nvcc
 +/opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc
 +/share/apps/CENTOS8/ohpc/software/amber/22/bin/mpicc
 +/usr/local/cuda/bin/nvcc
 +[hmeij@n100 ~]$ echo $CUDA_HOME
 +/usr/local/cuda
 +
 +-MPI=TRUE
 +-CUDA=TRUE
 +-DDOWNLOAD_MINICONDA=FALSE
 +./run_cmake
 +make install
 +
 +
 +[hmeij@n100 ~]$ which nvcc mpicc gcc
 +/usr/local/cuda/bin/nvcc
 +/opt/ohpc/pub/mpi/openmpi4-gnu9/4.1.1/bin/mpicc
 +/opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc
 +
 +#tests
 +cd $AMBERHOME
 +make test.serial
 +export DO_PARALLEL="mpirun -np 6"
 +make test.parallel
 +export CUDA_VISIBLE_DEVICES=0
 +make test.cuda.serial
 +make test.cuda.parallel
  
 </code> </code>
  
-==== Pics ==== 
  
- 
-\\ 
 **[[cluster:0|Back]]** **[[cluster:0|Back]]**
  
cluster/213.txt · Last modified: 2024/01/12 15:09 by hmeij07