User Tools

Site Tools


cluster:213

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
cluster:213 [2022/03/16 13:33]
hmeij07 [Pics]
cluster:213 [2024/01/12 15:09]
hmeij07
Line 42: Line 42:
 scp 10.10.102.253:/root/.ssh/authorized_keys /root/.ssh/ scp 10.10.102.253:/root/.ssh/authorized_keys /root/.ssh/
 /etc/ssh/sshd_config (PermitRootLogin) /etc/ssh/sshd_config (PermitRootLogin)
 +
 +# Put the warewulf cluster key in authorized_keys
 +# Put eth0 fingerprints in cottontail/greentail52 known hosts
 +# add to relevant known_hosts_servername file
  
 # configure private subnets and ping file server # configure private subnets and ping file server
 cd /etc/sysconfig/network-scripts/ cd /etc/sysconfig/network-scripts/
 vi ifcfg-eth0 # 192.168.102.x vi ifcfg-eth0 # 192.168.102.x
-vi ifcfg-eth1 # 10.10.102.x+vi ifcfg-eth1 # 10.10.102.x   # 'uuidgen eth1' to get uuid
 vi ifcfg-eth3 # 129.133.52.x vi ifcfg-eth3 # 129.133.52.x
 +scp 192.168.102.112:/etc/rc.d/rc.local /etc/rc.d/  # check +x, edit ib0, start
 +
 systemctl restart network systemctl restart network
 ping -c 3 192.168.102.42 ping -c 3 192.168.102.42
Line 53: Line 59:
  
 # make internet connection for yum # make internet connection for yum
-# eth3 for ctt2 or eth1 for n100-101 
-dnf install bind-utils 
-dig google.com 
- 
-#rocky8 
-# https://docs.fedoraproject.org/en-US/epel/#Quickstart 
-dnf config-manager --set-enabled powertools 
-dnf install epel-release 
-dnf install netcdf netcdf-devel 
-dnf install yum-utils # yumdownloader 
-dnf install ddd  
-dnf install grace 
-dnf install gnuplot 
-dnf install alpine # pico 
  
 # iptables # iptables
Line 74: Line 66:
 systemctl start iptables # and enable systemctl start iptables # and enable
 iptables -L iptables -L
-systemctl stop firewalld + 
-systemctl disable firewalld+ 
 +# eth3 for ctt2 or eth1 for n100-101 
 +dnf install bind-utils 
 +dig google.com 
 +iptables -L # check! 
 + 
  
 # other configs # other configs
 vi /etc/selinux/config # disabled, do not mistype, kernel will not boot! vi /etc/selinux/config # disabled, do not mistype, kernel will not boot!
 mv /home /usr/local/ mv /home /usr/local/
 +cd /;ln -s /usr/local/home 
 +cd /; ln -s /home /share
 vi /etc/passwd (exx, dockeruser $HOME) vi /etc/passwd (exx, dockeruser $HOME)
  
-## edit passwd, shadow, group, hosts files ## 
-## make -orig backups and stage in /home/tmp/global 
-## cottontail2 = greentail52 sections 
  
 +#exx96
 mkdir /sanscratch /home/localscratch mkdir /sanscratch /home/localscratch
 chmod ugo+rwx /sanscratch /home/localscratch chmod ugo+rwx /sanscratch /home/localscratch
 chmod o+t /sanscratch /home/localscratch  chmod o+t /sanscratch /home/localscratch 
 +# exx96
 # link localscratch in 1.4T /home to / # link localscratch in 1.4T /home to /
-mkdir /home  + 
-cd /home # local dir+cd /home 
 ln -s /zfshomes/apps ln -s /zfshomes/apps
 ln -s /zfshomes/tmp ln -s /zfshomes/tmp
 ln -s /zfshomes/csmith06 ln -s /zfshomes/csmith06
-ln -s /zfshomes /share+ls -l 
 + 
 +cat /sanscratch/tmp/fstab.tmp >> /etc/fstab; mkdir /astrostore; mount -a; df -h; cd /smithlab/;  ln -s /smithlab/home/opt/rhel08 opt; ls -l
  
 # fstab file mounts # fstab file mounts
 +mkdir -p /zfshomes /home66 /home33 /mindstore /opt/ohpc/pub /opt/intel
 +mkdir -p /smithlab/home;cd /smithlab;ln -s /smithlab/home/opt/rhel08 opt; ls -l
 # cottontail2 = greentail52 # cottontail2 = greentail52
 # n100-n101 = n79 # n100-n101 = n79
  
-# postfix 
-dnf install postfix 
-dnf install mailx 
-systemctl enable postfix 
-echo "relayhost = 192.168.102.251" >> /etc/postfix/main.cf 
  
 +# on head node /etc/chronyc.conf
 +allow 192.168.0.0/16
 # compute nodes /etc/chronyc.conf # compute nodes /etc/chronyc.conf
 #pool 2.pool.ntp.org iburst #pool 2.pool.ntp.org iburst
 Server 192.168.102.250 Server 192.168.102.250
 Server 192.168.102.251 Server 192.168.102.251
 +# check
 +systemctl restart chronyd
 +chronyc sources
 +
 +# Rocky8
 +# https://docs.fedoraproject.org/en-US/epel/#Quickstart
 +dnf config-manager --set-enabled powertools -y
 +dnf install epel-release -y
 +dnf install netcdf netcdf-devel -y
 +dnf install yum-utils # yumdownloader -y
 +dnf install ddd grace gnuplot alpine -y # pico
 +
 +yum groupinstall "Server" # server for compute nodes "Server with GUI"
 +
 +
 +# on head node install from epel repo
 +### yum install slurm-openlava
 +# error on conflicting libs, too bad!
  
  
Line 116: Line 134:
 yum install epel-release -y yum install epel-release -y
 yum install flex bison -y  yum install flex bison -y 
-yum install tcl tcl-devel dmtcp -y+yum install tcl tcl-devel dmtcp dmtcp-devel -y
 yum install net-snmp net-snmp-libs net-tools net-snmp-utils -y yum install net-snmp net-snmp-libs net-tools net-snmp-utils -y
 yum install freeglut-devel libXi-devel libXmu-devel -y yum install freeglut-devel libXi-devel libXmu-devel -y
Line 125: Line 143:
 yum install cmake -y yum install cmake -y
 yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y
-amber+ 
 +#easybuild 
 +yum install libibverbs libibverbs-devel 
 + 
 +# amber20 cmake readline error fix needs 
 +yum install ncurses-c++-libs-6.1-9.20180224.el8.x86_64.rpm \ 
 +            ncurses-devel-6.1-9.20180224.el8.x86_64.rpm \ 
 +            readline-devel-7.0-10.el8.x86_64.rpm 
 + 
 +# amber20
 yum -y install tcsh make \ yum -y install tcsh make \
                gcc gcc-gfortran gcc-c++ \                gcc gcc-gfortran gcc-c++ \
Line 132: Line 159:
                perl perl-ExtUtils-MakeMaker util-linux wget \                perl perl-ExtUtils-MakeMaker util-linux wget \
                bzip2 bzip2-devel zlib-devel tar                 bzip2 bzip2-devel zlib-devel tar 
-yum update -y 
-yum clean all 
  
 # CENTOS7 pick the kernel vendor used for now # CENTOS7 pick the kernel vendor used for now
Line 144: Line 169:
 # compute nodes old level 3 # compute nodes old level 3
 systemctl set-default multi-user.target systemctl set-default multi-user.target
-remove internetbring private back up + 
-reboot+ 
 +postfix 
 +dnf install postfix 
 +dnf install mailx 
 +systemctl enable postfix 
 +echo "relayhost = 192.168.102.251" >> /etc/postfix/main.cf 
 + 
 + 
 +# edit /etc/snmp/snmpd.conf, enable and start 
 +rocommunity public 
 +dontLogTCPWrappersConnects yes 
 +# enable, startadd to zenoss  
  
 # compute nodes only # compute nodes only
Line 163: Line 200:
 # openjdk version "1.8.0_322" # openjdk version "1.8.0_322"
 rpm -qa | grep ^java  # check rpm -qa | grep ^java  # check
 +yum install java-1.8.0-openjdk java-1.8.0-openjdk-devel \
 +java-1.8.0-openjdk-headless javapackages-filesystem
 # python v 3.9 # python v 3.9
 yum install python39 python39-devel yum install python39 python39-devel
 +ln -s /usr/bin/python3.9 /usr/bin/python
 # fftw 3.3.5-11.el8 # fftw 3.3.5-11.el8
 yum install fftw fftw-devel yum install fftw fftw-devel
Line 173: Line 213:
 # obabel chem file formats # obabel chem file formats
 yum install openbabel openbabel-devel yum install openbabel openbabel-devel
-# dmtcp + 
-yum install dmtcp dmtcp-devel+ 
  
 yum clean all yum clean all
-reboot+# eth3 onboot=no, private networks only 
 +systemctl disable iptables
  
 +
 +# now make it an ohpc compute node
 +# DO THIS on compute nodes BEFORE mounting ctt2:/opt
 +# pulls in newer version, potentail problem later on
 +  yum repolist
 +  rpm -ivh ohpc-release-2-1.el8.x86_64.rpm 
 +  yum install singularity-ohpc
 +  yum  install ohpc-base-compute --nobest
 +    yum  install ohpc-slurm-client
 +  
 +  scp cottontail2:/etc/resolv.conf /etc/resolv.conf
 +
 +  
 +  # check status of service munge
 +  rpm -ivh /sanscratch/tmp/rpms/munge-devel-0.5.13-2.el8.x86_64.rpm
 +  systemctl enable munge
 +  systemctl start munge
 +  scp cottontail2:/etc/munge/munge.key /etc/munge/munge.key
 +  echo SLURMD_OPTIONS="--conf-server 192.168.102.250" > /etc/sysconfig/slurmd
 +  yum  install --allowerasing lmod-ohpc
 +  grep '/var' /etc/slurm/slurm.conf
 +  mkdir /var/log/slurm 
 +  chown slurm:munge /var/log/slurm 
 +  mkdir /var/spool/slurm 
 +  chown slurm:munge /var/spool/slurm 
 +  scp cottontail2:/etc/slurm/slurm.conf /etc/slurm/slurm.conf
 +  scp cottontail2:/etc/slurm/gres.conf /etc/slurm/gres.conf
 +  scp cottontail2:/etc/profile.d/lmod.sh /etc/profile.d/
 +  
 +# /etc/bashrc add
 +# ohpc lmod gcc mpicc
 +export PATH=/usr/local/slurm/bin:$PATH
 +export LD_LIBRARY_PATH=/usr/local/slurm/lib:$LD_LIBRARY_PATH
 +
 +  
 +# /var/[log|spool|run] need to be removed from
 +/usr/libexec/warewulf/wwmkchroot/gold-template
 +
 +#test
 +  /usr/sbin/slurmd -D 
 +  
 +# start via rc.local (already copied)
 +#chmod +x /etc/rc.d/rc.local
 +#timing issue with munge
 +#sleep 15
 +#/usr/sbin/slurmd
 +
 +systemctl stop firewalld
 +systemctl disable firewalld
 +
 +systemctl  disable dnf-makecache.timer
 +systemctl stop dnf-makecache.timer
 +
 + mv /etc/issue.d/cockpit.issue /root/etc_issue.d_cockpit.issue
 + mv /etc/motd.d/cockpit /root/etc_motd.d_cockpit
 +
 +
 +## edit passwd, shadow, group, hosts files ##
 +## make -orig backups and stage in /home/tmp/global
 +## cottontail2 = greentail52 sections
 +chown -R munge:munge /etc/munge /var/log/munge /var/lib/munge /var/run/munge
 +chown -R slurm:munge /var/log/slurm /var/spool/slurm
 +
 +
 +  
 +# slurmd ???
 + libhwloc.so.15 => /opt/ohpc/pub/libs/hwloc/lib/libhwloc.so.15 (0x00007fd6e5684000)
 +
 +# crontab
 +
 +# ionice gaussian
 +0,15,30,45 * * * * /share/apps/scripts/ionice_lexes.sh  > /dev/null 2>&1
 +
 +# cpu temps
 +40 * * * * /share/apps/scripts/lm_sensors.sh > /dev/null 2>&1
 +
 +on compute node /etc/security/limits.conf
 +*                -       memlock         270039400
 +
 +
 +# file date_ctt2.sh
 +
 +# ctt /etc/pdsh
 +
 +# ctt:/root/scripts
 +
 +# ctt2:/usr/local/bin/rslurm2022.sh
  
 </code> </code>
Line 184: Line 313:
 ==== Pics ==== ==== Pics ====
  
-My data center robot thingie... 
  
-{{:cluster:dcrobot.jpg?400 |}}+My data center robot thingie and node n100's gpus\\
  
-Node n100's gpus...+\\
  
-{{:cluster:n100.jpg?400 |}}+{{:cluster:dcrobot.jpg?400|}} 
 +\\ 
 +{{:cluster:n100.jpg?400|}}\\ 
 +\\ 
 + 
 +==== Amber20 ==== 
 + 
 +OpenHPC 
 + 
 +<code> 
 + 
 +# First **all the necessary packages ** (yum install...) 
 + 
 + 988  tar xvfj ../AmberTools21.tar.bz2  
 +  989  tar xvfj ../Amber20.tar.bz2  
 +  993  cd amber20_src/ 
 +  994  cd build/ 
 +  996  vi run_cmake 
 + 
 +#  Assume this is Linux: 
 + 
 +# serial, do on head node, with miniconda true, compile, install 
 +  cmake $AMBER_PREFIX/amber20_src \ 
 +    -DCMAKE_INSTALL_PREFIX=/share/apps/CENTOS8/ohpc/software/amber/20 \ 
 +    -DCOMPILER=GNU 
 +    -DMPI=FALSE -DCUDA=FALSE -DINSTALL_TESTS=TRUE \ 
 +    -DDOWNLOAD_MINICONDA=TRUE -DMINICONDA_USE_PY3=TRUE \ 
 +    2>&1 | tee  cmake.log 
 + 
 +# Env 
 + 
 +[hmeij@n100 ~]$ module load cuda/11.6 
 + 
 +[hmeij@n100 ~]$ echo $CUDA_HOME 
 +/usr/local/cuda 
 + 
 +[hmeij@n100 ~]$ which nvcc mpicc gcc 
 +/usr/local/cuda/bin/nvcc 
 +/opt/ohpc/pub/mpi/openmpi4-gnu9/4.1.1/bin/mpicc 
 +/opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc 
 + 
 +# [FIXED} cmake error on conda install, set to FALSE 
 +# OS native python, install on n[100-101] 
 +-- Python version 3.9 -- OK 
 +-- Found PythonLibs: /usr/lib64/libpython3.9.so (found version "3.9.6")  
 +-- Checking for Python package numpy -- not found 
 +-- Checking for Python package scipy -- not found 
 +-- Checking for Python package matplotlib -- not found 
 +-- Checking for Python package setuptools -- found 
 +[END FIXED] 
 + 
 +# mpi & cuda FALSE builds serial 
 +./run_cmake 
 +make install 
 +# lots and lots of warnings 
 + 
 +# then 
 +source /share/apps/CENTOS8/ohpc/software/amber/20/amber.sh 
 + 
 +# on n100 now, parallel, set miniconda flags to FALSE 
 +-MPI=TRUE 
 +./run_cmake 
 +make install 
 + 
 +# on n100 just change cuda flag 
 +-CUDA=TRUE 
 +./run_cmake 
 +make install 
 + 
 +#tests 
 +cd $AMBERHOME 
 +make test.serial 
 +export DO_PARALLEL="mpirun -np 6" 
 +make test.parallel 
 +export CUDA_VISIBLE_DEVICES=0 
 +make test.cuda.serial 
 +make test.cuda.parallel 
 + 
 +</code> 
 + 
 +==== Amber22 ==== 
 + 
 +OpenHPC 
 + 
 +<code> 
 + 
 +# First **all the necessary packages ** (yum install...) 
 + 
 + 988  tar xvfj ../AmberTools22.tar.bz2  
 +  989  tar xvfj ../Amber22.tar.bz2  
 +  993  cd amber20_src/ 
 +  994  cd build/ 
 +  996  vi run_cmake 
 + 
 +#  Assume this is Linux: 
 + 
 +# serial, do on head node, with miniconda true, compile, install 
 +  cmake $AMBER_PREFIX/amber22_src \ 
 +    -DCMAKE_INSTALL_PREFIX=/share/apps/CENTOS8/ohpc/software/amber/22 \ 
 +    -DCOMPILER=GNU 
 +    -DMPI=FALSE -DCUDA=FALSE -DINSTALL_TESTS=TRUE \ 
 +    -DDOWNLOAD_MINICONDA=TRUE \ 
 +    2>&1 | tee  cmake.log 
 +./run_cmake 
 +make install 
 + 
 + 
 +# Note !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
 +The OpenMPI and MPICH system installations provided by CentOS  
 +(i.e., through yum install)  
 +are known to be somehow incompatible with Amber22. 
 +# OUCH !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
 + 
 + 
 +# GO TO node n100 
 + 
 +# copy head node's amber22_src/ to n100:/usr/local/src/tmp/ 
 + 
 + 
 +source /share/apps/CENTOS8/ohpc/software/amber/22/amber.sh 
 +echo $AMBERHOME 
 + 
 +# install latest openmpi version 
 +cd amber_src/Ambertools/src 
 +tar xvfj ../../../../openmpi-4.1.4.tar.bz2  
 + 
 +./configure_openmpi gnu # openhpc gcc/gfortran  
 + 
 + 
 +# on n100 now, parallel, set  
 +-MPI=TRUE 
 +-DDOWNLOAD_MINICONDA=FALSE 
 +./run_cmake 
 +make install 
 + 
 +# on n100 just change cuda flag 
 + 
 +[hmeij@n100 build]$ module load cuda/11.6 
 +[hmeij@n100 build]$ which gcc mpicc nvcc 
 +/opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc 
 +/share/apps/CENTOS8/ohpc/software/amber/22/bin/mpicc 
 +/usr/local/cuda/bin/nvcc 
 +[hmeij@n100 ~]$ echo $CUDA_HOME 
 +/usr/local/cuda 
 + 
 +-MPI=TRUE 
 +-CUDA=TRUE 
 +-DDOWNLOAD_MINICONDA=FALSE 
 +./run_cmake 
 +make install 
 + 
 + 
 +[hmeij@n100 ~]$ which nvcc mpicc gcc 
 +/usr/local/cuda/bin/nvcc 
 +/opt/ohpc/pub/mpi/openmpi4-gnu9/4.1.1/bin/mpicc 
 +/opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc 
 + 
 +#tests 
 +cd $AMBERHOME 
 +make test.serial 
 +export DO_PARALLEL="mpirun -np 6" 
 +make test.parallel 
 +export CUDA_VISIBLE_DEVICES=0 
 +make test.cuda.serial 
 +make test.cuda.parallel 
 + 
 +</code>
  
  
-\\ 
 **[[cluster:0|Back]]** **[[cluster:0|Back]]**
  
cluster/213.txt · Last modified: 2024/01/12 15:09 by hmeij07