This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision Next revision Both sides next revision | ||
cluster:213 [2022/03/16 13:09] hmeij07 |
cluster:213 [2023/03/03 18:11] hmeij07 |
||
---|---|---|---|
Line 2: | Line 2: | ||
**[[cluster: | **[[cluster: | ||
- | ===== New Head Node ===== | + | ==== New Head Node ==== |
We're embarking on a transition to a new head/login node name '' | We're embarking on a transition to a new head/login node name '' | ||
Line 8: | Line 8: | ||
Two new compute nodes (n100, n101) will be set up in a test queue. They each have four RTX5000 gpus which have the same architecture as our other gpus so all compiled software should work. These gpus have 16G memory foot print (twice as large as other gpus we have). | Two new compute nodes (n100, n101) will be set up in a test queue. They each have four RTX5000 gpus which have the same architecture as our other gpus so all compiled software should work. These gpus have 16G memory foot print (twice as large as other gpus we have). | ||
- | OpenHPC will be deployed next and I'll make some notes. We will to Slurm scheduler. ([[cluster:207|Slurm Test Env]] and [[cluster: | + | OpenHPC will be deployed next and I'll make some notes. We will to Slurm scheduler. ([[cluster:208|Slurm Test Env]] for users and [[cluster: |
- | ====== Config Recipe | + | Some pictures below. |
+ | |||
+ | ==== Config Recipe ==== | ||
Steps. "Ala n37" ... so the RTX nodes are similar to the K20 nodes and we can put the local software in place. See [[cluster: | Steps. "Ala n37" ... so the RTX nodes are similar to the K20 nodes and we can put the local software in place. See [[cluster: | ||
Line 40: | Line 42: | ||
scp 10.10.102.253:/ | scp 10.10.102.253:/ | ||
/ | / | ||
+ | |||
+ | # Put the warewulf cluster key in authorized_keys | ||
+ | # Put eth0 fingerprints in cottontail/ | ||
+ | # add to relevant known_hosts_servername file | ||
# configure private subnets and ping file server | # configure private subnets and ping file server | ||
cd / | cd / | ||
vi ifcfg-eth0 # 192.168.102.x | vi ifcfg-eth0 # 192.168.102.x | ||
- | vi ifcfg-eth1 # 10.10.102.x | + | vi ifcfg-eth1 # 10.10.102.x |
vi ifcfg-eth3 # 129.133.52.x | vi ifcfg-eth3 # 129.133.52.x | ||
+ | scp 192.168.102.112:/ | ||
+ | |||
systemctl restart network | systemctl restart network | ||
ping -c 3 192.168.102.42 | ping -c 3 192.168.102.42 | ||
Line 51: | Line 59: | ||
# make internet connection for yum | # make internet connection for yum | ||
- | # eth3 for ctt2 or eth1 for n100-101 | ||
- | dnf install bind-utils | ||
- | dig google.com | ||
- | |||
- | #rocky8 | ||
- | # https:// | ||
- | dnf config-manager --set-enabled powertools | ||
- | dnf install epel-release | ||
- | dnf install netcdf netcdf-devel | ||
- | dnf install yum-utils # yumdownloader | ||
- | dnf install ddd | ||
- | dnf install grace | ||
- | dnf install gnuplot | ||
- | dnf install alpine # pico | ||
# iptables | # iptables | ||
Line 72: | Line 66: | ||
systemctl start iptables # and enable | systemctl start iptables # and enable | ||
iptables -L | iptables -L | ||
- | systemctl stop firewalld | + | |
- | systemctl disable firewalld | + | |
+ | # eth3 for ctt2 or eth1 for n100-101 | ||
+ | dnf install bind-utils | ||
+ | dig google.com | ||
+ | iptables -L # check! | ||
+ | |||
# other configs | # other configs | ||
vi / | vi / | ||
mv /home /usr/local/ | mv /home /usr/local/ | ||
+ | cd /;ln -s / | ||
+ | cd /; ln -s /home /share | ||
vi /etc/passwd (exx, dockeruser $HOME) | vi /etc/passwd (exx, dockeruser $HOME) | ||
- | ## edit passwd, shadow, group, hosts files ## | ||
- | ## make -orig backups and stage in / | ||
- | ## cottontail2 = greentail52 sections | ||
+ | #exx96 | ||
mkdir /sanscratch / | mkdir /sanscratch / | ||
chmod ugo+rwx /sanscratch / | chmod ugo+rwx /sanscratch / | ||
chmod o+t /sanscratch / | chmod o+t /sanscratch / | ||
+ | # exx96 | ||
# link localscratch in 1.4T /home to / | # link localscratch in 1.4T /home to / | ||
- | mkdir /home | + | |
- | cd /home # local dir | + | cd /home |
ln -s / | ln -s / | ||
ln -s / | ln -s / | ||
ln -s / | ln -s / | ||
- | ln -s /zfshomes | + | ls -l |
+ | |||
+ | cat / | ||
# fstab file mounts | # fstab file mounts | ||
+ | mkdir -p /zfshomes /home66 /home33 /mindstore / | ||
+ | mkdir -p / | ||
# cottontail2 = greentail52 | # cottontail2 = greentail52 | ||
# n100-n101 = n79 | # n100-n101 = n79 | ||
- | # postfix | ||
- | dnf install postfix | ||
- | dnf install mailx | ||
- | systemctl enable postfix | ||
- | echo " | ||
+ | # on head node / | ||
+ | allow 192.168.0.0/ | ||
# compute nodes / | # compute nodes / | ||
#pool 2.pool.ntp.org iburst | #pool 2.pool.ntp.org iburst | ||
Server 192.168.102.250 | Server 192.168.102.250 | ||
Server 192.168.102.251 | Server 192.168.102.251 | ||
+ | # check | ||
+ | systemctl restart chronyd | ||
+ | chronyc sources | ||
+ | |||
+ | # Rocky8 | ||
+ | # https:// | ||
+ | dnf config-manager --set-enabled powertools -y | ||
+ | dnf install epel-release -y | ||
+ | dnf install netcdf netcdf-devel -y | ||
+ | dnf install yum-utils # yumdownloader -y | ||
+ | dnf install ddd grace gnuplot alpine -y # pico | ||
+ | |||
+ | yum groupinstall " | ||
+ | |||
+ | |||
+ | # on head node install from epel repo | ||
+ | ### yum install slurm-openlava | ||
+ | # error on conflicting libs, too bad! | ||
Line 114: | Line 134: | ||
yum install epel-release -y | yum install epel-release -y | ||
yum install flex bison -y | yum install flex bison -y | ||
- | yum install tcl tcl-devel dmtcp -y | + | yum install tcl tcl-devel |
yum install net-snmp net-snmp-libs net-tools net-snmp-utils -y | yum install net-snmp net-snmp-libs net-tools net-snmp-utils -y | ||
yum install freeglut-devel libXi-devel libXmu-devel -y | yum install freeglut-devel libXi-devel libXmu-devel -y | ||
Line 123: | Line 143: | ||
yum install cmake -y | yum install cmake -y | ||
yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y | yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y | ||
- | # amber | + | |
+ | #easybuild | ||
+ | yum install libibverbs libibverbs-devel | ||
+ | |||
+ | # amber20 cmake readline error fix needs | ||
+ | yum install ncurses-c++-libs-6.1-9.20180224.el8.x86_64.rpm \ | ||
+ | ncurses-devel-6.1-9.20180224.el8.x86_64.rpm \ | ||
+ | readline-devel-7.0-10.el8.x86_64.rpm | ||
+ | |||
+ | # amber20 | ||
yum -y install tcsh make \ | yum -y install tcsh make \ | ||
gcc gcc-gfortran gcc-c++ \ | gcc gcc-gfortran gcc-c++ \ | ||
Line 130: | Line 159: | ||
perl perl-ExtUtils-MakeMaker util-linux wget \ | perl perl-ExtUtils-MakeMaker util-linux wget \ | ||
bzip2 bzip2-devel zlib-devel tar | bzip2 bzip2-devel zlib-devel tar | ||
- | yum update -y | ||
- | yum clean all | ||
# CENTOS7 pick the kernel vendor used for now | # CENTOS7 pick the kernel vendor used for now | ||
Line 142: | Line 169: | ||
# compute nodes old level 3 | # compute nodes old level 3 | ||
systemctl set-default multi-user.target | systemctl set-default multi-user.target | ||
- | # remove internet, bring private back up | + | |
- | reboot | + | |
+ | # postfix | ||
+ | dnf install postfix | ||
+ | dnf install mailx | ||
+ | systemctl enable postfix | ||
+ | echo " | ||
+ | |||
+ | |||
+ | # edit / | ||
+ | rocommunity public | ||
+ | dontLogTCPWrappersConnects yes | ||
+ | # enable, start, add to zenoss | ||
# compute nodes only | # compute nodes only | ||
Line 161: | Line 200: | ||
# openjdk version " | # openjdk version " | ||
rpm -qa | grep ^java # check | rpm -qa | grep ^java # check | ||
+ | yum install java-1.8.0-openjdk java-1.8.0-openjdk-devel \ | ||
+ | java-1.8.0-openjdk-headless javapackages-filesystem | ||
# python v 3.9 | # python v 3.9 | ||
yum install python39 python39-devel | yum install python39 python39-devel | ||
+ | ln -s / | ||
# fftw 3.3.5-11.el8 | # fftw 3.3.5-11.el8 | ||
yum install fftw fftw-devel | yum install fftw fftw-devel | ||
Line 171: | Line 213: | ||
# obabel chem file formats | # obabel chem file formats | ||
yum install openbabel openbabel-devel | yum install openbabel openbabel-devel | ||
- | # dmtcp | + | |
- | yum install dmtcp dmtcp-devel | + | |
yum clean all | yum clean all | ||
- | reboot | + | # eth3 onboot=no, private networks only |
+ | systemctl disable iptables | ||
+ | |||
+ | # now make it an ohpc compute node | ||
+ | yum repolist | ||
+ | yum install singularity-ohpc | ||
+ | yum install ohpc-base-compute --nobest | ||
+ | | ||
+ | scp cottontail2:/ | ||
+ | yum install ohpc-slurm-client | ||
+ | # check status of service munge | ||
+ | rpm -ivh / | ||
+ | systemctl enable munge | ||
+ | systemctl start munge | ||
+ | scp cottontail2:/ | ||
+ | echo SLURMD_OPTIONS=" | ||
+ | yum install --allowerasing lmod-ohpc | ||
+ | grep '/ | ||
+ | mkdir / | ||
+ | chown slurm:munge / | ||
+ | mkdir / | ||
+ | chown slurm:munge / | ||
+ | scp cottontail2:/ | ||
+ | scp cottontail2:/ | ||
+ | scp cottontail2:/ | ||
+ | | ||
+ | # /etc/bashrc add | ||
+ | # ohpc lmod gcc mpicc | ||
+ | export PATH=/ | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | |||
+ | | ||
+ | # / | ||
+ | / | ||
+ | |||
+ | #test | ||
+ | / | ||
+ | | ||
+ | # start via rc.local (already copied) | ||
+ | #chmod +x / | ||
+ | #timing issue with munge | ||
+ | #sleep 15 | ||
+ | #/ | ||
+ | |||
+ | systemctl stop firewalld | ||
+ | systemctl disable firewalld | ||
+ | |||
+ | systemctl | ||
+ | systemctl stop dnf-makecache.timer | ||
+ | |||
+ | mv / | ||
+ | mv / | ||
+ | |||
+ | |||
+ | ## edit passwd, shadow, group, hosts files ## | ||
+ | ## make -orig backups and stage in / | ||
+ | ## cottontail2 = greentail52 sections | ||
+ | chown -R munge:munge /etc/munge / | ||
+ | chown -R slurm:munge / | ||
+ | |||
+ | |||
+ | | ||
+ | # slurmd ??? | ||
+ | libhwloc.so.15 => / | ||
+ | |||
+ | # crontab | ||
+ | |||
+ | # ionice gaussian | ||
+ | 0,15,30,45 * * * * / | ||
+ | |||
+ | # cpu temps | ||
+ | 40 * * * * / | ||
+ | |||
+ | on compute node / | ||
+ | * - | ||
+ | |||
+ | |||
+ | # file date_ctt2.sh | ||
+ | |||
+ | # ctt /etc/pdsh | ||
+ | |||
+ | # ctt:/ | ||
+ | |||
+ | # ctt2:/ | ||
</ | </ | ||
+ | ==== Pics ==== | ||
+ | |||
+ | |||
+ | My data center robot thingie and node n100's gpus\\ | ||
\\ | \\ | ||
+ | |||
+ | {{: | ||
+ | \\ | ||
+ | {{: | ||
+ | \\ | ||
+ | |||
+ | ==== Amber20 ==== | ||
+ | |||
+ | OpenHPC | ||
+ | |||
+ | < | ||
+ | |||
+ | # First **all the necessary packages ** (yum install...) | ||
+ | |||
+ | | ||
+ | 989 tar xvfj ../ | ||
+ | 993 cd amber20_src/ | ||
+ | 994 cd build/ | ||
+ | 996 vi run_cmake | ||
+ | |||
+ | # Assume this is Linux: | ||
+ | |||
+ | # serial, do on head node, with miniconda true, compile, install | ||
+ | cmake $AMBER_PREFIX/ | ||
+ | -DCMAKE_INSTALL_PREFIX=/ | ||
+ | -DCOMPILER=GNU | ||
+ | -DMPI=FALSE -DCUDA=FALSE -DINSTALL_TESTS=TRUE \ | ||
+ | -DDOWNLOAD_MINICONDA=TRUE -DMINICONDA_USE_PY3=TRUE \ | ||
+ | 2>&1 | tee cmake.log | ||
+ | |||
+ | # Env | ||
+ | |||
+ | [hmeij@n100 ~]$ module load cuda/11.6 | ||
+ | |||
+ | [hmeij@n100 ~]$ echo $CUDA_HOME | ||
+ | / | ||
+ | |||
+ | [hmeij@n100 ~]$ which nvcc mpicc gcc | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | |||
+ | # [FIXED} cmake error on conda install, set to FALSE | ||
+ | # OS native python, install on n[100-101] | ||
+ | -- Python version 3.9 -- OK | ||
+ | -- Found PythonLibs: / | ||
+ | -- Checking for Python package numpy -- not found | ||
+ | -- Checking for Python package scipy -- not found | ||
+ | -- Checking for Python package matplotlib -- not found | ||
+ | -- Checking for Python package setuptools -- found | ||
+ | [END FIXED] | ||
+ | |||
+ | # mpi & cuda FALSE builds serial | ||
+ | ./run_cmake | ||
+ | make install | ||
+ | # lots and lots of warnings | ||
+ | |||
+ | # then | ||
+ | source / | ||
+ | |||
+ | # on n100 now, parallel, set miniconda flags to FALSE | ||
+ | -MPI=TRUE | ||
+ | ./run_cmake | ||
+ | make install | ||
+ | |||
+ | # on n100 just change cuda flag | ||
+ | -CUDA=TRUE | ||
+ | ./run_cmake | ||
+ | make install | ||
+ | |||
+ | #tests | ||
+ | cd $AMBERHOME | ||
+ | make test.serial | ||
+ | export DO_PARALLEL=" | ||
+ | make test.parallel | ||
+ | export CUDA_VISIBLE_DEVICES=0 | ||
+ | make test.cuda.serial | ||
+ | make test.cuda.parallel | ||
+ | |||
+ | </ | ||
+ | |||
+ | ==== Amber22 ==== | ||
+ | |||
+ | OpenHPC | ||
+ | |||
+ | < | ||
+ | |||
+ | # First **all the necessary packages ** (yum install...) | ||
+ | |||
+ | | ||
+ | 989 tar xvfj ../ | ||
+ | 993 cd amber20_src/ | ||
+ | 994 cd build/ | ||
+ | 996 vi run_cmake | ||
+ | |||
+ | # Assume this is Linux: | ||
+ | |||
+ | # serial, do on head node, with miniconda true, compile, install | ||
+ | cmake $AMBER_PREFIX/ | ||
+ | -DCMAKE_INSTALL_PREFIX=/ | ||
+ | -DCOMPILER=GNU | ||
+ | -DMPI=FALSE -DCUDA=FALSE -DINSTALL_TESTS=TRUE \ | ||
+ | -DDOWNLOAD_MINICONDA=TRUE \ | ||
+ | 2>&1 | tee cmake.log | ||
+ | ./run_cmake | ||
+ | make install | ||
+ | |||
+ | |||
+ | # Note !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | ||
+ | The OpenMPI and MPICH system installations provided by CentOS | ||
+ | (i.e., through yum install) | ||
+ | are known to be somehow incompatible with Amber22. | ||
+ | # OUCH !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | ||
+ | |||
+ | |||
+ | # GO TO node n100 | ||
+ | |||
+ | # copy head node's amber22_src/ | ||
+ | |||
+ | |||
+ | source / | ||
+ | echo $AMBERHOME | ||
+ | |||
+ | # install latest openmpi version | ||
+ | cd amber_src/ | ||
+ | tar xvfj ../ | ||
+ | |||
+ | ./ | ||
+ | |||
+ | |||
+ | # on n100 now, parallel, set | ||
+ | -MPI=TRUE | ||
+ | -DDOWNLOAD_MINICONDA=FALSE | ||
+ | ./run_cmake | ||
+ | make install | ||
+ | |||
+ | # on n100 just change cuda flag | ||
+ | |||
+ | [hmeij@n100 build]$ module load cuda/11.6 | ||
+ | [hmeij@n100 build]$ which gcc mpicc nvcc | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | [hmeij@n100 ~]$ echo $CUDA_HOME | ||
+ | / | ||
+ | |||
+ | -MPI=TRUE | ||
+ | -CUDA=TRUE | ||
+ | -DDOWNLOAD_MINICONDA=FALSE | ||
+ | ./run_cmake | ||
+ | make install | ||
+ | |||
+ | |||
+ | [hmeij@n100 ~]$ which nvcc mpicc gcc | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | |||
+ | #tests | ||
+ | cd $AMBERHOME | ||
+ | make test.serial | ||
+ | export DO_PARALLEL=" | ||
+ | make test.parallel | ||
+ | export CUDA_VISIBLE_DEVICES=0 | ||
+ | make test.cuda.serial | ||
+ | make test.cuda.parallel | ||
+ | |||
+ | </ | ||
+ | |||
+ | |||
**[[cluster: | **[[cluster: | ||