This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision Next revision Both sides next revision | ||
cluster:213 [2022/03/16 13:43] hmeij07 [Pics] |
cluster:213 [2022/05/10 19:18] hmeij07 |
||
---|---|---|---|
Line 42: | Line 42: | ||
scp 10.10.102.253:/ | scp 10.10.102.253:/ | ||
/ | / | ||
+ | |||
+ | # Put the warewulf cluster key in authorized_keys | ||
+ | # Put eth0 fingerprints in cottontail/ | ||
+ | # add to relevant known_hosts_servername file | ||
# configure private subnets and ping file server | # configure private subnets and ping file server | ||
Line 53: | Line 57: | ||
# make internet connection for yum | # make internet connection for yum | ||
+ | |||
+ | # iptables | ||
+ | dnf install -y iptables-services | ||
+ | vi / | ||
+ | # add 'local allow' ports --dport 0:65535 | ||
+ | systemctl start iptables # and enable | ||
+ | iptables -L | ||
+ | systemctl stop firewalld | ||
+ | systemctl disable firewalld | ||
+ | |||
+ | |||
# eth3 for ctt2 or eth1 for n100-101 | # eth3 for ctt2 or eth1 for n100-101 | ||
dnf install bind-utils | dnf install bind-utils | ||
dig google.com | dig google.com | ||
+ | iptables -L # check! | ||
- | #rocky8 | + | # Rocky8 |
# https:// | # https:// | ||
dnf config-manager --set-enabled powertools | dnf config-manager --set-enabled powertools | ||
Line 67: | Line 83: | ||
dnf install gnuplot | dnf install gnuplot | ||
dnf install alpine # pico | dnf install alpine # pico | ||
- | + | yum groupinstall " | |
- | # iptables | + | |
- | dnf install -y iptables-services | + | |
- | vi / | + | |
- | # add 'local allow' ports --dport 0:65535 | + | |
- | systemctl start iptables # and enable | + | |
- | iptables -L | + | |
- | systemctl stop firewalld | + | |
- | systemctl disable firewalld | + | |
# other configs | # other configs | ||
Line 107: | Line 115: | ||
echo " | echo " | ||
+ | # on head node / | ||
+ | allow 192.168.0.0/ | ||
# compute nodes / | # compute nodes / | ||
#pool 2.pool.ntp.org iburst | #pool 2.pool.ntp.org iburst | ||
Server 192.168.102.250 | Server 192.168.102.250 | ||
Server 192.168.102.251 | Server 192.168.102.251 | ||
+ | # check | ||
+ | chronyc sources | ||
+ | |||
+ | |||
+ | # on head node install from epel repo | ||
+ | yum install slurm-openlava | ||
+ | # error on conflicting libs, too bad! | ||
Line 125: | Line 142: | ||
yum install cmake -y | yum install cmake -y | ||
yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y | yum install libjpeg libjpeg-devel libjpeg-turbo-devel -y | ||
- | # amber | + | |
+ | #easybuild | ||
+ | yum install libibverbs libibverbs-devel | ||
+ | |||
+ | # amber20 cmake readline error fix needs | ||
+ | yum install ncurses-c++-libs-6.1-9.20180224.el8.x86_64.rpm \ | ||
+ | ncurses-devel-6.1-9.20180224.el8.x86_64.rpm \ | ||
+ | readline-devel-7.0-10.el8.x86_64.rpm | ||
+ | |||
+ | # amber20 | ||
yum -y install tcsh make \ | yum -y install tcsh make \ | ||
gcc gcc-gfortran gcc-c++ \ | gcc gcc-gfortran gcc-c++ \ | ||
Line 132: | Line 158: | ||
perl perl-ExtUtils-MakeMaker util-linux wget \ | perl perl-ExtUtils-MakeMaker util-linux wget \ | ||
bzip2 bzip2-devel zlib-devel tar | bzip2 bzip2-devel zlib-devel tar | ||
- | yum update -y | ||
- | yum clean all | ||
# CENTOS7 pick the kernel vendor used for now | # CENTOS7 pick the kernel vendor used for now | ||
Line 144: | Line 168: | ||
# compute nodes old level 3 | # compute nodes old level 3 | ||
systemctl set-default multi-user.target | systemctl set-default multi-user.target | ||
- | # remove internet, bring private back up | ||
- | reboot | ||
# compute nodes only | # compute nodes only | ||
Line 163: | Line 185: | ||
# openjdk version " | # openjdk version " | ||
rpm -qa | grep ^java # check | rpm -qa | grep ^java # check | ||
+ | yum install java-1.8.0-openjdk java-1.8.0-openjdk-devel \ | ||
+ | java-1.8.0-openjdk-headless javapackages-filesystem | ||
# python v 3.9 | # python v 3.9 | ||
yum install python39 python39-devel | yum install python39 python39-devel | ||
+ | ln -s / | ||
# fftw 3.3.5-11.el8 | # fftw 3.3.5-11.el8 | ||
yum install fftw fftw-devel | yum install fftw fftw-devel | ||
Line 175: | Line 200: | ||
# dmtcp | # dmtcp | ||
yum install dmtcp dmtcp-devel | yum install dmtcp dmtcp-devel | ||
+ | |||
+ | # check status of service munge | ||
yum clean all | yum clean all | ||
+ | # eth3 onboot=no, private networks only | ||
+ | systemctl disable iptables | ||
reboot | reboot | ||
+ | # now make it an ohpc compute node | ||
+ | yum repolist | ||
+ | yum install ohpc-base-compute | ||
+ | | ||
+ | scp cottontail2:/ | ||
+ | yum install ohpc-slurm-client | ||
+ | systemctl enable munge | ||
+ | systemctl start munge | ||
+ | scp cottontail2:/ | ||
+ | echo SLURMD_OPTIONS=" | ||
+ | yum install --allowerasing lmod-ohpc | ||
+ | grep '/ | ||
+ | mkdir / | ||
+ | chown slurm:munge / | ||
+ | mkdir / | ||
+ | chown slurm:munge / | ||
+ | scp cottontail2:/ | ||
+ | scp cottontail2:/ | ||
+ | scp cottontail2:/ | ||
+ | |||
+ | #test | ||
+ | / | ||
+ | | ||
+ | # start via rc.local | ||
+ | chmod +x / | ||
+ | #timing issue with munge | ||
+ | sleep 15 | ||
+ | / | ||
+ | | ||
+ | # slurmd ??? | ||
+ | libhwloc.so.15 => / | ||
+ | |||
+ | # add to zenoss edit / | ||
+ | rocommunity public | ||
+ | dontLogTCPWrappersConnects yes | ||
</ | </ | ||
Line 185: | Line 249: | ||
- | My data center robot thingie...\\ | + | My data center robot thingie |
- | {{: | + | \\ |
- | Node n100's gpus...\\ | + | {{: |
+ | \\ | ||
+ | {{:cluster:n100.jpg? | ||
+ | \\ | ||
- | {{:cluster:n100.jpg?400 |}} | + | ==== Amber20 ==== |
+ | |||
+ | OpenHPC | ||
+ | |||
+ | < | ||
+ | |||
+ | | ||
+ | 989 tar xvfj ../ | ||
+ | 993 cd amber20_src/ | ||
+ | 994 cd build/ | ||
+ | 996 vi run_cmake | ||
+ | |||
+ | # Assume this is Linux: | ||
+ | |||
+ | # serial, do on heasd node, pull down miniconda, compile, install | ||
+ | cmake $AMBER_PREFIX/ | ||
+ | -DCMAKE_INSTALL_PREFIX=/ | ||
+ | -DCOMPILER=GNU | ||
+ | -DMPI=FALSE -DCUDA=FALSE -DINSTALL_TESTS=TRUE \ | ||
+ | -DDOWNLOAD_MINICONDA=TRUE -DMINICONDA_USE_PY3=TRUE \ | ||
+ | 2>&1 | tee cmake.log | ||
+ | |||
+ | # Env | ||
+ | |||
+ | [hmeij@n100 ~]$ module load cuda/11.6 | ||
+ | |||
+ | [hmeij@n100 ~]$ echo $CUDA_HOME | ||
+ | / | ||
+ | |||
+ | [hmeij@n100 ~]$ which nvcc mpicc gcc | ||
+ | / | ||
+ | / | ||
+ | / | ||
+ | |||
+ | # [FIXED} cmake error on conda install, set to FALSE | ||
+ | # OS native python, install on n[100-101] | ||
+ | -- Python version 3.9 -- OK | ||
+ | -- Found PythonLibs: / | ||
+ | -- Checking for Python package numpy -- not found | ||
+ | -- Checking for Python package scipy -- not found | ||
+ | -- Checking for Python package matplotlib -- not found | ||
+ | -- Checking for Python package setuptools -- found | ||
+ | [END FIXED] | ||
+ | |||
+ | # mpi & cuda FALSE builds serial | ||
+ | ./ | ||
+ | make install | ||
+ | # lots and lots of warnings | ||
+ | |||
+ | # then | ||
+ | source / | ||
+ | |||
+ | # on n100 now, parallel, set miniconda flags to FALSE | ||
+ | -MPI=TRUE | ||
+ | ./ | ||
+ | make install | ||
+ | |||
+ | # on n100 just change cuda flag | ||
+ | -CUDA=TRUE | ||
+ | ./ | ||
+ | make install | ||
+ | |||
+ | #tests | ||
+ | cd $AMBERHOME | ||
+ | make test.serial | ||
+ | export DO_PARALLEL=" | ||
+ | make test.parallel | ||
+ | export CUDA_VISIBLE_DEVICES=0 | ||
+ | make test.cuda.serial | ||
+ | make test.cuda.parallel | ||
+ | |||
+ | </ | ||
**[[cluster: | **[[cluster: | ||