User Tools

Site Tools


cluster:211

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
cluster:211 [2022/02/28 15:47]
hmeij07 [DMTCP CRAC]
cluster:211 [2022/03/01 16:50] (current)
hmeij07 [DMTCP CRAC]
Line 17: Line 17:
  
 # env on node n79 CRAC-early-developmennt-master.zip # env on node n79 CRAC-early-developmennt-master.zip
 +
 +# download
 +wget https://github.com/DMTCP-CRAC/CRAC-early-development/archive/master.zip
 +
 +# unzip
 +unzip ../dmtcp-crac-master.zip
 +mv CRAC-early-development-master /share/apps/CENTOS7/dmtcp/3.0.0.b
 +
 +# gcc 
 + export PATH=/share/apps/CENTOS7/gcc/9.2.0/bin:$PATH
 + export LD_LIBRARY_PATH=/share/apps/CENTOS7/gcc/9.2.0/lib64:\
 +/share/apps/CENTOS7/gcc/9.2.0/libexec:\
 +/share/apps/CENTOS7/gcc/9.2.0/lib:\
 +$LD_LIBRARY_PATH
 +
  
  export PATH=/share/apps/CENTOS7/openmpi/4.0.4/bin:$PATH  export PATH=/share/apps/CENTOS7/openmpi/4.0.4/bin:$PATH
Line 32: Line 47:
 cd /share/apps/CENTOS7/dmtcp/3.0.0.b/ cd /share/apps/CENTOS7/dmtcp/3.0.0.b/
 ./configure ./configure
-make+make  # no errors
 $ ls bin $ ls bin
 dmtcp_command      dmtcp_discover_rm  dmtcp_nocheckpoint  dmtcp_rm_loclaunch  dmtcp_ssh   mtcp_restart dmtcp_command      dmtcp_discover_rm  dmtcp_nocheckpoint  dmtcp_rm_loclaunch  dmtcp_ssh   mtcp_restart
 dmtcp_coordinator  dmtcp_launch       dmtcp_restart       dmtcp_srun_helper   dmtcp_sshd dmtcp_coordinator  dmtcp_launch       dmtcp_restart       dmtcp_srun_helper   dmtcp_sshd
  
-make check +make check  # all failed, msg: checkpoint error ??? 
-make check2 +make check2 # /bin/sh: -c: line 11: syntax error near unexpected token `&' 
-make check3+make check3 # /bin/sh: -c: line 11: syntax error near unexpected token `&' 
 + 
 +cd contrib/split-cuda 
 +# edit Makefile set to gcc/g++ in PATH 
 +make # no errors, but missing lib 
 + 
 +$ ls              
 +libdmtcp_split-cuda.so 
 +kernel-loader.exe   
 +libcuda_wrappers.so  
 + 
 +# -lcuda -lcusparse -lcusolver -lcublas 
 +# my 10.2 toolkit does not have cublas v11 
 +# so linking against lowest version in hpc_sdk 
 + 
 +# seems to have worked 
 +$ ldd kernel-loader.exe  
 + libcublas.so.11 => not found 
 +# now 
 + libcublas.so.11 => /usr/local/cuda/lib64/libcublas.so.11 (0x00007fc3b877a000) 
  
 </code> </code>
  
 +Next gobble together a gpu program like lammps/amber and test on gpu. Or you may have to wait on new compute nodes to arrive with latest toolkit and redo. The libcublas links needs to be made on each node in  this queue ''n79'' belongs to.
  
 +<code>
  
 +# oh well, nice try
 +cudaGetDeviceCount failed CUDA driver version is insufficient for CUDA runtime version
 +
 +# manual with amber20 example
 +
 +dmtcp_launch --new-coordinator \
 +--coord-port 0 --port-file /sanscratch/checkpoints/111/port.txt \
 +--ckptdir /sanscratch/checkpoints/111 --interval 600   pmemd.cuda \
 +-O -o mdout.$LSB_JOBID -inf mdinfo.1K10 -x mdcrd.1K10 -r restrt.1K10 -ref inpcrd
 +
 +
 +</code>
  
 \\ \\
cluster/211.1646063225.txt.gz · Last modified: 2022/02/28 15:47 by hmeij07