User Tools

Site Tools




This is a new DMTCP( plugin to checkpoint- restart CUDA application with noval split-process architecture.

CRAC consists of the plugin on top of DMTCP.
This software runs in the original directory

Compilation needs gcc version 8 or later (using 9.2.0 on CentOS 7, compute node n79)

# env on node n79

# download

# unzip
unzip ../
mv CRAC-early-development-master /share/apps/CENTOS7/dmtcp/3.0.0.b

# gcc 
 export PATH=/share/apps/CENTOS7/gcc/9.2.0/bin:$PATH
 export LD_LIBRARY_PATH=/share/apps/CENTOS7/gcc/9.2.0/lib64:\

 export PATH=/share/apps/CENTOS7/openmpi/4.0.4/bin:$PATH
 export LD_LIBRARY_PATH=/share/apps/CENTOS7/openmpi/4.0.4/lib:$LD_LIBRARY_PATH
 export PATH=/share/apps/CENTOS7/python/3.8.3/bin:$PATH
 export LD_LIBRARY_PATH=/share/apps/CENTOS7/python/3.8.3/lib:$LD_LIBRARY_PATH

 export CUDA_HOME=/usr/local/cuda
 export PATH=/usr/local/cuda/bin:$PATH
 export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=/usr/local/cuda/lib:$LD_LIBRARY_PATH

# make in place
cd /share/apps/CENTOS7/dmtcp/3.0.0.b/
make  # no errors
$ ls bin
dmtcp_command      dmtcp_discover_rm  dmtcp_nocheckpoint  dmtcp_rm_loclaunch  dmtcp_ssh   mtcp_restart
dmtcp_coordinator  dmtcp_launch       dmtcp_restart       dmtcp_srun_helper   dmtcp_sshd

make check  # all failed, msg: checkpoint error ???
make check2 # /bin/sh: -c: line 11: syntax error near unexpected token `&'
make check3 # /bin/sh: -c: line 11: syntax error near unexpected token `&'

cd contrib/split-cuda
# edit Makefile set to gcc/g++ in PATH
make # no errors, but missing lib

$ ls    

# -lcuda -lcusparse -lcusolver -lcublas
# my 10.2 toolkit does not have cublas v11
# so linking against lowest version in hpc_sdk

# seems to have worked
$ ldd kernel-loader.exe => not found
# now => /usr/local/cuda/lib64/ (0x00007fc3b877a000)

Next gobble together a gpu program like lammps/amber and test on gpu. Or you may have to wait on new compute nodes to arrive with latest toolkit and redo. The libcublas links needs to be made on each node in this queue n79 belongs to.

# oh well, nice try
cudaGetDeviceCount failed CUDA driver version is insufficient for CUDA runtime version

# manual with amber20 example

dmtcp_launch --new-coordinator \
--coord-port 0 --port-file /sanscratch/checkpoints/111/port.txt \
--ckptdir /sanscratch/checkpoints/111 --interval 600   pmemd.cuda \
-O -o mdout.$LSB_JOBID -inf mdinfo.1K10 -x mdcrd.1K10 -r restrt.1K10 -ref inpcrd


cluster/211.txt · Last modified: 2022/03/01 11:50 by hmeij07