This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
cluster:148 [2016/04/07 15:27] hmeij07 |
cluster:148 [2020/01/24 18:36] (current) hmeij07 |
||
---|---|---|---|
Line 3: | Line 3: | ||
==== BLCR Checkpoint in OL3 ==== | ==== BLCR Checkpoint in OL3 ==== | ||
+ | |||
+ | **Deprecated since we did [[cluster: | ||
+ | We will replace it with [[cluster: | ||
+ | --- // | ||
* This page concerns PARALLEL mpirun jobs only; there are some restrictions | * This page concerns PARALLEL mpirun jobs only; there are some restrictions | ||
Line 207: | Line 211: | ||
</ | </ | ||
- | ==== Parallel Wrapper ==== | + | |
+ | ==== Parallel Wrapper | ||
+ | |||
+ | A bit more verbose and error handling. Also the blcr_wrapper or the cr_checkpoint loop code can now terminate the job. | ||
+ | |||
+ | < | ||
+ | |||
+ | #!/bin/bash -x | ||
+ | rm -f err out | ||
+ | # work dir and cwd | ||
+ | export MYSANSCRATCH=/ | ||
+ | cd $MYSANSCRATCH | ||
+ | |||
+ | # at job finish, all content in / | ||
+ | # will be copied to / | ||
+ | # content older than 3 months will be removed | ||
+ | |||
+ | # SCHEDULER set queue name in next TWO lines | ||
+ | queue=hp12 | ||
+ | #BSUB -q hp12 | ||
+ | #BSUB -n 6 | ||
+ | #BSUB -J test | ||
+ | #BSUB -o out | ||
+ | #BSUB -e err | ||
+ | # next required for mpirun checkpoint to work | ||
+ | # restarts must use same node (not sure why) | ||
+ | #BSUB -R " | ||
+ | #BSUB -m n5 | ||
+ | |||
+ | # CHECK POINT TIME INTERVAL: 10m (debug) 6h 12h 18h 1d | ||
+ | cpti=15m | ||
+ | |||
+ | # COPY APPLICATION TO WORK DIR $MYSANSCRATCH (cwd) | ||
+ | # always stage the application (and data if needed) | ||
+ | # if mpirun save_exec=" | ||
+ | save_exec=" | ||
+ | pre_cmd=" | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | post_cmd=" | ||
+ | |||
+ | |||
+ | # IF START OF JOB, UNCOMMENT | ||
+ | # its either start or restart block | ||
+ | # | ||
+ | #cmd=" python mpi_run_models.py / | ||
+ | |||
+ | # IF RESTART OF JOB, UNCOMMENT, MUST BE RUN ON SAME NODE | ||
+ | # you must have pwd.JOBPID and chk.JOBPID in $orgjobpid/ | ||
+ | mode=restart | ||
+ | orgjobpid=636341 | ||
+ | |||
+ | # user environment | ||
+ | export PYTHONHOME=/ | ||
+ | export PYTHONPATH=/ | ||
+ | export PATH=$PYTHONHOME/ | ||
+ | . / | ||
+ | export PATH=$MIRBIN: | ||
+ | which python | ||
+ | |||
+ | |||
+ | ############### | ||
+ | |||
+ | |||
+ | |||
+ | # checkpoints | ||
+ | checkpoints=/ | ||
+ | |||
+ | # kernel modules | ||
+ | mods=`/ | ||
+ | if [ $mods -ne 2 ]; then | ||
+ | echo " | ||
+ | kill $$ | ||
+ | fi | ||
+ | |||
+ | # blcr setup | ||
+ | restore_options="" | ||
+ | # | ||
+ | if [ $save_exec == " | ||
+ | # | ||
+ | save_options=" | ||
+ | else | ||
+ | save_options=" | ||
+ | fi | ||
+ | |||
+ | # environment | ||
+ | export PATH=/ | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | |||
+ | export PATH=/ | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | |||
+ | which mpirun cr_mpirun ompi-checkpoint ompi-restart cr_checkpoint cr_restart | ||
+ | |||
+ | # setup checkpoints dir | ||
+ | if [ ! -d $checkpoints/ | ||
+ | mkdir -p $checkpoints/ | ||
+ | else | ||
+ | echo " | ||
+ | kill $$ | ||
+ | fi | ||
+ | |||
+ | # save process id and path and start application | ||
+ | if [ " | ||
+ | # hostfile | ||
+ | echo " | ||
+ | tr '\/ ' ' | ||
+ | c=`wc -l $HOME/ | ||
+ | for i in `seq 1 $c`; do echo ' | ||
+ | $pre_cmd | ||
+ | # why | ||
+ | rm -f / | ||
+ | cr_mpirun -v -am ft-enable-cr --gmca snapc_base_global_snapshot_dir $checkpoints/ | ||
+ | -x LD_LIBRARY_PATH --hostfile $HOME/ | ||
+ | pid=$! | ||
+ | pwd > $checkpoints/ | ||
+ | orgjobpid=0 | ||
+ | |||
+ | # otherwise restart the job | ||
+ | elif [ " | ||
+ | orgpid=`ls $checkpoints/ | ||
+ | orgpwd=`cat $checkpoints/ | ||
+ | if [ " | ||
+ | echo " | ||
+ | exit | ||
+ | fi | ||
+ | # cleanup old if present | ||
+ | rm -rf / | ||
+ | rm -f $HOME/ | ||
+ | # why | ||
+ | rm -f / | ||
+ | # stage old | ||
+ | scp $checkpoints/ | ||
+ | scp -r $checkpoints/ | ||
+ | ln -s $MYSANSCRATCH / | ||
+ | scp $checkpoints/ | ||
+ | scp -r $checkpoints/ | ||
+ | # why | ||
+ | scp $checkpoints/ | ||
+ | ln -s / | ||
+ | c=`wc -l $HOME/ | ||
+ | for i in `seq 1 $c`; do echo ' | ||
+ | cr_restart --kmsg-warning $restore_options --relocate $orgpwd=$MYSANSCRATCH --cont \ | ||
+ | $MYSANSCRATCH/ | ||
+ | pid=$! | ||
+ | started=`ps -u $USER | awk ' | ||
+ | if [ $started -ne 1 ]; then | ||
+ | echo " | ||
+ | kill $$ | ||
+ | fi | ||
+ | pwd > $checkpoints/ | ||
+ | |||
+ | # obviously | ||
+ | else | ||
+ | echo " | ||
+ | kill $$ | ||
+ | fi | ||
+ | |||
+ | # if $cmd disappears during $pcit, terminate wrapper | ||
+ | export POST_CMD=" | ||
+ | blcr_watcher $pid $$ $LSB_JOBID $orgjobpid & | ||
+ | bw_pid=$! | ||
+ | |||
+ | # always run this block | ||
+ | while [ true ]; do | ||
+ | # checkpoint time interval | ||
+ | sleep $cpti | ||
+ | # finished? | ||
+ | no_pid=`ps -u $USER | grep $pid | awk ' | ||
+ | if [ " | ||
+ | # save output | ||
+ | scp -rp $MYSANSCRATCH/ | ||
+ | $POST_CMD | ||
+ | kill $bw_pid | ||
+ | rm -f $HOME/ | ||
+ | exit | ||
+ | fi | ||
+ | # checkpoint file outside of sanscratch | ||
+ | scp -r $MYSANSCRATCH/ | ||
+ | scp -r / | ||
+ | chmod u+w $checkpoints/ | ||
+ | # why | ||
+ | scp / | ||
+ | cr_checkpoint -v --tree --cont $save_options -f $checkpoints/ | ||
+ | 2>> | ||
+ | scp $HOME/ | ||
+ | scp $HOME/ | ||
+ | scp -r / | ||
+ | # why | ||
+ | scp / | ||
+ | date >> $checkpoints/ | ||
+ | done | ||
+ | |||
+ | |||
+ | </ | ||
+ | |||
+ | ==== Parallel Wrapper v1 ==== | ||
< | < |