User Tools

Site Tools


cluster:148

Warning: Undefined array key -1 in /usr/share/dokuwiki/inc/html.php on line 1458

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
cluster:148 [2016/04/07 11:27]
hmeij07
cluster:148 [2020/01/24 13:36] (current)
hmeij07
Line 3: Line 3:
  
 ==== BLCR Checkpoint in OL3 ==== ==== BLCR Checkpoint in OL3 ====
 +
 +**Deprecated since we did [[cluster:185|OS Update]] \\
 +We will replace it with [[cluster:190|DMTCP]] ** \\
 + --- //[[hmeij@wesleyan.edu|Henk]] 2020/01/14 14:31//
  
   * This page concerns PARALLEL mpirun jobs only; there are some restrictions   * This page concerns PARALLEL mpirun jobs only; there are some restrictions
Line 207: Line 211:
 </code> </code>
  
-==== Parallel Wrapper ====+ 
 +==== Parallel Wrapper v2 ==== 
 + 
 +A bit more verbose and error handling. Also the blcr_wrapper or the cr_checkpoint loop code can now terminate the job. 
 + 
 +<code> 
 + 
 +#!/bin/bash -x  
 +rm -f err out 
 +# work dir and cwd 
 +export MYSANSCRATCH=/sanscratch/$LSB_JOBID 
 +cd $MYSANSCRATCH 
 + 
 +# at job finish, all content in /sanscratch/JOBPID 
 +# will be copied to /sanscratch/checkpoints/JOBPID 
 +# content older than 3 months will be removed 
 + 
 +# SCHEDULER set queue name in next TWO lines 
 +queue=hp12 
 +#BSUB -q hp12 
 +#BSUB -n 6 
 +#BSUB -J test 
 +#BSUB -o out 
 +#BSUB -e err 
 +# next required for mpirun checkpoint to work 
 +# restarts must use same node (not sure why) 
 +#BSUB -R "span[hosts=1]" 
 +#BSUB -m n5 
 + 
 +# CHECK POINT TIME INTERVAL: 10m (debug) 6h 12h 18h 1d  
 +cpti=15m 
 + 
 +# COPY APPLICATION TO WORK DIR $MYSANSCRATCH (cwd) 
 +# always stage the application (and data if needed) 
 +# if mpirun save_exec="n" (default) 
 +save_exec="n" 
 +pre_cmd=" scp -r 
 +$HOME/python/kflaherty/data/HD163296.CO32.regridded.cen15.vis 
 +$HOME/python/kflaherty/data/HD163296.CO32.regridded.cen15.vis.fits 
 +$HOME/python/kflaherty/data/lowres_ALMA_weights_calc.sav 
 +$HOME/python/kflaherty/co.dat 
 +$HOME/python/kflaherty/disk_other.py 
 +$HOME/python/kflaherty/disk.py 
 +$HOME/python/kflaherty/mol_dat.py 
 +$HOME/python/kflaherty/mpi_run_models.py 
 +$HOME/python/kflaherty/sample_co32.sh 
 +$HOME/python/kflaherty/single_model.py . " 
 +post_cmd=" scp $MYSANSCRATCH/chain*.dat $HOME/tmp/" 
 + 
 + 
 +# IF START OF JOB, UNCOMMENT 
 +# its either start or restart block 
 +#mode=start 
 +#cmd=" python mpi_run_models.py /sanscratch/$LSB_JOBID > /sanscratch/$LSB_JOBID/test2.out " 
 + 
 +# IF RESTART OF JOB, UNCOMMENT, MUST BE RUN ON SAME NODE 
 +# you must have pwd.JOBPID and chk.JOBPID in $orgjobpid/ 
 +mode=restart 
 +orgjobpid=636341 
 + 
 +# user environment 
 +export PYTHONHOME=/share/apps/CENTOS6/blcr_soft/python/2.7.10 
 +export PYTHONPATH=/home/apps/CENTOS6/blcr_soft/python/2.7.10/lib/python2.7/site-packages 
 +export PATH=$PYTHONHOME/bin:$PATH 
 +. /home/apps/miriad/MIRRC.sh 
 +export PATH=$MIRBIN:$PATH 
 +which python 
 + 
 + 
 +############### NOTHING TO EDIT BELOW THIS LINE ################## 
 + 
 + 
 + 
 +# checkpoints 
 +checkpoints=/sanscratch/checkpoints 
 + 
 +# kernel modules 
 +mods=`/sbin/lsmod | grep ^blcr | wc -l` 
 +if [ $mods -ne 2 ]; then 
 +        echo "Error: BLCR modules not loaded on `hostname`" 
 +        kill $$ 
 +fi 
 + 
 +# blcr setup 
 +restore_options="" 
 +#restore_options="--no-restore-pid --no-restore-pgid --no-restore-sid" 
 +if [ $save_exec == "n" ]; then 
 +        #save_options="--save-private --save-shared" 
 +        save_options="--save-none" 
 +else 
 +        save_options="--save-all" 
 +fi 
 + 
 +# environment  
 +export PATH=/share/apps/CENTOS6/openmpi/1.6.5.cr/bin:$PATH 
 +export LD_LIBRARY_PATH=/share/apps/CENTOS6/openmpi/1.6.5.cr/lib:$LD_LIBRARY_PATH 
 + 
 +export PATH=/share/apps/blcr/0.8.5/${queue}/bin:$PATH 
 +export LD_LIBRARY_PATH=/share/apps/blcr/0.8.5/${queue}/lib:$LD_LIBRARY_PATH 
 + 
 +which mpirun cr_mpirun ompi-checkpoint ompi-restart cr_checkpoint cr_restart 
 + 
 +# setup checkpoints dir 
 +if [ ! -d $checkpoints/$LSB_JOBID ]; then 
 +        mkdir -p $checkpoints/$LSB_JOBID  
 +else 
 +        echo "Error: $checkpoints/$LSB_JOBID already exists, exciting" 
 +        kill $$ 
 +fi 
 + 
 +# save process id and path and start application 
 +if [ "$mode" == "start" ];  then 
 +        # hostfile 
 +        echo "${LSB_HOSTS}" > $HOME/.lsbatch/hostfile.tmp.$LSB_JOBID 
 +        tr '\/ ' '\r\n' < $HOME/.lsbatch/hostfile.tmp.$LSB_JOBID > $HOME/.lsbatch/hostfile.$LSB_JOBID 
 +        c=`wc -l $HOME/.lsbatch/hostfile.$LSB_JOBID | awk '{print $1}'
 +        for i in `seq 1 $c`; do echo '127.0.0.1' >> $HOME/.lsbatch/localhost.$LSB_JOBID; done 
 +        $pre_cmd 
 +        # why 
 +        rm -f /tmp/tmp??????  
 +        cr_mpirun -v -am ft-enable-cr --gmca snapc_base_global_snapshot_dir $checkpoints/$LSB_JOBID \ 
 +        -x LD_LIBRARY_PATH --hostfile $HOME/.lsbatch/localhost.$LSB_JOBID $cmd 2>>$checkpoints/$LSB_JOBID/cr_mpirun.err & 
 +        pid=$! 
 +        pwd > $checkpoints/$LSB_JOBID/pwd.$pid 
 +        orgjobpid=0 
 + 
 +# otherwise restart the job 
 +elif [ "$mode" == "restart" ]; then 
 +        orgpid=`ls $checkpoints/$orgjobpid/pwd.* | awk -F\. '{print $2}'
 +        orgpwd=`cat $checkpoints/$orgjobpid/pwd.$orgpid` 
 +        if [ "X$orgpwd" == "X" ]; then 
 +                echo "Error: orgpwd problem, check error log" 
 +                exit 
 +        fi 
 +        # cleanup old if present 
 +        rm -rf /sanscratch/$orgjobpid /localscratch/$orgjobpid  
 +        rm -f $HOME/.lsbatch/*.$orgjobpid  
 +        # why 
 +        rm -f /tmp/tmp??????  
 +        # stage old 
 +        scp $checkpoints/$orgjobpid/*.$orgjobpid.err $checkpoints/$orgjobpid/*.$orgjobpid.out $HOME/.lsbatch/ 
 +        scp -r $checkpoints/$orgjobpid/* $MYSANSCRATCH 
 +        ln -s $MYSANSCRATCH /sanscratch/$orgjobpid 
 +        scp $checkpoints/$orgjobpid/hostfile.$orgjobpid $HOME/.lsbatch/ 
 +        scp -r $checkpoints/$orgjobpid/$orgjobpid/* /localscratch/$LSB_JOBID 
 +        # why 
 +        scp $checkpoints/$orgjobpid/$orgjobpid/tmp?????? /tmp/ 
 +        ln -s /localscratch/$LSB_JOBID /localscratch/$orgjobpid 
 +        c=`wc -l $HOME/.lsbatch/hostfile.$orgjobpid | awk '{print $1}'
 +        for i in `seq 1 $c`; do echo '127.0.0.1' >> $HOME/.lsbatch/localhost.$orgjobpid; done 
 +        cr_restart --kmsg-warning $restore_options --relocate $orgpwd=$MYSANSCRATCH --cont \ 
 +        $MYSANSCRATCH/chk.$orgpid 2>>$checkpoints/$LSB_JOBID/cr_restart.err & 
 +        pid=$! 
 +        started=`ps -u $USER | awk '{print $1}' | grep $pid | wc -l` 
 +        if [ $started -ne 1 ]; then 
 +                echo "Error: cr_restart failed, check error log" 
 +                kill $$ 
 +        fi 
 +        pwd > $checkpoints/$LSB_JOBID/pwd.$pid 
 + 
 +# obviously 
 +else 
 +        echo "Error: startup mode not defined correctly" 
 +        kill $$ 
 +fi 
 + 
 +# if $cmd disappears during $pcit, terminate wrapper 
 +export POST_CMD="$post_cmd" 
 +blcr_watcher $pid $$ $LSB_JOBID $orgjobpid & 
 +bw_pid=$! 
 + 
 +# always run this block 
 +while [ true ]; do 
 +        # checkpoint time interval 
 +        sleep $cpti 
 +        # finished? 
 +        no_pid=`ps -u $USER | grep $pid | awk '{print $1}'
 +        if [ "${no_pid}x" == 'x' ]; then 
 +                # save output 
 +                scp -rp $MYSANSCRATCH/* $checkpoints/$LSB_JOBID/ 
 +                $POST_CMD 
 +                kill $bw_pid 
 +                rm -f $HOME/.lsbatch/*${orgjobpid}* 
 +                exit 
 +        fi 
 +        # checkpoint file outside of sanscratch 
 +        scp -r $MYSANSCRATCH/* $checkpoints/$LSB_JOBID/ 
 +        scp -r /localscratch/$LSB_JOBID $checkpoints/$LSB_JOBID/ 
 +        chmod u+w $checkpoints/$LSB_JOBID/chk.* /sanscratch/$LSB_JOBID/chk.* 
 +        # why 
 +        scp /tmp/tmp?????? $checkpoints/$LSB_JOBID/$LSB_JOBID/ 
 +        cr_checkpoint -v --tree --cont $save_options -f $checkpoints/$LSB_JOBID/chk.$pid $pid \ 
 +        2>>$checkpoints/$LSB_JOBID/cr_checkpoint.err 
 +        scp $HOME/.lsbatch/*.$LSB_JOBID.err $HOME/.lsbatch/*.$LSB_JOBID.out $checkpoints/$LSB_JOBID/ 
 +        scp $HOME/.lsbatch/hostfile.$LSB_JOBID $checkpoints/$LSB_JOBID/ 
 +        scp -r /localscratch/$LSB_JOBID $checkpoints/$LSB_JOBID/ 
 +        # why 
 +        scp /tmp/tmp?????? $checkpoints/$LSB_JOBID/$LSB_JOBID/ 
 +        date >> $checkpoints/$LSB_JOBID/cr_checkpoint.err 
 +done 
 + 
 + 
 +</code> 
 + 
 +==== Parallel Wrapper v1 ====
  
 <code> <code>
cluster/148.1460042828.txt.gz · Last modified: 2016/04/07 11:27 by hmeij07