cluster:148
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| cluster:148 [2016/03/30 18:49] – hmeij07 | cluster:148 [2020/01/24 18:36] (current) – hmeij07 | ||
|---|---|---|---|
| Line 3: | Line 3: | ||
| ==== BLCR Checkpoint in OL3 ==== | ==== BLCR Checkpoint in OL3 ==== | ||
| + | |||
| + | **Deprecated since we did [[cluster: | ||
| + | We will replace it with [[cluster: | ||
| + | --- // | ||
| * This page concerns PARALLEL mpirun jobs only; there are some restrictions | * This page concerns PARALLEL mpirun jobs only; there are some restrictions | ||
| Line 14: | Line 18: | ||
| * Users Guide [[https:// | * Users Guide [[https:// | ||
| - | Checkpointing parallel jobs is a bit more complex than a serial job. MPI workers (the -n) are fired off by worker 0 of '' | + | Checkpointing parallel jobs is a bit more complex than a serial job. MPI workers (the -n) are fired off by worker 0 of '' |
| The '' | The '' | ||
| Line 207: | Line 211: | ||
| </ | </ | ||
| - | ==== Parallel Wrapper ==== | + | |
| + | ==== Parallel Wrapper | ||
| + | |||
| + | A bit more verbose and error handling. Also the blcr_wrapper or the cr_checkpoint loop code can now terminate the job. | ||
| + | |||
| + | < | ||
| + | |||
| + | #!/bin/bash -x | ||
| + | rm -f err out | ||
| + | # work dir and cwd | ||
| + | export MYSANSCRATCH=/ | ||
| + | cd $MYSANSCRATCH | ||
| + | |||
| + | # at job finish, all content in / | ||
| + | # will be copied to / | ||
| + | # content older than 3 months will be removed | ||
| + | |||
| + | # SCHEDULER set queue name in next TWO lines | ||
| + | queue=hp12 | ||
| + | #BSUB -q hp12 | ||
| + | #BSUB -n 6 | ||
| + | #BSUB -J test | ||
| + | #BSUB -o out | ||
| + | #BSUB -e err | ||
| + | # next required for mpirun checkpoint to work | ||
| + | # restarts must use same node (not sure why) | ||
| + | #BSUB -R " | ||
| + | #BSUB -m n5 | ||
| + | |||
| + | # CHECK POINT TIME INTERVAL: 10m (debug) 6h 12h 18h 1d | ||
| + | cpti=15m | ||
| + | |||
| + | # COPY APPLICATION TO WORK DIR $MYSANSCRATCH (cwd) | ||
| + | # always stage the application (and data if needed) | ||
| + | # if mpirun save_exec=" | ||
| + | save_exec=" | ||
| + | pre_cmd=" | ||
| + | $HOME/ | ||
| + | $HOME/ | ||
| + | $HOME/ | ||
| + | $HOME/ | ||
| + | $HOME/ | ||
| + | $HOME/ | ||
| + | $HOME/ | ||
| + | $HOME/ | ||
| + | $HOME/ | ||
| + | $HOME/ | ||
| + | post_cmd=" | ||
| + | |||
| + | |||
| + | # IF START OF JOB, UNCOMMENT | ||
| + | # its either start or restart block | ||
| + | # | ||
| + | #cmd=" python mpi_run_models.py / | ||
| + | |||
| + | # IF RESTART OF JOB, UNCOMMENT, MUST BE RUN ON SAME NODE | ||
| + | # you must have pwd.JOBPID and chk.JOBPID in $orgjobpid/ | ||
| + | mode=restart | ||
| + | orgjobpid=636341 | ||
| + | |||
| + | # user environment | ||
| + | export PYTHONHOME=/ | ||
| + | export PYTHONPATH=/ | ||
| + | export PATH=$PYTHONHOME/ | ||
| + | . / | ||
| + | export PATH=$MIRBIN: | ||
| + | which python | ||
| + | |||
| + | |||
| + | ############### | ||
| + | |||
| + | |||
| + | |||
| + | # checkpoints | ||
| + | checkpoints=/ | ||
| + | |||
| + | # kernel modules | ||
| + | mods=`/ | ||
| + | if [ $mods -ne 2 ]; then | ||
| + | echo " | ||
| + | kill $$ | ||
| + | fi | ||
| + | |||
| + | # blcr setup | ||
| + | restore_options="" | ||
| + | # | ||
| + | if [ $save_exec == " | ||
| + | # | ||
| + | save_options=" | ||
| + | else | ||
| + | save_options=" | ||
| + | fi | ||
| + | |||
| + | # environment | ||
| + | export PATH=/ | ||
| + | export LD_LIBRARY_PATH=/ | ||
| + | |||
| + | export PATH=/ | ||
| + | export LD_LIBRARY_PATH=/ | ||
| + | |||
| + | which mpirun cr_mpirun ompi-checkpoint ompi-restart cr_checkpoint cr_restart | ||
| + | |||
| + | # setup checkpoints dir | ||
| + | if [ ! -d $checkpoints/ | ||
| + | mkdir -p $checkpoints/ | ||
| + | else | ||
| + | echo " | ||
| + | kill $$ | ||
| + | fi | ||
| + | |||
| + | # save process id and path and start application | ||
| + | if [ " | ||
| + | # hostfile | ||
| + | echo " | ||
| + | tr '\/ ' ' | ||
| + | c=`wc -l $HOME/ | ||
| + | for i in `seq 1 $c`; do echo ' | ||
| + | $pre_cmd | ||
| + | # why | ||
| + | rm -f / | ||
| + | cr_mpirun -v -am ft-enable-cr --gmca snapc_base_global_snapshot_dir $checkpoints/ | ||
| + | -x LD_LIBRARY_PATH --hostfile $HOME/ | ||
| + | pid=$! | ||
| + | pwd > $checkpoints/ | ||
| + | orgjobpid=0 | ||
| + | |||
| + | # otherwise restart the job | ||
| + | elif [ " | ||
| + | orgpid=`ls $checkpoints/ | ||
| + | orgpwd=`cat $checkpoints/ | ||
| + | if [ " | ||
| + | echo " | ||
| + | exit | ||
| + | fi | ||
| + | # cleanup old if present | ||
| + | rm -rf / | ||
| + | rm -f $HOME/ | ||
| + | # why | ||
| + | rm -f / | ||
| + | # stage old | ||
| + | scp $checkpoints/ | ||
| + | scp -r $checkpoints/ | ||
| + | ln -s $MYSANSCRATCH / | ||
| + | scp $checkpoints/ | ||
| + | scp -r $checkpoints/ | ||
| + | # why | ||
| + | scp $checkpoints/ | ||
| + | ln -s / | ||
| + | c=`wc -l $HOME/ | ||
| + | for i in `seq 1 $c`; do echo ' | ||
| + | cr_restart --kmsg-warning $restore_options --relocate $orgpwd=$MYSANSCRATCH --cont \ | ||
| + | $MYSANSCRATCH/ | ||
| + | pid=$! | ||
| + | started=`ps -u $USER | awk ' | ||
| + | if [ $started -ne 1 ]; then | ||
| + | echo " | ||
| + | kill $$ | ||
| + | fi | ||
| + | pwd > $checkpoints/ | ||
| + | |||
| + | # obviously | ||
| + | else | ||
| + | echo " | ||
| + | kill $$ | ||
| + | fi | ||
| + | |||
| + | # if $cmd disappears during $pcit, terminate wrapper | ||
| + | export POST_CMD=" | ||
| + | blcr_watcher $pid $$ $LSB_JOBID $orgjobpid & | ||
| + | bw_pid=$! | ||
| + | |||
| + | # always run this block | ||
| + | while [ true ]; do | ||
| + | # checkpoint time interval | ||
| + | sleep $cpti | ||
| + | # finished? | ||
| + | no_pid=`ps -u $USER | grep $pid | awk ' | ||
| + | if [ " | ||
| + | # save output | ||
| + | scp -rp $MYSANSCRATCH/ | ||
| + | $POST_CMD | ||
| + | kill $bw_pid | ||
| + | rm -f $HOME/ | ||
| + | exit | ||
| + | fi | ||
| + | # checkpoint file outside of sanscratch | ||
| + | scp -r $MYSANSCRATCH/ | ||
| + | scp -r / | ||
| + | chmod u+w $checkpoints/ | ||
| + | # why | ||
| + | scp / | ||
| + | cr_checkpoint -v --tree --cont $save_options -f $checkpoints/ | ||
| + | 2>> | ||
| + | scp $HOME/ | ||
| + | scp $HOME/ | ||
| + | scp -r / | ||
| + | # why | ||
| + | scp / | ||
| + | date >> $checkpoints/ | ||
| + | done | ||
| + | |||
| + | |||
| + | </ | ||
| + | |||
| + | ==== Parallel Wrapper v1 ==== | ||
| < | < | ||
| Line 228: | Line 436: | ||
| #BSUB -e err | #BSUB -e err | ||
| # next required for mpirun checkpoint to work | # next required for mpirun checkpoint to work | ||
| - | # restarts must use same node (not sure why) | + | # restarts must use same node in test queue (not sure why, others ca restart anywhere) |
| #BSUB -R " | #BSUB -R " | ||
cluster/148.1459363761.txt.gz · Last modified: (external edit)
