This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
cluster:148 [2016/03/29 20:00] hmeij07 |
cluster:148 [2020/01/24 18:36] (current) hmeij07 |
||
---|---|---|---|
Line 4: | Line 4: | ||
==== BLCR Checkpoint in OL3 ==== | ==== BLCR Checkpoint in OL3 ==== | ||
- | | + | **Deprecated since we did [[cluster: |
+ | We will replace it with [[cluster: | ||
+ | --- // | ||
+ | |||
+ | | ||
+ | * all MPI threads need to be confined to one node | ||
+ | * restarted jobs must use the same node (not sure why) | ||
* For SERIAL jobs go here [[cluster: | * For SERIAL jobs go here [[cluster: | ||
Line 11: | Line 17: | ||
* Users Guide [[https:// | * Users Guide [[https:// | ||
+ | |||
+ | Checkpointing parallel jobs is a bit more complex than a serial job. MPI workers (the -n) are fired off by worker 0 of '' | ||
+ | |||
+ | The '' | ||
+ | |||
+ | Here is the admin stuff. | ||
< | < | ||
- | [hmeij@cottontail lammps]$ bsub < blcr_wrapper | + | # from eric at lbl, configure openmpi, I choose 1.6.5 (version needs to be < 1.7) |
+ | ./configure \ | ||
+ | --enable-ft-thread \ | ||
+ | --with-ft=cr \ | ||
+ | --enable-opal-multi-threads \ | ||
+ | --with-blcr=/ | ||
+ | --without-tm \ | ||
+ | --prefix=/ | ||
+ | make | ||
+ | make install | ||
+ | |||
+ | # next download cr_mpirun from LBL | ||
+ | https:// | ||
+ | |||
+ | # configure and test cr_mpirun | ||
+ | export PATH=/ | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | |||
+ | ./configure --with-blcr=/ | ||
+ | make | ||
+ | make check | ||
+ | |||
+ | ============================================================================ | ||
+ | Testsuite summary for cr_mpirun 295 | ||
+ | ============================================================================ | ||
+ | # TOTAL: 3 | ||
+ | # PASS: 3 | ||
+ | # SKIP: 0 | ||
+ | # XFAIL: 0 | ||
+ | # FAIL: 0 | ||
+ | # XPASS: 0 | ||
+ | # ERROR: 0 | ||
+ | ============================================================================ | ||
+ | make[1]: Leaving directory `/ | ||
+ | |||
+ | # I copied cr_mpirun into / | ||
+ | # cr_mpirun needs access to all these in $PATH | ||
+ | # mpirun cr_mpirun ompi-checkpoint ompi-restart cr_checkpoint cr_restart | ||
+ | |||
+ | # next compile you parallel software using mpicc/ | ||
+ | |||
+ | </ | ||
+ | |||
+ | Here is what a sample run using the Openlava scheduler looks like | ||
+ | |||
+ | < | ||
+ | |||
+ | # submit as usual after editing the top of the file, see comments in that wrapper file | ||
+ | [hmeij@cottontail lammps]$ bsub < blcr_wrapper_parallel | ||
Job <681> is submitted to queue < | Job <681> is submitted to queue < | ||
- | [hmeij@cottontail lammps]$ bjobs | + | |
- | JOBID | + | # cr_mpirun job |
- | 681 | + | |
[hmeij@cottontail lammps]$ bjobs | [hmeij@cottontail lammps]$ bjobs | ||
JOBID | JOBID | ||
Line 26: | Line 86: | ||
| | ||
- | [hmeij@cottontail lammps]$ tail ~/.lsbatch/ | + | # wrapper stores BLCR checkpoint file (chk.PID) in this location |
- | 160 4062132.4 | + | # and it calls the openmpi snapshot tools and stores that in |
- | 170 4395340.9 | + | # ompi_global_snapshot_SOME-PID.ckpt, also in same location |
- | 180 4711438.8 | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
[hmeij@cottontail lammps]$ ll / | [hmeij@cottontail lammps]$ ll / | ||
total 30572 | total 30572 | ||
Line 49: | Line 102: | ||
drwx------ 3 hmeij its 46 Mar 29 13:28 ompi_global_snapshot_9134.ckpt | drwx------ 3 hmeij its 46 Mar 29 13:28 ompi_global_snapshot_9134.ckpt | ||
-rw-r--r-- 1 hmeij its 16 Mar 29 13:23 pwd.9127 | -rw-r--r-- 1 hmeij its 16 Mar 29 13:23 pwd.9127 | ||
+ | |||
+ | # the processes running | ||
[hmeij@cottontail lammps]$ ssh petaltail ps -u hmeij | [hmeij@cottontail lammps]$ ssh petaltail ps -u hmeij | ||
PID TTY TIME CMD | PID TTY TIME CMD | ||
Line 57: | Line 112: | ||
9113 ? 00:00:00 1459272204.681. | 9113 ? 00:00:00 1459272204.681. | ||
9127 ? 00:00:00 cr_mpirun | 9127 ? 00:00:00 cr_mpirun | ||
- | 9128 ? 00:00:00 blcr_watcher | + | 9128 ? 00:00:00 blcr_watcher |
9133 ? 00:00:00 cr_mpirun | 9133 ? 00:00:00 cr_mpirun | ||
9134 ? 00:00:00 mpirun | 9134 ? 00:00:00 mpirun | ||
Line 69: | Line 124: | ||
9370 ? 00:00:00 ps | 9370 ? 00:00:00 ps | ||
18559 pts/2 00:00:00 bash | 18559 pts/2 00:00:00 bash | ||
+ | |||
+ | # how far did the job progress? | ||
[hmeij@cottontail lammps]$ tail ~/ | [hmeij@cottontail lammps]$ tail ~/ | ||
| | ||
Line 80: | Line 137: | ||
| | ||
| | ||
+ | |||
+ | # simulate crash | ||
[hmeij@cottontail lammps]$ ssh petaltail kill 9133 | [hmeij@cottontail lammps]$ ssh petaltail kill 9133 | ||
+ | # edit the file and prep for a restart, submit again | ||
[hmeij@cottontail lammps]$ bsub < blcr_wrapper | [hmeij@cottontail lammps]$ bsub < blcr_wrapper | ||
Job <684> is submitted to queue < | Job <684> is submitted to queue < | ||
- | [hmeij@cottontail lammps]$ rm -f ../ | + | |
- | [hmeij@cottontail lammps]$ bjobs | + | # so job 684 is resatrinig job 681, wrapper preps files |
- | JOBID | + | |
- | 684 hmeij | + | |
- | | + | |
- | | + | |
- | | + | |
[hmeij@cottontail lammps]$ ll ../ | [hmeij@cottontail lammps]$ ll ../ | ||
total 172 | total 172 | ||
Line 99: | Line 154: | ||
-rw------- 1 hmeij its 53 Mar 29 13:48 1459273700.684.out | -rw------- 1 hmeij its 53 Mar 29 13:48 1459273700.684.out | ||
-rwxr--r-- 1 hmeij its 4270 Mar 29 13:48 1459273700.684.shell | -rwxr--r-- 1 hmeij its 4270 Mar 29 13:48 1459273700.684.shell | ||
- | lrwxrwxrwx | + | -rwxr--r-- |
-rw-r--r-- 1 hmeij its 40 Mar 29 13:48 hostfile.684 | -rw-r--r-- 1 hmeij its 40 Mar 29 13:48 hostfile.684 | ||
-rw-r--r-- 1 hmeij its 40 Mar 29 13:48 hostfile.tmp.684 | -rw-r--r-- 1 hmeij its 40 Mar 29 13:48 hostfile.tmp.684 | ||
- | [hmeij@cottontail lammps]$ less ../ | + | |
[hmeij@cottontail lammps]$ ssh petaltail ps -u hmeij | [hmeij@cottontail lammps]$ ssh petaltail ps -u hmeij | ||
PID TTY TIME CMD | PID TTY TIME CMD | ||
Line 115: | Line 170: | ||
10002 ? 00:00:00 1459273700.684 | 10002 ? 00:00:00 1459273700.684 | ||
10005 ? 00:00:00 1459273700.684. | 10005 ? 00:00:00 1459273700.684. | ||
- | 10039 ? 00:00:00 cr_restart | + | 10039 ? 00:00:00 cr_restart |
10051 ? 00:00:00 cr_mpirun | 10051 ? 00:00:00 cr_mpirun | ||
10052 ? 00:00:00 mpirun | 10052 ? 00:00:00 mpirun | ||
Line 129: | Line 184: | ||
18559 pts/2 00:00:00 bash | 18559 pts/2 00:00:00 bash | ||
+ | # and now you can watch the output picking from last checkpoint file | ||
[hmeij@cottontail lammps]$ tail -20 ../ | [hmeij@cottontail lammps]$ tail -20 ../ | ||
| | ||
Line 150: | Line 206: | ||
| | ||
| | ||
+ | |||
+ | # let job finish | ||
</ | </ | ||
+ | |||
+ | ==== Parallel Wrapper v2 ==== | ||
+ | |||
+ | A bit more verbose and error handling. Also the blcr_wrapper or the cr_checkpoint loop code can now terminate the job. | ||
+ | |||
+ | < | ||
+ | |||
+ | #!/bin/bash -x | ||
+ | rm -f err out | ||
+ | # work dir and cwd | ||
+ | export MYSANSCRATCH=/ | ||
+ | cd $MYSANSCRATCH | ||
+ | |||
+ | # at job finish, all content in / | ||
+ | # will be copied to / | ||
+ | # content older than 3 months will be removed | ||
+ | |||
+ | # SCHEDULER set queue name in next TWO lines | ||
+ | queue=hp12 | ||
+ | #BSUB -q hp12 | ||
+ | #BSUB -n 6 | ||
+ | #BSUB -J test | ||
+ | #BSUB -o out | ||
+ | #BSUB -e err | ||
+ | # next required for mpirun checkpoint to work | ||
+ | # restarts must use same node (not sure why) | ||
+ | #BSUB -R " | ||
+ | #BSUB -m n5 | ||
+ | |||
+ | # CHECK POINT TIME INTERVAL: 10m (debug) 6h 12h 18h 1d | ||
+ | cpti=15m | ||
+ | |||
+ | # COPY APPLICATION TO WORK DIR $MYSANSCRATCH (cwd) | ||
+ | # always stage the application (and data if needed) | ||
+ | # if mpirun save_exec=" | ||
+ | save_exec=" | ||
+ | pre_cmd=" | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | $HOME/ | ||
+ | post_cmd=" | ||
+ | |||
+ | |||
+ | # IF START OF JOB, UNCOMMENT | ||
+ | # its either start or restart block | ||
+ | #mode=start | ||
+ | #cmd=" python mpi_run_models.py / | ||
+ | |||
+ | # IF RESTART OF JOB, UNCOMMENT, MUST BE RUN ON SAME NODE | ||
+ | # you must have pwd.JOBPID and chk.JOBPID in $orgjobpid/ | ||
+ | mode=restart | ||
+ | orgjobpid=636341 | ||
+ | |||
+ | # user environment | ||
+ | export PYTHONHOME=/ | ||
+ | export PYTHONPATH=/ | ||
+ | export PATH=$PYTHONHOME/ | ||
+ | . / | ||
+ | export PATH=$MIRBIN: | ||
+ | which python | ||
+ | |||
+ | |||
+ | ############### | ||
+ | |||
+ | |||
+ | |||
+ | # checkpoints | ||
+ | checkpoints=/ | ||
+ | |||
+ | # kernel modules | ||
+ | mods=`/ | ||
+ | if [ $mods -ne 2 ]; then | ||
+ | echo " | ||
+ | kill $$ | ||
+ | fi | ||
+ | |||
+ | # blcr setup | ||
+ | restore_options="" | ||
+ | # | ||
+ | if [ $save_exec == " | ||
+ | # | ||
+ | save_options=" | ||
+ | else | ||
+ | save_options=" | ||
+ | fi | ||
+ | |||
+ | # environment | ||
+ | export PATH=/ | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | |||
+ | export PATH=/ | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | |||
+ | which mpirun cr_mpirun ompi-checkpoint ompi-restart cr_checkpoint cr_restart | ||
+ | |||
+ | # setup checkpoints dir | ||
+ | if [ ! -d $checkpoints/ | ||
+ | mkdir -p $checkpoints/ | ||
+ | else | ||
+ | echo " | ||
+ | kill $$ | ||
+ | fi | ||
+ | |||
+ | # save process id and path and start application | ||
+ | if [ " | ||
+ | # hostfile | ||
+ | echo " | ||
+ | tr '\/ ' ' | ||
+ | c=`wc -l $HOME/ | ||
+ | for i in `seq 1 $c`; do echo ' | ||
+ | $pre_cmd | ||
+ | # why | ||
+ | rm -f / | ||
+ | cr_mpirun -v -am ft-enable-cr --gmca snapc_base_global_snapshot_dir $checkpoints/ | ||
+ | -x LD_LIBRARY_PATH --hostfile $HOME/ | ||
+ | pid=$! | ||
+ | pwd > $checkpoints/ | ||
+ | orgjobpid=0 | ||
+ | |||
+ | # otherwise restart the job | ||
+ | elif [ " | ||
+ | orgpid=`ls $checkpoints/ | ||
+ | orgpwd=`cat $checkpoints/ | ||
+ | if [ " | ||
+ | echo " | ||
+ | exit | ||
+ | fi | ||
+ | # cleanup old if present | ||
+ | rm -rf / | ||
+ | rm -f $HOME/ | ||
+ | # why | ||
+ | rm -f / | ||
+ | # stage old | ||
+ | scp $checkpoints/ | ||
+ | scp -r $checkpoints/ | ||
+ | ln -s $MYSANSCRATCH / | ||
+ | scp $checkpoints/ | ||
+ | scp -r $checkpoints/ | ||
+ | # why | ||
+ | scp $checkpoints/ | ||
+ | ln -s / | ||
+ | c=`wc -l $HOME/ | ||
+ | for i in `seq 1 $c`; do echo ' | ||
+ | cr_restart --kmsg-warning $restore_options --relocate $orgpwd=$MYSANSCRATCH --cont \ | ||
+ | $MYSANSCRATCH/ | ||
+ | pid=$! | ||
+ | started=`ps -u $USER | awk ' | ||
+ | if [ $started -ne 1 ]; then | ||
+ | echo " | ||
+ | kill $$ | ||
+ | fi | ||
+ | pwd > $checkpoints/ | ||
+ | |||
+ | # obviously | ||
+ | else | ||
+ | echo " | ||
+ | kill $$ | ||
+ | fi | ||
+ | |||
+ | # if $cmd disappears during $pcit, terminate wrapper | ||
+ | export POST_CMD=" | ||
+ | blcr_watcher $pid $$ $LSB_JOBID $orgjobpid & | ||
+ | bw_pid=$! | ||
+ | |||
+ | # always run this block | ||
+ | while [ true ]; do | ||
+ | # checkpoint time interval | ||
+ | sleep $cpti | ||
+ | # finished? | ||
+ | no_pid=`ps -u $USER | grep $pid | awk ' | ||
+ | if [ " | ||
+ | # save output | ||
+ | scp -rp $MYSANSCRATCH/ | ||
+ | $POST_CMD | ||
+ | kill $bw_pid | ||
+ | rm -f $HOME/ | ||
+ | exit | ||
+ | fi | ||
+ | # checkpoint file outside of sanscratch | ||
+ | scp -r $MYSANSCRATCH/ | ||
+ | scp -r / | ||
+ | chmod u+w $checkpoints/ | ||
+ | # why | ||
+ | scp / | ||
+ | cr_checkpoint -v --tree --cont $save_options -f $checkpoints/ | ||
+ | 2>> | ||
+ | scp $HOME/ | ||
+ | scp $HOME/ | ||
+ | scp -r / | ||
+ | # why | ||
+ | scp / | ||
+ | date >> $checkpoints/ | ||
+ | done | ||
+ | |||
+ | |||
+ | </ | ||
+ | |||
+ | ==== Parallel Wrapper v1 ==== | ||
+ | |||
+ | < | ||
+ | |||
+ | # | ||
+ | rm -f err out | ||
+ | # work dir and cwd | ||
+ | export MYSANSCRATCH=/ | ||
+ | cd $MYSANSCRATCH | ||
+ | |||
+ | # at job finish, all content in / | ||
+ | # will be copied to / | ||
+ | # content older than 3 months will be removed | ||
+ | |||
+ | # SCHEDULER | ||
+ | #BSUB -q test | ||
+ | #BSUB -J test | ||
+ | #BSUB -n 4 | ||
+ | #BSUB -o out | ||
+ | #BSUB -e err | ||
+ | # next required for mpirun checkpoint to work | ||
+ | # restarts must use same node in test queue (not sure why, others ca restart anywhere) | ||
+ | #BSUB -R " | ||
+ | |||
+ | # CHECK POINT TIME INTERVAL: 10m (debug) 6h 12h 18h 1d | ||
+ | cpti=1d | ||
+ | |||
+ | # COPY APPLICATION TO WORK DIR $MYSANSCRATCH (cwd) | ||
+ | # always stage the application (and data if needed) | ||
+ | # if mpirun save_exec=" | ||
+ | save_exec=" | ||
+ | pre_cmd=" | ||
+ | | ||
+ | post_cmd=" | ||
+ | |||
+ | # IF START OF JOB, UNCOMMENT | ||
+ | # its either start or restart block | ||
+ | mode=start | ||
+ | queue=test | ||
+ | cmd=" lmp_mpi -c off -var GPUIDX 0 -in au.inp -l auout " | ||
+ | |||
+ | # IF RESTART OF JOB, UNCOMMENT | ||
+ | # you must have pwd.JOBPID and chk.JOBPID in $orgjobpid/ | ||
+ | # | ||
+ | #queue=test | ||
+ | # | ||
+ | |||
+ | # user environment | ||
+ | export PATH=/ | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | #which lmp_mpi | ||
+ | |||
+ | |||
+ | |||
+ | ############### | ||
+ | |||
+ | |||
+ | |||
+ | # checkpoints | ||
+ | checkpoints=/ | ||
+ | |||
+ | # kernel modules | ||
+ | mods=`/ | ||
+ | if [ $mods -ne 2 ]; then | ||
+ | echo " | ||
+ | kill $$ | ||
+ | fi | ||
+ | |||
+ | # blcr setup | ||
+ | restore_options="" | ||
+ | # | ||
+ | if [ $save_exec == " | ||
+ | # | ||
+ | save_options=" | ||
+ | else | ||
+ | save_options=" | ||
+ | fi | ||
+ | |||
+ | # environment | ||
+ | export PATH=/ | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | |||
+ | export PATH=/ | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | |||
+ | #which mpirun cr_mpirun ompi-checkpoint ompi-restart cr_checkpoint cr_restart | ||
+ | |||
+ | # setup checkpoints dir | ||
+ | if [ ! -d $checkpoints/ | ||
+ | mkdir -p $checkpoints/ | ||
+ | else | ||
+ | echo " | ||
+ | kill $$ | ||
+ | fi | ||
+ | |||
+ | # save process id and path and start application | ||
+ | if [ " | ||
+ | # hostfile | ||
+ | echo " | ||
+ | tr '\/ ' ' | ||
+ | $pre_cmd | ||
+ | cr_mpirun -am ft-enable-cr --gmca snapc_base_global_snapshot_dir $checkpoints/ | ||
+ | --hostfile $HOME/ | ||
+ | pid=$! | ||
+ | pwd > $checkpoints/ | ||
+ | orgjobpid=0 | ||
+ | |||
+ | # otherwise restart the job | ||
+ | elif [ " | ||
+ | orgpid=`ls $checkpoints/ | ||
+ | orgpwd=`cat $checkpoints/ | ||
+ | # cleanup old | ||
+ | rm -rf / | ||
+ | # stage old | ||
+ | scp $checkpoints/ | ||
+ | scp -r $checkpoints/ | ||
+ | ln -s $MYSANSCRATCH / | ||
+ | scp $checkpoints/ | ||
+ | cr_restart --kmsg-warning $restore_options --relocate $orgpwd=$MYSANSCRATCH $MYSANSCRATCH/ | ||
+ | pid=$! | ||
+ | started=` ps -u hmeij | awk ' | ||
+ | if [ $started -ne 1 ]; then | ||
+ | echo " | ||
+ | kill $$ | ||
+ | fi | ||
+ | pwd > $checkpoints/ | ||
+ | |||
+ | # obviously | ||
+ | else | ||
+ | echo " | ||
+ | kill $$ | ||
+ | fi | ||
+ | |||
+ | # if $cmd disappears during $pcit, terminate wrapper | ||
+ | export POST_CMD=" | ||
+ | blcr_watcher $pid $$ $LSB_JOBID $orgjobpid & | ||
+ | |||
+ | # always run this block | ||
+ | while [ true ]; do | ||
+ | # checkpoint time interval | ||
+ | sleep $cpti | ||
+ | # checkpoint file outside of sanscratch | ||
+ | scp -r $MYSANSCRATCH/ | ||
+ | cr_checkpoint --tree $save_options -f $checkpoints/ | ||
+ | scp $HOME/ | ||
+ | scp $HOME/ | ||
+ | done | ||
+ | |||
+ | </ | ||
\\ | \\ | ||
**[[cluster: | **[[cluster: |