cluster:124
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| cluster:124 [2013/10/31 18:44] – hmeij | cluster:124 [2016/03/11 20:14] (current) – hmeij07 | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| \\ | \\ | ||
| **[[cluster: | **[[cluster: | ||
| + | |||
| + | Queue '' | ||
| + | --- // | ||
| + | |||
| + | Adjust your PATH and LD_LIBRARY_PATH accordingly. | ||
| ==== BLCR ==== | ==== BLCR ==== | ||
| Line 142: | Line 147: | ||
| * The restart job may end up on another node but will same process_id | * The restart job may end up on another node but will same process_id | ||
| - | After you have restarted, you can observe the tool starting from the checkpoint file you are pointing to. To simulate a crash, while your first submission is running with '' | + | After you have restarted, you can observe the tool starting from the checkpoint file you are pointing to. To simulate a crash, while your first submission is running with '' |
| It would be ever sweeter if the scheduler could be told to do all the checkpointing at intervals. | It would be ever sweeter if the scheduler could be told to do all the checkpointing at intervals. | ||
| Line 154: | Line 159: | ||
| # submit via 'bsub < run.serial' | # submit via 'bsub < run.serial' | ||
| rm -f *err *out *shell | rm -f *err *out *shell | ||
| - | #BSUB -q mw256chkpnt | + | #BSUB -q test |
| #BSUB -n 1 | #BSUB -n 1 | ||
| #BSUB -J test | #BSUB -J test | ||
| Line 160: | Line 165: | ||
| #BSUB -e err | #BSUB -e err | ||
| - | export PATH=/ | + | export PATH=/ |
| - | export LD_LIBRARY_PATH=/ | + | export LD_LIBRARY_PATH=/ |
| - | # checkpoint | + | # checkpoint |
| MYSANSCRATCH=/ | MYSANSCRATCH=/ | ||
| MYLOCALSCRATCH=/ | MYLOCALSCRATCH=/ | ||
| Line 172: | Line 177: | ||
| cp -rp ~/ | cp -rp ~/ | ||
| - | # start the application | + | # on first start of application, remember |
| - | cr_run ./ | + | # save some stuff for checking later and restart |
| - | process_id=`ps -u hmeij | grep t-20001030-01 | grep -v grep | awk ' | + | #cr_run ./ |
| - | pwd > pwd.$process_id | + | #sleep 60 |
| + | #process_id=`ps -u hmeij | grep t-20001030-01 | grep -v grep | awk ' | ||
| + | #pwd > pwd.$process_id | ||
| + | #cp -p pwd* *.shell *.out *.err ~/blcr/ | ||
| # on restart, give cr_restart some time to set up | # on restart, give cr_restart some time to set up | ||
| # WARNING: it will overwrite the checkpoint file, save it | # WARNING: it will overwrite the checkpoint file, save it | ||
| # you need to find the process_id and supply it | # you need to find the process_id and supply it | ||
| - | #process_id=9089 | + | process_id=4711 |
| - | #cp -p ~/ | + | cp -p ~/ |
| - | #mv ~/ | + | mv ~/ |
| - | #ln -s $MYSANSCRATCH `cat ~/ | + | ln -s $MYSANSCRATCH `cat ~/ |
| - | #cr_restart ~/ | + | cr_restart ~/ |
| - | #sleep 60 | + | sleep 60 |
| + | # always uncommented | ||
| echo " | echo " | ||
| while [ $process_id -gt 0 ]; do | while [ $process_id -gt 0 ]; do | ||
| # checkpoint time interval, make it very large (small for testing) | # checkpoint time interval, make it very large (small for testing) | ||
| sleep 120 | sleep 120 | ||
| - | # save the checkpoint file outside of /sanscratch | + | # save the checkpoint file outside of sanscratch |
| cr_checkpoint -f ~/ | cr_checkpoint -f ~/ | ||
| - | # if the application has crashed, exit | + | |
| + | | ||
| process_id=`ps -u hmeij | grep t-20001030-01 | grep -v grep | awk ' | process_id=`ps -u hmeij | grep t-20001030-01 | grep -v grep | awk ' | ||
| if [ " | if [ " | ||
| - | # save some stuff for checking later | ||
| - | cp -p pwd* *.shell *.out *.err context ~/blcr/ | ||
| rm -f `cat ~/ | rm -f `cat ~/ | ||
| exit; | exit; | ||
| fi | fi | ||
| done | done | ||
| + | |||
| + | |||
| </ | </ | ||
cluster/124.1383245072.txt.gz · Last modified: by hmeij
