This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision Last revision Both sides next revision | ||
cluster:124 [2013/10/31 14:53] hmeij |
cluster:124 [2016/03/03 13:58] hmeij07 |
||
---|---|---|---|
Line 1: | Line 1: | ||
\\ | \\ | ||
**[[cluster: | **[[cluster: | ||
+ | |||
+ | Queue '' | ||
+ | --- // | ||
+ | |||
+ | Adjust your PATH and LD_LIBRARY_PATH accordingly. | ||
==== BLCR ==== | ==== BLCR ==== | ||
Line 142: | Line 147: | ||
* The restart job may end up on another node but will same process_id | * The restart job may end up on another node but will same process_id | ||
- | After you have restarted, you can observe the tool starting from the checkpoint file you are pointing to. To simulate a crash, while your first submission is running with '' | + | After you have restarted, you can observe the tool starting from the checkpoint file you are pointing to. To simulate a crash, while your first submission is running with '' |
It would be ever sweeter if the scheduler could be told to do all the checkpointing at intervals. | It would be ever sweeter if the scheduler could be told to do all the checkpointing at intervals. | ||
Line 163: | Line 168: | ||
export LD_LIBRARY_PATH=/ | export LD_LIBRARY_PATH=/ | ||
- | # checkpoint | + | # checkpoint |
MYSANSCRATCH=/ | MYSANSCRATCH=/ | ||
MYLOCALSCRATCH=/ | MYLOCALSCRATCH=/ | ||
Line 172: | Line 177: | ||
cp -rp ~/ | cp -rp ~/ | ||
- | # start the application and remeber | + | # start the application and remember |
+ | # save some stuff for checking later and restart | ||
cr_run ./ | cr_run ./ | ||
process_id=`ps -u hmeij | grep t-20001030-01 | grep -v grep | awk ' | process_id=`ps -u hmeij | grep t-20001030-01 | grep -v grep | awk ' | ||
pwd > pwd.$process_id | pwd > pwd.$process_id | ||
+ | cp -p pwd* *.shell *.out *.err ~/blcr/ | ||
# on restart, give cr_restart some time to set up | # on restart, give cr_restart some time to set up | ||
Line 191: | Line 198: | ||
# checkpoint time interval, make it very large (small for testing) | # checkpoint time interval, make it very large (small for testing) | ||
sleep 120 | sleep 120 | ||
- | # save the checkpoint file outside of /sanscratch | + | # save the checkpoint file outside of sanscratch |
cr_checkpoint -f ~/ | cr_checkpoint -f ~/ | ||
+ | cp -p context ~/blcr/ | ||
# if the application has crashed, exit | # if the application has crashed, exit | ||
process_id=`ps -u hmeij | grep t-20001030-01 | grep -v grep | awk ' | process_id=`ps -u hmeij | grep t-20001030-01 | grep -v grep | awk ' | ||
if [ " | if [ " | ||
- | # save some stuff for checking later | ||
- | cp -p pwd* *.shell *.out *.err context ~/blcr/ | ||
rm -f `cat ~/ | rm -f `cat ~/ | ||
exit; | exit; | ||
fi | fi | ||
done | done | ||
+ | |||
+ | |||
</ | </ |