This shows you the differences between two versions of the page.
Next revision | Previous revision Next revision Both sides next revision | ||
cluster:148 [2016/03/29 17:38] hmeij07 created |
cluster:148 [2016/03/30 18:00] hmeij07 |
||
---|---|---|---|
Line 4: | Line 4: | ||
==== BLCR Checkpoint in OL3 ==== | ==== BLCR Checkpoint in OL3 ==== | ||
- | * This page concerns PARALLEL mpirun jobs only | + | * This page concerns PARALLEL mpirun jobs only; there are some restrictions |
+ | * all MPI threads need to be confined to one node | ||
+ | * restarted jobs must use the same node (not sure why) | ||
* For SERIAL jobs go here [[cluster: | * For SERIAL jobs go here [[cluster: | ||
Line 12: | Line 14: | ||
* Users Guide [[https:// | * Users Guide [[https:// | ||
+ | Checkpointing parallel jobs is a bit more complex than a serial job. MPI jobs are fired off by worker 0 of '' | ||
+ | The '' | ||
+ | |||
+ | < | ||
+ | |||
+ | # from eric at lbl | ||
+ | ./configure \ | ||
+ | --enable-ft-thread \ | ||
+ | --with-ft=cr \ | ||
+ | --enable-opal-multi-threads \ | ||
+ | --with-blcr=/ | ||
+ | --without-tm \ | ||
+ | --prefix=/ | ||
+ | |||
+ | # next download cr_mpirun | ||
+ | https:// | ||
+ | |||
+ | # configure and test | ||
+ | |||
+ | export PATH=/ | ||
+ | export LD_LIBRARY_PATH=/ | ||
+ | |||
+ | ./configure --with-blcr=/ | ||
+ | |||
+ | ============================================================================ | ||
+ | Testsuite summary for cr_mpirun 295 | ||
+ | ============================================================================ | ||
+ | # TOTAL: 3 | ||
+ | # PASS: 3 | ||
+ | # SKIP: 0 | ||
+ | # XFAIL: 0 | ||
+ | # FAIL: 0 | ||
+ | # XPASS: 0 | ||
+ | # ERROR: 0 | ||
+ | ============================================================================ | ||
+ | make[1]: Leaving directory `/ | ||
+ | |||
+ | # I coped cr_runmpi into / | ||
+ | # cr_runmpi needs access to all these in $PATH | ||
+ | # mpirun cr_mpirun ompi-checkpoint ompi-restart cr_checkpoint cr_restart | ||
+ | |||
+ | # next compile you parallel software using mpicc/ | ||
+ | |||
+ | </ | ||
+ | |||
+ | |||
+ | |||
+ | < | ||
+ | |||
+ | [hmeij@cottontail lammps]$ bsub < blcr_wrapper | ||
+ | Job <681> is submitted to queue < | ||
+ | [hmeij@cottontail lammps]$ bjobs | ||
+ | JOBID | ||
+ | 681 | ||
+ | [hmeij@cottontail lammps]$ bjobs | ||
+ | JOBID | ||
+ | 681 | ||
+ | | ||
+ | | ||
+ | | ||
+ | |||
+ | [hmeij@cottontail lammps]$ tail ~/ | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | [hmeij@cottontail lammps]$ ll / | ||
+ | total 30572 | ||
+ | -rw------- 1 hmeij its 8704 Mar 29 13:28 1459272204.681.err | ||
+ | -rw------- 1 hmeij its 5686 Mar 29 13:28 1459272204.681.out | ||
+ | -rw-r--r-- 1 hmeij its 2652 Mar 29 13:28 au.inp | ||
+ | -rw-r--r-- 1 hmeij its 0 Mar 29 13:28 auout | ||
+ | -rw-r--r-- 1 hmeij its 38310 Mar 29 13:28 auu3 | ||
+ | -r-------- 1 hmeij its | ||
+ | -rw-r--r-- 1 hmeij its 21342187 Mar 29 13:28 data.Big11AuSAMInitial | ||
+ | -rw-r--r-- 1 hmeij its 9598629 Mar 29 13:28 henz.dump | ||
+ | drwx------ 3 hmeij its 46 Mar 29 13:28 ompi_global_snapshot_9134.ckpt | ||
+ | -rw-r--r-- 1 hmeij its 16 Mar 29 13:23 pwd.9127 | ||
+ | [hmeij@cottontail lammps]$ ssh petaltail ps -u hmeij | ||
+ | PID TTY TIME CMD | ||
+ | 5762 ? 00:00:00 sshd | ||
+ | 5763 pts/1 00:00:00 bash | ||
+ | 9104 ? 00:00:00 res | ||
+ | 9110 ? 00:00:00 1459272204.681 | ||
+ | 9113 ? 00:00:00 1459272204.681. | ||
+ | 9127 ? 00:00:00 cr_mpirun | ||
+ | 9128 ? 00:00:00 blcr_watcher | ||
+ | 9133 ? 00:00:00 cr_mpirun | ||
+ | 9134 ? 00:00:00 mpirun | ||
+ | 9135 ? 00:00:00 sleep | ||
+ | 9136 ? 00:05:55 lmp_mpi | ||
+ | 9137 ? 00:06:07 lmp_mpi | ||
+ | 9138 ? 00:05:52 lmp_mpi | ||
+ | 9139 ? 00:05:53 lmp_mpi | ||
+ | 9347 ? 00:00:00 sleep | ||
+ | 9369 ? 00:00:00 sshd | ||
+ | 9370 ? 00:00:00 ps | ||
+ | 18559 pts/2 00:00:00 bash | ||
+ | [hmeij@cottontail lammps]$ tail ~/ | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | [hmeij@cottontail lammps]$ ssh petaltail kill 9133 | ||
+ | |||
+ | [hmeij@cottontail lammps]$ bsub < blcr_wrapper | ||
+ | Job <684> is submitted to queue < | ||
+ | [hmeij@cottontail lammps]$ rm -f ../ | ||
+ | [hmeij@cottontail lammps]$ bjobs | ||
+ | JOBID | ||
+ | 684 | ||
+ | | ||
+ | | ||
+ | | ||
+ | [hmeij@cottontail lammps]$ ll ../ | ||
+ | total 172 | ||
+ | -rw------- 1 hmeij its 8589 Mar 29 13:48 1459272204.681.err | ||
+ | -rw------- 1 hmeij its 5686 Mar 29 13:48 1459272204.681.out | ||
+ | -rwx------ 1 hmeij its 4609 Mar 29 13:48 1459273700.684 | ||
+ | -rw------- 1 hmeij its 9054 Mar 29 13:48 1459273700.684.err | ||
+ | -rw------- 1 hmeij its 53 Mar 29 13:48 1459273700.684.out | ||
+ | -rwxr--r-- 1 hmeij its 4270 Mar 29 13:48 1459273700.684.shell | ||
+ | lrwxrwxrwx 1 hmeij its 33 Mar 29 13:48 hostfile.681 -> / | ||
+ | -rw-r--r-- 1 hmeij its 40 Mar 29 13:48 hostfile.684 | ||
+ | -rw-r--r-- 1 hmeij its 40 Mar 29 13:48 hostfile.tmp.684 | ||
+ | [hmeij@cottontail lammps]$ less ../ | ||
+ | [hmeij@cottontail lammps]$ ssh petaltail ps -u hmeij | ||
+ | PID TTY TIME CMD | ||
+ | 5762 ? 00:00:00 sshd | ||
+ | 5763 pts/1 00:00:00 bash | ||
+ | 9127 ? 00:00:00 cr_mpirun | ||
+ | 9136 ? 00:00:34 lmp_mpi | ||
+ | 9137 ? 00:00:34 lmp_mpi | ||
+ | 9138 ? 00:00:34 lmp_mpi | ||
+ | 9139 ? 00:00:34 lmp_mpi | ||
+ | 9994 ? 00:00:00 res | ||
+ | 10002 ? 00:00:00 1459273700.684 | ||
+ | 10005 ? 00:00:00 1459273700.684. | ||
+ | 10039 ? 00:00:00 cr_restart | ||
+ | 10051 ? 00:00:00 cr_mpirun | ||
+ | 10052 ? 00:00:00 mpirun | ||
+ | 10053 ? 00:00:00 blcr_watcher | ||
+ | 10054 ? 00:00:00 sleep | ||
+ | 10055 ? 00:00:00 sleep | ||
+ | 10056 ? 00:00:01 cr_restart | ||
+ | 10057 ? 00:00:01 cr_restart | ||
+ | 10058 ? 00:00:02 cr_restart | ||
+ | 10059 ? 00:00:02 cr_restart | ||
+ | 10151 ? 00:00:00 sshd | ||
+ | 10152 ? 00:00:00 ps | ||
+ | 18559 pts/2 00:00:00 bash | ||
+ | |||
+ | [hmeij@cottontail lammps]$ tail -20 ../ | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | |||
+ | </ | ||
\\ | \\ | ||
**[[cluster: | **[[cluster: |