cluster:148
This is an old revision of the document!
BLCR Checkpoint in OL3
- This page concerns PARALLEL mpirun jobs only
- For SERIAL jobs go here BLCR Checkpoint in OL3
- Installation and what it does BLCR
[hmeij@cottontail lammps]$ bsub < blcr_wrapper
Job <681> is submitted to queue <test>.
[hmeij@cottontail lammps]$ bjobs
JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME SUBMIT_TIME
681 hmeij PEND test cottontail test Mar 29 13:23
[hmeij@cottontail lammps]$ bjobs
JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME SUBMIT_TIME
681 hmeij RUN test cottontail petaltail test Mar 29 13:23
petaltail
petaltail
petaltail
[hmeij@cottontail lammps]$ tail ~/.lsbatch/1459272204.681.out
160 4062132.4 -4439564.1 1.4618689e+08 2.8818933e+08 17552.743
170 4395340.9 -4440084.6 1.3417499e+08 2.8818925e+08 19150.394
180 4711438.8 -4440426.5 1.2277977e+08 2.8818918e+08 20665.317
190 5007151.9 -4440573.2 1.1211925e+08 2.8818913e+08 22081.756
200 5279740.3 -4440516 1.0229219e+08 2.8818909e+08 23386.523
210 5527023.5 -4440257.5 93377214 2.8818906e+08 24569.109
220 5747387.7 -4439813.3 85432510 2.8818904e+08 25621.734
230 5939773.3 -4439214.4 78496309 2.8818904e+08 26539.282
240 6103647.2 -4438507.6 72587871 2.8818905e+08 27319.145
250 6238961.8 -4437755.5 67708974 2.8818907e+08 27961.064
[hmeij@cottontail lammps]$ ll /sanscratch/checkpoints/681
total 30572
-rw------- 1 hmeij its 8704 Mar 29 13:28 1459272204.681.err
-rw------- 1 hmeij its 5686 Mar 29 13:28 1459272204.681.out
-rw-r--r-- 1 hmeij its 2652 Mar 29 13:28 au.inp
-rw-r--r-- 1 hmeij its 0 Mar 29 13:28 auout
-rw-r--r-- 1 hmeij its 38310 Mar 29 13:28 auu3
-r-------- 1 hmeij its 289714 Mar 29 13:28 chk.9127
-rw-r--r-- 1 hmeij its 21342187 Mar 29 13:28 data.Big11AuSAMInitial
-rw-r--r-- 1 hmeij its 9598629 Mar 29 13:28 henz.dump
drwx------ 3 hmeij its 46 Mar 29 13:28 ompi_global_snapshot_9134.ckpt
-rw-r--r-- 1 hmeij its 16 Mar 29 13:23 pwd.9127
[hmeij@cottontail lammps]$ ssh petaltail ps -u hmeij
PID TTY TIME CMD
5762 ? 00:00:00 sshd
5763 pts/1 00:00:00 bash
9104 ? 00:00:00 res
9110 ? 00:00:00 1459272204.681
9113 ? 00:00:00 1459272204.681.
9127 ? 00:00:00 cr_mpirun
9128 ? 00:00:00 blcr_watcher
9133 ? 00:00:00 cr_mpirun
9134 ? 00:00:00 mpirun
9135 ? 00:00:00 sleep
9136 ? 00:05:55 lmp_mpi
9137 ? 00:06:07 lmp_mpi
9138 ? 00:05:52 lmp_mpi
9139 ? 00:05:53 lmp_mpi
9347 ? 00:00:00 sleep
9369 ? 00:00:00 sshd
9370 ? 00:00:00 ps
18559 pts/2 00:00:00 bash
[hmeij@cottontail lammps]$ tail ~/.lsbatch/1459272204.681.out
190 5007151.9 -4440573.2 1.1211925e+08 2.8818913e+08 22081.756
200 5279740.3 -4440516 1.0229219e+08 2.8818909e+08 23386.523
210 5527023.5 -4440257.5 93377214 2.8818906e+08 24569.109
220 5747387.7 -4439813.3 85432510 2.8818904e+08 25621.734
230 5939773.3 -4439214.4 78496309 2.8818904e+08 26539.282
240 6103647.2 -4438507.6 72587871 2.8818905e+08 27319.145
250 6238961.8 -4437755.5 67708974 2.8818907e+08 27961.064
260 6346104.5 -4437033.7 63845731 2.881891e+08 28466.852
270 6425840.1 -4436426.7 60970646 2.8818913e+08 28840.129
280 6479251.9 -4436021.1 59044759 2.8818917e+08 29086.075
[hmeij@cottontail lammps]$ ssh petaltail kill 9133
[hmeij@cottontail lammps]$ bsub < blcr_wrapper
Job <684> is submitted to queue <test>.
[hmeij@cottontail lammps]$ rm -f ../.lsbatch/*^C
[hmeij@cottontail lammps]$ bjobs
JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME SUBMIT_TIME
684 hmeij RUN test cottontail petaltail test Mar 29 13:48
petaltail
petaltail
petaltail
[hmeij@cottontail lammps]$ ll ../.lsbatch/
total 172
-rw------- 1 hmeij its 8589 Mar 29 13:48 1459272204.681.err
-rw------- 1 hmeij its 5686 Mar 29 13:48 1459272204.681.out
-rwx------ 1 hmeij its 4609 Mar 29 13:48 1459273700.684
-rw------- 1 hmeij its 9054 Mar 29 13:48 1459273700.684.err
-rw------- 1 hmeij its 53 Mar 29 13:48 1459273700.684.out
-rwxr--r-- 1 hmeij its 4270 Mar 29 13:48 1459273700.684.shell
lrwxrwxrwx 1 hmeij its 33 Mar 29 13:48 hostfile.681 -> /home/hmeij/.lsbatch/hostfile.684
-rw-r--r-- 1 hmeij its 40 Mar 29 13:48 hostfile.684
-rw-r--r-- 1 hmeij its 40 Mar 29 13:48 hostfile.tmp.684
[hmeij@cottontail lammps]$ less ../.lsbatch/*684.err
[hmeij@cottontail lammps]$ ssh petaltail ps -u hmeij
PID TTY TIME CMD
5762 ? 00:00:00 sshd
5763 pts/1 00:00:00 bash
9127 ? 00:00:00 cr_mpirun
9136 ? 00:00:34 lmp_mpi
9137 ? 00:00:34 lmp_mpi
9138 ? 00:00:34 lmp_mpi
9139 ? 00:00:34 lmp_mpi
9994 ? 00:00:00 res
10002 ? 00:00:00 1459273700.684
10005 ? 00:00:00 1459273700.684.
10039 ? 00:00:00 cr_restart
10051 ? 00:00:00 cr_mpirun
10052 ? 00:00:00 mpirun
10053 ? 00:00:00 blcr_watcher
10054 ? 00:00:00 sleep
10055 ? 00:00:00 sleep
10056 ? 00:00:01 cr_restart
10057 ? 00:00:01 cr_restart
10058 ? 00:00:02 cr_restart
10059 ? 00:00:02 cr_restart
10151 ? 00:00:00 sshd
10152 ? 00:00:00 ps
18559 pts/2 00:00:00 bash
[hmeij@cottontail lammps]$ tail -20 ../.lsbatch/1459272204.681.out
210 5527023.5 -4440257.5 93377214 2.8818906e+08 24569.109
220 5747387.7 -4439813.3 85432510 2.8818904e+08 25621.734
230 5939773.3 -4439214.4 78496309 2.8818904e+08 26539.282
240 6103647.2 -4438507.6 72587871 2.8818905e+08 27319.145
250 6238961.8 -4437755.5 67708974 2.8818907e+08 27961.064
260 6346104.5 -4437033.7 63845731 2.881891e+08 28466.852
270 6425840.1 -4436426.7 60970646 2.8818913e+08 28840.129
280 6479251.9 -4436021.1 59044759 2.8818917e+08 29086.075
290 6507681 -4435898.2 58019799 2.8818922e+08 29211.089
300 6512669 -4436124.7 57840251 2.8818927e+08 29222.575
310 6495904.7 -4436745.3 58445285 2.8818932e+08 29128.647
320 6459174.9 -4437776.1 59770495 2.8818937e+08 28937.93
330 6404322.4 -4439201.5 61749434 2.8818942e+08 28659.348
340 6333209 -4440973.8 64314930 2.8818947e+08 28301.927
350 6247685.4 -4443016.1 67400192 2.8818951e+08 27874.684
360 6149565.9 -4445228.2 70939709 2.8818956e+08 27386.465
370 6040609.2 -4447492.8 74869965 2.8818961e+08 26845.871
380 5922503.2 -4449683.5 79129981 2.8818965e+08 26261.166
390 5796854.1 -4451671.6 83661722 2.8818969e+08 25640.235
400 5665179.3 -4453332 88410367 2.8818972e+08 24990.519
cluster/148.1459281610.txt.gz · Last modified: by hmeij07
