User Tools

Site Tools


cluster:148

This is an old revision of the document!



Back

BLCR Checkpoint in OL3

  • This page concerns PARALLEL mpirun jobs only
  • Installation and what it does BLCR
[hmeij@cottontail lammps]$ bsub < blcr_wrapper
Job <681> is submitted to queue <test>.
[hmeij@cottontail lammps]$ bjobs
JOBID   USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME
681     hmeij   PEND  test       cottontail              test       Mar 29 13:23
[hmeij@cottontail lammps]$ bjobs
JOBID   USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME
681     hmeij   RUN   test       cottontail  petaltail   test       Mar 29 13:23
                                             petaltail
                                             petaltail
                                             petaltail

[hmeij@cottontail lammps]$ tail ~/.lsbatch/1459272204.681.out                   
     160    4062132.4   -4439564.1 1.4618689e+08 2.8818933e+08    17552.743     
     170    4395340.9   -4440084.6 1.3417499e+08 2.8818925e+08    19150.394     
     180    4711438.8   -4440426.5 1.2277977e+08 2.8818918e+08    20665.317     
     190    5007151.9   -4440573.2 1.1211925e+08 2.8818913e+08    22081.756     
     200    5279740.3     -4440516 1.0229219e+08 2.8818909e+08    23386.523     
     210    5527023.5   -4440257.5     93377214 2.8818906e+08    24569.109      
     220    5747387.7   -4439813.3     85432510 2.8818904e+08    25621.734      
     230    5939773.3   -4439214.4     78496309 2.8818904e+08    26539.282      
     240    6103647.2   -4438507.6     72587871 2.8818905e+08    27319.145      
     250    6238961.8   -4437755.5     67708974 2.8818907e+08    27961.064      
[hmeij@cottontail lammps]$ ll /sanscratch/checkpoints/681
total 30572                                              
-rw------- 1 hmeij its     8704 Mar 29 13:28 1459272204.681.err
-rw------- 1 hmeij its     5686 Mar 29 13:28 1459272204.681.out
-rw-r--r-- 1 hmeij its     2652 Mar 29 13:28 au.inp
-rw-r--r-- 1 hmeij its        0 Mar 29 13:28 auout
-rw-r--r-- 1 hmeij its    38310 Mar 29 13:28 auu3
-r-------- 1 hmeij its   289714 Mar 29 13:28 chk.9127
-rw-r--r-- 1 hmeij its 21342187 Mar 29 13:28 data.Big11AuSAMInitial
-rw-r--r-- 1 hmeij its  9598629 Mar 29 13:28 henz.dump
drwx------ 3 hmeij its       46 Mar 29 13:28 ompi_global_snapshot_9134.ckpt
-rw-r--r-- 1 hmeij its       16 Mar 29 13:23 pwd.9127
[hmeij@cottontail lammps]$ ssh petaltail ps -u hmeij
  PID TTY          TIME CMD
 5762 ?        00:00:00 sshd
 5763 pts/1    00:00:00 bash
 9104 ?        00:00:00 res
 9110 ?        00:00:00 1459272204.681
 9113 ?        00:00:00 1459272204.681.
 9127 ?        00:00:00 cr_mpirun
 9128 ?        00:00:00 blcr_watcher
 9133 ?        00:00:00 cr_mpirun
 9134 ?        00:00:00 mpirun
 9135 ?        00:00:00 sleep
 9136 ?        00:05:55 lmp_mpi
 9137 ?        00:06:07 lmp_mpi
 9138 ?        00:05:52 lmp_mpi
 9139 ?        00:05:53 lmp_mpi
 9347 ?        00:00:00 sleep
 9369 ?        00:00:00 sshd
 9370 ?        00:00:00 ps
18559 pts/2    00:00:00 bash
[hmeij@cottontail lammps]$ tail ~/.lsbatch/1459272204.681.out
     190    5007151.9   -4440573.2 1.1211925e+08 2.8818913e+08    22081.756
     200    5279740.3     -4440516 1.0229219e+08 2.8818909e+08    23386.523
     210    5527023.5   -4440257.5     93377214 2.8818906e+08    24569.109
     220    5747387.7   -4439813.3     85432510 2.8818904e+08    25621.734
     230    5939773.3   -4439214.4     78496309 2.8818904e+08    26539.282
     240    6103647.2   -4438507.6     72587871 2.8818905e+08    27319.145
     250    6238961.8   -4437755.5     67708974 2.8818907e+08    27961.064
     260    6346104.5   -4437033.7     63845731 2.881891e+08    28466.852
     270    6425840.1   -4436426.7     60970646 2.8818913e+08    28840.129
     280    6479251.9   -4436021.1     59044759 2.8818917e+08    29086.075
[hmeij@cottontail lammps]$ ssh petaltail kill 9133

[hmeij@cottontail lammps]$ bsub < blcr_wrapper
Job <684> is submitted to queue <test>.       
[hmeij@cottontail lammps]$ rm -f ../.lsbatch/*^C
[hmeij@cottontail lammps]$ bjobs                
JOBID   USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME
684     hmeij   RUN   test       cottontail  petaltail   test       Mar 29 13:48
                                             petaltail                          
                                             petaltail                          
                                             petaltail                          
[hmeij@cottontail lammps]$ ll ../.lsbatch/                                      
total 172                                                                       
-rw------- 1 hmeij its 8589 Mar 29 13:48 1459272204.681.err                     
-rw------- 1 hmeij its 5686 Mar 29 13:48 1459272204.681.out                     
-rwx------ 1 hmeij its 4609 Mar 29 13:48 1459273700.684                         
-rw------- 1 hmeij its 9054 Mar 29 13:48 1459273700.684.err                     
-rw------- 1 hmeij its   53 Mar 29 13:48 1459273700.684.out                     
-rwxr--r-- 1 hmeij its 4270 Mar 29 13:48 1459273700.684.shell                   
lrwxrwxrwx 1 hmeij its   33 Mar 29 13:48 hostfile.681 -> /home/hmeij/.lsbatch/hostfile.684
-rw-r--r-- 1 hmeij its   40 Mar 29 13:48 hostfile.684                                     
-rw-r--r-- 1 hmeij its   40 Mar 29 13:48 hostfile.tmp.684                                 
[hmeij@cottontail lammps]$ less ../.lsbatch/*684.err
[hmeij@cottontail lammps]$ ssh petaltail ps -u hmeij
  PID TTY          TIME CMD                         
 5762 ?        00:00:00 sshd                        
 5763 pts/1    00:00:00 bash                        
 9127 ?        00:00:00 cr_mpirun                   
 9136 ?        00:00:34 lmp_mpi                     
 9137 ?        00:00:34 lmp_mpi                     
 9138 ?        00:00:34 lmp_mpi                     
 9139 ?        00:00:34 lmp_mpi                     
 9994 ?        00:00:00 res                         
10002 ?        00:00:00 1459273700.684              
10005 ?        00:00:00 1459273700.684.             
10039 ?        00:00:00 cr_restart                  
10051 ?        00:00:00 cr_mpirun                   
10052 ?        00:00:00 mpirun                      
10053 ?        00:00:00 blcr_watcher                
10054 ?        00:00:00 sleep                       
10055 ?        00:00:00 sleep                       
10056 ?        00:00:01 cr_restart                  
10057 ?        00:00:01 cr_restart                  
10058 ?        00:00:02 cr_restart                  
10059 ?        00:00:02 cr_restart                  
10151 ?        00:00:00 sshd                        
10152 ?        00:00:00 ps                          
18559 pts/2    00:00:00 bash                        

[hmeij@cottontail lammps]$ tail -20 ../.lsbatch/1459272204.681.out
     210    5527023.5   -4440257.5     93377214 2.8818906e+08    24569.109
     220    5747387.7   -4439813.3     85432510 2.8818904e+08    25621.734
     230    5939773.3   -4439214.4     78496309 2.8818904e+08    26539.282
     240    6103647.2   -4438507.6     72587871 2.8818905e+08    27319.145
     250    6238961.8   -4437755.5     67708974 2.8818907e+08    27961.064
     260    6346104.5   -4437033.7     63845731 2.881891e+08    28466.852
     270    6425840.1   -4436426.7     60970646 2.8818913e+08    28840.129
     280    6479251.9   -4436021.1     59044759 2.8818917e+08    29086.075
     290      6507681   -4435898.2     58019799 2.8818922e+08    29211.089
     300      6512669   -4436124.7     57840251 2.8818927e+08    29222.575
     310    6495904.7   -4436745.3     58445285 2.8818932e+08    29128.647
     320    6459174.9   -4437776.1     59770495 2.8818937e+08     28937.93
     330    6404322.4   -4439201.5     61749434 2.8818942e+08    28659.348
     340      6333209   -4440973.8     64314930 2.8818947e+08    28301.927
     350    6247685.4   -4443016.1     67400192 2.8818951e+08    27874.684
     360    6149565.9   -4445228.2     70939709 2.8818956e+08    27386.465
     370    6040609.2   -4447492.8     74869965 2.8818961e+08    26845.871
     380    5922503.2   -4449683.5     79129981 2.8818965e+08    26261.166
     390    5796854.1   -4451671.6     83661722 2.8818969e+08    25640.235
     400    5665179.3     -4453332     88410367 2.8818972e+08    24990.519


Back

cluster/148.1459281610.txt.gz · Last modified: 2016/03/29 16:00 by hmeij07