Differences

This shows you the differences between two versions of the page.

--- cluster:155 [2017/03/23 15:43]
hmeij07 [OpenHPC]
+++ cluster:155 [2017/04/05 12:35] (current)
hmeij07
@@ Line 2: / Line 2: @@
 **[[cluster:0|Back]]**
-==== OpenHPC ====
+==== OpenHPC page 2====
 Additional tools for the OpenHPC environment. First add these two lines to SMS and all compute nodes. Patch CHROOT as well.
@@ Line 17: / Line 17: @@
   * Skipping SSH restrictions for users
+    * Set up password less logins ''ssh-keygen -t rsa''
+    * ''cat ~/.ssh/idrsa.pub >> ~/.ssh/authorized_keys''
+    * Collect all server fingerprints and make a global ''known_hosts'' file in ~/.ssh/
   * Skipping Luster installation
@@ Line 23: / Line 26: @@
 <code>
-yum -y groupinstall ohpc-nagios
+ yum -y groupinstall ohpc-nagios
-yum -y --installroot=/data/ohpc/images/centos7.2 install nagios-plugins-all-ohpc nrpe-ohpc
+ yum -y --installroot=/data/ohpc/images/centos7.2 install nagios-plugins-all-ohpc nrpe-ohpc
-chroot /data/ohpc/images/centos7.2 systemctl enable nrpe
+ chroot /data/ohpc/images/centos7.2 systemctl enable nrpe
-perl -pi -e "s/^allowed_hosts=/# allowed_hosts=/"  /data/ohpc/images/centos7.2/etc/nagios/nrpe.cfg
+ perl -pi -e "s/^allowed_hosts=/# allowed_hosts=/"  /data/ohpc/images/centos7.2/etc/nagios/nrpe.cfg
-echo "nrpe 5666/tcp # NRPE" >> /data/ohpc/images/centos7.2/etc/services
+ echo "nrpe 5666/tcp # NRPE" >> /data/ohpc/images/centos7.2/etc/services
-echo "nrpe : 192.168.1.249 : ALLOW" >> /data/ohpc/images/centos7.2/etc/hosts.allow
+ echo "nrpe : 192.168.1.249 : ALLOW" >> /data/ohpc/images/centos7.2/etc/hosts.allow
-echo "nrpe : ALL : DENY" >> /data/ohpc/images/centos7.2/etc/hosts.allow
+ echo "nrpe : ALL : DENY" >> /data/ohpc/images/centos7.2/etc/hosts.allow
-chroot /data/ohpc/images/centos7.2 /usr/sbin/useradd -c "NRPE user for the NRPE service" \
+ chroot /data/ohpc/images/centos7.2 /usr/sbin/useradd -c "NRPE user for the NRPE service" \
        -d /var/run/nrpe -r -g nrpe -s /sbin/nologin nrpe
-mv /etc/nagios/conf.d/services.cfg.example /etc/nagios/conf.d/services.cfg
+ mv /etc/nagios/conf.d/services.cfg.example /etc/nagios/conf.d/services.cfg
-mv /etc/nagios/conf.d/hosts.cfg.example /etc/nagios/conf.d/hosts.cfg
+ mv /etc/nagios/conf.d/hosts.cfg.example /etc/nagios/conf.d/hosts.cfg
-perl -pi -e "s/HOSTNAME1/n29/ || s/HOST1_IP/192.168.102.38/" /etc/nagios/conf.d/hosts.cfg
+ perl -pi -e "s/HOSTNAME1/n29/ || s/HOST1_IP/192.168.102.38/" /etc/nagios/conf.d/hosts.cfg
-perl -pi -e "s/HOSTNAME2/n31/ || s/HOST2_IP/192.168.102.40/" /etc/nagios/conf.d/hosts.cfg
+ perl -pi -e "s/HOSTNAME2/n31/ || s/HOST2_IP/192.168.102.40/" /etc/nagios/conf.d/hosts.cfg
-perl -pi -e "s/ \/bin\/mail/\/usr\/bin\/mailx/g" /etc/nagios/objects/commands.cfg
+ perl -pi -e "s/ \/bin\/mail/\/usr\/bin\/mailx/g" /etc/nagios/objects/commands.cfg
-perl -pi -e "s/nagios\@localhost/root\@ohpc0-test/" /etc/nagios/objects/contacts.cfg
+ perl -pi -e "s/nagios\@localhost/root\@ohpc0-test/" /etc/nagios/objects/contacts.cfg
-chkconfig nagios on
+ chkconfig nagios on
-systemctl start nagios
+ systemctl start nagios
-chmod u+s `which ping`
+ chmod u+s `which ping`
-echo "relayhost = 192.168.102.42" >> /etc/postfix/main.cf
+ echo "relayhost = 192.168.102.42" >> /etc/postfix/main.cf
-echo "root:           hmeij@wes..." >> /etc/aliases
+ echo "root:           hmeij@wes..." >> /etc/aliases
-newaliases
+ newaliases
-systemctl restart postfix
+ systemctl restart postfix
 # recreate vnfs and reimage nodes, see page1
-wwvnfs -y --chroot /data/ohpc/images/centos7.2
+ wwvnfs -y --chroot /data/ohpc/images/centos7.2
-/root/deploy.sh
+ /root/deploy.sh
 </code>
@@ Line 56: / Line 59: @@
   * Open port 80 in iptables but restrict severely (plain text passwords)
   * http://localhost/nagios
+  * On to Ganglia
+<code>
+ yum -y groupinstall ohpc-ganglia
+ yum -y --installroot=/data/ohpc/images/centos7.2 install ganglia-gmond-ohpc
+# import passwd, shadow and group files for new user account ganglia
+ mv /etc/ganglia/gmond.conf /etc/ganglia/gmond.conf-orig
+ cp /opt/ohpc/pub/examples/ganglia/gmond.conf /etc/ganglia/
+# use provision IP
+ perl -pi -e "s/<sms>/192.168.1.249/" /etc/ganglia/gmond.conf
+ cp /etc/ganglia/gmond.conf /data/ohpc/images/centos7.2/etc/ganglia/
+ echo "gridname MySite" >> /etc/ganglia/gmetad.conf
+ systemctl enable gmond
+ systemctl enable gmetad
+ systemctl start gmond
+ systemctl start gmetad
+ systemctl restart httpd
+ chroot /data/ohpc/images/centos7.2 systemctl enable gmond
+# recreate vnfs and reimage nodes, see page1
+ wwvnfs -y --chroot /data/ohpc/images/centos7.2
+ /root/deploy.sh
+</code>
+  * http://localhost/ganglia
+  * Not installing ClusterShell, pdsh is already installed
+    * add compute hostnames to /etc/hosts.pdsh
+    * ''echo export WCOLL=/etc/hosts.pdsh >> /root/.bashrc''
+<code>
+[root@ohpc0-test ~]# pdsh uptime
+n31:  10:44:25 up 19:14,  1 user,  load average: 0.00, 0.01, 0.05
+n29:  10:44:25 up 19:19,  0 users,  load average: 0.00, 0.01, 0.05
+</code>
+  * Skip ''mrsh'' installation
+  * Skip ''Genders'' installation
+  * Skip ''ConMan'' installation, ipmi serial consoles
+  * Skip ''rsysslog'' forwarding of compute node logs to SMS
+  * Redefine ''ControlMachine'' in /etc/slurm.slurm.conf
+    * use eth0, not public address eth1
+    * and CHROOT/etc/slurm/slurm.conf
+    * import file back into database
+Ran into a slurm config problem here on compue ndoes. When issuing ''systemctl status slurm'' the output revealed a failed start and it reported could not find /var/run/slurmctl.pid ... that's the wrong pid process, compute nodes should only start slurmd. So finally got this fixed
+<code>
+# ON COMPUTE NODES, that is in CHROOT
+# Removed file /etc/init.d/slurm
+mv /etc/init.d/slurm /root/
+# Made the following link
+[root@n31 ~]# ls -l /etc/systemd/system/multi-user.target.wants/slurmd.service
+lrwxrwxrwx 1 root root 38 Mar 30 14:05 /etc/systemd/system/multi-user.target.wants/slurmd.service -> /usr/lib/systemd/system/slurmd.service
+# now it starts properly
+Mar 31 12:41:05 n31.localdomain systemd[1]: Starting Slurm node daemon...
+Mar 31 12:41:05 n31.localdomain systemd[1]: PID file /var/run/slurmd.pid not readable (yet?) after start.
+Mar 31 12:41:05 n31.localdomain systemd[1]: Started Slurm node daemon.
+</code>
+  * Recreate vnfs, Reimage the whole kaboodle
+Link to my previous eval of Slurm and job throughput testing: [[cluster:134|Slurm]]. Next submit some test jobs, explained on that page too.
+Here are my current settings on slurm.conf in OpenHPC.
+<code>
+ClusterName=linux
+ControlMachine=ohpc0-slurm
+ControlAddr=192.168.1.249
+SlurmUser=slurm
+SlurmctldPort=6815-6817        <---
+SlurmdPort=6818
+AuthType=auth/munge
+StateSaveLocation=/etc/slurm/state    <---
+SlurmdSpoolDir=/etc/slurm/spool      <---
+SwitchType=switch/none
+MpiDefault=none
+SlurmctldPidFile=/var/run/slurmctld.pid
+SlurmdPidFile=/var/run/slurmd.pid
+ProctrackType=proctrack/pgid
+FirstJobId=101               <---
+MaxJobCount=999999           <---
+SlurmctldTimeout=300
+SlurmdTimeout=300
+InactiveLimit=0
+MinJobAge=300
+KillWait=30
+Waittime=0
+SchedulerType=sched/builtin <---
+SchedulerPort=7321          <---
+SelectType=select/linear
+FastSchedule=1
+SlurmctldDebug=3
+SlurmdDebug=3
+JobCompType=jobcomp/none
+PropagateResourceLimitsExcept=MEMLOCK
+SlurmdLogFile=/var/log/slurm.log
+SlurmctldLogFile=/var/log/slurmctld.log
+Epilog=/etc/slurm/slurm.epilog.clean
+ReturnToService=1
+NodeName=ohpc0-slurm NodeAddr=192.168.1.249
+NodeName=n29 NodeAddr=192.168.102.38
+NodeName=n31 NodeAddr=192.168.102.40
+PartitionName=test Nodes=n29,n31 Default=YES MaxTime=INFINITE STATE=UP
+</code>
+Define CPUs, Cores, ThreadsPerCore, etc al later, run with Slurm self-discovered values for now.
+[[cluster:154|OpenHPC page 1]] - page 2 - [[cluster:156|OpenHPC page 3]] - [[cluster:160|OpenHPC page 4]]
 \\
 **[[cluster:0|Back]]**

DokuWiki

User Tools

Site Tools

Differences

Page Tools