User Tools

Site Tools


cluster:216


Back

Warewulf, ohpc 2.4

There are other pages to view but this is my latest …

stateless

First we create templates network.ww and ifcfg-eth0.ww

This node n59 is bare metal with just a 16G usb stick attached to system board (DOM) to hold operating system. Legacy boot.

# network.ww
# short node names
NETWORK=yes
HOSTNAME=%{NODENAME}

# ifcfg-eth1.ww
DEVICE=eth1
BOOTPROTO=static
ONBOOT=yes
HWADDR=%{NETDEVS::ETH1::HWADDR}
IPADDR=%{NETDEVS::ETH1::IPADDR}
NETMASK=%{NETDEVS::ETH1::NETMASK}
NETWORK=%{NETDEVS::ETH1::NETWORK}

# import those templates

wwsh file import  \
/opt/ohpc/admin/images/rocky8.5/root/wwtemplates/network.ww \
--path=/etc/sysconfig/network --name=network.ww

wwsh file import  \
/opt/ohpc/admin/images/rocky8.5/root/wwtemplates/ifcfg-eth1.ww \
--path=/etc/sysconfig/network-scripts/ifcfg-eth1 --name=ifcfg-eth1.ww

Next we build a deploy script, first the input file

# deploy.txt
# add nodes to image: nodename hwaddrof eth0 hwaddrof eth1
n59 0C:C4:7A:4F:0B:7C 192.168.102.69 0C:C4:7A:4F:0B:7D 10.10.102.69

And the script to deploy

#!/bin/bash

# FIX vnfs & bootstrap for appropriate node
# CHECK disk to format sda?

# deploy a chroot server via PXE golden image transfer
# templates are always in stateless CHROOT/rocky8.5/root/wwtemplates
# look at header deploy.txt 

node=$1
hwaddr0=$2
ipaddr0=$3
hwaddr1=$4
ipaddr1=$5

if [ $# != 5 ]; then
	echo "missing args: node hwaddr0 ipaddr0 hwaddr1 ipaddr1 "
	exit
fi

wwsh object delete $node -y 
sleep 3

wwsh node new $node --netdev=eth0 \
--hwaddr=$hwaddr0 --ipaddr=$ipaddr0 \
--netmask=255.255.0.0  --network=255.255.0.0 -y

wwsh node set $node --netdev=eth1 \
--hwaddr=$hwaddr1 --ipaddr=$ipaddr1 \
--netmask=255.255.0.0  --network=255.255.0.0 -y

wwsh provision set $node --fileadd hosts,munge.key -y 
wwsh provision set $node --fileadd passwd,shadow,group -y 
wwsh provision set $node --fileadd network.ww,ifcfg-eth1.ww -y 


# stateless, comment out for golden image
# wwsh provision set $node --bootstrap=4.18.0-348.12.2.el8_5.x86_64 -y
# wwsh provision set $node --vnfs=rocky8.5 -y

# stateful, comment out for golden image and stateless
# install grub2 in $CHROOT first, rebuild vnfs
# wwsh provision set --filesystem=gpt-n59  $node -y
# wwsh provision set --bootloader=sda  $node -y

# uncomment for golden image, comment out stateless and stateful
 wwsh provision set $node --bootstrap=4.18.0-348.12.2.el8_5.x86_64 -y
 wwsh provision set $node --vnfs=n59.chroot -y
 wwsh provision set --filesystem=gpt-n59  $node -y
 wwsh provision set --bootloader=sda  $node -y


wwsh provision set --bootlocal=UNDEF $node -y
echo "for stateful or golden image, after first boot issue"
echo "wwsh provision set --bootlocal=normal $node -y"

wwsh pxe update
wwsh dhcp update
systemctl restart dhcpd 
systemctl restart httpd
systemctl restart tftp.socket
# crontab will shutdown these services at 5pm


# execute the script
./deploy.sh n59 ...

Next PXE boot the node and we'll observe a stateless launch (ie no hard disk)

[root@n59 ~]# cat /etc/redhat-release 
Rocky Linux release 8.5 (Green Obsidian)

[root@n59 ~]# df -h
Filesystem                     Size  Used Avail Use% Mounted on
tmpfs                           16G  1.3G   15G   9% /
devtmpfs                        16G     0   16G   0% /dev
tmpfs                           16G     0   16G   0% /dev/shm
tmpfs                           16G  9.8M   16G   1% /run
tmpfs                           16G     0   16G   0% /sys/fs/cgroup
192.168.102.250:/opt/intel     708G  209G  499G  30% /opt/intel
192.168.102.250:/opt/ohpc/pub  708G  209G  499G  30% /opt/ohpc/pub
tmpfs                          3.2G     0  3.2G   0% /run/user/0

stateful

To go stateful we need grub installed

yum --installroot=/opt/ohpc/admin/images/rocky8.5 install grub2 
touch /opt/ohpc/admin/images/rocky8.5/root/VNFS-TEST-WITH-GRUB2

# build out stateful if desired
dnf --installroot $CHROOT install yum
dnf --installroot $CHROOT groupinstall "Server with GUI"
dnf --installroot $CHROOT install iptables-services
dnf --installroot $CHROOT clean all

# rebuild vnfs
wwvnfs --chroot /opt/ohpc/admin/images/rocky8.5

# partition
cp /etc/warewulf/filesystem/examples/gpt_example.cmds \
/etc/warewulf/filesystem/gpt.cmds

# customize
cp gpt.cmds gpt-n59.cmds

# edit the file and change swap to
mkpart primary linux-swap 513MiB 1025MiB
mkpart primary ext4 1025MiB 100%

# edit deploy script and change
--filesystem=gpt-n59.cmds

Re-execute the script so services start and dhcp/pxe files are updated.
PXE boot the node again.
Upon boot we view stateful partitions, then set bootlocal to normal.

[root@n59 ~]# df -h
Filesystem                     Size  Used Avail Use% Mounted on
/dev/sda4                      7.3G  1.3G  5.7G  19% /
devtmpfs                        16G     0   16G   0% /dev
tmpfs                           16G     0   16G   0% /dev/shm
tmpfs                           16G  9.8M   16G   1% /run
tmpfs                           16G     0   16G   0% /sys/fs/cgroup
/dev/sda2                      486M   55M  402M  13% /boot
192.168.102.250:/opt/intel     708G  209G  499G  30% /opt/intel
192.168.102.250:/opt/ohpc/pub  708G  209G  499G  30% /opt/ohpc/pub
tmpfs                          3.2G     0  3.2G   0% /run/user/0

[root@n59 ~]# fdisk -l
Device       Start      End  Sectors  Size Type
/dev/sda1     2048     6143     4096    2M BIOS boot
/dev/sda2     6144  1050623  1044480  510M EFI System
/dev/sda3  1050624  2099199  1048576  512M Linux swap
/dev/sda4  2099200 31277055 29177856 13.9G Linux filesystem


[root@n59 ~]# cat /etc/redhat-release 
Rocky Linux release 8.5 (Green Obsidian)

[root@n59 ~]# wwsh provision set --bootlocal=normal n59 -y
[root@n59 ~]# touch BOOTLOCAL=NORMAL
[root@n59 ~]# reboot

# observe that new file

golden image

After stateful imaging we touch another file on imaged server then build a golden image. The touching of this new file represents customizing and testing the node prior to creating golden image. So for complex designs we might put the node temporarily on the internet and install nvidia drivers and toolkit for example. And perhaps install software that will optimize itself based on resources found (like gromacs/lammps probing gpu models for proper architecture). Then we build a golden image when everything works as expected. Hard to do in a CHROOT environment.

[root@n59 ~]# touch VNFS-TEST-WITH-GRUB2-GOLDEN-IMAGE

[root@n59 ~]# ll
total 4
-rw-r--r-- 1 root root    0 Apr 20 14:22 VNFS-TEST
-rw-r--r-- 1 root root    0 Apr 28 08:09 VNFS-TEST-WITH-GRUB2
-rw-r--r-- 1 root root    0 Apr 28 15:55 VNFS-TEST-WITH-GRUB2-GOLDEN-IMAGE
drwxr-xr-x 2 root root 4096 Apr 28 08:06 wwtemplates

# install software in /usr/local or /opt/ohpc/pub
# customize and test functionality on node, then

# on master node
cd /var/chroots  # or where you keep images

# be sure to edit /usr/libexec/warewulf/wwmkchroot/golden-system.tmpl
# add any excludes necessary (like any NFS mounts present, or umount)
# view /etc/warewulf/vnfs.conf
# the HYBRIDIZE section is commented out

# /var/[log|spool|run] need to be removed from
/usr/libexec/warewulf/wwmkchroot/golden-tmpl

# try on compute nodes
systemctl enable slurmd

SOURCEADDR=n59 wwmkchroot golden-system \
/var/chroots/n59.chroot | tee /var/chroots/n59.log

# for large images you might want to exclude say /usr/share (leave locale)
# create a tar archive with vnfs image to overlay after imaging

# rebuild
[root@master]# touch /var/chroots/n59.chroot/root/VNFS-TEST-WITH-GRUB2-GOLDEN-IMAGE-CHROOT
[root@master]# wwvnfs --chroot /var/chroots/n59.chroot

[root@master]# wwsh vnfs list
VNFS NAME            SIZE (M)   ARCH       CHROOT LOCATION
n59.chroot           571.1      x86_64     /var/chroots/n59.chroot
rocky8.5             553.7      x86_64     /opt/ohpc/admin/images/rocky8.5

# bootstrap remains the same so edit deploy script
# uncomment for golden image, comment out stateless and stateful
 wwsh provision set $node --bootstrap=4.18.0-348.12.2.el8_5.x86_64 -y
 wwsh provision set $node --vnfs=n59.chroot -y
 wwsh provision set --filesystem=gpt-n59  $node -y
 wwsh provision set --bootloader=sda  $node -y

# execute deploy script, pxe boot node
# and there is the golden image deployed

[root@n59 ~]# ll
total 4
-rw-r--r-- 1 root root    0 Apr 20 14:22 VNFS-TEST
-rw-r--r-- 1 root root    0 Apr 28 08:09 VNFS-TEST-WITH-GRUB2
-rw-r--r-- 1 root root    0 Apr 28 15:55 VNFS-TEST-WITH-GRUB2-GOLDEN-IMAGE
-rw-r--r-- 1 root root    0 Apr 29 10:03 VNFS-TEST-WITH-GRUB2-GOLDEN-IMAGE-CHROOT
drwxr-xr-x 2 root root 4096 Apr 28 08:06 wwtemplates


# future boots from local disk
wwsh provision set --bootlocal=normal n59 -y

Awesome. You also have a backup now. Image away. And no need for a dhcp server to always be at the ready. Linux will fix journal file system errors 99% of the time if rebooted from say a utility power loss.
Thank you Warewulf team.

I also see there are EFI and EFI + NVME filesystem examples in /etc/warewulf/filesystem/examples

logger

For some reason, after vnfs has compiled and deployed /dev/log is a socket file generating permission denied errors. Manual fix to apply, maybe put in /etc/rc.local in future

cd /dev
mv log log-orig
ln -s /run/systemd/journal/dev-log log

logger test
journalctl --since=-1m
-- Logs begin at Thu 2022-05-12 10:46:49 EDT, end at Thu 2022-05-12 10:52:17 EDT. --
May 12 10:52:17 n59 root[3748]: test

queues left

Not imaged will be nodes in these queues

  • hp12 n[1-n32] Too old and failing fast, centos 6
  • mwgpu n[33-n37] K20 gpus EOL, no cuda driver updates anymore, centos7
  • mw256fd n[38-n45] When warewulf starts imaging we disappear in a loop of “disks not ready”, centos 6


Back

cluster/216.txt · Last modified: 2022/06/07 16:07 by hmeij07