#!/bin/tcsh -f
# JLdL 28Jan14.
#
# Copyright (C) 2005-2014 by Jorge L. deLyra <delyra@fma.if.usp.br>.
# Modified by Sybele G. P. Groff <syb@fma.if.usp.br>.
# This program may be copied and/or distributed freely. See the
# _ terms and conditions in /usr/share/doc/<package>/copyright.
#
# This program correctly installs the root crontab on all the nodes
# _ of a cluster, using chroot; it may either just fix the crontabs
# _ which are already there or edit and install a new global one on
# _ all the nodes, overwriting the ones that are already there.
#
# NOTE: this program does not handle error messages
# _ in languages other than English.
#
# If this script is interrupted, do a clean exit,
# _ unmounting all the filesystems of the node.
onintr cleanexit
#
# Record the name this script was called with.
set name = `basename $0`
#
# Initialize variables for the configuration file.
set conflag = 0
set confile = "/etc/cluster.conf"
#
# Initialize a flag for the type of mount to be done; the default
# _ is to use local bind mounts rather than loopback NFS mounts.
set bindmnt = 1
#
# Initialize a flag for editing a global crontab for all nodes;
# _ the default is to just fix the existing crontabs.
set editflg = 0
#
# Process the command-line arguments.
foreach cla ( $* )
    #
    # Detect options.
    if ( "`echo -n $cla | cut -c 1`" == "-" ) then
	#
	# If we got here with the argument flag up, there is an error.
	if ( $conflag == 1 ) then
	    echo "${name}: ERROR: option -C requires an argument"
	    exit 1
	endif
	#
	# Now process the options.
	switch ( $cla )
	case "-h":
	case "--help":
	    #
	    # Print a usage message.
	    echo "usage: $name [-C <config>] [-B|-N] [-F|-E]"
	    echo "       -C: use alternate configuration file <config>"
	    echo "       -B: use local 'bind' mounts for the node filesystems"
	    echo "       -N: use loopback NFS mounts for the node filesystems"
	    echo "       -F: simply fix the existing crontabs at each node"
	    echo "       -E: edit a global crontab and install it on all nodes"
	    echo "       correctly install the root crontab on all the"
	    echo "       nodes of a cluster, using chroot to access"
	    echo "       the node filesystems; in order to get the"
	    echo "       details run 'man $name'"
	    exit 0
	    breaksw
	case "-C":
	case "--Config-file":
	    #
	    # Raise the flag.
	    set conflag = 1
	    breaksw
	case "-B":
	case "--bind-mounts":
	    #
	    # Raise the bind-mounts flag.
	    set bindmnt = 1
	    breaksw
	case "-N":
	case "--NFS-mounts":
	    #
	    # Lower the bind-mounts flag.
	    set bindmnt = 0
	    breaksw
	case "-F":
	case "--fix-crontabs":
	    #
	    # Lower the edit flag.
	    set editflg = 0
	    breaksw
	case "-E":
	case "--edit-globally":
	    #
	    # Raise the edit flag.
	    set editflg = 1
	    breaksw
	default:
	    #
	    # Print an error message.
	    echo "${name}: ERROR: unknown option $cla; try -h to get help"
	    exit 1
	    breaksw
	endsw
    #
    # Process non-option arguments.
    else
	#
	# Get the arguments of options.
	if ( $conflag == 1 ) then
	    #
	    # Set the configuration file.
	    set confile = $cla
	    #
	    # Lower the flag.
	    set conflag = 0
	#
	# This script takes no arguments.
	else
	    #
	    # Print an error message.
	    echo "${name}: ERROR: this program takes no arguments"
	    exit 1
	endif
    endif
end
#
# If we got here with the argument flag up, there is an error.
if ( $conflag == 1 ) then
    echo "${name}: ERROR: option -C requires an argument"
    exit 1
endif
#
# Source the configuration file; this must define the following variables:
# _ nick_name; virt_node; cluster_root; mount_points.
if ( -r $confile ) then
    source $confile
else
    echo "${name}: ERROR: cannot read configuration file $confile"
    exit 1
endif
#
# Do some simple error detection: check that the necessary
# _ variables are defined in the configuration file.
if ( ! $?nick_name ) then
    echo "${name}: ERROR: nick_name not defined in configuration file"
    exit 1
endif
if ( ! $?virt_node ) then
    echo "${name}: ERROR: virt_node not defined in configuration file"
    exit 1
endif
if ( ! $?cluster_root ) then
    echo "${name}: ERROR: cluster_root not defined in configuration file"
    exit 1
endif
if ( ! $?mount_points ) then
    echo "${name}: ERROR: mount_points not defined in configuration file"
    exit 1
endif
#
# Give default values to the optional configuration variables.
if ( ! $?cluster_server ) then
    set cluster_server = `hostname -s`
endif
if ( ! $?mount_retries ) then
    set mount_retries = 12
endif
if ( ! $?retry_timeout ) then
    set retry_timeout = 10
endif
#
# Get the number of digits in the node numbers.
set ndig = `echo -n $virt_node | wc -c`
#
# Build the regular expression for the node numbers.
set node_digs = "[0-9]"
set idig = 1
while ( $idig < $ndig )
    set node_digs = "${node_digs}[0-9]"
    @ idig = $idig + 1
end
#
# Define a separator line.
set sep = "--------------------------------------------------------------------------------"
#
# Define the location of the library.
set libdir = /usr/lib/cluster
#
# Define the location of the library within the nodes.
set cldir = /lib/cluster
#
# Get the time in number of seconds, for use in the temporary files.
set nsecs = `date +%s`
#
# Define a target for egrep; start with one for the system mounts; do
# _ not use the hostname of the server here, in private networks the
# _ hostname associated to the other interface may be shown instead.
set etarg = ":${cluster_root}/[^ ]* $cluster_root/$node_digs/[^ ]* nfs"
#
# This is for the case of the use of local bind mounts; note that this
# _ assumes that ext2, ext3 or ext4 filesystemas are being used.
set etarg = "^/dev/[^ ]* $cluster_root/$node_digs/[^ ]* ext"
#
# The proc filesystem mount must be treated separately, since it is
# _ of a different type and will not be picked by the regexp above.
set etarg = "$etarg|^proc $cluster_root/$node_digs/proc proc"
#
# The sys filesystem mount must be treated separately, since it is of
# _ a different type and will not be picked by the regexps above.
set etarg = "$etarg|^sysfs $cluster_root/$node_digs/sys sysfs"
#
# The pts filesystem mount must be treated separately, since it is of
# _ a different type and will not be picked by the regexps above.
set etarg = "$etarg|^devpts $cluster_root/$node_digs/dev/pts devpts"
#
# Check for spurious mounts before starting.
cat /proc/mounts | egrep -q "$etarg"
if ( $status == 0 ) then
    echo "${name}: ERROR: there are spurious chroot mounts:"
    cat /proc/mounts | egrep "$etarg"
    exit 1
endif
#
# Go to the root of the cluster.
cd $cluster_root
#
# Edit a temporary master crontab file, taken from the virtual
# _ node, and distribute it globally to all the nodes.
if ( $editflg ) then
    #
    # Define the user to be acted on.
    set theuser = root
    #
    # Define the name of the temporary master crontab file.
    set tmpfile = /tmp/root.crontab.$nsecs
    #
    # Create the temporary master crontab file, using the crontab file
    # _ of the virtual node, and separating the head and the body.
    head -3 var/$virt_node/spool/cron/crontabs/$theuser >! $tmpfile.HEAD
    tail -n +4 var/$virt_node/spool/cron/crontabs/$theuser >! $tmpfile.BODY
    #
    # Now try to edit the body of the temporary master crontab file.
    if ( $?EDITOR ) then
	#
	# Get the name of the editing command from the environment.
	set ecomm = `echo "$EDITOR" | cut -d' ' -f1`
	#
	# Verify that it is available for execution.
	which "$ecomm" >& /dev/null
	#
	# If it is available, use it.
	if ( $status == 0 ) then
	    $EDITOR $tmpfile.BODY
	else
	    echo "${name}: ERROR: environment editor not usable: $EDITOR"
	    exit 1
	endif
    else
	echo "${name}: ERROR: EDITOR environment variable not set"
	exit 1
    endif
    #
    # Reconstruct the whole master crontab file.
    rm -f $tmpfile.BODY\~ >& /dev/null
    cat $tmpfile.BODY >> $tmpfile.HEAD
    rm -f $tmpfile.BODY
    mv -f $tmpfile.HEAD $tmpfile
    #
    # Loop over the nodes.
    foreach node ( $node_digs )
	#
	# Distribute the new file to all the nodes.
	cp -pf $tmpfile var/$node/spool/cron/crontabs/$theuser
    end
    #
    # Remove the temporary master crontab file.
    rm -f $tmpfile
endif
#
# Define a list of the masking mounts to be made:
# _ /var/run: to avoid daemons dying off on the server;
# _ /var/yp/binding: to avoid spurious errors with NIS.
#
# IMPORTANT WARNING: it is _imperative_ to _not_ have ending slashes here,
# _ or this script may break the server's system, by covering the server's
# _ /run directory with a masking mount.
#
set mask_mounts = ( var/run var/yp/binding )
#
# Verify whether or not there is a masking directory for the
# _ /var/run directory of the nodes and, if not, make one.
if ( ! -d fake-var-run ) then
    echo "${name}: WARNING: missing $cwd/fake-var-run directory, making one"
    if ( -e fake-var-run ) then
	mv -f fake-var-run fake-var-run.WRONG
    endif
    mkdir fake-var-run
    ln -s $libdir/fake-var-run.README fake-var-run/README
endif
#
# Verify whether or not there is a masking directory for the directory
# _ /var/yp/binding of the nodes and, if not, make one.
if ( ! -d fake-var-yp-binding ) then
    echo "${name}: WARNING: missing $cwd/fake-var-yp-binding directory, making one"
    if ( -e fake-var-yp-binding ) then
	mv -f fake-var-yp-binding fake-var-yp-binding.WRONG
    endif
    mkdir fake-var-yp-binding
    ln -s $libdir/fake-var-yp-binding.README fake-var-yp-binding/README
endif
#
# Loop over the nodes.
foreach node ( $node_digs )
    #
    # Print out a progress report separator.
    echo $sep
    echo current node is: $nick_name$node
    #
    # If bind-mounts is in effect, then mount all local filesystems
    # _ here, rather than within the anchoring program.
    if ( $bindmnt ) then
	#
	# Preparation: mount filesystems for the node.
	echo -n "  Mounting filesystems for the node:\n "
	#
	# Loop over the filesystems to be mounted; note the inclusion
	# _ of the /proc, /sys and /dev/pts filesystems.
	foreach fs ( proc sys dev/pts $mount_points $mask_mounts )
	    #
	    # Write out some progress report.
	    echo -n " /$fs"
	    #
	    # Start an error counter.
	    @ ec = 0
	    #
	    # An error-handling label.
	    again1:
	    #
	    # Mount the filesystems, taking care of each type in turn.
	    #
	    # Take care of the /proc, /sys and /dev/pts filesystems: it is faster
	    # _ to make bind mounts, rather than fresh proc, sys or pts mounts.
	    if ( $fs == proc || $fs == sys || $fs == dev/pts ) then
		set error = `mount -n --bind /$fs $cluster_root/$node/$fs |& \
				sed -e 's|[()]|_|g' | cat`
	    #
	    # Take care of the system mounts.
	    else if ( $fs == `echo "$mount_points" | tr ' ' '\n' | grep "^$fs"'$'` ) then
		set error = `mount -n --bind $cluster_root/$fs/$node $cluster_root/$node/$fs |& \
				sed -e 's|[()]|_|g' | cat`
	    #
	    # Take care of the masking mounts.
	    else
		#
		# IMPORTANT WARNING: there must be no ending slashes here;
		# _ must make sure that this mount point is not a symlink.
		if ( -Ld $cluster_root/$node/$fs ) then
		    echo -n "(mask)"
		    set fn = `echo -n $fs | tr '/' '-'`
		    set error = `mount -n --bind $cluster_root/fake-$fn $cluster_root/$node/$fs |& \
				    sed -e 's|[()]|_|g' | cat`
		else
		    echo -n "(nope)"
		    set error = ""
		endif
	    endif
	    #
	    # Handle mount errors.
	    if ( "$error" != "" ) then
		if ( $ec == 0 ) echo ""
		echo -n "  WARNING: cannot mount filesystem $cluster_root/$node/${fs}: "
		#
		# Increment the error counter.
		@ ec = $ec + 1
		#
		# Try again up to mount_retries times at retry_timeout-second intervals.
		if ( $ec <= $mount_retries ) then
		    echo "  trying again in $retry_timeout seconds..."
		    echo "  (error message was: $error)"
		    sleep $retry_timeout
		    goto again1
		else
		    echo "  WARNING: failed $mount_retries times, quitting..."
		    goto cleanexit
		endif
	    endif
	end
	#
	# End the progress-report line.
	echo ""
    endif
    #
    # Do a chroot to the node and execute the anchor script, passing
    # _ all the necessary command-line arguments; note the inclusion
    # _ of the /proc, /sys and /dev/pts filesystems in the case when
    # _ bind-mounts is not in effect.
    if ( $bindmnt ) then
	chroot $cluster_root/$node $cldir/multi-crontab-chroot.anchor \
	    $cluster_server $cluster_root "" "" \
	    $mount_retries $retry_timeout $nsecs
    else
	chroot $cluster_root/$node $cldir/multi-crontab-chroot.anchor \
	    $cluster_server $cluster_root "proc sys dev/pts $mount_points" "$mask_mounts" \
	    $mount_retries $retry_timeout $nsecs
    endif
    #
    # The clean-exit label.
    cleanexit:
    #
    # If bind-mounts is in effect, then unmount here all the
    # _ local filesystems which were mounted before.
    if ( $bindmnt ) then
	#
	# Finalization: unmount filesystems for the node.
	echo -n "  Unmounting filesystems for the node:\n "
	#
	# Loop over the mounted filesystems.
	foreach fs ( $mask_mounts $mount_points dev/pts sys proc )
	    #
	    # Write out some progress report.
	    echo -n " /$fs"
	    #
	    # Start an error counter.
	    @ ec = 0
	    #
	    # An error-handling label.
	    again2:
	    #
	    # Take care of the masking mounts.
	    if ( $fs == `echo "$mask_mounts" | tr ' ' '\n' | grep "^$fs"'$'` ) then
		#
		# IMPORTANT WARNING: there must be no ending slashes here;
		# _ must make sure that this mount point is not a symlink.
		if ( -Ld $cluster_root/$node/$fs ) then
		    #
		    # Try a straight unmount.
		    set error = `umount -n $cluster_root/$node/$fs |& sed -e 's|[()]|_|g' | cat`
		else
		    set error = ""
		endif
	    else
		#
		# Try a straight unmount.
		set error = `umount -n $cluster_root/$node/$fs |& sed -e 's|[()]|_|g' | cat`
	    endif
	    #
	    # Handle unmount errors; note the _reversed_ grep searches in order to
	    # _ filter out some error messages which require no further action.
	    if ( "$error" != "" && \
		"`echo $error | grep -v 'umount: $cluster_root/$node/${fs}: not found'`" != "" && \
		"`echo $error | grep -v 'umount: $cluster_root/$node/${fs}: not mounted'`" != "" ) then
		if ( $ec == 0 ) echo ""
		echo -n "  WARNING: cannot unmount filesystem $cluster_root/$node/${fs}: "
		#
		# Increment the error counter.
		@ ec = $ec + 1
		#
		# Try again up to mount_retries times at retry_timeout-second intervals.
		if ( $ec <= $mount_retries ) then
		    echo "  trying again in $retry_timeout seconds..."
		    echo "  (error message was: $error)"
		    sleep $retry_timeout
		    goto again2
		else
		    #
		    # If waiting does not work, do a lazy unmount.
		    echo "  failed $mount_retries times, giving up."
		    echo "  WARNING: doing a lazy unmount for $cluster_root/$node/$fs..."
		    umount -nl $cluster_root/$node/$fs
		endif
	    endif
	end
	#
	# End the progress-report line.
	echo ""
    endif
    #
    # Check whether there are any stray pid files left.
    find $cluster_root/fake-var-run/ -name \*.pid -exec \
	echo "${name}: WARNING: stray PID file found:" \{\} \;
end
#
# Print a final separator.
echo $sep
#
# Go back to the original directory.
cd -
#
# Check and warn about spurious mounts at the end.
cat /proc/mounts | egrep -q "$etarg"
if ( $status == 0 ) then
    echo "${name}: WARNING: there are chroot mounts left over:"
    cat /proc/mounts | egrep "$etarg"
endif
