#!/bin/sh
#
#ident "@(#)net_pingnet.shf   1.26     97/03/18 SMI"
#
#	Copyright 03/18/97 Sun Microsystems, Inc.  All Rights Reserved.
#

# Usage: net_pingnet network
#
# The purpose of this script is to determine whether or not it is
# possible to establish contact with one or more remote hosts on a
# given network.
#
# The 'network' parameter should be the broadcast address for
# a subnet that this host is connected to.  This script assumes
# that said parameter is a broadcast address.  To ping an
# ordinary host, use the script 'net_pinghost' instead.
#
# This actually requires that at least TWO replies be detected
# within the timeout period given on the command line.   Since a
# healthy localhost will always respond to a self-initiated broadcast
# through it's own loopback interface, at least one other reply
# is required in order to ensure that a response is detected
# over the wire itself.  The same problem occurs with broadcast requests
# routed to other subnets through one or more gateways.  That is, the
# destination interface on the last gateway machine in the route
# may respond for itself through it's own loopback interface.  Unless
# a second response is heard from another host on that subnet, it is
# not a valid test of whether or not contact has been established with
# a host actually attached to that net.
#
# As a side effect of the dual reply requirement, attempts to use this
# script to ping a single host address will fail.  This is nice, since
# the purpose of the script is to "ping a network", not a host.
# 
# This script is implemented by using the ping(1M) command with the
# -s option, a ridiculously long request (essentially infinite) interval.
# If the ping(1M) command does not
# complete within the timeout period given on the command line, it
# is killed, and the script fails.  Otherwise, the output is parsed
# and the number of replies are counted.
#
##############################################



##############################################
#
# Boilerplate
#
##############################################


# BEGIN boiler_fault_sub
#ident "@(#)boiler_fault_sub   1.8     96/04/22 SMI"
# Copyright 04/22/96 Sun Microsystems, Inc.  All Rights Reserved.
# This boilerplate is source-include'ed at "compile-time".
# This boilerplate is the same in every fault probe script that is called
# as a subroutine from some other fault probe script or cluster
# transition script.  Such scripts assume that the caller has already
# set up the HA_ENV environment from the HA_ENV file.
# Actually, if the HA_ENV is NOT set up, this script will take care
# of doing it, see below -- some scripts depend upon this!
# XXX CAUTION: This means that this boilerplate knows the location of the
# HA_ENV file, so it cannot be moved without editing this boilerplate
# and "recompiling"!
# The major difference between this boilerplate and the other boilerplate
# file, boiler_fault_top, is that boiler_fault_top is for when the 
# surrounding script takes the name of the HA_ENV file as its first argument.


argv0=`basename $0`
LOGGER=/usr/bin/logger

# If the HA_ENV environment does not seem to have been set up,
# set it up anyway using the defaults.  This behavior is
# relied upon by some scripts, including:
#   haload hastat_net hastat_nfs fdl_fault_suspend
#   plus programs run via rsh including:
#   net_hosts_check fdl_consider_takeover nfs_islockedfs

if [ -z "$HA_CLUSTER" ]; then
	HA_CLUSTER=hadf
	# Use existing or default value of HA_ENV:
	HA_ENV=${HA_ENV:=/var/opt/SUNWhadf/${HA_CLUSTER}/ha_env}
	export HA_ENV

	if [ ! -r "$HA_ENV" ]; then
		$LOGGER -p local7.err -t "nfs" "$argv0:  Cannot determine correct HA environment"
		exit 1
	fi
	. $HA_ENV
	if [ $? -ne 0 ]; then
	        $LOGGER -p local7.err -t "$HA_CLUSTER" "$argv0: Cannot include ha_env file $HA_ENV"
		exit 1
	fi

fi

# include hafmconfig parameters:
if [ ! -r $HA_FILES/hafmconfig ]; then
	$LOGGER -p local7.err -t "$HA_SLOGTAG" "$argv0: Cannot read hafmconfig file $HA_FILES/hafmconfig"
	exit 1
fi
. $HA_FILES/hafmconfig
if [ $? -ne 0 ]; then
	$LOGGER -p local7.err -t "$HA_SLOGTAG" "$argv0: Cannot include hafmconfig file $HA_FILES/hafmconfig"
	exit 1
fi

# include HA utilities library
. utilities
if [ $? -ne 0 ]; then
	$LOGGER -p local7.err -t "$HA_SLOGTAG" "$argv0:  Cannot include HA utilities library!"
	exit 1
fi

# Some scripts depend upon this:
PROG=$argv0

# END boiler_fault_sub
#include_boiler_fault_sub


##############################################
#
# cleanup
#
##############################################
cleanup()
{
	rm -f $HA_TMP/${PROG}.*.$$
}


##############################################
#
# main program
#
##############################################

PROG=`basename $0`
USAGE="usage: $PROG network"
PKTSIZE=56

#
# arrange for cleanup on signals
#
establish_cleanup_handler

#
# check args
#
if [ $# -lt 1 ]; then
	logerr "$USAGE"
	cleanup 
	exit 1
fi

NETWORK=$1

#
# start ping of the network under a timeout, using fdl_timedrun.
# The -n switch is to avoid delay/hang from translating ip addrs to hostnames.
# The -I switch is giving the interval between sending ping REQUEST packets,
# and we have set this interval to be essentially infinite, relative
# to the timeout that ping is being run under.  The
# net effect of that is that only one REQUEST packet is being sent out.
# If we were to send out more than one REQUEST packet, we would have to
# worry about this host itself responding more than once.
# Ping is going to wait until at least $MINREPLIES 
# replies have been received.
# Some controllers (e.g., an hme) will cause the sending host to
# respond to itself more than once when logical network interfaces
# are configured up.  Other controllers do not have that behavior
# (e.g., le).  Therefore, we must set $MINREPLIES to be greater
# than the number of logical hosts plus 1 for the physical host.
# If fdl_timedrun times out the child ping process, it will kill the ping
# process with an INT signal.  ping catches the INT signal and will
# write out its statistics on stdout and then exit.
#
# XXX2.0 >2 hosts MINREPLIES must increase once we have more
# than two logical hosts, to be at least bigger than the
# number of logical hosts plus one for the physical host.
MINREPLIES=4
INTERVAL=32000
TIMEOUT=5
GIVEUP=60
TOTALATTEMPTS=6
J=1
while [ $J -le $TOTALATTEMPTS ]; do
	fdl_timedrun -a $TIMEOUT fdl_timedrun -k INT $GIVEUP \
	    ping -ns -I $INTERVAL $NETWORK $PKTSIZE $MINREPLIES \
	    > $HA_TMP/${PROG}.${J}.$$ 2>/dev/null
	# Count the number of replies that came from different hosts:
	# we need at least two, one from this host itself and the other
	# from any other host on the subnetwork.  Because some
	# controllers might cause this host itself to respond more
	# than once, we must eliminate duplicates, using sort and uniq.
	# Look at all of the previous iterations too,
	# not just at this one.
	K=1
	while [ $K -le $J ]; do
	    NUMREPLIES="`grep -i 'bytes from' $HA_TMP/${PROG}.${K}.$$ | \
	        awk '{print $4}' | sort | uniq | wc -l`"
	    if [ -n "$NUMREPLIES" ] && [ $NUMREPLIES -ge 2 ]; then
		logdeb "OK - Able to contact network $NETWORK"
		cleanup
		exit 0
	    fi
	    K=`expr $K + 1`
	done

	J=`expr $J + 1`
done

logerr "Unable to contact network $NETWORK"
cleanup
exit 1
