#! /bin/sh
#
#ident "@(#)tivoli_probe.shi   1.11     01/03/27 SMI"
#
#	Copyright 03/27/01 Sun Microsystems, Inc.  All Rights Reserved.
#

#
# Usage: tivoli_probe <instance name> <inst_runs_remotely>
#
# Started up in the background via pmfd in tivoli_fm_start
# during reconfiguration only if a) instance is running locally and
# local probes are configured on, or if b) instance is running remotely
# and remote probes are configured on.  If instance is running remotely,
# $2 is set to 'y'.  We need to know this so we can get the wping binary
# from a another location.

#
# Add the path to framework binaries, since the probe is not called in the
# context of the methods
#
PATH=${PATH}:/opt/SUNWhadf/bin:/opt/SUNWhadf/fault_progs

INST_NAME=$1
INST_RUNS_REMOTELY=$2

#
#pragma ident "@(#)ds_boiler	1.3	98/09/15 SMI"
#
# common boiler for HA data services
#
#


ARGV0=`basename $0`
LOGGER=logger
HA_SLOGFACILITY=`haget -f syslog_facility`
HA_SLOGTAG=hadf
prog_path=`dirname $0`

# source in ha-services common utilities
. hads_utilities

# add the ha-service directory to the path
PATH=${prog_path}:${PATH}

#
# for use by subsequent hactl command, get hostnames of local and remote hosts
#
LOCALHOST=`uname -n`

#! /bin/sh 
#
#	Copyright 04/30/99 Sun Microsystems, Inc.  All Rights Reserved.
#
#pragma ident       "@(#)do_service 1.12     00/08/25 SMI"
#
#

SYSLOG_PREFIX="SUNWcluster.ha.tivoli"

# This file is replicated on both servers.
HATIV_CONFIG_FILE=/etc/opt/SUNWsctiv/hadsconf

#
# Call the parser to handle the config file.
#
if [ ! -f $HATIV_CONFIG_FILE ]; then
	logerr "${SYSLOG_PREFIX}.4027" `gettext "$HATIV_CONFIG_FILE doesn't exist"`
	exit 1
fi

source_env_file $HATIV_CONFIG_FILE
if [ $? -ne 0 ]; then
	# source_env_file logs error message if it fails.
	# No need to log another; just exit.
	exit 1
fi

#
# Time to wait for SIGTERM to stop a process.
# This should be in the config file.
#
STOP_TIMEOUT=15
#
# bundle_do_svc <action>
#
# is called for each instance
#
# We must start oserv clients after the oserv server has started,
# and we must stop oserv clients before the oserv server is stopped.
# Therefore we start/stop the server and start_net/stop_net the clients.
#
bundle_do_svc ()
{
	action=$1
	prefix="$SYSLOG_PREFIX.$action"

	# oserv and CLI commands are influenced by the WLOCALHOST
	# environment variable.  Set this variable to the Managed Node
	# name of this instance of the oserv daemon.

	WLOCALHOST=`basename $_INST_CONF_DIR | sed 's/.db$//'`
	export WLOCALHOST

LD_LIBRARY_PATH="${_INST_PRIV_TIV_LIB}/solaris2:/usr/openwin/lib:/usr/lib:/usr/ucblib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" 
export LD_LIBRARY_PATH

	case $action in

	'start' | 'start_net')

		# start the oserv server in start method first
		# start the oserv clients in start_net method

		if [ "${_INST_PRIV_TIV_OSERV_TYPE}" = "server" -a \
		     "$action" = "start_net" ]; then
			# if this is an oserv server in start_net script, no-op
			exit 0
		fi

		if [ "${_INST_PRIV_TIV_OSERV_TYPE}" = "client" -a \
		     "$action" = "start" ]; then
			# if this is an oserv client in start_net script, no-op
			exit 0
		fi

		# The start script is specified in the hadsconf file.

		if [ ! -x ${_INST_START} ]; then
			logerr "$prefix.4000" \
				`gettext "<${_INST_START}> is not executable."`
			exit 1
		fi

		if [ ! -d "$_INST_PRIV_TIV_BIN" -o \
		     ! -d "$_INST_PRIV_TIV_LIB" -o \
		     ! -d "$_INST_CONF_DIR" ] ; then
			logerr "$prefix.4028" `gettext "Instance ${_INST_NAME} is not configured properly.  Check the path for CONF_DIR, PRIV_TIV_BIN and PRIV_TIV_LIB"`
			exit 1
		fi

		# The process monitor facility calls the start program,
		# passing to it the instance-specific information it needs.
		# Note that we're using pmf to start/stop, but not to probe.

		pmfadm -c ${_INST_NAME} \
		    /bin/sh -c "${_INST_START} ${_INST_PRIV_TIV_BIN} ${_INST_PRIV_TIV_LIB} ${_INST_LOGICAL_HOST} ${_INST_PORT} ${_INST_CONF_DIR} >/dev/null 2>&1"

		if [ $? -ne 0 ]; then
			logerr "$prefix.4001" \
	`gettext "pmfadm failed to start Tivoli instance ${_INST_NAME}"`
			exit 1
		else
			lognotice "$prefix.2000" \
			`gettext "Started Tivoli instance ${_INST_NAME}"`
		fi

	;;

	'stop' | 'abort')

		# if this is an oserv server, stop/abort it
		# if this is an oserv client, no-op

		if [ "${_INST_PRIV_TIV_OSERV_TYPE}" = "client" ]; then
			# no-op
			exit 0
		fi

		# remove oserv server from pmfd's queue and then kill it
		pmfadm -s ${_INST_NAME} -w  ${STOP_TIMEOUT} TERM || \
		     pmfadm -s ${_INST_NAME} KILL
		if [ $? -ne 0 ]; then
			logerr "$prefix.4002" \
		`gettext "pmfadm failed to stop oserv instance ${_INST_NAME}"`
			exit 1
		else
			lognotice "$prefix.2001" \
			    `gettext "Stopped oserv instance ${_INST_NAME}"`
		fi

		# XXX Do we need to kill the probe oserv in abort method?
	;;

	'stop_net' | 'abort_net')

		# if this is an oserv client, stop/abort it
		# if this is an oserv server, no-op

		if [ "${_INST_PRIV_TIV_OSERV_TYPE}" = "server" ]; then
			# no-op
			exit 0
		fi

		# delete from queue, but don't kill
		pmfadm -s ${_INST_NAME}
		if [ $? -ne 0 ]; then
			logerr "$prefix.4003" \
		`gettext "pmfadm failed to delete ${_INST_NAME} from queue"`
			exit 1
		fi

		# WLOCALHOST must be set to the Managed Node name of this
		# instance of the oserv daemon.

		${_INST_PRIV_TIV_BIN}/solaris2/bin/odadmin shutdown
		res=$?
		# If normal shutdown fails try killing by alternate methods.
		if [ $res -ne 0 ]; then
			pmfadm -k ${_INST_NAME} -w  ${STOP_TIMEOUT} TERM || \
					pmfadm -s ${_INST_NAME} KILL
			res=$?
		fi

		if [ $res -ne 0 ]; then
			logerr "$prefix.4004" \
		`gettext "odadmin failed to stop instance ${_INST_NAME}."`
			exit 1
		else
			lognotice "$prefix.2002" \
			    `gettext "Stopped Tivoli instance ${_INST_NAME}"`
		fi
	;;

	'fm_start')

		# pmf starts tivoli_probe
		# tivoli_probe runs until tivoli_fm_stop kills it.
		# Don't start probe if diskset is in maintenance mode.

		# If this Tivoli instance's diskset is in maint mode, exit now.
		MAINT=`haget -f is_maint -h ${_INST_LOGICAL_HOST}`
		if [ "$MAINT" = "1" ]; then
			exit 0
		fi

		# start the probe oserv if REMOTE=y and it is not running
		if [ "$_INST_PROBE_REMOTE_1" = "y" ] ; then
			if [ -z "${_INST_PRIV_TIV_PROBE_DIR}" ] ; then
				logerr "$prefix.4005" \
     `gettext "the probe information is not correct for instance ${_INST_NAME}"`
			else
				TIV_DB=`ls -d ${_INST_PRIV_TIV_PROBE_DIR}/*.db`
				if [ -z "`ps -ef | grep -w oserv | grep \" $TIV_DB$\"`" ] ; then
					WLOCALHOST=`basename $TIV_DB | sed 's/.db$//'`
					export WLOCALHOST
					${_INST_START} \
					${_INST_PRIV_TIV_PROBE_DIR}/bin \
					${_INST_PRIV_TIV_PROBE_DIR}/lib \
					"`uname -n`" ${_INST_PORT} \
					${TIV_DB}
					if [ $? -ne 0 ]; then
						logerr "$prefix.4006" \
   `gettext "Failed to start the probe oserv for Tivolo instance ${_INST_NAME}"`
					else
						lognotice "$prefix.2003" \
	`gettext "Started the probe oserv for Tivoli instance ${_INST_NAME}"`
					fi
				fi
			fi
		fi

		# Start probe program only if a) instance is running
		# locally and local probes are configured on, or if
		# b) instance is running remotely and remote probes are
		# configured on.  Otherwise just exit.
		# If instance is running remotely, 2nd arg to probe
		# program is 'y'; otherwise it's 'n'.  Probe program
		# needs to know this so it can figure out where to get
		# the wping binary.

		INST_RUNS_REMOTELY=n
		is_member "${_INST_LOGICAL_HOST}" "$MASTERED_LOGICAL_HOSTS"
		if [ $? -ne 0 ]; then
			# Server is not running locally.
			INST_RUNS_REMOTELY=y
		fi

		if [ "$INST_RUNS_REMOTELY" = "n"  -a  \
		     "$_INST_PROBE_LOCAL_1" != "y" ]; then
			# no work to do
			exit 0
		fi
		if [ "$INST_RUNS_REMOTELY" = "y"  -a  \
		     "$_INST_PROBE_REMOTE_1" != "y" ]; then
			# no work to do
			exit 0
		fi

		pmfadm -c ${_INST_NAME}.probe \
			/bin/sh -c "${_INST_PROBE_PROG_1} ${_INST_NAME} $INST_RUNS_REMOTELY >/dev/null 2>&1"

		if [ $? -ne 0 ]; then
			logerr "$prefix.4007" \
`gettext "pmfadm failed to start Tivoli probe for instance ${_INST_NAME}"`
			exit 1
		else
			lognotice "$prefix.2004" \
		`gettext "Started Tivoli probe instance ${_INST_NAME}.probe"`
		fi
	;;

	'fm_stop')

		# If probe not running, do nothing
		ha_svc_not_running ${_INST_NAME}.probe && exit 0

		# pmf kills tivoli_probe
		pmfadm -s ${_INST_NAME}.probe -w ${STOP_TIMEOUT} TERM || \
			pmfadm -s ${_INST_NAME}.probe KILL
		if [ $? -ne 0 ]; then
			logerr "$prefix.4008" \
`gettext "pmfadm failed to stop Tivoli probe instance ${_INST_NAME}.probe"`
			exit 1
		else
			lognotice "$prefix.2005" \
		`gettext "Stopped Tivoli probe instance ${_INST_NAME}.probe"`
		fi
	;;

	'fm_check')

		# If the oserv server is running on this machine and
		# we're taking over a logical host with an oserv client,
		# then we could do a quick wping here of the server to
		# make sure it's OK.
		#
		# However if we're taking over a logical host with the
		# oserv server, there's not a whole lot we can check here.
		#
		# Note: Our current probing mechanism (wping) depends on
		# the oserv server being up.
		#
		# Just exit 0.

		exit 0
	;;

	esac

	exit 0
}
#include_boiler

# Used for error message logging
set_inst_name ${INST_NAME}

prefix="$SYSLOG_PREFIX.probe"

if [ -z "$INST_NAME"  -o  -z "$INST_RUNS_REMOTELY" ]; then
	logerr "$prefix.4009" \
		`gettext "Usage: $ARGV0 <instance> <inst_runs_remotely>"`
	exit 1
fi

MASTERED_LOGICAL_HOSTS="`haget -f mastered`"

TIV_PORT=`get_config_param $INST_NAME PORT`
# required parameter
if [ -z "$TIV_PORT" ]; then
	logerr "$prefix.4010" \
	    `gettext "PORT value not set for instance $INST_NAME"`
	exit 1
fi

TIV_DB=`get_config_param $INST_NAME CONF_DIR`
if [ -z "$TIV_DB" ]; then
	logerr "$prefix.4011" \
    `gettext "CONF_DIR value not set for instance $INST_NAME"`
	exit 1
fi

MANAGED_NODE=`basename $TIV_DB | sed 's/.db$//'`

TIV_TYPE=`get_config_param $INST_NAME PRIV_TIV_OSERV_TYPE`
if [ -z "$TIV_TYPE" ]; then
	logerr "$prefix.4012" \
    `gettext "TIV_OSERV_TYPE value not set for instance $INST_NAME"`
	exit 1
fi

TIV_HOST=`get_config_param $INST_NAME LOGICAL_HOST`
# parser requires this to be set

if [ "$INST_RUNS_REMOTELY" = "y" ]; then

	# required parameter if REMOTE=y
	TIV_DIR=`get_config_param $INST_NAME PRIV_TIV_PROBE_DIR`
	if [ ! -d "$TIV_DIR" ]; then
		logerr "$prefix.4013" \
	    `gettext "TIV_PROBE_DIR value not set correctly for instance $INST_NAME"`
		exit 1
	fi

	TIV_BIN=$TIV_DIR/bin
	if [ ! -d "$TIV_BIN" ]; then
		logerr "$prefix.4014" \
	    `gettext "$TIV_BIN doesn't exit; can't start fault monitor for instance $INST_NAME"`
		exit 1
	fi

	TIV_LIB=$TIV_DIR/lib
	if [ ! -d "$TIV_LIB" ]; then
		logerr "$prefix.4015" \
	    `gettext "$TIV_LIB doesn't exit; can't start fault monitor for instance $INST_NAME"`
		exit 1
	fi

	TIV_DB=`ls -d $TIV_DIR/*.db`
	if [ `echo $TIV_DB | wc -w` -gt 1 ] ; then
		logerr "$prefix.4016" \
	    `gettext "should be only one .db directories under $TIV_DIR"`
		exit 1
	fi

else

	# Instance is running locally.  We get wping binary from diskset
	# for this logical host, and we set WLOCALHOST to the Managed Node
	# name of the oserv daemon binding to this logical host.

	TIV_BIN=`get_config_param $INST_NAME PRIV_TIV_BIN`
	# required parameter
	if [ ! -d "$TIV_BIN" ]; then
		logerr "$prefix.4017" \
	    `gettext "TIV_BIN value not set correctly for instance $INST_NAME"`
		exit 1
	fi

	TIV_LIB=`get_config_param $INST_NAME PRIV_TIV_LIB`
	# required parameter
	if [ ! -d "$TIV_LIB" ]; then
		logerr "$prefix.4018" \
	    `gettext "TIV_LIB value not set correctly for instance $INST_NAME"`
		exit 1
	fi

fi

WLOCALHOST=`basename $TIV_DB | sed 's/.db$//'` ; export WLOCALHOST
LD_LIBRARY_PATH="${TIV_LIB}/solaris2:/usr/openwin/lib:/usr/lib:/usr/ucblib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
export LD_LIBRARY_PATH

TIV_START=`get_config_param $INST_NAME START`
TIV_REMOTE_PROBE=`get_config_param $INST_NAME PROBE_1_REMOTE`

TIV_PROBE_INTERVAL=`get_config_param $INST_NAME PROBE_1_INTERVAL`
# parser requires this to be set, but doesn't check for negative values
if [ $TIV_PROBE_INTERVAL -lt 0 ]; then
	lognotice "$prefix.2006" \
`gettext "INTERVAL value is negative for instance $INST_NAME; using 60 seconds"`
	TIV_PROBE_INTERVAL=60
fi

TIV_PROBE_TIMEOUT=`get_config_param $INST_NAME PROBE_1_TIMEOUT`
# optional parameter, parser doesn't check for <= 0 values
if [ -z "$TIV_PROBE_TIMEOUT" ]; then
	lognotice "$prefix.2007" \
`gettext "TIMEOUT value not set for instance $INST_NAME; using 60 seconds"`
	TIV_PROBE_TIMEOUT=60
fi
# what timeout value is too low?
if [ $TIV_PROBE_TIMEOUT -le 0 ]; then
	lognotice "$prefix.2008" \
`gettext "TIMEOUT is <= zero for instance $INST_NAME; resetting to 60 seconds"`
	TIV_PROBE_TIMEOUT=60
fi

TIV_TAKEOVER=`get_config_param $INST_NAME PROBE_1_TAKEOVER`
# optional parameter
if [ -z "$TIV_TAKEOVER" ]; then
	lognotice "$prefix.2009" \
	   `gettext "TAKEOVER value not set for instance $INST_NAME; using 'y'"`
	TIV_TAKEOVER=y
fi

# Exit if wping program is not executable
if [ ! -x $TIV_BIN/solaris2/bin/wping ]; then
	logerr "$prefix.4019" \
	   `gettext "$TIV_BIN/solaris2/bin/wping is not executable"`
	exit
fi


# tcp_close_wait_interval for Solaris is 240 seconds.  Tivoli fault monitor
# should wait for 240 seconds for Solaris releasing the port when restarting
# tivoli instance.
if [ $TIV_PROBE_INTERVAL -lt 240 ] ; then
	TCP_CLOSE_WAIT_INTERVAL=`expr 240 - $TIV_PROBE_INTERVAL`
else
	TCP_CLOSE_WAIT_INTERVAL=0
fi


TIVGRACE=0
TIVRESTART=0
TIVPROBEFILE=/var/opt/SUNWcluster/run/.tivoli_probe.$INST_NAME
RETRY=0

while : ; do

	sleep $TIV_PROBE_INTERVAL

	# Ping the oserv daemon using the managed node name specified
	# in the config file.
	#
	# To wping a remote oserv instance requires that a local oserv is
	# running.  The hadsconf file specifies the bin, lib, and
	# wlocalhost values of this locally running instance.  If there
	# is no oserv that permanently runs locally (such as an oserv
	# instance binding to the physical IP address), then the system
	# administrator must configure the system for local probes only.

	# wping of a remote oserv can fail for three reasons:
	# 	1. remote oserv is dead
	# 	2. local oserv is dead
	#	3. proper TMR roles have not been set for administrator

	hatimerun -t $TIV_PROBE_TIMEOUT $TIV_BIN/solaris2/bin/wping $MANAGED_NODE \
	    > $TIVPROBEFILE 2>&1
	if [ $? -ne 0 ]; then
		fgrep \
	"System Exception: no permission for attempting operation:" \
		    $TIVPROBEFILE > /dev/null 2>&1
		if [ $? -eq 0 ]; then
			# wping failed for reason #3, above
			logerr "$prefix.4020" \
		`gettext "insufficient authorization to probe $INST_NAME;"`
			logerr "$prefix.4021" \
		`gettext "please set proper TMR roles for administrator"`
			continue
		fi

		# If running locally, restart it.
		if [ "$INST_RUNS_REMOTELY" = "n" ]; then
			fgrep "destination dispatcher unavailable" \
				$TIVPROBEFILE > /dev/null 2>&1
			if [ $? -eq 0 ]; then
				# wping failed for server oserv die
				lognotice "$prefix.2012" \
		`gettext "oserv on TME server is not running, wait for restart"`
				continue
			fi

			logerr "$prefix.4022" \
			   `gettext "tivoli instance $INST_NAME failed locally"`

			# if both remote and local fault probes are running,
			# let the remote probe triggers takeover.
			if [ $TIVRESTART -eq 0 -o \
			     "$TIV_REMOTE_PROBE" = "y" ] ; then
				TIVRESTART=1
				RETRY=`expr $RETRY + 1`
				logerr "$prefix.4023" \
	`gettext "restarting tivoli instance $INST_NAME; restart number $RETRY"`
				if [ "$TIV_TYPE" = "server" ]; then
					# Since tcp_close_wait_interval is set
					# to 240 seconds, server oserv need to
					# wait until port is released.  Sleep
					# 240 seconds is to prevent from 
					# second restart which may trigger a
					# failover.
					tivoli_svc_start "$MASTERED_LOGICAL_HOSTS" ""
					sleep $TCP_CLOSE_WAIT_INTERVAL
				else
					tivoli_svc_start_net "$MASTERED_LOGICAL_HOSTS" ""
				fi
				continue
			fi

			if [ "$TIV_TAKEOVER" = "y" ]; then
				logerr "$prefix.4023" \
		`gettext "second restart of $INST_NAME; trigger a failover"`
				hactl -g -s tivoli -l $TIV_HOST
			else
				# continue restarting oserv
				TIVRESTART=0
			fi
			continue
		fi


		# If running remotely, find out which oserv failed.
		# If it fails on the local oserv, restart it once.
		fgrep "bad handle specified" $TIVPROBEFILE > /dev/null 2>&1
		if [ $? -eq 0 ]; then
                        # wping failed for reason #2, above
			lognotice "$prefix.2013" \
			    `gettext "local oserv $WLOCALHOST not running"`
			if [ $TIVRESTART -eq 0 ] ; then
				TIVRESTART=1
				lognotice "$prefix.2014" \
			       `gettext "restarting local oserv $WLOCALHOST..."`
				$TIV_START $TIV_BIN $TIV_LIB $LOCALHOST \
					$TIV_PORT $TIV_DB
			else
				logerr "$prefix.4025" \
      `gettext "cannot restart local oserv $WLOCALHOST; need user intervention"`
			fi
		else
			logerr "$prefix.4026" \
		`gettext "tivoli instance $INST_NAME failed on sibling"`
			# give sibling chance to restart tivoli
			# before doing a takeover
			if [ $TIVGRACE -eq 0 ]; then
				TIVGRACE=1
				if [ "$TIV_TYPE" = "server" ]; then
					sleep $TCP_CLOSE_WAIT_INTERVAL
				fi
				continue
			fi
			if [ "$TIV_TAKEOVER" = "y" ]; then
				hactl -t -s tivoli -l $TIV_HOST
			fi
		fi
	else
		TIVGRACE=0
		TIVRESTART=0
		RETRY=0
	fi

done
