#! /usr/bin/ksh
#
# ident	"@(#)hasap_probe.shi	1.10	98/07/24 SMI"
#
# Copyright (c) 1997 by Sun Microsystems, Inc.
# All rights reserved.
#

# Usage: sap_probe <instance name>
# Started up in the background via pmfd in sap_fm_start during reconfiguration.

#
# Add the path to framework binaries, since the probe is not called in the
# context of the methods
#

INST_NAME=$1

#
#	Copyright 11/18/96 Sun Microsystems, Inc.  All Rights Reserved.
#
#
#pragma ident "@(#)ds_boiler	1.1 97/06/12 SMI"
#
#ident "@(#)ds_boiler		1.7	96/11/18 SMI"
#
# common boiler for HA Internet Pro data services
#
#


ARGV0=`basename $0`
LOGGER=logger
HA_SLOGFACILITY=`haget -f syslog_facility`
HA_SLOGTAG=hadf
prog_path=`dirname $0`

# source in ha-services common utilities
. ds_utilities

# add the ha-service specific clust_progs
expr "$prog_path" : '.*/clust_progs' >/dev/null 2>&1
if [ $? -eq 0 ]; then
	PATH=${prog_path}:${PATH}
else
	PATH=${prog_path}:${prog_path}/../clust_progs:${PATH}
fi

# add the ha-service specific fault_progs
expr "$prog_path" : '.*/fault_progs' >/dev/null 2>&1
if [ $? -eq 0 ]; then
    PATH=${prog_path}:${PATH}
else
    PATH=${prog_path}:${prog_path}/../fault_progs:${PATH}
fi

#
# for use by subsequent hactl command, get hostnames of local and remote hosts
#
REMOTEHOSTS=
LOCALHOST=`uname -n`

if [ $? -ne 0 ]; then
	logerr `gettext "Cannot obtain name of local host"`
	exit 1
fi
# compute hostnames of remote nodes
PHYS_HOSTS="`haget -f all_physical_hosts`"
for i in $PHYS_HOSTS; do
	if [ "$i" != "$LOCALHOST" ]; then
		REMOTEHOSTS="$REMOTEHOSTS $i"
	fi
done
#! /bin/sh 
#
# pragma ident "@(#)do_service	1.5 98/01/14 SMI"
#
#

ARG_MASTERED=$1
ARG_NOT_MASTERED=$2
SYSLOG_PREFIX="SUNWcluster.ha.sap"

# Replace comma with space to form an sh word list
MASTERED="`echo $ARG_MASTERED | tr ',' ' '`"
NOT_MASTERED="`echo $ARG_NOT_MASTERED | tr ',' ' '`"

HASAP_CONFIG_FILE=/etc/opt/SUNWscsap/hadsconf

#
# Call the parser to handle the config file.
#
if [ ! -f $HASAP_CONFIG_FILE ]; then
	logerr `gettext "$HASAP_CONFIG_FILE doesn't exist"`
	exit 1
fi

. ds_utilities.1.3

source_env_file $HASAP_CONFIG_FILE

if [ $? -ne 0 ]; then
	# source_env_file logs error message if it fails.
	# No need to log another; just exit.
	exit 1
fi

#
# Timeout to waiting for SIGTERM to stop a process
# This should be in the config file
#
STOP_TIMEOUT=15
#
# bundle_do_svc <action>
#
# is called for each instance
#
bundle_do_svc ()
{
	action=$1

	SAP_START=${_INST_BASE_DIR}/hasap_start_net
	SAP_STOP=${_INST_BASE_DIR}/hasap_stop_net

	prefix="$SYSLOG_PREFIX.$action"

	case $action in

	'start')

		# First do some error checking.

		if [ ! -x $SAP_START ]; then
			logerr "$prefix.4000" \
				`gettext "<$SAP_START> is not executable."`
			exit 1
		fi

		# The process monitor facility calls the start program,
		# passing to it the instance-specific information it needs.
		# Note that we're using pmf to start/stop, but not to probe.

		if [ ${_INST_RETRY} = "n" ]; then
			pmfadm -c ${_INST_NAME} $SAP_START
		else
			pmfadm -c ${_INST_NAME}          \
			       -n ${_INST_RETRY_TIMES}    \
			       -t ${_INST_RETRY_INTERVAL} \
			       -a ${_INST_PROBE_CALLBACK_1} \
			       $SAP_START
		fi

		if [ $? -ne 0 ]; then
			logerr "$prefix.4001" \
				`gettext "pmfadm failed to start SAP instance ${_INST_NAME}"`
			exit 1
		else
			lognotice "$prefix.2000" \
				`gettext "Started SAP instance ${_INST_NAME}"`

		fi
	;;

	'stop' | 'abort')

		# delete from queue, but don't kill
		pmfadm -s ${_INST_NAME}
		if [ $? -ne 0 ]; then
			logerr "$prefix.4002" \
				`gettext "pmfadm failed to delete ${_INST_NAME} from queue"`
			exit 1
		fi

		# use Netscape's stop script to stop nshttp instance
		if [ ! -x $SAP_STOP ]; then
			logerr "$prefix.4003" `gettext "<$SAP_STOP> is not executable"`
			exit 1
		fi

		$SAP_STOP

		if [ $? -ne 0 ]; then
			logerr "$prefix.4004" `gettext "$SAP_STOP execution failed."`
		else
			lognotice "$prefix.2001" \
			    `gettext "Stopped SAP instance ${_INST_NAME}"`
		fi

		ha_svc_not_running ${_INST_NAME}

		if [[ $? -ne 0 ]]; then
        	# Now kill any processes left out
        	pmfadm -s ${_INST_NAME} KILL
        	if [ $? -ne 0 ]; then
            	logerr "$prefix.4009" \
                	`gettext "pmfadm failed to kill ${_INST_NAME}'s process and its sub-processes (if any) : pmfadm returned $?"`
            	exit 1
        	fi
		fi

	;;

	'fm_start')

		# XXX
		need_to_run_probe ${_INST_LOGICAL_HOST} ${LOCALHOST}

		if [ $? -ne 0 ]; then
			exit 0
		fi

		# pmf starts nshttp_probe
		# nshttp_probe runs until nshttp_fm_stop kills it.
		# Don't start probe if diskset is in maintenance mode.

		# If this SAP instance's diskset is in maint mode, exit now.
		MAINT=`haget -f is_maint -h ${_INST_LOGICAL_HOST}`
		if [ "$MAINT" = "1" ]; then
			exit 0
		fi

		pmfadm -c ${_INST_NAME}.probe ${_INST_PROBE_PROG_1} \
		    ${_INST_NAME}

		if [ $? -ne 0 ]; then
			logerr "$prefix.4005" \
	`gettext "pmfadm failed to start SAP probe for instance ${_INST_NAME}"`
			exit 1
		else
			lognotice "$prefix.2002" \
		`gettext "Started SAP probe instance ${_INST_NAME}.probe"`
		fi
	;;

	'fm_stop')

		# If probe not running, do nothing
		ha_svc_not_running ${_INST_NAME}.probe && exit 0

		# pmf kills nshttp_probe
		pmfadm -s ${_INST_NAME}.probe -w ${STOP_TIMEOUT} TERM || \
			pmfadm -s ${_INST_NAME}.probe KILL
		if [ $? -ne 0 ]; then
			logerr "$prefix.4006" \
`gettext "pmfadm failed to stop SAP probe instance ${_INST_NAME}.probe"`
			exit 1
		else
			lognotice "$prefix.2003" \
		`gettext "Stopped SAP probe instance ${_INST_NAME}.probe"`
		fi
	;;

	'fm_check_this_host_ok')

		# If the HA-SAP logical host for this instance
		# is not currently mastered by this machine, exit now.

		is_member "${_INST_LOGICAL_HOST}" "$MASTERED"
		if [ $? -ne 0 ]; then
			exit 0
		fi

		# Otherwise, probe Web service now.
		# If dead, request will time out in
		# ${_INST_PROBE_TIMEOUT_1} secs.

		SAPPROBEFILE=/tmp/${_INST_NAME}.probe.$$

		hatimerun -t ${_INST_PROBE_TIMEOUT_1} \
		    /usr/bin/telnet ${_INST_LOGICAL_HOST} \
		    ${_INST_PORT} <<EOF > $SAPPROBEFILE 2>&1

EOF
		if [ $? -eq 99 ]; then
			# timeout
logerr "$prefix.4007" \
	`gettext "This server is supposed to be providing SAP service for instance <${_INST_NAME}>, but request timed out"`
			exit 1
		fi
		grep refused $SAPPROBEFILE > /dev/null 2>&1
		if [ $? -eq 0 ]; then
logerr "$prefix.4008" \
	`gettext "This server is supposed to be providing SAP service for instance <${_INST_NAME}>, but isn't"`
			exit 1
		fi
	;;

	esac

	exit 0
}
#include_boiler


ARGV0=`basename $0`
PATH0=`dirname $0`
SYSLOG=`haget -f syslog_facility`
SAPCONF=/etc/opt/SUNWscsap/hadsconf
MYNAME=`uname -n`
HA_DEBUG=1

#
# take this out before fcs
#
if [ $HA_DEBUG -ne 0 ];then
  logger -p ${SYSLOG}.info \
	 "$0" "$1" "$2"
fi

prog_path=`dirname $0`


# source in ha-services common utilities
. ds_utilities.1.3

source_env_file $SAPCONF

#
# SAP variables
#
NOTAVAIL=${_INST_1_PRIV_NOT_AVAIL}
SAPSID=${_INST_1_PRIV_YOUR_SAP_SID}
CI_INSTANCE_ID=${_INST_1_PRIV_CI_INSTANCE_ID}
CI_STARTSAP_RETRY_CNT=${_INST_1_PRIV_CI_STARTSAP_RETRY_CNT}
CI_STARTSAP_RETRY_INTERVAL=${_INST_1_PRIV_CI_STARTSAP_RETRY_INTERVAL}
CI=${_INST_1_PRIV_CI_LOGICAL_HOSTNAME}
DB=${_INST_1_PRIV_DB_LOGICAL_HOSTNAME}
NFS=${_INST_1_PRIV_NFS_LOGICAL_HOSTNAME}
SAPADM=${_INST_1_PRIV_SAP_ADMIN_LOGIN_NAME}
DBADM=${_INST_1_PRIV_DB_ADMIN_LOGIN_NAME}
SAPMNT1=${_INST_1_PRIV_SAP_MOUNT_POINT_A}
SAPMNT2=${_INST_1_PRIV_SAP_MOUNT_POINT_B}
AUTO_NFSMOUNT=${_INST_1_PRIV_AUTO_NFS_MOUNT}
DS_STATE=${_INST_1_DS_STATE}


#
# for use by subsequent hactl command, get hostnames of local and sibling hosts
#
MYNAME=`uname -n`

ci_physhost=`haget -f master -h "$CI"`

if [ $? -ne 0 ]; then
	logerr `gettext "Cannot obtain name of local host"`
	exit 1
fi

if [ -n "$ci_physhost" -a "$ci_physhost" != "$MYNAME" ]; then
	#
	# I don't master CI, no need to probe
	#
	lognotice `gettext "CI not mastered on this host, not starting probe"`
	exit 0
fi

# compute hostname of sibling
PHYS_HOSTS="`/opt/SUNWhadf/bin/haget -f all_physical_hosts`"
if [ `count_items $PHYS_HOSTS` -lt 2 ]; then
	logerr `gettext "Cannot compute hostname of sibling"`
	exit 1
fi
for i in $PHYS_HOSTS; do
	if [ "$i" != "$MYNAME" ]; then
		REMOTEHOST=$i
	fi
done


if [ -z "$INST_NAME" ]; then
	logerr `gettext "Usage: $ARGV0 <instance>"`
	exit 1
fi

MASTERED_LOGICAL_HOSTS="`haget -f mastered`"

SAP_HOST=`get_config_param $INST_NAME LOGICAL_HOST`
# parser requires this to be set

SAP_PROBE_INTERVAL=`get_config_param $INST_NAME PROBE_1_INTERVAL`
# parser requires this to be set, but doesn't check for negative values
if [ $SAP_PROBE_INTERVAL -lt 0 ]; then
	lognotice \
`gettext "INTERVAL value is negative for instance $INST_NAME; using 60 seconds"`
	SAP_PROBE_INTERVAL=60
fi

SAP_PROBE_TIMEOUT=`get_config_param $INST_NAME PROBE_1_TIMEOUT`
# optional parameter, parser doesn't check for <= 0 values
if [ -z "$SAP_PROBE_TIMEOUT" ]; then
	lognotice \
`gettext "TIMEOUT value not set for instance $INST_NAME; using 60 seconds"`
	SAP_PROBE_TIMEOUT=60
fi
# what timeout value is too low?
if [ $SAP_PROBE_TIMEOUT -le 0 ]; then
	lognotice \
`gettext "TIMEOUT is <= zero for instance $INST_NAME; resetting to 60 seconds"`
	SAP_PROBE_TIMEOUT=60
fi

SAP_TAKEOVER=`get_config_param $INST_NAME PROBE_1_TAKEOVER`
# optional parameter
if [ -z "$SAP_TAKEOVER" ]; then
	lognotice \
	   `gettext "TAKEOVER value not set for instance $INST_NAME; using 'y'"`
	SAP_TAKEOVER=y
fi

LOCAL=no
is_member "$SAP_HOST" "$MASTERED_LOGICAL_HOSTS"
if [ $? -eq 0 ]; then
	# SAP_HOST is running locally
	LOCAL=yes
fi


cicnt=0
dbcnt=0
proccnt=0

# ######################################################################
#
# Probing starts here:
#
# Uncomment doprobe to start the probing
#
# ######################################################################

procprobe()
{
	if [ $# -eq 1 ] && [ ${1} = "grace" ]; then
		grace_mode=1
	else
		grace_mode=0
	fi

	if [ -x ${PATH0}/sapmon ]; then
		${PATH0}/sapmon proc > /dev/null 2>&1
		probe_status=$?
		if [ $probe_status -ne 0 ]; then

			#
			# if we are in grace mode, do not issue takeover, 
			# instead just inform caller what is failing
			#
			if [ ${grace_mode} -eq 1 ]; then
				return $probe_status
			fi

			#
			# dissect the "sapmon proc" return status 
			# Check bits and build error string
			#
			procerr=

			for i in 1 2 4
			do
				bitstat=$(( $probe_status & $i ))
				if [ $bitstat -gt 0 ]; then
					case $bitstat in
						1) procerr=$procerr"[ms.sap]"
							;;
						2) procerr=$procerr"[dw.sap]"
							;;
						4) procerr=$procerr"[dispatcher]"
							;;
					esac
				fi
			done

			logerr `gettext "SAP process failure in instance ${SAPSID} has been detected!"`
			logerr `gettext "${procerr} process died!"`

			hactl -g -s sap  -l ${CI}

			logerr `gettext "HA-SAP has issued a giveup request! Probe exiting"`

			exit 1
		fi
	else
		logerr `gettext "${PATH0}/sapmon executable not found!"`
		exit 1
	fi

	#
	# put out message every 10th probe
	#
	if [ $proccnt -eq 0 ]; then
		lognotice `gettext "Probing SAP processes $SAPSID OK."`
		proccnt=`expr $proccnt + 1`
	else
		proccnt=`expr $proccnt + 1`
		if [ $proccnt -ge 10 ]; then
			proccnt=0
		fi
	fi

	#
	# return okay status 
	#
	return 0
}

diaprobe() {
	if [ $# -eq 1 ] && [ ${1} = "grace" ]; then
		grace_mode=1
	else
		grace_mode=0
	fi

	if [ -x ${PATH0}/sapmon ]; then
		${PATH0}/sapmon dia > /dev/null 2>&1
		probe_status=$?
		if [ $probe_status -ne 0 ]; then
			#
			# if we are in grace mode, do not issue takeover, instead
			# inform caller that this test is still failing
			#
			if [ ${grace_mode} -eq 1 ]; then
				return 1
			fi

			#
			# some problem encountered in dialog
			#
			logerr `gettext "SAP failure of $SAPSID has been detected!"`
			logerr `gettext "SAP dialog died!"`

			hactl -g -s sap  -l ${CI}

			logerr `gettext "HA-SAP has issued a giveup request! Probe exiting"`
			exit 1
		fi
	else
		logerr `gettext "${PATH0}/sapmon executable not found!"`
		exit 1
	fi

	#
	# put out message every 10th probe
	#
	if [ $cicnt -eq 0 ]; then
		lognotice `gettext "Probing SAP instance $SAPSID OK."`
		cicnt=`expr $cicnt + 1`
	else
		cicnt=`expr $cicnt + 1`
		if [ $cicnt -ge 10 ]; then
			cicnt=0
		fi
	fi

	#
	# return okay status
	#
	return 0
}

dbprobe() {
	if [ $# -eq 1 ] && [ ${1} = "grace" ]; then
		grace_mode=1
	else
		grace_mode=0
	fi

	if [ -x ${PATH0}/sapmon ]; then
		${PATH0}/sapmon db > /dev/null 2>&1
		probe_status=$?
		if [ $probe_status -ne 0 ]; then
			#
			# if we are in grace mode, do not issue takeover, instead
			# inform caller that this test is still failing
			#
			if [ ${grace_mode} -eq 1 ]; then
				return 1
			fi

			#
			# some problem encountered in oracle
			#
			logerr `gettext "SAP failure of $SAPSID has been detected!"`
			logerr `gettext "SAP DB failed probing!"`

			hactl -g -s sap  -l ${CI}

			logerr `gettext "HA-SAP has issued a giveup request! Probe exiting"`
			exit 1
		fi
	fi

	#
	# put out message every 10th probe
	#
	if [ $dbcnt -eq 0 ]; then
		lognotice `gettext "Probing SAP DBMS $SAPSID OK."`
		dbcnt=`expr $dbcnt + 1`
	else
		dbcnt=`expr $dbcnt + 1`
		if [ $dbcnt -ge 10 ]; then
			dbcnt=0
		fi
	fi

	#
	# return okay status
	#
	return 0
}


#
# Start testing if START_NET has completed, if so exit to normal
# probing, if not, continue testing until START_NET retry has been
# exhausted, then exit to normal probing and allow it to do takeover
#
# Do not go to normal mode unless all test has passed.
#
lognotice `gettext "Starting probe in grace mode"`

cur_retry=1
while [ ${cur_retry} -le ${CI_STARTSAP_RETRY_CNT} ]; do

	lognotice `gettext "Grace mode (retry# ${cur_retry}/max ${CI_STARTSAP_RETRY_CNT})"`

	if [ "$ci_physhost" = "$LOCALHOST" ]; then
		procprobe grace
		procprobe_rc=$?

		diaprobe grace
		diaprobe_rc=$?

		# we'll let HA-DBMS monitor this
		#dbprobe grace
		#dbprobe_rc=$?
	fi

	if [ ${procprobe_rc} -eq 0 ] &&
	   [ ${diaprobe_rc} -eq 0 ]; then
	   #[ ${dbprobe_rc} -eq 0 ]; then
		lognotice `gettext "Passed all probes, exiting grace mode into normal mode."`
		break
	else
		#lognotice `gettext "SAP startup needs more time, continue grace mode (procprobe=${procprobe_rc},diaprobe=${diaprobe_rc},dbprobe=${dbprobe_rc})"`
		lognotice `gettext "SAP startup needs more time, continue grace mode (procprobe=${procprobe_rc},diaprobe=${diaprobe_rc})"`
	fi

	cur_retry=`expr $cur_retry + 1`
	sleep ${CI_STARTSAP_RETRY_INTERVAL}
done


#
# Normal SAP probing
#
lognotice `gettext "Starting normal probe"`

while : ; do
	# 
	# Now do the CI probing if there is an CI running
	#
	sleep $SAP_PROBE_INTERVAL
	if [ "$ci_physhost" = "$LOCALHOST" ]; then
		#
		# simple ps -ef |grep
		#
		procprobe

		#
		# probe SAP itself
		#
		diaprobe
	fi

	#
	# Now do the DB probing if there is DB on this host
	#
	# DL 7/7/98 we'll let HA-DBMS monitor this
	#if [ "$db_physhost" = "$LOCALHOST" ]; then
	#	#
	#	# probe SAP DB using sql
	#	#
	#	dbprobe
	#fi
done
