#! /bin/sh
#
# ident "@(#)nsldap_probe.shi 1.20     00/06/26 SMI"
#
#	Copyright 1996-1997 Sun Microsystems, Inc.  All Rights Reserved.
#

#
# Usage: nsldap_probe <instance_name> <local_flag>
#
# This nsldap probe is called per instance and is doing any significant
# work just for the local probe case. When it detects a failure, it will
# try to do several local restarts before giving up to attempt a failover
# (if allowable for this instance). The local restart mechanism uses a
# sliding time window algorithm , so that only when the number of retries
# was exhausted within that window, a failover can happen.
# 

#
# Add the path to framework binaries, since the probe is not called in the
# context of the methods
#
HA_BIN=/opt/SUNWcluster/bin

SYSLOG_PREFIX="SUNWcluster.ha.nsldap"
prefix=$SYSLOG_PREFIX.probe

PATH=${PATH}:$HA_BIN:/usr/bin:/usr/sbin
export PATH


# generate trap if signal 1 2 or 15 go to cleanup_exit
trap "cleanup_exit 1 " 1 2 15

#
#pragma ident "@(#)ds_boiler	1.3	98/09/15 SMI"
#
# common boiler for HA data services
#
#


ARGV0=`basename $0`
LOGGER=logger
HA_SLOGFACILITY=`haget -f syslog_facility`
HA_SLOGTAG=hadf
prog_path=`dirname $0`

# source in ha-services common utilities
. hads_utilities

# add the ha-service directory to the path
PATH=${prog_path}:${PATH}

#
# for use by subsequent hactl command, get hostnames of local and remote hosts
#
LOCALHOST=`uname -n`

#
#	Copyright 06/24/99 Sun Microsystems, Inc.  All Rights Reserved.
#
#ident "@(#)do_service		1.13	99/06/24 SMI"
#
# do_service 
# Standard-name file that defines the standard-name routine bundle_do_svc(), 
# which implements the HA framework-defined methods for this data service.
# The _boiler file in this directory sources this file when the data
# service method scripts execute.
#


SYSLOG_PREFIX="SUNWcluster.ha.nsldap"

# Read in and interpret the data service environment configuration file
source_env_file /etc/opt/SUNWscnsl/hadsconf || exit 1
STOP_PROBE_TIMEOUT=15

#
# bundle_do_svc <action>
#
# called for each instance
#
bundle_do_svc ()
{

# XXX
#set -x
	action=$1
	prefix="$SYSLOG_PREFIX.$action"

#lognotice "bundle_do_svc: called with action '$action' for instance '$_INST_NAME' "

	LOGICAL_HOST=`get_config_param $_INST_NAME LOGICAL_HOST`
	BASE_DIR=`get_config_param $_INST_NAME BASE_DIR`
        if [ -f $BASE_DIR/start-slapd ] ;
        then
                LDAP_START=${BASE_DIR}/start-slapd
                LDAP_STOP=${BASE_DIR}/stop-slapd
        else
                LDAP_START=${BASE_DIR}/start
                LDAP_STOP=${BASE_DIR}/stop
        fi

	case $action in
	'start')

		if [ ! -x $LDAP_START ]; then
			logerr "$prefix.4040" `gettext "Unable to execute $BASE_DIR/start"`
			exit 1
		fi

		#
		# launch the daemon using the process monitor
		#
		ha_svc_not_running $_INST_NAME
		if [ $? -eq 0 ]; then
			pmfadm -c $_INST_NAME $LDAP_START
			if [ $? -ne 0 ]; then
				logerr "$prefix.4000" `gettext "failed to start Netscape Directory Server instance $_INST_NAME"`
				exit 1
			else
				lognotice "$prefix.2000" `gettext "Started Netscape Directory Server instance $_INST_NAME"`
			fi
		fi
		;;

	'start_net')
		#
		# START_NET is always called after START, so at this point
		# the daemon should be running (we're doing the actual start 
		# at START time). The bundles framework, as part of the idempotency
		# check, will notice that the daemon for this instance is running,
		# and will not call bundle_do_svc(), so this code should never 
		# be executed.
		#
		lognotice "$prefix.2010" `gettext "start_net method called for instance $_INST_NAME"`
		ha_svc_not_running $_INST_NAME
		if [ $? -eq 0 ]; then
			logerr "$prefix.3002" `gettext "start_net: instance $_INST_NAME is down"`
			exit 1
		fi
		;;

	'stop' )
		ha_svc_not_running $_INST_NAME
		if [ $? -ne 0 ]; then
			pmfadm -s $_INST_NAME
			if [ $? -ne 0 ]; then
				logerr "$prefix.4001" `gettext "pmfadm failed to delete  $_INST_NAME from queue"`
				exit 1
			fi
			if [ ! -x $LDAP_STOP ]; then   
				logerr "$prefix.4002"  \
			`gettext "$LDAP_STOP is not executable"` 
				exit 1
			fi		
		
			method_timeout=`hareg -q nsldap -T stop`
			hatimerun -t $method_timeout $LDAP_STOP
			if [ $? -ne 0 ]; then
				logerr "$prefix.4003" \
	`gettext "pmfadm failed to kill ${_INST_NAME}'s process"`
				exit 1
			else
				lognotice "$prefix.2030" \
				`gettext "Stopped LDAP instance ${_INST_NAME}"`
			fi
		fi
		;;

	'stop_net' )
		#
		;;

	'abort' | 'abort_net')
		pmfadm -s $_INST_NAME KILL
		if [ $? -ne 0 ]; then
			logerr "$prefix.4024" `gettext "failed to abort/net instance $_INST_NAME"`
			exit 1
		fi
		;;

	'fm_init')
		lognotice "$prefix.2005" `gettext "fm_init not implemented "`
		;;

	'fm_start')
		#
		# Check if we are about to probe a logical host we master (local
		# probe),  or one which another host masters (remote probe), or
		# one in maintenance mode (don't probe)
		#
		maint=`haget -f is_maint -h $LOGICAL_HOST`
		if [ $? -ne 0 ]; then
			logerr "$prefix.4030" `gettext "haget(1M) failed; could be an error is logical host name $LOGICAL_HOST"`
			exit 1
		elif  [ $maint -eq 0 ]; then
			if is_member "$LOGICAL_HOST" "$MASTERED_LOGICAL_HOSTS" ; then
				local=yes
			else
				local=no
			fi
			#
			# Launch a probe using the process monitor.
			# We are using the process monitor just to start and tag
			# the probe, without the retry feature of the process monitor.
			#
			pmfadm -c ${_INST_NAME}.probe \
				/bin/sh -c "$_INST_PROBE_PROG_1 $_INST_NAME $local >/dev/null 2>&1"
			if [ $? -ne 0 ]; then
				logerr "$prefix.4020" `gettext "failed to start probe $_INST_PROBE_PROG_1 for instance $_INST_NAME"`
				exit 1
			fi
		fi
		;;

	'fm_stop')
		#
		# Kill the probe that is associated with this instance.
		#
                # If probe not running, do nothing
                ha_svc_not_running ${_INST_NAME}.probe && exit 0

                # PMF kills nsldap_probe
		pmfadm -s ${_INST_NAME}.probe -w ${STOP_PROBE_TIMEOUT} TERM || pmfadm -s ${_INST_NAME}.probe KILL
		if [ $? -ne 0 ]; then
                        logerr "$prefix.4006" \
    `gettext "pmfadm failed to stop LDAP probe instance ${_INST_NAME}.probe"`
                        exit 1
                else
                        lognotice "$prefix.2020" \
                        `gettext "Stopped probe for instance $_INST_NAME"`
                fi
		;;

	'fm_check_this_host_ok')
		#
		# Yap, we are OK
		#
		lognotice "$prefix.2021" `gettext "This host is OK for instance $_INST_NAME"`
		;;

	esac

	exit 0
# XXX
#set +x

}

#include_boiler

# XXX
#set -x

INST_NAME=$1
LOCAL=$2

# Used for error message logging
set_inst_name ${INST_NAME}

#
# cleanup_exit()
# on a signal 1 2 or 15 send an Event because the Fault Monitor is
# in state of and the instance state is then unknown
#
cleanup_exit()
{
    # send an event when the fault monitor exits
    # on signal 1 2 15 
	#$CT_PROGS/halogmsg \
	 #-p NOTICE \
	 #-n "$INST_NAME" \
	 #-t DS_INSTANCE \
	 #-c UNKNOWN \
	 #-s "Unknown"\
	 #-h "$LHOST" \
	 #-d "nsldap" \
	 #-m `gettext "Fault Monitor Exit"` \
	 #-N 

   exit 1
}


#
# Restart an instance with the process monitor
#
do_retry ()
{
	candorestart=0
	if [ "$LOCAL" = "yes" ]; then
		# Check to see if the server process is there or not
		MASTERED_LOGICAL_HOSTS="`haget -f mastered`"
		NOT_MASTERED_LOGICAL_HOSTS="`haget -f not_mastered`"
		pmfadm -q $INST_NAME > /dev/null 2>&1
		if [ $? -eq 0 ]; then
			# Hmmm... the server process is actually up
			# It must be stuck somewhere. Do not kill if it is the
			# very first failure of the probe. But if there have been
			# some probe failures recently, Kill and then restart afresh

			if [ $retries -gt 1 ]; then
				lognotice "$prefix.3010" `gettext "Terminating instance $INST_NAME before restart"`
			#	pmfadm -s $INST_NAME -w $STOP_TIMEOUT TERM || pmfadm -s $INST_NAME KILL
				nsldap_svc_stop "${NOT_MASTERED_LOGICAL_HOSTS}" "${MASTERED_LOGICAL_HOSTS}" ""
				candorestart=1
			fi
		else
			candorestart=1
		fi
	fi

	if [ $candorestart -ne 0 ]; then
		lognotice "$prefix.3011" `gettext "Restarting instance $INST_NAME, attempt #${retries}"`
		#pmfadm -c $INST_NAME $INST_START >/dev/null 2>&1
		nsldap_svc_start "${MASTERED_LOGICAL_HOSTS}" "" ""
		if [ $? -ne 0 ]; then
			logerr "$prefix.4010" `gettext "Failed to restart instance $_INST_NAME"`
		fi
	fi
}


# evaluate_instance_migration()
#
# Called when the retry logic reaches its wit's end.
# This routine checks if takeover is permitted for this instance,
# if this is a local/remote probe and tries to do the best it can.
#
# In any case, the probe is at the end of it's rope here, so it exits.
#
evaluate_instance_migration()
{
	# First check if there has been a cluster reconfiguration
	# which we somehow missed

	NEW_CLUST_COOKIE=`hactl -f cluster_key`
	if [ "$NEW_CLUST_COOKIE" -ne "$CLUST_COOKIE" ]; then
		logerr "$prefix.4011" `gettext "nsldap_probe: Missed cluster reconfiguration. Exiting"`
		exit 1
	fi

	# Check if takeover is permited for this instance, and if it is,
	# ask the framework to attempt a takeover. 
	if [ "$TAKEOVER" != "n" -a "$LOCAL" = "no" ]; then
		lognotice "$prefix.3012" `gettext "Attempting to take ownership of Netscape Directory Access Server instance $INST_NAME"`
		hactl -t -s nsldap -l $LHOST -L soft
		#fdl_consider_takeover

	# Check if failover is permited for an this instace, and if it is,
	# ask the framework to attempt a failover. 
	elif [ "$TAKEOVER" != "n" -a "$LOCAL" = "yes" ]; then
		lognotice "$prefix.3013" `gettext "Local probe for $INST_NAME: Giving up ownership"`
		hactl -g -s nsldap -l $LHOST -L soft
		#fdl_request_relinquish
	elif [ "$TAKEOVER" = "n" -a "$LOCAL" = "yes" ]; then
		logwarn $prefix.3014 `gettext "Failover not permitted for instance $INST_NAME. Giving up"`
	elif [ "$TAKEOVER" = "n" -a "$LOCAL" = "no" ]; then
		logwarn $prefix.3014 `gettext "Takeover not allowed for instance $INST_NAME. Giving up"`
	fi

	exit 0
}



#
# Get the probe interval for this instance
#
PROBE_INTERVAL=`get_config_param $INST_NAME PROBE_1_INTERVAL`
is_numeric $PROBE_INTERVAL 
if [ $? -ne 0 ]; then
	logerr "$prefix.4013" `gettext "Value of probe interval for instance $INST_NAME is not numeric" `
	exit 1
fi

#
# Get the retry time interval (the time window in minutes)
#
RETRY_INTERVAL=`get_config_param $INST_NAME RETRY_INTERVAL`
if [ -z "$RETRY_INTERVAL" ]; then
	# IF retRY_INTERVAL  was left blank, time window is indefinite
	RETRY_INTERVAL=0
else
	is_numeric $RETRY_INTERVAL
	if [ $? -ne 0 ]; then
		logerr "$prefix.4014" `gettext "Invalid value of RETRY_INTERVAL for instance $INST_NAME"`
		exit 1
	fi
fi
# Convert to seconds
RETRY_INTERVAL=`expr $RETRY_INTERVAL \* 60`

#
# Get the takeover/failover flag for this instance
#
TAKEOVER=`get_config_param $INST_NAME PROBE_1_TAKEOVER`

#
# Get the retry number for this instance
#
RETRY_TIMES=`get_config_param $INST_NAME RETRY_TIMES`
#
# validate the number
#
if [ -z "$RETRY_TIMES" ]; then
	# if RETRY_TIMES was left blank, we'll assume no retries
	RETRY_TIMES=0
else
	is_numeric $RETRY_TIMES
	if [ $? -ne 0 ]; then
		logerr "$prefix.4015" `gettext "Invalid value of RETRY_TIMES for instance $INST_NAME"`
		exit 1
	fi
fi

#
# Get the start script/program for this instance
#
BASE_DIR=`get_config_param $INST_NAME BASE_DIR`
        if [ -f $BASE_DIR/start-slapd ] ;
        then
                INST_START=${BASE_DIR}/start-slapd
                INST_STOP=${BASE_DIR}/stop-slapd
		INST_MON_DIR=$BASE_DIR/../shared/bin
        else
                INST_START=${BASE_DIR}/start
                INST_STOP=${BASE_DIR}/stop
		INST_MON_DIR=$BASE_DIR/../bin/slapd/server
        fi


INST_MON=$INST_MON_DIR/ldapsearch

#
# Get the logical host associated with this instance that 
# the probe is suppose to check. 
#
LHOST=`get_config_param $INST_NAME LOGICAL_HOST`
LPORT=`get_config_param $INST_NAME PORT`
PROBE_TIMEOUT=`get_config_param $INST_NAME PROBE_1_TIMEOUT`
STOP_TIMEOUT=10

if [ "$LOCAL" = "yes" ]; then
	REMLOC=local
	if [ ! -x $INST_START ]; then
		logerr "$prefix.4041" `gettext "Unable to execute $INST_START"`
		exit 1
	fi

	if [ ! -x $INST_MON ]; then
		logerr "$prefix.4042" `gettext "Unable to execute $INST_MON"`
		exit 1
	fi

	PATH=$PATH:$INST_MON_DIR
	export PATH
	cd $INST_MON_DIR

else
	REMLOC=remote
fi

lognotice "$prefix.2010" `gettext "Starting a $REMLOC probe of instance $INST_NAME on logical host $LHOST"`

retries=0
wasdead=0

CLUST_COOKIE=`hactl -f cluster_key`

while : ; do

	#
	# Take a nap here
	# In the first iteration, this has the advantage of allowing time 
	# for slapd to start.
	#
	sleep $PROBE_INTERVAL

	#
	#  Remote probe, just try to connect to slapd
	#  Local probe, connect to slapd with "cn=monitor"

	if [ "$LOCAL" = "yes" ]; then
		hatimerun -t $PROBE_TIMEOUT $INST_MON -h $LHOST -p $LPORT -b "cn=monitor" -s base "objectClass=*" > /dev/null 2>&1
	else
		hatimerun -t $PROBE_TIMEOUT telnet -e'e' $LHOST $LPORT << TELEND > /dev/null 2>&1
	equit

TELEND
	fi

	if [ $? -ne 0 ]; then
		logerr "$prefix.4016" `gettext "Probe detected that instance $INST_NAME is dead"`
		wasdead=1
		if [ $RETRY_TIMES -eq 0 ]; then
			#
			# This instance was configured not to do any retries.
			# Do the failover check.
			#
			evaluate_instance_migration
		elif [ $retries -eq 0 ]; then
			#
			# We are about to do the first retry in the time window. 
			# Latch the time.
			#
			start_time=`fdl_timesecs`
			retries=1
			do_retry
		else
			#
			# This is not the first failure. First find
			# the time span from the first failure (in the
			# time window).
			#
			cur_time=`fdl_timesecs`
			time_diff=`expr $cur_time - $start_time`
			if [ $time_diff -ge $RETRY_INTERVAL ]; then
				#
				# This failure happened after the time window
				# elapsed, so we reset the retries counter,
				# slide the window, and do a retry.
				#
				retries=1
				start_time=$cur_time
				do_retry
			elif [ $retries -ge $RETRY_TIMES ]; then
				#
				# We are still within the time window,
				# and the retry counter expired, so if
				# failover is permited, initiate one.
				#
				retries=0
				evaluate_instance_migration
			else
				#
				# We are still within the time window,
				# and retry counter did not expired,
				# so do another retry.
				#
				retries=`expr $retries + 1`
				do_retry
			fi
		fi
	else    # The instance is doing OK
	    if [ $wasdead -eq 1 ]; then
		lognotice "$prefix.2017" `gettext "Instance $INST_NAME is on-line now"`

	    fi
	    wasdead=0

		# The success of this probe should compensate the retries
		# we have done so far in the current sliding window
		if [ $retries -gt 1 ]; then
			retries=`expr $retries - 1`
		fi
	fi
done

# XXX
#set +x

