#! /bin/ksh
#
#
#pragma ident "@(#)nsnews_probe.shi 1.7     01/03/28 SMI"
#
#
#	Copyright 02/03/97 Sun Microsystems, Inc.  All Rights Reserved.
#

# Usage: nsnews_probe <instance name>
# Started up in the background via pmfd in nsnews_fm_start
# during reconfiguration.
# Connects to innd daemon to verify its health.
# Does not currently monitor newstime.

#
# Add the path to framework binaries, since the probe is not called in the
# context of the methods
#
PATH=${PATH}:/opt/SUNWcluster/bin:/opt/SUNWcluster/ha/nsnews

INST_NAME=$1
typeset prefix
prefix="SUNWcluster.ha.nsnews.probe"

#
#	Copyright 11/18/96 Sun Microsystems, Inc.  All Rights Reserved.
#
#
#pragma ident "@(#)ds_boiler	1.1 97/06/12 SMI"
#
#ident "@(#)ds_boiler		1.7	96/11/18 SMI"
#
# common boiler for HA Internet Pro data services
#
#


ARGV0=`basename $0`
LOGGER=logger
HA_SLOGFACILITY=`haget -f syslog_facility`
HA_SLOGTAG=hadf
prog_path=`dirname $0`

# source in ha-services common utilities
. ds_utilities

# add the ha-service specific clust_progs
expr "$prog_path" : '.*/clust_progs' >/dev/null 2>&1
if [ $? -eq 0 ]; then
	PATH=${prog_path}:${PATH}
else
	PATH=${prog_path}:${prog_path}/../clust_progs:${PATH}
fi

# add the ha-service specific fault_progs
expr "$prog_path" : '.*/fault_progs' >/dev/null 2>&1
if [ $? -eq 0 ]; then
    PATH=${prog_path}:${PATH}
else
    PATH=${prog_path}:${prog_path}/../fault_progs:${PATH}
fi

#
# for use by subsequent hactl command, get hostnames of local and remote hosts
#
REMOTEHOSTS=
LOCALHOST=`uname -n`

if [ $? -ne 0 ]; then
	logerr `gettext "Cannot obtain name of local host"`
	exit 1
fi
# compute hostnames of remote nodes
PHYS_HOSTS="`haget -f all_physical_hosts`"
for i in $PHYS_HOSTS; do
	if [ "$i" != "$LOCALHOST" ]; then
		REMOTEHOSTS="$REMOTEHOSTS $i"
	fi
done
#! /bin/sh 
#
#	Copyright 12/20/96 Sun Microsystems, Inc.  All Rights Reserved.
#
#
#pragma ident "@(#)do_service	1.6 98/04/02 SMI"
#
#ident "@(#)do_service		1.14	96/12/20 SMI"
#
#

ARG_MASTERED=$1
ARG_NOT_MASTERED=$2
SYSLOG_PREFIX="SUNWcluster.ha.nsnews"

# Replace comma with space to form an sh word list
MASTERED="`echo $ARG_MASTERED | tr ',' ' '`"
NOT_MASTERED="`echo $ARG_NOT_MASTERED | tr ',' ' '`"

source_env NSNEWS

if [ $? -ne 0 ]; then
	# source_env_file logs error message if it fails.
	# No need to log another; just exit.
	exit 1
fi

#
# Timeout to waiting for SIGTERM to stop a process
# This should be in the config file
#
STOP_TIMEOUT=15

#
# bundle_do_svc <action>
#
# is called for each instance
#
bundle_do_svc ()
{
	typeset method_timeout
	typeset wait_time

	action=$1

	# The following values work for Netscape News Server 1.1.
	# and News 2.0, Collabra 3.0. Base dir should be ispecified as server's root 
	# A different version of the News server may require different values.
	# They are "hard-coded" here for ease-of-use in configuration.
	# (User only has to specify basedir and port parameters.)

	NEWS_CONFIG=${_INST_BASE_DIR}/config
	# NEWS_BIN=${_INST_BASE_DIR}/bin/news
	NEWS_START=${_INST_BASE_DIR}/start
	NEWS_STOP=${_INST_BASE_DIR}/stop

	typeset prefix
	prefix="$SYSLOG_PREFIX.$action"

	case $action in

	'start')

		# First do some error checking.

		if [ ! -d $NEWS_CONFIG ]; then
			logerr "$prefix.4000" \
`gettext "missing nsnews config directory <$NEWS_CONFIG> for instance <${_INST_NAME}>"`
			exit 1
		fi

	#if [ ! -d $NEWS_BIN ]; then
	#	logerr "$prefix.4001" \
	# `gettext "missing nsnews bin directory <$NEWS_BIN> for instance <${_INST_NAME}>"`
  	#			exit 1
 	#	fi

		if [ ! -x $NEWS_START ]; then
			logerr "$prefix.4002" \
				`gettext "$NEWS_START is not executable."`
			exit 1
		fi

		# The process monitor facility calls the start program,
		# passing to it the instance-specific information it needs.
		# Note that we're using pmf to start/stop, but not to probe.

		if [ ${_INST_RETRY} = "n" ]; then
			pmfadm -c ${_INST_NAME} $NEWS_START
		fi

		if [ $? -ne 0 ]; then
			logerr "$prefix.4003" \
		`gettext "pmfadm failed to start NEWS instance ${_INST_NAME}"`
			exit 1
		else
			lognotice "$prefix.2000" \
			    `gettext "Started NEWS instance ${_INST_NAME}"`

		fi
	;;

	'stop' | 'abort')

		# delete from queue, but don't kill
		pmfadm -s ${_INST_NAME}
		if [ $? -ne 0 ]; then
			logerr "$prefix.4004" \
		`gettext "pmfadm failed to delete ${_INST_NAME} from queue"`
			exit 1
		fi

		# use Netscape's stop script to stop nsnews instance
		if [ ! -x $NEWS_STOP ]; then
			logerr "$prefix.4005" `gettext "$NEWS_STOP is not executable."`
			exit 1
		fi

		method_timeout=`hareg -q nsnews -T stop`
		wait_time=`expr $method_timeout - 5`

		hatimerun -t $wait_time $NEWS_STOP

		ha_svc_not_running ${_INST_NAME}

		if [[ $? -ne 0 ]]; then
		    # Now kill any processes left out
		    pmfadm -s ${_INST_NAME} KILL
		    if [ $? -ne 0 ]; then
			logerr "$prefix.4010" \
			    `gettext "pmfadm failed to kill ${_INST_NAME}'s process and its sub-processes (if any) : pmfadm returned $?"` 
			exit 1    
		    fi
		fi

		;;

	'fm_start')

        	# XXX
        	need_to_run_probe ${_INST_LOGICAL_HOST} ${LOCALHOST}

        	if [ $? -ne 0 ]; then
            		exit 0
        	fi

		# pmf starts nsnews_probe
		# nsnews_probe runs until nsnews_fm_stop kills it.
		# Don't start probe if diskset is in maintenance mode.

		# If this NEWS instance's diskset is in maint mode, exit now.
		MAINT=`haget -f is_maint -h ${_INST_LOGICAL_HOST}`
		if [ "$MAINT" = "1" ]; then
			exit 0
		fi

		pmfadm -c ${_INST_NAME}.probe \
		    ${_INST_PROBE_PROG_1} ${_INST_NAME}

		if [ $? -ne 0 ]; then
			logerr "$prefix.4007" \
`gettext "pmfadm failed to start NEWS probe instance <${_INST_NAME}.probe"`
			exit 1
		else
			lognotice "$prefix.2002" \
		`gettext "Started NEWS probe instance ${_INST_NAME}.probe"`
		fi
	;;

	'fm_stop')

		# If probe not running, do nothing
		ha_svc_not_running ${_INST_NAME}.probe && exit 0

		# pmf kills nsnews_probe
		pmfadm -s ${_INST_NAME}.probe -w ${STOP_TIMEOUT} TERM || \
			pmfadm -s ${_INST_NAME}.probe KILL
		if [ $? -ne 0 ]; then
			logerr "$prefix.4008" \
`gettext "pmfadm failed to stop NEWS probe instance ${_INST_NAME}.probe"`
		else
			lognotice "$prefix.2003" \
		`gettext "Stopped NEWS probe instance ${_INST_NAME}.probe"`
		fi
	;;

	'fm_check_this_host_ok')

		# If the HA-NEWS logical host for this instance
		# is not currently mastered by this machine, exit now.

		is_member "${_INST_LOGICAL_HOST}" "$MASTERED"
		if [ $? -ne 0 ]; then
			exit 0
		fi

		# Otherwise, probe News service now.
		# If dead, request will time out in ${_INST_PROBE_TIMEOUT} secs.

		hatimerun -t ${_INST_PROBE_TIMEOUT_1} \
		    /opt/SUNWcluster/ha/nsnews/inndprobe ${_INST_LOGICAL_HOST} \
		    ${_INST_PORT} date\nquit\n > /dev/null 2>&1
		if [ $? -ne 0 ]; then
logerr "$prefix.4009" `gettext "This server is supposed to be providing NEWS service for instance <${_INST_NAME}>, but isn't"`
			exit 1
		fi
	;;

	esac

	exit 0
}
#include_boiler

set_inst_name ${INST_NAME}

if [ -z "$INST_NAME" ]; then
	logerr "$prefix.4010" `gettext "Usage: $ARGV0 <instance>"`
	exit 1
fi

MASTERED_LOGICAL_HOSTS="`haget -f mastered`"

NEWS_PORT=`get_config_param $INST_NAME PORT`
# required parameter
if [ -z "$NEWS_PORT" ]; then
	logerr "$prefix.4011" \
	    `gettext "NEWS_PORT value not set for instance $INST_NAME"`
	exit 1
fi

NEWS_HOST=`get_config_param $INST_NAME LOGICAL_HOST`
# parser requires this to be set

NEWS_PROBE_INTERVAL=`get_config_param $INST_NAME PROBE_1_INTERVAL`
# parser requires this to be set, but doesn't check for negative values
if [ $NEWS_PROBE_INTERVAL -lt 0 ]; then
	lognotice "$prefix.2004" \
`gettext "INTERVAL value is negative for instance $INST_NAME; using 60 seconds"`
	NEWS_PROBE_INTERVAL=60
fi

NEWS_PROBE_TIMEOUT=`get_config_param $INST_NAME PROBE_1_TIMEOUT`
# optional parameter, parser doesn't check for <= 0 values
if [ -z "$NEWS_PROBE_TIMEOUT" ]; then
	lognotice "$prefix.2005" \
`gettext "TIMEOUT value not set for instance $INST_NAME; using 60 seconds"`
	NEWS_PROBE_TIMEOUT=60
fi
# what timeout value is too low?
if [ $NEWS_PROBE_TIMEOUT -le 0 ]; then
	lognotice "$prefix.2006" \
`gettext "TIMEOUT is <= zero for instance $INST_NAME; resetting to 60 seconds"`
	NEWS_PROBE_TIMEOUT=60
fi

NEWS_TAKEOVER=`get_config_param $INST_NAME PROBE_1_TAKEOVER`
# optional parameter
if [ -z "$NEWS_TAKEOVER" ]; then
	lognotice "$prefix.2007" \
	   `gettext "TAKEOVER value not set for instance $INST_NAME; using 'y'"`
	NEWS_TAKEOVER=y
fi

LOCAL=no
is_member "$NEWS_HOST" "$MASTERED_LOGICAL_HOSTS"
if [ $? -eq 0 ]; then
	# NEWS_HOST is running locally
	LOCAL=yes
fi

NEWSGRACE=0
NEWSPROBEFILE=/var/opt/SUNWcluster/run/.nsnews_probe
RETRY=0

#
# Timeout to waiting for SIGTERM to stop a process
# This should be in the config file
#
STOP_TIMEOUT=15
FAIL=0

while : ; do

	# Take a nap here, instead of at the end of the loop.
	# At start-up, this gives the server more time to initialize itself

	sleep $NEWS_PROBE_INTERVAL

	hatimerun -t $NEWS_PROBE_TIMEOUT \
	    /opt/SUNWcluster/ha/nsnews/inndprobe $NEWS_HOST $NEWS_PORT \
	    date quit  > /dev/null 2>&1
	if [ $? -ne 0 ]; then
		FAIL=1
		# if running locally, restart it
		if [ $LOCAL = "yes" ]; then
			logerr "$prefix.5000" \
			   `gettext "nsnews instance $INST_NAME failed locally"`
			RETRY=`expr $RETRY + 1`
			logerr "$prefix.5001" \
	`gettext "restarting nsnews instance $INST_NAME; restart number $RETRY"`
			# We have detected that innd is either dead or sick.
			# If pmf has knowledge of them,
			# we have to delete $INST_NAME
			# from pmf's queue.  Otherwise, idempotency
			# check will prevent restart from succeeding.
			ha_svc_not_running $INST_NAME
			if [ $? -eq 1 ]; then
				# Kill off newstime and innd processes
				# and start fresh. 
				pmfadm -s $INST_NAME -w $STOP_TIMEOUT TERM || \
					pmfadm -s $INST_NAME KILL
			fi
			nsnews_svc_start "$MASTERED_LOGICAL_HOSTS" ""
			# Give it some time to start up before resuming probe.
			# If it fails to restart successfully, the sibling
			# will eventually take over if TAKEOVER set to 'y'.
		else
			logerr "$prefix.5002" \
		`gettext "nsnews instance $INST_NAME failed on sibling"`
			# give sibling chance to restart nsnews
			# before doing a takeover
			if [ $NEWSGRACE -eq 0 ]; then
				NEWSGRACE=1
				continue
			fi
			if [ "$NEWS_TAKEOVER" = "y" ]; then
				# $REMOTEHOST set in ds_boiler
				# Get "current master"
                		CURRENT_MASTER="`haget -f master -h $NEWS_HOST`"
				## hactl -t -s nsnews -p $CURRENT_MASTER

				pmfadm -c ${INST_NAME}.hactl hactl -t -s nsnews \
                                        -l $NEWS_HOST


			fi
		fi
	else
                if [ $FAIL -eq 1 ]; then
                        lognotice "$prefix.2008"\
                        `gettext "News instance ${INST_NAME} is up and running"`
                fi
                FAIL=0
		NEWSGRACE=0
	fi

done
