#! /bin/sh
#
#ident "@(#)nsmail_probe.shi   1.7     97/07/25 SMI"
#
#	Copyright 07/25/97 Sun Microsystems, Inc.  All Rights Reserved.
#
# Usage: nsmail_probe <instance_name> <local_flag>
# 

#
# Add the path to framework binaries, since the probe is not called in the
# context of the methods
#
PATH=${PATH}:/opt/SUNWhadf/bin:/opt/SUNWhadf/fault_progs

#
#	Copyright 11/18/96 Sun Microsystems, Inc.  All Rights Reserved.
#
#ident "@(#)ds_boiler		1.7	96/11/18 SMI"
#
# common boiler for HA Internet Pro data services
#
#


ARGV0=`basename $0`
LOGGER=logger
HA_SLOGFACILITY=`haget -f syslog_facility`
HA_SLOGTAG=hadf
prog_path=`dirname $0`
# add the ha-service specific clust_progs path since
# it will not co-reside with the framework clust_progs.
# if it's a fault monitor method, add also fault_progs.
expr "$prog_path" : '.*/clust_progs' >/dev/null 2>&1
if [ $? -eq 0 ]; then
	PATH=${prog_path}:${PATH}
else
	PATH=${prog_path}:${prog_path}/../clust_progs:${PATH}
fi

# source in ha-services common utilities
. ds_utilities

#
# for use by subsequent hactl command, get hostnames of local and sibling hosts
#
LOCALHOST=`uname -n`
if [ $? -ne 0 ]; then
	logerr `gettext "Cannot obtain name of local host"`
	exit 1
fi
# compute hostname of sibling
PHYS_HOSTS="`/opt/SUNWhadf/bin/haget -f all_physical_hosts`"
if [ `count_items $PHYS_HOSTS` -ne 2 ]; then
	logerr `gettext "Cannot compute hostname of sibling"`
	exit 1
fi
for i in $PHYS_HOSTS; do
	if [ "$i" != "$LOCALHOST" ]; then
		REMOTEHOST=$i
	fi
done

#
#	Copyright 11/20/96 Sun Microsystems, Inc.  All Rights Reserved.
#
#ident "@(#)do_service		1.7	96/11/20 SMI"
#
# do_service 
# Standard-name file that defines the standard-name routine bundle_do_svc(), 
# which implements the HA framework-defined methods for this data service.
# The _boiler file in this directory sources this file when the data
# service method scripts execute.
#

# Read in and interpret the data service environment configuration file
HADSCONF=/etc/opt/SUNWhansm/hadsconf
source_env_file $HADSCONF || exit 1

#
# Global variables
#
SMTP_PORT=25
POP3_PORT=110
IMAP4_PORT=143

DEF_PROBE_TIMEOUT=120

#
# After installation and setup, sendmail is a symlink , and sendmail.bak
# is Sun's sendmail
#
SENDMAIL=/usr/lib/sendmail
SENDMAIL_SUN=/usr/lib/sendmail.bak

#
# The following variables potentially can go to the config file private
# section, to allow the flexibility of which channel to probe.
#
CHECK_SMTP=y
CHECK_POP3=y
CHECK_IMAP4=y

#
# This is a place holder for a future parameter to tune
# the timeout period. Currently not used in the config file,
# so we default to 15 secs.
#
STOP_TIMEOUT=`get_config_param $INST_NAME PRIV_STOP_TIMEOUT`
if [ -z "$STOP_TIMEOUT" ]; then
	STOP_TIMEOUT=15
fi

#
# proto_connect  <host> <port>
#
# Attempt to connect to the mail server using telnet to a specified port.
# Will run under the probe time out.
#
# Retrun: 0 - success, 1 - failure.
#
proto_connect ()
{
	tmpfile=/tmp/nsmail_probe.$$
	PROBE_TIMEOUT=`get_config_param $INST_NAME PROBE_1_TIMEOUT`
	[ -z "$PROBE_TIMEOUT" ] && PROBE_TIMEOUT=$DEF_PROBE_TIMEOUT

	hatimerun -t  $PROBE_TIMEOUT /usr/bin/telnet $1 $2 <<EOF >$tmpfile 2>&1

EOF
	if [ $? -eq 99 ]; then
		#
		# telnet had timed out
		#
		conn_err=1
	else
		#
		# we don't check the return code from telnet
		# since it returns 1 even when the connection succeeded.
		#
		grep refused  $tmpfile > /dev/null 2>&1
		if [ $? -eq 0 ]; then
			conn_err=1
		else
			conn_err=0
		fi
	fi

	rm -f $tmpfile >/dev/null 2>&1
	return $conn_err
	
} 

#
# check_ns_file
#
# Check if we have access to the Netscape config file and
# if it can be sourced in
#
# Can be called by any host in the cluster, since the conf file
# is on the private disk
#
check_ns_file ()
{
	#
	# Netscape Mail config file is pointed by CONF_FILE entry in the
	# hadsconf file.
	# The Netscape file is a collection of shell variables that we
	# check their validity below
	#
	CONF_FILE=`get_config_param $INST_NAME CONF_FILE`

	if [ -z "$CONF_FILE" ]; then
		logerr "Configuration file is missing entry for CONF_FILE"
		exit 1
	fi

	if [ ! -f "$CONF_FILE" ] ; then
		logerr "Couldn't find $CONF_FILE file. Maybe Netscape Mail was not installed properly, or you didn't run hadssetup(1m)"
		exit 1
	fi

	. $CONF_FILE
	if [ $? -ne 0 ]; then
		logerr "Error in Netscape Mail configuration file, $CONF_FILE"
		exit 1
	fi

	return 0
}

#
# verify_conf 
#
# Verify the configuration of Netscape mail.
# Check existance of Netscape config file, and verify it's content.
#
# It should be called only from a host that owns the mail server logical
# diskset, since it is trying to access directories and files on this diskset.
#
verify_conf ()
{
	check_ns_file

	# Check for existence of the directories pointed by the config file

	if [ ! -d $PostOffice ]; then
		logerr "Netscape Mail improperly installed, missing $PostOffice"
		exit 1
	fi

	if [ ! -d $ProgramDir ]; then
		logerr "Netscape Mail improperly installed, missing $ProgramDir"
		exit 1
	fi

	if [ ! -d $MailboxDir ]; then
		logerr "Netscape Mail improperly installed, missing $MailboxDir"
		exit 1
	fi

	return 0

}

#
# do_symlink <to_sun>
#
# If to_sun is 0, create a symbolic link from sendmail to sendmail.bak,
# otherwise, create a symbolic link to NscpMail
#
do_symlink ()
{
	to_sun=$1
	NS_SENDMAIL=$ProgramDir/bin/sendmail

	if [ ! -h "$SENDMAIL" ]; then
		logerr "$SENDMAIL should be a symbolic link. The system is not properly configured"
		return 1
	fi
	
	if [ ! -f "$SENDMAIL_SUN" ]; then
		logerr "Could not find the file $SENDMAIL_SUN"
		return 1
	fi

	if [ $to_sun -eq 0 ]; then
		ls -l $SENDMAIL | grep $SENDMAIL_SUN >/dev/null 2>&1
		if [ $? -ne 0 ]; then
			# XXX
			#lognotice "linking $SENDMAIL_SUN to $SENDMAIL"
			rm $SENDMAIL >/dev/null 2>&1
			ln -s $SENDMAIL_SUN $SENDMAIL
		fi
	else
		ls -l $SENDMAIL |grep $ProgramDir >/dev/null 2>&1
		if [ $? -ne 0 ]; then
			# XXX
			#lognotice "linking $NS_SENDMAIL to $SENDMAIL"
			rm $SENDMAIL >/dev/null 2>&1
			ln -s $NS_SENDMAIL $SENDMAIL
			mtagid=`/usr/xpg4/bin/id -g $MailUserName 2>/dev/null`
			if [ -z "$mtagid" ]; then
				logerr "Couldn't get the group id of \"$MailUserName\" "
				return 1
			fi
			chgrp -h $mtagid $SENDMAIL
		fi
	fi

	return 0

}

#
# start_nsmail
#
# Start the Netscape mail daemon
#
start_nsmail ()
{
	#
	# Verify that the variables in the Netscape mail config
	# file matches the reality
	#
	verify_conf

	NSMAIL_PROG=$ProgramDir/NscpMail

	#
	# If needed, create symlink from sendmail to NscpMail
	#
	do_symlink 1 || return 1

	#
	# Do the idempotency check
	#
	ha_svc_not_running $INST_NAME
	if [ $? -eq 0 ]; then
		#
		# launch the daemon using the process monitor
		#
		pmfadm -c $INST_NAME $NSMAIL_PROG >/dev/null 2>&1
			# XXX If pmfadm failed because the instance was already there, 
			# XXX ignore the error. Otherwise, log an error message.
			# XXX currently, pmfadm doesn't support different error codes.
		start_code=$?
		if [ $start_code -ne 0 ]; then
			logerr "Failed to start Netscape Mail instance $INST_NAME"
		else
			lognotice "Started Netscape Mail instance $INST_NAME"
		fi
		return $start_code
	else
		return 0
	fi
	
}

#
# stop_nsmail
#
# Stop Netscape Mail. We do so by sending a TERM signal to the mail daemon
# and waiting for it to exit, using the process monitor. If after the
# waiting period it did not exit, we'll send it a KILL signal.
# 
stop_nsmail ()
{

	#
	# source in the Netscape config file
	#
	check_ns_file

	#
	# If needed, create symbolic link from sendmail to sendmail.bak
	#
	do_symlink 0 || return 1


	ha_svc_not_running $INST_NAME
	if [ $? -ne 0 ]; then
		# Send TERM and wait for the daemon to exit
		pmfadm -w $STOP_TIMEOUT -s $INST_NAME TERM || \
			pmfadm -s $INST_NAME KILL 
		stop_code=$?
		if [ $stop_code -ne 0 ]; then
			logerr "Failed to stop Netscape Mail instance $INST_NAME"
		else
			lognotice "Stopped Netscape Mail instance $INST_NAME"
		fi
		return $stop_code
	fi

	return 0
	
}



#
# bundle_do_svc <action>
#
# called for each instance
#
bundle_do_svc ()
{

# XXX
#set -x
	action=$1
	# XXX
	INST_NAME=$_INST_NAME

	LOGICAL_HOST=`get_config_param $INST_NAME LOGICAL_HOST`

	# XXX
	#lognotice "bundle_do_svc: called with action '$action' for instance '$INST_NAME' "

	ret_code=0

	case $action in
	'start')
		start_nsmail
		ret_code=$?
		;;

	'start_net')
		;;

	'stop' )
		stop_nsmail
		ret_code=$?
		;;

	'stop_net' )
		;;

	'abort' )
		#
		# source in the Netscape config file
		#
		check_ns_file

		#
		# If needed, create symbolic link from sendmail to sendmail.bak
		#
		do_symlink 0 
		pmfadm -s $INST_NAME KILL >/dev/null 2>&1
		# XXX If pmfadm failed because the instance was not there, 
		# XXX ignore the error. Otherwise, log an error message.
		# XXX currently, pmfadm doesn't support different error codes.
		;;

	'abort_net')
		;;

	'fm_init')
		;;

	'fm_start')
		#
		# Check if we are about to probe a logical host we master (local
		# probe),  or one which another host masters (remote probe), or
		# one in maintenance mode (don't probe)
		#
		maint=`haget -f is_maint -h $LOGICAL_HOST`
		if [ $? -ne 0 ]; then
			"haget(1M) failed for logical host $LOGICAL_HOST"
			ret_code=1
		elif  [ $maint -eq 0 ]; then
			if is_member "$LOGICAL_HOST" "$MASTERED_LOGICAL_HOSTS" ; then
				local=y
			else
				local=n
			fi
			#
			# Launch a probe using the process monitor.
			# We are using the process monitor just to start and tag
			# the probe, without the retry feature of the process monitor.
			#
			PROBE_PROG=`get_config_param $INST_NAME PROBE_1_PROG`
			pmfadm -c ${INST_NAME}_probe_1 \
				$PROBE_PROG $INST_NAME $local >/dev/null 2>&1
			if [ $? -ne 0 ]; then
				logerr \
	"Failed to start Netscape Mail probe instance ${INST_NAME}_probe_1"
				ret_code=1
			else
				lognotice \
	"Started Netscape Mail probe instance ${INST_NAME}_probe_1"
			fi
		fi
		;;

	'fm_stop')
		# If probe not running, do nothing
		ha_svc_not_running ${INST_NAME}_probe_1 && exit 0
		#
		# Kill the probe that is associated with this instance.
		#
		pmfadm -s ${INST_NAME}_probe_1 -w $STOP_TIMEOUT TERM || \
			pmfadm -s ${INST_NAME}_probe_1 KILL
		if [ $? -ne 0 ]; then
			logerr "Failed to stop Netscape Mail probe instance ${INST_NAME}_probe_1"
			ret_code=1
		else
			lognotice "Stopped Netscape Mail probe instance ${INST_NAME}_probe_1"
		fi

		;;

	'fm_check_this_host_ok')
		#
		# Verify that we have access to the netscape config file
		#
		check_ns_file
		ret_code=$?
		;;

	esac

# XXX
#set +x
	return $ret_code

}

#include_boiler

# XXX
#set -x

INST_NAME=$1
LOCAL=$2


#
# check_restart
#
# Check if the need_restart flag was raised.
# If it was, restart the daemon.
#
check_restart ()
{
	# check if we need to do a restart
	[ $need_restart -ne 0 ] || return

	if [ "$LOCAL" = "n" ]; then
		logerr "Internal error: local restart requested from a remote probe"
		exit 1
	fi

	# reset the flag and start the daemon
	need_restart=0

	#
	# Verify that the variables in the Netscape mail config
	# file matches the reality
	#
	verify_conf

	NSMAIL_PROG=$ProgramDir/NscpMail
	#
	# launch the daemon using the process monitor
	#
	lognotice "Starting Netscape Mail instance $INST_NAME"
	pmfadm -c $INST_NAME $NSMAIL_PROG >/dev/null 2>&1 || \
		logerr "Failed to start Netscape Mail"
		# XXX If pmfadm failed because the instance was already there, 
		# XXX ignore the error. Otherwise, log an error message.
		# XXX currently, pmfadm doesn't support different error codes.
	
}


#
# Check if takeover is permitted for this instance, and if it is,
# ask the framework to attempt a takeover. 
#
# We count here on the remote probe to initiate a takeover, since
# the local probe does not detect any special failures that the 
# remote probe will miss.
#
# In any case, the probe is at the end of its rope here, so it exits.
#
check_takeover ()
{
	if [ "$TAKEOVER" != "n" -a "$LOCAL" = "n" ]; then
		lognotice "Attempting to take ownership of Netscape Mail instance $INST_NAME"
		# $REMOTEHOST was set in ds_boiler
		hactl -t -s nsmail -p $REMOTEHOST
	else
		logwarn "Giving up on instance $INST_NAME. "
	fi
		
	exit 0
}

smtp_probe ()
{

	port=$1

	hatimerun -t  $PROBE_TIMEOUT tcpclnt $LHOST $port "helo $HOSTNAME" quit >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		failed=1
		smtp_failed=1
		if [ "$LOCAL" = "y" ]; then
			pmfadm -l $INST_NAME >/dev/null 2>&1
			if [ $? -ne 0 ]; then
				need_restart=1
			fi
		fi
	fi

}

pop3_probe ()
{

	port=$1

	hatimerun -t  $PROBE_TIMEOUT tcpclnt $LHOST $port quit >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		failed=1
		pop3_failed=1
		if [ "$LOCAL" = "y" ]; then
			pmfadm -l $INST_NAME >/dev/null 2>&1
			if [ $? -ne 0 ]; then
				need_restart=1
			fi
		fi
	fi

}

imap_probe ()
{

	port=$1

	hatimerun -t  $PROBE_TIMEOUT tcpclnt $LHOST $port "A00000 CAPABILITY" "A00001 LOGOUT"  >/dev/null 2>&1
	if [ $? -ne 0 ]; then
		failed=1
		imap_failed=1
		if [ "$LOCAL" = "y" ]; then
			pmfadm -l $INST_NAME >/dev/null 2>&1
			if [ $? -ne 0 ]; then
				need_restart=1
			fi
		fi
	fi

}

################################################################################
#
# Main program
#
################################################################################

#
# Get the probe interval for this instance
#
PROBE_INTERVAL=`get_config_param $INST_NAME PROBE_1_INTERVAL`
is_numeric $PROBE_INTERVAL 
if [ $? -ne 0 ]; then
	logerr "Value of probe interval for instance $INST_NAME is not numeric" 
	exit 1
fi

#
# Get the probe timeout
#
PROBE_TIMEOUT=`get_config_param $INST_NAME PROBE_1_TIMEOUT`
is_numeric $PROBE_TIMEOUT 
if [ $? -ne 0 ]; then
	logerr "Value of probe timeout for instance $INST_NAME is not numeric" 
	exit 1
fi

#
# Get the retry time interval (the time window in minutes)
#
RETRY_INTERVAL=`get_config_param $INST_NAME RETRY_INTERVAL`

# Convert to seconds
RETRY_INTERVAL=`expr $RETRY_INTERVAL \* 60`

#
# Get the takeover/failover flag for this instance
#
TAKEOVER=`get_config_param $INST_NAME PROBE_1_TAKEOVER`

#
# Get the retry number for this instance
#
RETRY_TIMES=`get_config_param $INST_NAME RETRY_TIMES`

#
# Get the logical host associated with this instance that 
# the probe is suppose to check. 
#
LHOST=`get_config_param $INST_NAME LOGICAL_HOST`

HOSTNAME=`uname -n`


retries=0
need_restart=0
failed=0
smtp_failed=0
pop3_failed=0
imap_failed=0


while : ; do
	#
	# Take a nap here
	#
	sleep $PROBE_INTERVAL

	for proto in SMTP POP3 IMAP4 ; do
		case $proto in
		SMTP)
			if [ "$CHECK_SMTP" = "y" ]; then
				smtp_probe $SMTP_PORT
				[ $smtp_failed -eq 0 ] || logwarn "Failed to connect to SMTP port"
				[ $need_restart -eq 0 ] || break
			fi
			;;

		POP3)
			if [ "$CHECK_POP3" = "y" ]; then
				pop3_probe $POP3_PORT
				[ $pop3_failed -eq 0 ] || logwarn "Failed to connect to POP3 port"
				[ $need_restart -eq 0 ] || break
			fi
			;;

		IMAP4)
			if [ "$CHECK_IMAP4" = "y" ]; then
				imap_probe $IMAP4_PORT
				[ $imap_failed -eq 0 ] || logwarn "Failed to connect to IMAP4 port"
				[ $need_restart -eq 0 ] || break
			fi
			;;
		esac
	done

	smtp_failed=0
	pop3_failed=0
	imap_failed=0

	if [ $failed -eq 1 ]; then
		failed=0
		if [ $RETRY_TIMES -eq 0 ]; then
			#
			# This instance was configured not to do any retries.
			# Do the takeover check.
			#
			check_takeover
		elif [ $retries -eq 0 ]; then
			#
			# We are about to do the first retry in the time window. 
			# Latch the time.
			#
			start_time=`get_timesecs`
			retries=`expr $retries + 1`
			check_restart
		else
			#
			# This is not the first failure. First find
			# the time span from the first failure (in the
			# time window.
			#
			cur_time=`get_timesecs`
			time_diff=`expr $cur_time - $start_time`
			if [ $time_diff -ge $RETRY_INTERVAL ]; then
				#
				# This failure happened after the time window
				# elapsed, so we reset the retries counter,
				# slide the window, and do a retry.
				#
				retries=1
				start_time=$cur_time
				check_restart
			elif [ $retries -ge $RETRY_TIMES ]; then
				#
				# We are still within the time window,
				# and the retry counter expired, so if
				# takeover is permited, initiate one.
				#
				retries=0
				check_takeover
			else
				#
				# We are still within the time window,
				# and retry counter did not expired,
				# so do another retry.
				#
				retries=`expr $retries + 1`
				check_restart
			fi
		fi
	fi
done

# XXX
#set +x
