#!/bin/ksh
#
# @(#)pdbadmin 1.43 98/06/15 SMI
#
# 	Copyright (c) 1994 Sun Microsystems, Inc.
#
# scadmin - SPARCcluster SC administration script.
#

prog=`basename $0`

B=/opt/SUNWcluster/bin
E=/etc/opt/SUNWcluster
V=/var/opt/SUNWcluster
PKGINFO=/bin/pkginfo
GREP=/bin/grep
SED=/bin/sed

# print usage and exit
usage() {
	echo "Usage:"
	echo "	$0 [-a] [-f] startcluster localnodename clustname"
	echo "	$0 [-a] [-f] startnode [clustname]"
	echo "	$0 [-a] stopnode [clustname]"
	echo "	$0 abortpartition localnodename clustname"
	echo "	$0 continuepartition localnodename clustname"
	echo "	$0 reldisks [clustname]"
	echo "	$0 resdisks [clustname]"
	echo "	$0 switch   clustname [-m] logical-hosts ..."
	echo "	$0 switch   clustname dest-host logical-hosts ..."
	exit 2
}

# lookup a value in the configuration file
enmatch() {
	${B}/cdbmatch $* ${cdbfile} || \
		(echo "enmatch" "cdbmatch $* ${cdbfile} failed" 1>&2; return 1) 
}

# find the nodeid of the local node
get_nodeids () {
	myuname=`uname -n`
	if [ "`enmatch cluster.node.0.hostname`" = "${myuname}" ]
	then
		myid=0
	elif [ "`enmatch cluster.node.1.hostname`" = "${myuname}" ]
	then
		myid=1
	elif [ "`enmatch cluster.node.2.hostname`" = "${myuname}" ]
	then
		myid=2
	elif [ "`enmatch cluster.node.3.hostname`" = "${myuname}" ]
	then
		myid=3
	else
		echo "Host ${myuname} is not a cluster member"
		exit 1
	fi
}

function switchover_cmd
{
typeset m_mode		# maintenance mode
typeset dest_host	# destination host
typeset l lhlist	# logical host list and iterator
typeset row		# row retrieved from the CCD
typeset n curr_members	# current cluster membership and iterator
typeset nodename curr_nodes curr_nodeid dest_nodeid
typeset nodenames
integer found
typeset node

curr_nodes=""
curr_members=$(${B}/get_node_status | /bin/grep membership \
		| sed -e 's/membership: //')
if [[ "${curr_members}" = "unknown" ]]; then
  print "This node does not appear to be in the cluster membership."
  print "This command cannot be run from this node."
  exit 1
else
  for n in ${curr_members}; do
    nodename=$(${B}/cdbmatch cluster.node.${n}.hostname ${cdbfile})
    curr_nodes="${curr_nodes} ${nodename}"
  done
fi

dest_host=""
let m_mode=0
if [[ "$1" = "-m" ]]; then
  let m_mode=1
else
  dest_host=$1
  # check whether destination host is a cluster member
  let found=0
  for n in ${curr_nodes}; do
    if [[ "${n}" = "${dest_host}" ]]; then
      let found=1
      break
    fi
  done
  if (( found == 0 )); then
    print "Destination host ${dest_host} is not a cluster member."
    exit 1
  fi
fi

shift
lhlist=$*

for l in ${lhlist}; do

  # check for the validity of the logical host
  row=$(${B}/scccd ${clustname} LOGHOST query lname ${l})
  if [[ -z "${row}" ]]; then
    print "The logical host ${l} is not defined in the Cluster Configuration Database."
    print "Ignoring ${l}."
    continue
  fi

  # Get nnodelist and check if the destination node exists
  # only if we are not in maintenance mode.
  if [ ${m_mode} -eq 0 ]; then
        nodenames=`echo ${row} | /usr/bin/awk -F: '{ print $3}' | \
                          /usr/bin/tr ',' ' ' `
        found=0
        for node in ${nodenames}
        do   
                if [ "${node}"  = "${dest_host}" ]; then
                        found=1
                        break; 
                fi
        done   
        if [ ${found} = 0 ]; then
           print "The specified destination host - ${dest_host} - is not configured to master "
           print "this logical host ${l}."
           continue
        fi
  fi 

  # query for the current master of the logical host
  row=$(${B}/scccd ${clustname} LOGHOST_CM query lname ${l})
  curr_master=${row#*:*:}

  if [[ -n "${curr_master}" ]]; then
    
    if [[ -n "${dest_host}" && "${curr_master}" = "${dest_host}" ]]; then
      print "The specified destination host - ${dest_host} - is the current"
      print "master of logical host ${l}. Ignoring ${l}."
      continue
    fi

    if [ ${m_mode} -eq 1 ]; then
         set_loghost_mstate ${l} 0
    fi

    # find the node id of the current master

    for n in ${curr_members}; do
      nodename=$(${B}/cdbmatch cluster.node.${n}.hostname ${cdbfile})
      if [[ "${nodename}" = "${curr_master}" ]]; then
	curr_nodeid=${n}
      fi
      if [[ "${nodename}" = "${dest_host}" ]]; then
	dest_nodeid=${n}
      fi
    done

    ${B}/scccd ${clustname} LOGHOST_CM remove lname ${l}
    
    if [[ "$?" -ne 0 ]]; then
      #
      # Add it back so that the Logical Host
      # is brought to sane state again..
      #
      print "Unable to bring down logical host ${l} on ${curr_master}."
      print "Check system console logs on ${curr_master} for detailed"
      print "error messages. Trying to re-master it on the original"
      print "node"

      if [ ${m_mode} -eq 1 ]; then
         set_loghost_mstate ${l} 1
      fi

      ${B}/scccd ${clustname} LOGHOST_CM add "lname:curr_master" \
		"${l}:${curr_master}"

      if [[ "$?" -ne 0 ]]; then
	print "Unable to re-master the logical host ${l} on ${curr_master}"
	print "Aborting node to prevent data service inconsistencies."
        ${B}/clustm abort ${clustname} ${curr_nodeid}
        exit 1
      fi

      # we got the logical host re-mastered and hence continue
      continue
    fi
  fi

  if [[ -n "${dest_host}" ]]; then

    if [ ${m_mode} -eq 0 ]; then
        set_loghost_mstate ${l} 1
    fi

    ${B}/scccd ${clustname} LOGHOST_CM add "lname:curr_master" \
		"${l}:${dest_host}"

    if [[ "$?" -ne 0 ]]; then
      #
      # clean up the stuff if required, so that logical host is
      # not mastered anywhere.
      #

      print "Unable to bring up logical host ${l} on ${dest_host}."
      print "Check system console logs on ${dest_host} for detailed"
      print "error messages."

      ${B}/scccd ${clustname} LOGHOST_CM remove lname ${l}

      if [[ "$?" -ne 0 ]]; then

	 print "Unable to clean up logical host ${l} on ${dest_host}."
	 print "Aborting node to prevent data service inconsistencies."

         ${B}/clustm abort ${clustname} ${dest_nodeid}
         exit 1

      else

	#
	# we try to re-master the logical host back to its
	# original node where it was being mastered before.
	# If this fails , do a clean up and exit  the command.
	#

	if [[ ${m_mode} -eq 0  && -n "${curr_master}" ]]; then

     	    ${B}/scccd ${clustname} LOGHOST_CM add "lname:curr_master" \
		"${l}:${curr_master}"

       	    if [[ "$?" -ne 0 ]]; then
		print "Unable to Re-Master the Logical Host ${l} on ${curr_master}"
		print "Logical Host ${l} will not be mastered anywhere"

      		${B}/scccd ${clustname} LOGHOST_CM remove lname ${l}

		if [[ "$?" -ne 0 ]]; then
         	    ${B}/clustm abort ${clustname} ${curr_nodeid}
         	    exit 1
		fi
	    fi
	fi
      fi
    fi
  fi
done
}


set_loghost_mstate()
{
    typeset logname value

    logname=$1
    value=$2

    mstate=$(${B}/scccd ${clustname} LOGHOST_MSTATE query lname ${logname})
    mode=${mstate#*:*:}
    if [ ${mode} -ne ${value} ]; then

        ${B}/scccd ${clustname} LOGHOST_MSTATE remove lname ${logname}

        ${B}/scccd ${clustname} LOGHOST_MSTATE add "lname:mmode" "${logname}:${value}"

        if [[ "$?" -ne 0 ]]; then
                print "Unable to set the State of Logical Host to ${value}"
                print "Check system console logs for detailed error messages."
        fi
    fi
}



# get program options
set -- `getopt af $*`
if [ $? != 0 ]
then
        usage
fi
for i in  $*
do
        case $i in
        -a) async="-a"; shift ;;
        -f) forcestart="-f"; shift ;;
        --) shift; break;;
        esac
done

if [ $# -lt 1 ]; then
 echo "Missing required parameter!"
 usage
 exit 2
fi

cmd=$1
if [  ${cmd} != "startnode" -a ${cmd} != "stopnode" -a ${cmd} != "reldisks" -a\
      ${cmd} != "startcluster" -a ${cmd} != "continuepartition" -a \
      ${cmd} != "abortpartition" -a ${cmd} != "resdisks" -a\
      ${cmd} != "switch" ]
then
	echo "invalid command: ${cmd}"
	usage
	exit 2
fi

# rjw +
nodename=""
if [ ${cmd} = "continuepartition" -o ${cmd} = "abortpartition" -o\
		${cmd} = "startcluster" ]; then
	nodename="$2"
	clustname="$3"
	if [ -z "${nodename}" -o -z "${clustname}" ]; then
		usage
	else
		echo "Node specified is ${nodename}"
		echo "Cluster specified is ${clustname}"
	fi
else
	clustname=$2
fi
# rjw -

if [ -z "${clustname}" ]; then
	if [ -f ${E}/conf/default_clustername ]; then
		clustname=`cat ${E}/conf/default_clustername`
		echo "Assuming a default cluster name of ${clustname}"
	else
		usage
	fi
fi


cdbfile=${E}/conf/${clustname}.cdb

if [ ! -f $cdbfile ]; then
	echo "Error: Invalid cluster name: $clustname"
	echo "       The file \"$cdbfile\" was not found"
	usage
fi

# set variables related to node ids
get_nodeids
mynode=`eval enmatch cluster.node.${myid}.hostname`
### this is a throwback to the 2 node cluster days.  needs to be cleaned up

# check if nodename is the name of my node  rjw
if [ -n "${nodename}" ]; then
 	if [ ${nodename} != ${mynode} ]; then
		usage
	fi
fi


# -f option is legal only with the 'startnode' command
if [ "${forcestart}" = -f -a ${cmd} != "startnode" -a ${cmd} != "startcluster" ] ; then
	echo "Error: The -f option is only legal when used with the 'startcluster'"
	echo "or 'startnode' sub-command"
	usage
fi

# warn operator about the consequences of the -f option
if [ "${forcestart}" = -f ] ; then

	# XXX - allow running a customized script here

	echo "=========================== WARNING ================================="
	echo "=      Multiple Failures have been detected in this cluster         ="
	echo "====================================================================="
	echo
	echo "You are attempting to start up the cluster node '${mynode}' using"
	echo "the -f option.  The -f option allows '${mynode}' to come online when"
	echo "other node(s) and the quorum devices shared with them are not reachable."
	echo "This action could corrupt the database and/or otherwise compromise"
	echo "cluster integrity if used incorrectly. Please read the following"
	echo "instructions carefully and refer to XXX documentation."
	echo 
	echo "Before you proceed, you must verify that:"
	echo "	(1) that the other node(s) are offline (i.e. halted)"
	echo "	(2) the quorum device(s) is offline (i.e. power is off)"
	echo
	echo "Note that you *must* bring the node '${mynode}' offline before"
	echo "restarting the quorum device(s) or any of the other nodes'." \
		" This can be"
	echo "done by executing the command"
	echo
	echo "	/opt/SUNWcluster/bin/scadmin stopnode ${clustname}"
	echo
	echo "Please enter \"yes\" only after you have verified (1) and (2)."

	reply=`ckyorn -Q -d "no" -p " Do you want to continue"`
	case $reply in
			[Nn]|[Nn][Oo])
				echo "exiting..." >& 2 ; exit 0 ;;
	esac
fi

# rjw+
# warn operator about the consequences of the startcluster command  
if [ "${cmd}" = startcluster ] ; then
	# XXX - allow running a customized script here


	echo >& 2
	echo "=========================== WARNING =================================" >& 2
	echo "=                     Creating a new cluster                        =" >& 2
	echo "=====================================================================" >& 2
	echo >& 2
	echo "You are attempting to start up the cluster node '${mynode}' as the" >& 2
	echo "only node in a new cluster.  It is important that no other cluster" >& 2
	echo "nodes be active at this time.  If this node hears from other cluster" >& 2
	echo "nodes, this node will abort.  Other nodes may only join after this" >& 2
	echo "command has completed successfully.  Data corruption may occur if" >& 2
	echo "more than one cluster is active." >& 2
	echo >& 2

	reply=`ckyorn -Q -d "no" -p " Do you want to continue"`
	case $reply in
			[Nn]|[Nn][Oo])
				echo "exiting..." >& 2 ; exit 0 ;;
	esac
fi

# change current working directory to /opt/SUNWcluster/bin to avoid
# all kinds of problems with the 'rm(1)' command when filesystems
# disappear from underneath it or root does not have privileges to
# open the current working directory.

cd /opt/SUNWcluster/bin

# rjw-

# dispatch into reconf_ener

case ${cmd} in
	startnode | stopnode)
		${B}/reconf_ener ${forcestart} ${async} ${cmd} \
		       ${clustname} \
		      || (echo "$0: errors encountered."; \
			exit 1) || exit 1;;
# rjw+
	startcluster)
		${B}/reconf_ener ${forcestart} ${async} -n startnode ${clustname} \
		      || (echo "$0: errors encountered."; \
			exit 1) || exit 1;;
	continuepartition)
		echo "*** Node ${mynode} will continue participating cluster ${clustname}"
		${B}/clustm continuepartition ${clustname} || \
			(echo "$0: errors encountered."; \
			exit 1) || exit 1;;
	abortpartition)
		echo "*** Node ${mynode} will abort from cluster ${clustname}"
		${B}/clustm abortpartition ${clustname} || \
			(echo "$0: errors encountered."; \
			exit 1) || exit 1;;
	resdisks | reldisks)
		${B}/reconf_ener ${cmd} ${clustname} || \
			(echo "$0: errors encountered."; \
			exit 1) || exit 1;;
	switch)
		if (( $# < 2 )); then
			print "$0: Required parameters missing."
			usage; exit 1
		else
			shift 2
			set -m
			( switchover_cmd $* || \
			  (echo "$0: errors encountered."; exit 1) || exit 1)&
		fi
		;;
# rjw-
	*)
		usage; exit 1 ;;
esac
exit 0

