#!/bin/sh
#
# Quick and dirty watchdog script that manages VMware services.
# Launches the specified process, and respawns it after it exits;
# Gives up after recording the specified number of 'quick failures'
# in succession or after recording a specified total number of 
# failures (over any length of time).
#

# Handler used to cleanly exit the watchdog
cleanup () {
   log daemon.info "[$$] Signal received: exiting the watchdog"
   if [ $START = 1 ] ; then
       rm -rf $PIDFILE
   fi
   exit 0
}

# Trap all trappable signals (this excludes 9 17 19 23) and clean up.
trap cleanup 1 2 3 6 7 8 10 13 14 15 16 24 25 26 27 30 31      

usage () {
    echo "Usage: $0 [-n] -s|-k|-r <tag> [(options)] <command>"
    echo "   Start the watchdog: $0 -s <tag> [-u <min_uptime>] [-q <max_quick_failures>] [-t <max_total_failures>] <command>"
    echo "   Kill a running watchdog: $0 -k <tag>"
    echo "   Query whether a watchdog is running: $0 -r <tag>"
    echo "   Suppress logging: -n"
    exit 1
}

if [ $# -lt 2 ]
then
    usage
fi

log () {
    if [ "$QUIET" -ne 0 ] 
    then
        return
    fi

    if [ $# -lt 2 ]
    then 
        logger -p daemon.err -t $LABEL "Bad log usage."
    else
        _loglevel=$1
        _logmsg=$2
        shift; shift;
        _logoptarg=$@
        logger -p $_loglevel -t $LABEL "$_logmsg" $_logoptarg
    fi
}

start () {
    MSG="[$$] Begin '$CMD', min-uptime = $MIN_UPTIME, max-quick-failures = "
    MSG="$MSG$MAX_QUICK_FAILURES, max-total-failures = $MAX_TOTAL_FAILURES"
    echo $MSG
    log daemon.info "$MSG"
    echo $$ > $PIDFILE

    while [ ! $QUICK_FAILURES -gt $MAX_QUICK_FAILURES -a ! $TOTAL_FAILURES -gt $MAX_TOTAL_FAILURES ]
      do
      log daemon.info "Executing '$CMD'"
      LAST=`date +%s`
      # Lauches the command in a different process group.  Helps with signal handling.
      setsid $CMD &
      wait
      TOTAL_FAILURES=`expr $TOTAL_FAILURES + 1`
      NOW=`date +%s`
      UPTIME=`expr $NOW - $LAST`
      if [ $UPTIME -lt $MIN_UPTIME ] ; then
          QUICK_FAILURES=`expr $QUICK_FAILURES + 1`
          LOG_MESSAGE="'$CMD' exited after $UPTIME seconds (quick failure $QUICK_FAILURES)"
          log daemon.err "$LOG_MESSAGE"
      else
          QUICK_FAILURES=0  
          log daemon.err "'$CMD' exited after $UPTIME seconds"
      fi
      if [ "$CLEANUP_CMD" != "" ] ; then
          log daemon.info "Executing cleanup command '$CLEANUP_CMD'"
          setsid $CLEANUP_CMD > /dev/null 2>&1
      fi
    done

    log daemon.err "End '$CMD', failure limit reached"
    rm -rf $PIDFILE
    exit 0
}

stop () {
    if ! query > /dev/null
    then
        MSG="Unable to terminate watchdog: Can't find process"
        log daemon.info "$MSG" -s 
        exit
    fi
    MSG="Terminating watchdog with PID $PID"
    log daemon.info "$MSG" -s
    kill -HUP $PID
}

query () {
    PID=`cat $PIDFILE 2> /dev/null`

    if [[ "$PID" == "" ]] ; then
        # This is a measure of last resort in case the pid file has been deleted somehow
        # Not foolproof but should work most of the time
        log daemon.info "PID file $PIDFILE not found" -s

        if [[ "$(uname)" == "VMkernel" ]] ; then
           # The awk selects the lines which lists watchdog being run with the
           # specified TAG and is not the process itself and then extracts the
           # PID.
           PID=$(ps -cu 2> /dev/null | awk "!/$$/ && /$(basename $0)/ && /-s ${TAG}/ { print \$1 }")
        else
           # The awk selects the watchdog that is running the specified TAG and
           # is not the process itself and then extracts the PID.
           PID=$(ps hw -C $(basename $0) | awk "!/$$/ && /-s ${TAG}/ { print \$1 }")
        fi

        if [[ "${PID}" != "" ]] ; then
            log daemon.info "Found match watchdog with pid $PID" -s
        fi
    fi

    # We are sure that the PID is not us
    if [[ "${PID}" != "" ]] ; then
        echo $PID
        return 0
    else
        return 1
    fi
}

# Give these variables default values.
# These can be overridden with command line arguments.
# 1,000,000 here is basically meant to be infinity.
MAX_QUICK_FAILURES=5
MAX_TOTAL_FAILURES=1000000
MIN_UPTIME=60

# These variables tell us which action we will be performing.
START=0
STOP=0
QUERY=0

# Whether the execution should be quiet.
QUIET=0

# Read the command line arguments and set variables accordingly.  
# Does no error checking on the inputs.
while getopts "ns:k:r:u:q:t:c:" option
do
    case "$option" in
        n ) QUIET=1;; 
        s ) START=1; TAG=$OPTARG;;
        k ) STOP=1; TAG=$OPTARG;;
        r ) QUERY=1; TAG=$OPTARG;;
        u ) MIN_UPTIME=$OPTARG;;
        q ) MAX_QUICK_FAILURES=$OPTARG;;
        t ) MAX_TOTAL_FAILURES=$OPTARG;;
        c ) CLEANUP_CMD=$OPTARG;;
        * ) usage;;
    esac
done

# Everything after the options is the command we are running.
shift `expr $OPTIND - 1`
CMD=$@

# The current number of failures.
QUICK_FAILURES=0
TOTAL_FAILURES=0
    
# Use this file to keep state across invocations of the script (i.e. allow
# us to query or kill a watchdog we start).    
LABEL="watchdog-$TAG"
PIDDIR="/var/run/vmware"
PIDFILE="$PIDDIR/$LABEL.PID"  

# Make sure the pid directory exists
mkdir -p $PIDDIR   

if [ $START = 1 ] ; then
    start
elif [ $STOP = 1 ] ; then
    stop
elif [ $QUERY = 1 ] ; then
    query
else
    usage
fi
