#!/bin/ksh
#ident "@(#)pt7764j/xs_solaris/CMS/scripts/autopri 1.5P1.2"
#ident "@(#)xs_solaris/xs_solaris/CMS/scripts/autopri	1.5 Last changed 97/04/02"
#/* Copyright (C) 97/04/02 Integrated Micro Products PLC. */
#/*============================================================================
#
# autopri - automatic processor reintegration module
#
# SYNOPSIS
# --------
#
# autopri -a [module_name] [module_no] [serial_no] [yy/mm/dd] [hh:mm:ss] [interval] [bucketsize]
# 
# autopri -d [module_name] [serial_no]
#
# DESCRIPTION
# -----------
#
# autopri is a statistical module that facilitates the automatic reintegration
# of modules that suffer transient errors which are not detrimental to the 
# availability of the system. 
#
# In essence, a module is automatically reintegrated if the rate of failures of
# this type is within acceptable parameters for the module type e.g 1 day
# per 30 day interval. This is determined by the 'leaky bucket' 
# algorithm which defines the following parameters:
#
#	bucket capacity,
#	fill quantity,
#	leakage rate.
#
# received event
# pour 'fill quantity' units into bucket
# if bucket overflowed
#	module faulty, do not reintegrate
# otherwise
# 	error rate is acceptable, reintegrate module
#
# determining whether the bucket has overflowed or not is achieved with the
# following formula:
#
#		  n
# #events * FQ - sum (Te - Ti) * LR
# 		 i=0
#
# where:
#
# Te - time stamp of the most recent event
# Ti - time stamp of event 'i' in the event history file
# FQ - is the fill quantity
# LR - is the leakage rate ( 1/interval ) 
# 
# Reintegration occurs if the result of the formula is less than the bucket 
# capacity. 
#
# At present auto-reintegration applies only to cpuset modules in the ftSPARC, 
# which can generate correctable memory errors or signature differences, but the
# infrastructure for supporting other modules such as disks and IOSETS has also
# been provided.
#
# All correctable memory error or signature difference events are date and
# time stamped and written to a module event history log file. When an event
# reaches a certain age it is deemed as stale and removed from the file. 
#
# OPTIONS
# -------
#
# -a - add an event to the history log and determine whether a module should
#	   be reintegrated or not.
#
# -d - delete the history event for a particular module.
#
# NOTES
# -----
#
# The module name passed in is of the format as stored in the EEPROM e.g
#
#	CPUSET-2-32-40
#	IOSET-2000
#
# and is parsed to extract the module and a particular attribute. In the case of
# the cpuset module, the attribute is the memory size (since there is a depen-
# dency between rate of correctable memory events and memory size). These 
# are then used in conjunction with the serial number to generate a unique
# history log file e.g
#
#	/etc/cms/bin/.autopri/CPUSET/32/3930.log
#	/etc/cms/bin/.autopri/IOSET/2000/5629.log
#
# The module number passed in is used to extract the autopri attributes for
# this module which are stored in the CMS.
#
# When calculating the days elapsed between events, the date and time for each
# event is converted to seconds and the difference is then divided by the number
# of seconds per day.
#----------------------------------------------------------------------------*/
#/*HIDE*/

trap 'echo "$0: interrupted - quiting..." 1>&2; exit 0' 1 2 15

PATH=$PATH:/etc/cms/bin:/etc

# output files for error logging

STATLOG="/dev/FTlog:status"
CONSLOG="/dev/FTlog:console"

# Root of all knowledge. This directory contains module directories each 
# containing their respective error event log files.

BASEDIR="/etc/cms/.autopri"

# unit of water that is poured into the bucket when an error event occurs

FILLQUANTITY=1

# low water mark for the number of events in the log file. autopri rather 
# optimistically tells the CMS to reintegrate a module if its event file
# has at most two entries.

MINENTRIES=2

# the stale value indicates the number of days that must elapse before an event
# entry in the log file is deemed as stale. Stale events are removed from the
# event log file. STALEVALUE is derived from the error rate expectancy on a
# per-module basis.

STALEVALUE=0

# 
# count - count the number of entries in a log file
#
# inputs - module event log file
#
count()
{
	if [ ! -f $1 ]
	then
		return 0
	fi
	num_entries=`cat $1 | wc | nawk 'BEGIN {FS=" "}{print $1}'`
	return $num_entries
}

#
# create - create the module event logging files if they don't already exist
#
# inputs - module name
#	   module attribute i.e memory size/storage capacity etc.
#
create()
{
	if [ ! -d $BASEDIR ]
	then
		mkdir $BASEDIR
	fi
	if [ ! -d $BASEDIR/$1/$2 ]
	then
		if [ ! -d $BASEDIR/$1 ]
		then	
			mkdir $BASEDIR/$1
			mkdir $BASEDIR/$1/$2
		fi
		return 0
	else
		return 1
	fi
}

#
# stale - removes stale events from the log file
#
# inputs - module name
#	   module attribute i.e memory/size
#	   serial number
#	   recent event date
#	   recent event time
#
stale()
{
	
	if count $BASEDIR/$1/$2/$3.log -le $MINENTRIES 
	then
		return 1	# not enough entries
	fi
	recentdate=$4
	recentime=$5
	firstentry=`head -1 $BASEDIR/$1/$2/$3.log`
	firstdate=`echo $firstentry | nawk 'BEGIN {FS=" "}{print $1}'`
	firstime=`echo $firstentry | nawk 'BEGIN {FS=" "}{print $2}'`
	val=`elapsed $recentdate $recentime $firstdate $firstime`
	if [ $? -ne 0 ]
	then 
		return 1	# We also return if the autopri utility progs  
				# registers a fault.
	elif [ $val -lt $STALEVALUE ]
	then
		return 1	# no more stale entries. 
	fi
 	count $BASEDIR/$1/$2/$3.log
	num=$?
	numlines=`expr $num - 1`
	tail -$numlines $BASEDIR/$1/$2/$3.log > /tmp/$3.tmp
 	cp /tmp/$3.tmp $BASEDIR/$1/$2/$3.log		
	return 0	# check for more stale entries
}

#
# overflow - indicate whether the rate of transient failures is too high.  
#
# inputs   - 	module name
#		module number
#		memory size/storage capacity
#		serial number 
#		date
#		time
#		interval
#		bucketsize
#
overflow()
{
	# check that we have a sensible history size

	count $BASEDIR/$1/$3/$4.log
	num_entries=$?
	if [ $num_entries -le $MINENTRIES ]
	then
		return 0
	fi

	# get the total number of days elapsed upto this error occurence 

	while read lastdate lasttime  
	do
        	daydiff=`elapsed $5 $6 $lastdate $lasttime`
		if [ $? -ne 0 ]
		then
			return 1	# the program 'elapsed' has failed. 
					# do not give cpuset benefit of doubt.
		fi
        	cumtotal=$(($scratch + $daydiff))
        	scratch=$(($cumtotal + 0))
	done  < $BASEDIR/$1/$3/$4.log

	interval=$7
	bucketsize=$8

	# have we overflowed our bucket?

	mess=`bucket $num_entries $interval $bucketsize $FILLQUANTITY $cumtotal`
	if [ $? -ne 0 ]
	then
		return 1	# an error occured during execution of bucket
				# do not reintegrate
	elif [ $mess = "overflowed" ]
	then
		return 1	# bucket overflowed 
	else
		return 0
	fi	
}


#
# cpuset - handle the addition of a new entry into the cpuset event log
#
# inputs - module name
#	   module number
#	   serial#
#	   date
#	   time
#	   interval
#	   bucketsize
#
cpuset()
{

   	module=`echo $1 | nawk 'BEGIN {FS="-"}{print $1}'`
	memory=`echo $1 | nawk 'BEGIN {FS="-"}{print $4}'`
	moduleno=$2
	serialno=$3
	newdate=$4
	newtime=$5
	interval=$6
	bucketsize=$7
	
	# is this going to be the first event registered?

	if create $module $memory 
	then
		# copy the event to the file and enforce Auto Pri
		# by returning success down the call chain

		echo $newdate $newtime >> $BASEDIR/$module/$memory/$serialno.log
		return 0
	fi
		
	# determine whether the rate of failure is too high 

	if overflow $module $moduleno $memory $serialno $newdate $newtime $interval $bucketsize
	then
		echo $newdate $newtime >> $BASEDIR/$module/$memory/$serialno.log
		retval=0
	else
		echo $newdate $newtime >> $BASEDIR/$module/$memory/$serialno.log
		retval=1
	fi  

	# update the STALEVALUE for log file entries according to the error
        # expectancy for this type of module.
 
        STALEVALUE=$(($interval * 3))

	# discard stale events from the event log

	while stale $module $memory $serialno $newdate $newtime
	do
		stale $module $memory $serialno $newdate $newtime
	done

	return $retval

}

#
# ioset - handle the addition of a new entry into the ioset event log
#
# inputs - module name
#	   module number
#	   serial#
#	   date
#	   time
# 	   interval
#	   bucketsize
#
ioset()
{
    module=`echo $1 | nawk 'BEGIN {FS="-"}{print $1}'`
    iosetsize=`echo $1 | nawk 'BEGIN {FS="-"}{print $2}'`
    moduleno=$2
    serialno=$3
    newdate=$4
    newtime=$5
    interval=$6
    bucketsize=$7
 
    # is this going to be the first event registered?
 
    if create $module $iosetsize
    then
        # copy the event to the file and enforce Auto Pri
        # by returning success down the call chain
 
        echo $newdate $newtime >> $BASEDIR/$module/$iosetsize/$serialno.log
        return 0
    fi
 
    # determine whether the rate of failure is too high
 
    if overflow $module $moduleno $iosetsize $serialno $newdate $newtime $interval $bucketsize
    then
        echo $newdate $newtime >> $BASEDIR/$module/$iosetsize/$serialno.log
        retval=0
    else
        echo $newdate $newtime >> $BASEDIR/$module/$iosetsize/$serialno.log
        retval=1
    fi
 
	# update the STALEVALUE for log file entries according to the error
        # expectancy for this type of module.
 
        STALEVALUE=$(($interval * 3))

    # discard stale events from the event log
 
    while stale $module $iosetsize $serialno $newdate $newtime
    do
        stale $module $iosetsize $serialno $newdate $newtime
    done
 
    return $retval
}

#
# disk - handle the addition of a new event entry into the disk event log
#
# inputs - module name
#	   module number
#	   serial#
#	   date
#	   time
#	   interval
#	   bucketsize
disk()
{
    module=`echo $1 | nawk 'BEGIN {FS="-"}{print $1}'`
    disksize=`echo $1 | nawk 'BEGIN {FS="-"}{print $4}'`
    moduleno=$2
    serialno=$3
    newdate=$4
    newtime=$5
    interval=$6
    bucketsize=$7
 
    # is this going to be the first event registered?
 
    if create $module $disksize
    then
        # copy the event to the file and enforce Auto Pri
        # by returning success down the call chain
 
        echo $newdate $newtime >> $BASEDIR/$module/$disksize/$serialno.log
        return 0
    fi
 
    # determine whether the rate of failure is too high
 
    if overflow $module $moduleno $disksize $serialno $newdate $newtime $interval $bucketsize
    then
        echo $newdate $newtime >> $BASEDIR/$module/$disksize/$serialno.log
        retval=0
    else
        echo $newdate $newtime >> $BASEDIR/$module/$disksize/$serialno.log
        retval=1
    fi
 
	# update the STALEVALUE for log file entries according to the error
        # expectancy for this type of module.
 
        STALEVALUE=$(($interval * 3))

    # discard stale events from the event log
 
    while stale $module $disksize $serialno $newdate $newtime
    do
        stale $module $disksize $serialno $newdate $newtime
    done
 
    return $retval
}

#
# delete - deletes the log files relating to a particular module
#
# inputs - modulename
#	   module serial#
#
delete()
{
   	module=`echo $1 | nawk 'BEGIN {FS="-"}{print $1}'`
	case $module in
		CPUSET) attribute=`echo $1 | nawk 'BEGIN {FS="-"}{print $4}'` ;;
		IOSET) attribute=`echo $1 | nawk 'BEGIN {FS="-"}{print $2}'` ;;
		DISK) attribute=`echo $1 | nawk 'BEGIN {FS="-"}{print $2}'` ;;
		\?)	echo $0':' $module 'not supported'
			exit 2 ;;
	esac
	serialno=$2
	if [ ! -f $BASEDIR/$module/$attribute/$serialno.log ]
	then
		echo $BASEDIR/$module/$attribute/$serialno.log 'not found'
		return 1
	else
		if rm $BASEDIR/$module/$attribute/$serialno.log
		then
			return 0
		else
			return 1
		fi
	fi	
}

#
# add   - add another reintegratable-out-of-sync event to the module log file
# inputs - module name
#	   module number
#	   serial#
#	   date
#	   time
#	   interval
#	   bucketsize
#
add()
{

	# extract module name and call the appropriate function     

	module=`echo $1 | nawk 'BEGIN {FS="-"}{print $1}'`
	if [ $module = "CPUSET" ]
 	then
		if cpuset $1 $2 $3 $4 $5 $6 $7
		then
			return 0
		else
			return 1	
		fi
	elif [ $module = "IOSET" ]
	then
		if ioset $1 $2 $3 $4 $5 $6 $7
		then
			return 0
		else
			return 1
		fi
	elif [ $module = "DISK" ]
	then
		if disk $1 $2 $3 $4 $5 $6 $7
		then
			return 0
		else
			return 1
		fi
	fi
}

#
# usage - tells the user how to invoke autopri
#
usage()
{
	echo 'Usage:' $0 '-a name no serial# date time interval bucketsize'
	echo 'Usage:' $0 '-d name serial#'
}

#
# main section that parses the command line options and calls the corresponding
# function.
#
# inputs - module name
#	   module number	-- not used with -d option
#	   serial number	
#	   event date		-- ditto 
#	   event time		-- ditto 
#	   interval		-- ditto
#	   bucketsize		-- ditto
#

if [ $# -lt 3 ]
then
	usage
	exit 1
fi

# interpret the option passed via the command line and call the appropriate 
# function with the required parameters. 

while getopts aqd opt   
do
	case $opt in
		a |  d) option=$opt;;
	       	    \?) usage 
			exit 2;;
	esac
done

value=`expr $OPTIND - 1`
shift $value
case $option in
	a) add $1 $2 $3 $4 $5 $6 $7
	   stat=$? ;;
	d) delete $1 $2
	   stat=$? ;;
esac


# determine the result of the operation required and exit accordingly
# in that case of event addition, the status return is used to indicate to
# the CMS whether the module is to be reintegrated or not.

if [ $stat -eq 1 ]
then
	echo 'Auto-reintegration limit exceeded - not auto reintegrating module' > $STATLOG
	exit 1	# do not reintegrate the faulty module
else
	exit 0
fi

