#!/bin/bash

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#   NOTE: This script should be run as root. It will raise
#   the priority of the process so that it will can run even when
#   the system is overloaded.
#
#   PURPOSE
#   This script will record system information that is useful for
#   problem analysis. The ssytem information that is captured by this
#   scripts current consists of:
#        *  process information from "ps auxf"
#        *  connectivity information from "netstat -an"
#        *  memory utilization from "cat /proc/meminfo"
#
#   The script will run at a specific interval that can either be set
#   when invoked or read from the configuration file.
#
#   The configuration file defines parameters that can be used to
#   control the operation of the script and can be used to dynamically
#   alter the behavior of the script. The following variables
#   can be modified by the configuration file:
#       *  interval: defines the system monitoring interval
#       *  logfilecount: defines the number of log files that are kept
#
#   The user can force the monitoring process to read the
#   configuration file and dynamically alter the behavior of the
#   monitoring.
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

function usage () {
    test -n "$1" && echo "ERROR: $*"
    echo "USAGE: $_argv0 [-c] [-d directory] [-h] [-i minutes] [-f file] [-k number] [-l log file]"
    echo -e "\tc: clean directory before starting execution"
    echo -e "\td: log directory (default: $DEFAULT_DIR)"
    echo -e "\tf: configuration file (default: $DEFAULT_CFG"
    echo -e "\th: usage message"
    echo -e "\ti: interval between process listing (default: $DEFAULT_INTERVAL minutes)"
    echo -e "\tk: number of process logs to keep (default: $DEFAULT_LOG_COUNT logs)"
    echo -e "\tl: log file (default: $DEFAULT_LOG)"

    exit 1
}

#================================================================
#  Name: purgeLogs
#  Purpose: removes all logs files
#  Global Variables Used
#      _logCount
#      _netLogs
#      _psLogs
#  Global Variables Modified
#================================================================
function purgeLogs() {
    echo "$HEADER"
    echo "Purge Logs: $(date)"
    typeset -r _netLogs=$(/bin/ls -lt $_netstatFiles | awk '{print $9}')
    typeset -r _psLogs=$(/bin/ls -lt $_procFiles | awk '{print $9}')
    typeset -r _memLogs=$(/bin/ls -lt $_meminfoFiles | awk '{print $9}')

    typeset -i _cnt=$_logCount
    typeset _log

    #++++++++++++++++++++++++++++
    # purge netstat logs
    for _log in $_netLogs; do
	if [[ $_cnt -gt 0 ]]; then
	    ((_cnt = _cnt - 1))
	else
	    rm -f $_log
	fi
    done

    #++++++++++++++++++++++++++++
    # purge ps logs
    _cnt=$_logCount
    for _log in $_psLogs; do
	if [[ $_cnt -gt 0 ]]; then
	    ((_cnt = _cnt - 1))
	else
	    rm -f $_log
	fi
    done

    #++++++++++++++++++++++++++++
    # purge meminfo files 
    _cnt=2   # only keep 2 days of meminfo
    for _log in $_memLogs; do
	if [[ $_cnt -gt 0 ]]; then
	    ((_cnt = _cnt - 1))
	else
	    rm -f $_log
	fi
    done
}

#================================================================
#  Name: purgeLogs
#  Purpose: removes all logs files
#  Global Variables Used
#      _cfgFile
#  Global Variables Modified
#      _logCount
#      _delay
#================================================================
function readConfigFile() {
    typeset cfgInterval
    typeset cfgLogCount

    if [[ ! -f $_cfgFile ]]; then
	echo "configuration file NOT found - using default values"
    else
	typeset -r saveIFS=$IFS
	typeset -r eqIFS="="$IFS
	while read line; do
	    # strip comments from line
	    str=${line%%#*}
	    IFS=$eqIFS
	    set -- $str
	    IFS=$saveIFS
	    if [[ $# -eq 2 ]]; then
		# convert variable to lower case
		typeset var=$(echo $1 | tr [:upper:] [:lower:])
		case $var in
		    interval) cfgInterval=$2;;
		    logfilecount) cfgLogCount=$2;
		esac
	    fi
	done < $_cfgFile
    fi

    echo "$HEADER"
    echo "$(date)"
    echo "System Monitor Parameters"
    if [[ -n "$cfgInterval" ]]; then
	echo "Interval: $cfgInterval"
	# convert delay to seconds
	((_delay = cfgInterval * 60))
    else
	((cfgInterval = _delay / 60))
	echo "using current/default interval count: " $cfgInterval
    fi
    if [[ -n "$cfgLogCount" ]]; then
	_logCount=$cfgLogCount
	echo "Log Files Kept: $_logCount"
    else
	echo "using current/default log file count: " $_logCount
    fi
    echo "$HEADER"

}


#================================================================
#  Name: createLogging
#  Purpose: redirects stdout and stderr to the defined log file
#  Global Variables Used
#      _logFile
#  Global Variables Modified
#================================================================
function createLogging() {
    exec 1> $_logFile
    exec 2>&1

    echo "Starting: $(date)"
}


#================================================================
#  Name: monitor
#  Purpose: continues to monitor system at a specified interval
#  Global Variables Used
#      _delay
#  Global Variables Modified
#================================================================
function monitor() {
    typeset _psLog _netstatLog
    typeset _date _day

    # CMVC is converting %H% to mm/dd/yy
    # define the '%' character as a variable and do a runtime substitution
    typeset -r _pSym="%"

    while :; do
        _date=$(date "+${_pSym}m${_pSym}d${_pSym}H${_pSym}M")
	_day=${_date:0:4}
	_psLog=${PROC_FILE}${_date}
	ps auxf > $_psLog
	_netstatLog=${NETSTAT_FILE}${_date}
	netstat -an >$_netstatLog
	echo $(date) >>${MEMINFO_FILE}${_day}
	cat /proc/meminfo >>${MEMINFO_FILE}${_day}
	echo "$HEADER" >>${MEMINFO_FILE}${_day}
	purgeLogs
	sleep $_delay
    done       
}

#================================================================
#  Name: monitorCCFW
#  Purpose: Monitor processes owned by the ccfw userid and
#           perform the appropriate actions (e.g. take dumps,
#           recycle the daemon, etc).
#  Global Variables Used
#           none
#  Global Variables Modified
#           none
#================================================================
function monitorCCFW() {

    echo "Starting CCFW Monitoring"

    # catch these signals so we can stop CCFW monitoring
    #trap "stopCCFWMonitor" 1 2 3 15

    # cleanup the CCFW Monitoring code when this scripts exits
    trap "stopCCFWMonitor" EXIT

    mkdir -p /tmp/hmc
    chown -R ccfw.ccfw /tmp/hmc

    # start CCFW monitoring
    /usr/websm/bin/wjava com.ibm.hwmca.base.monitor.MonitorMgr /opt/hsc/data/monitor.xml &

}


#================================================================
#  Name: stopCCFWmonitoring
#  Purpose: Stop the monitoring of processes owned by the ccfw userid.
#  Global Variables Used
#           none
#  Global Variables Modified
#           none
#================================================================
function stopCCFWMonitor() {

    typeset _pids
    typeset _kill

    echo "Stopping CCFW Monitoring"

    # ignore signals now
    #trap "" 1 2 3 15

    # kill all java processes running the CCFW monitor
    _pids=$(ps -eo pid,cmd -www | grep 'com.ibm.hwmca.base.monitor.MonitorMgr' | grep java | awk '{print $1}')
    if [[ -n "$_pids" ]]; then
       echo "killing PIDs : " $_pids
       for _kill in $_pids; do
           #echo "killing PID $_kill"
   	   killPid $_kill
       done
    fi

    echo "$HEADER"
}


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#   Purpose: Ensures that a pid is killed upon exit of the function.
#   Input: pid number of process
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
function killPid () {
    typeset _killPid=$1
    typeset _pid
    typeset _cnt
    kill -15 $_killPid >/dev/null 2>&1
    # make sure process exits
    for _cnt in 1 2; do
	_pid=$(ps -eo pid | grep "$_killPid" | while read _proc; do
		test "$_proc" -eq "$_killPid" && return $_proc
	    done
	)
	test -z "$_pid" && return
	sleep 2
    done
    test -n "$_pid" && kill -9 $_killPid >/dev/null 2>&1
}


#===============================================================
#    main logic - start by parsing command options
#
typeset -r HEADER="****************************************************"
typeset -r DEFAULT_DIR=/dump/HMCMonitor
typeset -r DEFAULT_CFG=${DEFAULT_DIR}/monitor.cfg
typeset -r DEFAULT_LOG=${DEFAULT_DIR}/monitor.log
typeset -r DEFAULT_INTERVAL=5
typeset -r PROC_FILE=procs_
typeset -r MEMINFO_FILE=meminfo_
typeset -r NETSTAT_FILE=netstat_
typeset -r DEFAULT_LOG_COUNT=40
typeset -i _logCount=$DEFAULT_LOG_COUNT
typeset -i _delay
typeset -r _argv0=$(basename $0)
typeset -r _procFiles="${PROC_FILE}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9]"
typeset -r _netstatFiles="${NETSTAT_FILE}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9]"
typeset -r _meminfoFiles="${MEMINFO_FILE}[0-1][0-9][0-3][0-9]"

typeset _dir
typeset _cfgFile=$DEFAULT_CFG
typeset _logFile=$DEFAULT_LOG

((_delay = $DEFAULT_INTERVAL * 60))

while getopts ":cd:f:hi:k:l:" _arg; do
    case $_arg in
	c) typeset -r _clean=1;;
	d) _dir=$OPTARG;;
	f) _cfgFile=$OPTARG;;
	i) typeset -ri _parmInterval=$OPTARG;;
	k) typeset -ri _parmLogCount=$OPTARG;;
	l) _logFile=$OPTARG;;
	*) usage;;
    esac
done

#+++++++++++++++++++++++++++++++++++++++++++++
#  set directory to log directory
: ${_dir:=$DEFAULT_DIR}
if [[ ! -d $_dir ]]; then
    mkdir $_dir || usage
fi
cd $_dir || usage


#+++++++++++++++++++++++++++++++++++++++++++++
# setup log file
createLogging


#+++++++++++++++++++++++++++++++++++++++++++++++++++
# clean log directory if requested
if [[ -n "$_clean" ]]; then
    rm -f $_procFiles $_netstatFiles $_meminfoFiles
fi


#+++++++++++++++++++++++++++++++++++++++++++++++++++
# read configuration file
: ${_cfgDir:=$DEFAULT_CFG}
readConfigFile


#+++++++++++++++++++++++++++++++++++++++++++++
#  override interval if set on command line
if [[ -n "$_parmInterval" ]]; then
    ((_delay = 60 * _parmInterval))
	echo "monitor interval set from parameter: $_parmInterval"
else
    if [[ -z "$_delay" ]]; then
	((_delay = 60 * DEFAULT_INTERVAL))
	echo "monitor interval set to default: $DEFAULT_INTERVAL"
    fi
fi


#+++++++++++++++++++++++++++++++++++++++++++++
#  override log count if set on command line
if [[ -n "$_parmLogCount" ]]; then
    _logCount=$_parmLogCount
    echo "monitor log count set from parameter: " $_parmLogCount
else
    if [[ -z "$_logCount" ]]; then
	_logCount=$DEFAULT_LOG_COUNT
	echo "monitor log count set to default: " $DEFAULT_LOG_COUNT
    fi
fi

#+++++++++++++++++++++++++++++++++++++++++++++++++++
# make sure logCount and delay are > 0
test $_logCount -gt 0 || usage "invalid number of process logs"
test $_delay -gt 0 || usage "invalid interval"


#+++++++++++++++++++++++++++++++++++++++++++++++++++
trap "readConfigFile" 10  # SIGUSR1
trap "purgeLogs" 12  # SIGUSR2

#   DO NOT START CCFW monitoring at higher priority
monitorCCFW

# raise the prioriy of this process
renice -10 -p $$ >/dev/null 2>&1
monitor
