#!/bin/bash
# usage: alarms_service [start/stop/status]
# purpose: monitors system resource utilisation.
# config file: /opt/Avaya/scripts/config_files/alarms_config
# If a specific threshold is reached an alarm level is computed.
# If this current alarm level is different than the old one
# then it is written into an alarm resource file in the 
# /var/log/alarm_log/ directory.

main()
{
    # log alarms under the following directory
    local path="/opt/Avaya/alarms"
    if [ ! -d "$path" ]; then
        mkdir $path
    fi
    
    script_path="/opt/Avaya/scripts"

    #CPU alarm file
    cpu_alarm_file=${path}/"cpu_alarm"
    #CPUIO alarm file
    cpuio_alarm_file=${path}/"cpuio_alarm"
    #MEM alarm file
    mem_alarm_file=${path}/"mem_alarm"   
    #system hdd alarm file
    systemhdd_alarm_file=${path}/"systemhdd_alarm"
    #additional hdd alarm file
    additionalhdd_alarm_file=${path}/"additionalhdd_alarm"   

    local -i total_RAM_kB=0
    get_RAM_total_kB total_RAM_kB

    local -i cpu_last_alarm=0
    local -i cpuio_last_alarm=0
    local -i mem_last_alarm=0
    local -i systemhdd_last_alarm=0 # active
    local ops=`/opt/Avaya/scripts/get_HDDs.sh get_options`
    if [ -z "$ops" ]; then
        local -i additionalhdd_last_alarm=2 # not in the system
    else
        local -i additionalhdd_last_alarm=0 # start as active
    fi
    while true;
    do
        local -i var_cpu_use=0
        local -i var_cpu_io=0
        local -i var_mem_non_free=0

        MEMINFO_SRECLAIMABLE=$(cat /proc/meminfo | awk '$1 == "SReclaimable:" {print $2}')

        # run vmstat. it waits for $IPOFFICE_ALARM_INTERVAL seconds.
        vmstat_output=`vmstat $IPOFFICE_ALARM_INTERVAL 2 | tail -n 1`

        compute_cpu_load "$vmstat_output" var_cpu_use var_cpu_io

        local -i var_mem_free=0
        compute_mem_free "$vmstat_output" $total_RAM_kB var_mem_free $MEMINFO_SRECLAIMABLE
        var_mem_non_free=$((100-var_mem_free))

        compute_hdds_status system_hdds_status additional_hdds_status
        local -i cpu_raise_alarm_level=0
        local -i cpu_discard_alarm_level=0
        local -i cpuio_raise_alarm_level=0
        local -i cpuio_discard_alarm_level=0
        local -i mem_raise_alarm_level=0
        local -i mem_discard_alarm_level=0


        compute_resource_alarm_level $var_cpu_use $CPU_RAISE_WARNING_LEVEL $CPU_RAISE_CRITICAL_LEVEL cpu_raise_alarm_level
        compute_resource_alarm_level $var_cpu_io $CPUIO_RAISE_WARNING_LEVEL $CPUIO_RAISE_CRITICAL_LEVEL cpuio_raise_alarm_level
        compute_resource_alarm_level $var_mem_non_free $MEM_RAISE_WARNING_LEVEL $MEM_RAISE_CRITICAL_LEVEL mem_raise_alarm_level
        
        compute_resource_alarm_level $var_cpu_use $CPU_DISCARD_WARNING_LEVEL $CPU_DISCARD_CRITICAL_LEVEL cpu_discard_alarm_level
        compute_resource_alarm_level $var_cpu_io $CPUIO_DISCARD_WARNING_LEVEL $CPUIO_DISCARD_CRITICAL_LEVEL cpuio_discard_alarm_level
        compute_resource_alarm_level $var_mem_non_free $MEM_DISCARD_WARNING_LEVEL $MEM_DISCARD_CRITICAL_LEVEL mem_discard_alarm_level

        manage_alarm cpu_last_alarm $cpu_raise_alarm_level $cpu_discard_alarm_level $cpu_alarm_file

        #When CPU alarm is raised, print output of top, to track high CPU processes
        ########################################################################    
       
	#check if the $var_cpu_use is grater than the $CPU_RAISE_CRITICAL_LEVEL and the process is not running
        
        if [ $var_cpu_use -ge $CPU_RAISE_CRITICAL_LEVEL ]
        then
            
        if ! pgrep -u root -f "get_processes_with_high_cpu.sh" > /dev/null; then
                ${script_path}/get_processes_with_high_cpu.sh > /dev/null 2>&1 &
		PID_process_script=$!
            fi
        elif [ $var_cpu_use -lt $CPU_RAISE_CRITICAL_LEVEL ] 
	then
            if pgrep -u root -f "get_processes_with_high_cpu.sh" > /dev/null; then
	        PID_process_script=$(pgrep -u root -f "get_processes_with_high_cpu.sh")
                kill -SIGUSR1  $PID_process_script
	        PID_process_script=0
             fi 
        fi
        #######################################################################


        manage_alarm cpuio_last_alarm $cpuio_raise_alarm_level $cpuio_discard_alarm_level $cpuio_alarm_file
        manage_alarm mem_last_alarm $mem_raise_alarm_level $mem_discard_alarm_level $mem_alarm_file
        manage_hdd_alarm $systemhdd_last_alarm $system_hdds_status $systemhdd_alarm_file
        manage_hdd_alarm $additionalhdd_last_alarm $additional_hdds_status $additionalhdd_alarm_file
        systemhdd_last_alarm=$system_hdds_status
        additionalhdd_last_alarm=$additional_hdds_status
     done
}

# get the health status of the hdds
function compute_hdds_status(){
    # IPOFFICE-143643 - AWS Server Edition - Persistent SSA additional Hard Drive removed alarm
    if [ -f /opt/Avaya/.amazon ]; then
        return
    fi
    list_hdds=`smartctl --scan | grep -v "^#" | cut -d' ' -f1`
    if [ ! -f /opt/Avaya/.ova ]; then
        temp_part=`lvm pvscan | grep "PV" | cut -d' ' -f4`
    else
        temp_part=""
    fi
    system_hdds_status=2 #removed (impossible for system hdd)
    additional_hdds_status=2 #removed or inactive
    temp_part+=" "
    # for virtualization get / mount point
    for j in `mount | grep " / " | cut -d' ' -f1`; do
        temp_part+="$j "
    done
    if [ -z "$list_hdds" ]; then
        system_hdds_status=0 #removed (impossible for system hdd)
    fi
    for hdd in $list_hdds; do
        is_system=false
        for part in $temp_part; do
            if fdisk -l $hdd | grep "$part" &> /dev/null ; then
                is_system=true
                if [ "$system_hdds_status" != "2" ]; then
                    break
                fi
                system_hdds_status=0 # active
                temp=`smartctl -H $hdd | egrep "OK|PASSED"`
                fd=`fdisk -l $hdd`
                if [ -z "$fd" -a ! -z "$temp" ];then
                    system_hdds_status=1 #failing
                elif [ "$fd" == "" -a "$temp" == "" ]; then
                    system_hdds_status=2
                fi
                break
            fi
        done
        if ! $is_system; then
            additional_hdds_status=0 # active
            temp0=`smartctl -H $hdd | grep "please try adding" | grep "megaraid"`
            temp=""

            if [ "$temp0" == "" ]; then
                temp=`smartctl -H $hdd | egrep "OK|PASSED"`
            else
                temp=`smartctl -H -d megaraid,0 $hdd | egrep "OK|PASSED"`
            fi
            
            fd=`fdisk -l $hdd`
            if [ "$fd" == "" -a "$temp" != "" ];then
                additional_hdds_status=1 #failing
            elif [ "$temp" == "" ]; then
                additional_hdds_status=2
            fi
        fi
    done
}
manage_hdd_alarm()
{
    last_level=$1
    current_level=$2
    alarm_file=$3
    if [ "$current_level" != "$last_level" ]; then
        write_alarm_to_file $alarm_file $current_level
    fi
}
###########################################################################
# it updates the last alarm level whenever the current level changes
###########################################################################
function manage_alarm()
{
    local last_level=$1
    local raise_level=$2
    local discard_level=$3
    local alarm_file=$4
       
    local -i level=${!last_level}
    
    if [[ $raise_level -gt $last_level ]]
    then
        ((level++))
    elif [[ $discard_level -lt $last_level ]]
    then
        ((level--))
    fi
    
    if [[ $level -lt 0 ]] 
    then 
        level = 0
    elif [[ $level -gt 2 ]] 
    then
        level = 2
    fi
    
    if [[ $level -ne ${!last_level} ]]
    then
        # update last_level value
        eval $last_level="'$level'"
        # level:0-Cleared 1-Warning 2-Critical
        write_alarm_to_file $alarm_file $level
    fi
}

###############################################################################
# atomic writing an alarm level into a given file 
# $1 alarm file 
# $2 alarm value
###############################################################################
function write_alarm_to_file
{
    local file_temp="${1}_temp"
    local alarm=$2

    touch $file_temp
    
    echo $alarm >> $file_temp

    mv $file_temp $1
}

###########################################################################
# returns total RAM in kB
# $1 total memory
###########################################################################
function get_RAM_total_kB()
{
    local __total_memory=$1

    local ramKb=`grep MemTotal /proc/meminfo | awk '{print $2}'`

    eval $__total_memory="'$ramKb'"
}

###############################################################################
# computes cpu load 
# returns in param 1 the cpu usage
# returns in param 2 the cpu io wait
###############################################################################
function compute_cpu_load()
{
    local __vmstat_output=$1
    local __cpu_use=$2
    local __cpu_io=$3

    #vmstat output
    ##procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu-----
    ##r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
    ##1  2      3      4      5      6    7    8     9    10   11   12 13 14 15 16 17

    # %CPU spent idle
    local -i idle=`echo $__vmstat_output | cut -d ' ' -f15`

    # %CPU spent waiting for I/O
    local -i iowait=`echo $__vmstat_output | cut -d ' ' -f16`

    #cpu_load (includes CPU I/O time)
    local -i cpu_load=$((100-idle))

    #cpu_load formula without time spent in CPU I/O
    # %CPU spent by User space applications
    #local cpu_user=`echo $__vmstat_output | cut -d ' ' -f13`
    # %CPU spent by the System (kernel mode)
    #local cpu_system=`echo $__vmstat_output | cut -d ' ' -f14`
    #local cpu_load=$((cpu_user+cpu_system))

    eval $__cpu_use="'$cpu_load'"
    eval $__cpu_io="'$iowait'"
}

###############################################################################
# returns free memory in percent. 
# this is an averaged number based on the vmstat output 
###############################################################################
function compute_mem_free()
{
    local __vmstat_output=$1
    local __total_RAM_kB=$2
    local __mem_free=$3
    local __mem_reclaimable=$4

    #vmstat output
    ##procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu-----
    ##r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
    ##1  2      3      4      5      6    7    8     9    10   11   12 13 14 15 16 17
    local free_mem_kB=`echo $__vmstat_output | cut -d ' ' -f4`
    local buff_mem_kB=`echo $__vmstat_output | cut -d ' ' -f5`
    local cache_mem_kB=`echo $__vmstat_output | cut -d ' ' -f6`

    local -i free_mem_percent=0
    if [[ $__total_RAM_kB -gt 0 ]]
    then
        free_mem_percent=$(((free_mem_kB+buff_mem_kB+cache_mem_kB+__mem_reclaimable)*100/__total_RAM_kB))
    fi

    eval $__mem_free="'$free_mem_percent'"
}

###############################################################################
# given a resource value and 2 ordered values it returns an alarm level 
# $1 resource val
# $2 val_a
# $3 val_b
# $4 alarm level
###############################################################################
function compute_resource_alarm_level()
{
    local __val=$1
    local __val_a=$2
    local __val_b=$3
    local __status=$4

    local level=0

    if (($__val >= $__val_b))
    then
        level=2
    elif (($__val >= $__val_a)) 
    then
        level=1
    else
        level=0
    fi

    eval $__status="'$level'"
}



################################ BEGIN ########################################

main "$@"


