#!/bin/bash
readonly DPC_ROOT_PATH=/opt/oceanstor/dataturbo
readonly DPC_USER_CONF=${DPC_ROOT_PATH}/conf/dpc_user_config.xml
readonly DPC_SHIM_CONF=${DPC_ROOT_PATH}/conf/shim_config.xml
readonly DPC_DROP_CACHE=${DPC_ROOT_PATH}/script/drop_caches.sh
readonly SCRIPT_LOG="/var/log/dataturbo/dataturbo_script/dataturbo.action.log"
readonly DPC_PROCESS=${DPC_ROOT_PATH}/bin/dpc
readonly CGROUP_CPU_DIR=/sys/fs/cgroup/cpu/dataturbo
FAIL_COUNT_ARRAY=(0 0 0 0 0 0)
DROP_COUNT_ARRAY=(0 0 0 0 0 0)
DROP_WAIT_TIME=0

function log_error()
{
    if [ -L ${SCRIPT_LOG} ]; then
        return;
    fi
    local func_name=${FUNCNAME[1]}
    if [ "$2" != "" ]; then
        func_name="$2"
    fi
    echo "[`date "+%Y-%m-%d %T"`:$$][ERR][monitor][${0##*/}][${func_name}]:" "$1" >> ${SCRIPT_LOG}
}

function monitor_process()
{
    local cgroup_mem_conf=$(cat "${DPC_USER_CONF}" | grep "DpcCgroupLimitMemory")
    cgroup_mem_conf=${cgroup_mem_conf#*>}
    cgroup_mem_conf=${cgroup_mem_conf%%<*}
    cgroup_mem_conf=$(echo ${cgroup_mem_conf} | awk -F  '.' '{print $1}')
    cgroup_mem_conf=$(expr "${cgroup_mem_conf}" \* 1024 \* 1024)

    while true
    do
    {
        sleep 5
        check_mem_usage ${cgroup_mem_conf}

        check_cpu_cgroup_proc_pid
    }
    done
}

function get_limit_mem() {
    local limit_map=("1,4" "2,6" "3,12")
    local level=$(cat "${DPC_SHIM_CONF}" | grep "DpcLevelVal")
    level=${level#*>}
    level=${level%%<*}
    level=$(echo ${level} | awk -F  '.' '{print $1}')

    local str=${limit_map[$level-1]}
    LIMIT_MEM=$(echo "${str}" | awk -F "," '{print $2}')
    LIMIT_MEM=$(expr "${LIMIT_MEM}" \* 1024 \* 1024)
}

function get_cpu_info() {
    local total_core=$(grep -c 'model name' /proc/cpuinfo)
    local load_average=$(uptime |awk -F ' ' '{print $12}')
    CPU_USE=$(awk 'BEGIN{printf "%.d\n",('${load_average}'/'${total_core}')*100}')
    if [ ! -n "${CPU_USE}" ]; then
        CPU_USE=0
    fi
}

function check_drop_count() {
    local mem_usage=$1
    local drop_count=0
    for (( i=0; i<${#DROP_COUNT_ARRAY[@]} - 1;i++ ));  do
        DROP_COUNT_ARRAY[$i]=${DROP_COUNT_ARRAY[$(expr $i + 1)]}
        drop_count=$(expr ${drop_count} + ${DROP_COUNT_ARRAY[$i]})
    done

    # if drop count >= 3, wait 1 day
    if [ "${drop_count}" -ge 3 ]; then
        log_error "reclaim times exceeds 3 within 30 seconds."
        DROP_WAIT_TIME=86400 # 86400s = 1day
        DROP_COUNT_ARRAY=(0 0 0 0 0 0)
    fi

    # DROP_WAIT_TIME > 0, do not reclaim
    if [ "${DROP_WAIT_TIME}" -gt 0 ]; then
        DROP_WAIT_TIME=$(expr ${DROP_WAIT_TIME} - 5)
        return;
    fi

    get_limit_mem
    # Soft limit: mem_usage > (limit * 0.9) && cpu_use < 30%
    local soft_limit=$(expr ${LIMIT_MEM} \* 9 / 10)
    if [ "${mem_usage}" -ge "${soft_limit}" ]; then
        get_cpu_info
        if [ "${CPU_USE}" -lt 30 ]; then
          log_error "Memory usage{${mem_usage} KB} is over expectations{${soft_limit} KB}, and cpu load average {${CPU_USE}} less than 30%."
          DROP_COUNT_ARRAY[5]=1
          # reclaim if over limit
          sudo ${DPC_DROP_CACHE}
          return;
        fi
    fi

    # Hard limit: mem_usage > limit
    if [ "${mem_usage}" -ge "${LIMIT_MEM}" ]; then
        log_error "Memory usage{${mem_usage} KB} is over expectations{${LIMIT_MEM} KB}."
        DROP_COUNT_ARRAY[5]=1
        sudo ${DPC_DROP_CACHE}
    fi
}

function check_mem_usage()
{
    local dpc_pid=""
    dpc_pid=$(ps -ef | grep "$DPC_PROCESS" | grep -v grep | awk '{print $2}')
    if [ "${dpc_pid}" == "" ]; then
        log_error "get dpc pid failed."
        return 1
    fi

    local mem_usage=$(cat /proc/${dpc_pid}/status | awk '/VmRSS/{print $2}')
    local fail_count=0
    for (( i=0; i<${#FAIL_COUNT_ARRAY[@]} - 1;i++ ));  do
        FAIL_COUNT_ARRAY[$i]=${FAIL_COUNT_ARRAY[$(expr $i + 1)]}
        fail_count=$(expr ${fail_count} + ${FAIL_COUNT_ARRAY[$i]})
    done

    if [ ! -n "${mem_usage}" ]; then
        log_error "Memory usage empty."
        mem_usage=0
    fi

    check_drop_count ${mem_usage}

    if [ "${mem_usage}" -ge "$1" ]; then
        FAIL_COUNT_ARRAY[5]=1
        fail_count=$(expr ${fail_count} + 1)
        log_error "Memory usage{${mem_usage} KB} is over expectations{$1 KB}, failed count{${fail_count}}."
        if [ ${fail_count} -eq 3 ]; then
            log_error "The number of memory alloc failures reaches the upper limit{3}, the dpc process will be resarted."
            kill -9 "${dpc_pid}" 2 > /dev/null
            exit 1
        fi
    else
        FAIL_COUNT_ARRAY[5]=0
    fi
}

function check_cpu_cgroup_proc_pid()
{
    local dpc_pid=""
    dpc_pid=$(ps -ef | grep "$DPC_PROCESS" | grep -v grep | awk '{print $2}')
    if [ "${dpc_pid}" == "" ]; then
        log_error "get dpc pid failed."
        return 1
    fi
    local cgroup_pid=$(cat $CGROUP_CPU_DIR/cgroup.procs | grep ${dpc_pid})
    if [ "${cgroup_pid}" == "" ]; then
        echo ${dpc_pid} >> $CGROUP_CPU_DIR/tasks
    fi

}

monitor_process
