#!/bin/bash

#############################################################
#
# 名  称：DMS内存泄漏问题
# 错误码：
#         0000=正常
#         0001=存在进程内存泄漏
#
#############################################################

UPG_LIB_PATH="/opt/omm/oma/atoms/Inspect/lib"
PATH="/sbin:/usr/sbin:/usr/local/sbin:/root/bin:/usr/local/bin:/usr/bin:/bin:/opt/omm/oma/workspace/tools"

source "${UPG_LIB_PATH}/log.sh" || { echo "source ${UPG_LIB_PATH}/log.sh failed."; exit 130; }
source "${UPG_LIB_PATH}/out_put.sh" >> ${LOG_FILE} 2>&1 || { log ERROR "source ${UPG_LIB_PATH}/out_put.sh failed."; exit 130; }
source "${UPG_LIB_PATH}/version.sh" >> ${LOG_FILE} 2>&1 || { log ERROR "source ${UPG_LIB_PATH}/version.sh failed."; exit 130; }
source "${UPG_LIB_PATH}/dswareTool_lib.sh" >> ${LOG_FILE} 2>&1 || { log ERROR "source ${UPG_LIB_PATH}/dswareTool_lib.sh failed."; exit 130; }

checkItemId="4416"
resultCode=0
errorKey=""
params=""
originalInfo=""

function memory_check()
{
    local process_name=$1
    local mem_threshold=$2
    local process_memory_use=""

    if [[ "${process_name}" != "zk" ]] && [[ "${process_name}" != "dms" ]]
    then
        process_memory_use=($(ps auxww | grep -iw "./dsware_${process_name}" | grep -v grep | awk '{print $6}'))
        originalInfo="${originalInfo}""command:ps auxww | grep -iw './dsware_${process_name}' | grep -v grep | awk '{print \$6}'\n"
        originalInfo="${originalInfo}""review:${process_name}_process_memory_use=${process_memory_use[@]}\n"
        process_id=($(ps auxww | grep -iw "./dsware_${process_name}" | grep -v grep | awk '{print $2}'))
        originalInfo="${originalInfo}""command:ps auxww | grep -iw './dsware_${process_name}' | grep -v grep | awk '{print \$2}'\n"
        originalInfo="${originalInfo}""review:${process_name}_process_id=${process_id[@]}\n"

        if [[ -z "${process_memory_use}" ]]
        then
            originalInfo="${originalInfo}""result:There is no ${process_name}_process on the node.\n\n"
            log INFO "FSA_${checkItemId}:result:There is no ${process_name}_process on the node."
            return 0
        fi
        
        for ((i=0;i<${#process_id[@]};i++))
        do
            if [ ${process_memory_use[i]} -gt ${mem_threshold} ]
            then
                resultCode=1
                originalInfo="${originalInfo}""result:${process_name}_id=${process_id[i]} memory usage is abnormal, usage:${process_memory_use[i]} (kb), threshold:${mem_threshold} (kb).\n"
                log ERROR "FSA_${checkItemId}${process_name}_id=${process_id[i]} memory usage is abnormal, usage:${process_memory_use[i]} (kb), threshold:${mem_threshold}(kb)."
                error_result="${error_result} ${process_name}_id=${process_id[i]}"
            else
                originalInfo="${originalInfo}""result:${process_name}_id=${process_id[i]} memory usage is normal.\n"
                log INFO "FSA_${checkItemId}:${process_name}_id=${process_id[i]} memory usage is normal."
            fi
        done
        originalInfo="${originalInfo}\n"
        return
    elif [[ "${process_name}" == "zk" ]]
    then
        process_memory_use=`ps auxww | grep QuorumPeerMain | grep -w zookeeper | grep Dzookeeper.log.dir | grep /opt/dsware/agent/zk | grep -v grep | awk '{print $6}' `
        originalInfo="${originalInfo}""command:ps auxww | grep QuorumPeerMain | grep -w zookeeper | grep Dzookeeper.log.dir | grep /opt/dsware/agent/zk | grep -v grep | awk '{print \$6}' \n"
        originalInfo="${originalInfo}""review:${process_name}_process_memory_use=${process_memory_use}\n"
        if [[ -z "${process_memory_use}" ]]
        then
            originalInfo="${originalInfo}""result:There is no ${process_name}_process on the node.\n\n"
            log INFO "FSA_${checkItemId}:result:There is no ${process_name}_process on the node."
            return 0
        elif [ ${process_memory_use} -gt ${mem_threshold} ]
        then
            resultCode=1
            originalInfo="${originalInfo}""result:${process_name} memory usage is abnormal, usage:${process_memory_use} (kb), threshold:${mem_threshold} (kb).\n\n"
            log ERROR "FSA_${checkItemId}:${process_name} memory usage is abnormal, usage:${process_memory_use} (kb), threshold:${mem_threshold} (kb)."
            error_result="${error_result} ${process_name}"
            return 1
        else
            originalInfo="${originalInfo}""result:${process_name} memory usage is normal.\n\n"
            log INFO "FSA_${checkItemId}:result:${process_name} memory usage is normal."
            return 0
        fi
    else
        process_memory_use=($(ps auxww | grep -iw /opt/dsware/service/dr/dms | grep -v grep | awk '{print $6}'))
        originalInfo="${originalInfo}""command:ps auxww | grep -iw /opt/dsware/service/dr/dms | grep -v grep | awk '{print \$6}'\n"
        originalInfo="${originalInfo}""review:${process_name}_process_memory_use=${process_memory_use[@]}\n"
        process_id=($(ps auxww | grep -iw /opt/dsware/service/dr/dms | grep -v grep | awk '{print $2}'))
        originalInfo="${originalInfo}""command:ps auxww | grep -iw /opt/dsware/service/dr/dms | grep -v grep | awk '{print \$2}'\n"
        originalInfo="${originalInfo}""review:${process_name}_process_id=${process_id[@]}\n"

        if [[ -z "${process_memory_use}" ]]
        then
            originalInfo="${originalInfo}""result:There is no ${process_name}_process on the node.\n\n"
            log INFO "FSA_${checkItemId}:result:There is no ${process_name}_process on the node."
            return 0
        fi
        
        for ((i=0;i<${#process_id[@]};i++))
        do
            if [ ${process_memory_use[i]} -gt ${mem_threshold} ]
            then
                resultCode=1
                originalInfo="${originalInfo}""result:${process_name}_id=${process_id[i]} memory usage is abnormal, usage:${process_memory_use[i]} (kb), threshold:${mem_threshold} (kb).\n"
                log ERROR "FSA_${checkItemId}${process_name}_id=${process_id[i]} memory usage is abnormal, usage:${process_memory_use[i]} (kb), threshold:${mem_threshold}(kb)."
                error_result="${error_result} ${process_name}_id=${process_id[i]}"
            else
                originalInfo="${originalInfo}""result:${process_name}_id=${process_id[i]} memory usage is normal.\n"
                log INFO "FSA_${checkItemId}:${process_name}_id=${process_id[i]} memory usage is normal."
            fi
        done
        originalInfo="${originalInfo}\n"
        return
    fi
}

function get_osd_main_media_memory()
{
    osd_config=$1

    disk_type=$(cat ${osd_config} | grep -w "p_media_type" | awk -F '=' '{print$2}')
    media_size_kb=$(cat ${osd_config} | grep -w "p_osd_media_size" | awk -F '=' '{print$2}')
    media_size_TB=$(echo "scale=2;${media_size_kb}/1024/1024/1024/1024" | bc)
    network_type=$(cat /opt/dsware/agent/conf/network.cfg |grep -i network_type|awk -F '"' '{print $2}')

    total_osd_memory=0
    if [[ ${disk_type} =~ ssd ]]
    then
        if [ $(echo "${media_size_TB} <= 2.0" | bc) = 1 ]
        then
            total_osd_memory=2.33
        elif [ $(echo "${media_size_TB} <= 4.0" | bc) = 1 ]
        then
            total_osd_memory=3.08
        else
            total_osd_memory=$(echo "3.08+((${media_size_TB}-4.0)/2+1)*0.38" | bc)
        fi
    else
        if [ $(echo "${media_size_TB} <= 2.0" | bc) = 1 ]
        then
            total_osd_memory=1.5
        elif [ $(echo "${media_size_TB} <= 4.0" | bc) = 1 ]
        then
            total_osd_memory=2.25
        else
            total_osd_memory=$(echo "2.25+((${media_size_TB}-4.0)/2+1)*0.38" | bc)
        fi
    fi

    if [ "${network_type}x" = "InfiniBandx" ]
    then
          total_osd_memory=$(echo "${total_osd_memory}+0.3")
    fi

    echo "${total_osd_memory}"
}

function memory_check_per_osd()
{
    pool_id=$1
    total_osd_memory=$2
    osd_config="/opt/dsware/osd/conf/osd_${pool_id}_conf.cfg"

    # 打印所有进程
    osd_pid_list=($(ps auxww | grep -iw "./dsware_osd" | grep ${osd_config} | awk '{print$2}'))
    originalInfo="${originalInfo}""command:ps auxww | grep -iw './dsware_osd' | grep ${osd_config} | grep -v grep\n"


    # 检查每个进程是否在范围内
    for process_id in ${osd_pid_list[@]}
    do
        osd_process=$(ps auxww | grep -iw "./dsware_osd" | grep -w ${process_id})
        originalInfo="${originalInfo}""review:pool_id=${pool_id} osd_process='${osd_process}'\n"

        process_mem=$(echo ${osd_process} | awk '{print $6}')
        process_mem_GB=$(printf "%.2f" `echo "scale=2;${process_mem}/1024/1024" | bc`)
        if [ $(echo "${process_mem_GB} > ${total_osd_memory}" | bc) = 1 ]
        then
            resultCode=1
            originalInfo="${originalInfo}""result:osd pid=${process_id} memory usage is abnormal, usage:${process_mem_GB} (GB), threshold:${total_osd_memory} (GB).\n"
            log ERROR "FSA_${checkItemId}osd pid=${process_id} memory usage is abnormal, usage:${process_mem_GB} (GB), threshold:${total_osd_memory} (GB)."
            error_result="${error_result} osd_id=${process_id}"
        else
            originalInfo="${originalInfo}""result:osd_id=${process_id} memory usage is normal.\n"
            log INFO "FSA_${checkItemId}:osd_id=${process_id} memory usage is normal."
        fi
    done
    originalInfo="${originalInfo}\n"
    return
}

function check_osd_need_memory()
{
    osd_process=($(ps -ef | grep -iw "./dsware_osd"))
    if [[ -z "${osd_process}" ]]
    then
        originalInfo="${originalInfo}""result:There is no ${process_name}_process on the node.\n\n"
        log INFO "FSA_${checkItemId}:result:There is no ${process_name}_process on the node."
        return 0
    fi

    osd_conf_list=($(ls /opt/dsware/osd/conf/osd_*_conf.cfg))
    for ((i=0;i<${#osd_conf_list[@]};i++))
    do
        pool_id=$(echo ${osd_conf_list[$i]} | awk -F '_' '{print$2}')
        osd_config=$(echo "${osd_conf_list[$i]}")
        osd_num=$(ps -ef | grep ${osd_config} | grep -v grep | wc -l)
        if [ ${osd_num} -eq 0 ]
        then
            continue
        fi

        # OSD主存进程使用
        total_osd_memory=$(get_osd_main_media_memory ${osd_config})

        # OSD操作系统占用
        total_osd_memory=$(echo "8/${osd_num}+${total_osd_memory}" | bc)

        # OSD基础服务框架占用
        total_osd_memory=$(echo "4/${osd_num}+${total_osd_memory}" | bc)

        # 预留0.5G
        total_osd_memory=$(echo "0.5+${total_osd_memory}" | bc)

        # OSD缓存占用
        total_osd_memory=$(echo "4/${osd_num}+${total_osd_memory}" | bc)
        mdc_url=$(cat ${osd_config} | grep mdc_url |awk -F '=' '{print $2}' | awk -F ','  '{print $1}')
        mdc_id=$(echo ${mdc_url} | awk -F '[:-]' '{print $1}')
        mdc_port=$(echo ${mdc_url} | awk -F '[:-]' '{print $3}')
        mdc_ip=$(echo ${mdc_url} | awk -F '[:-]' '{print $2}' | awk -F '@' '{print $1}')
        cache_size=$(/opt/dsware/agent/tool/dsware_insight 0 ${mdc_id} ${mdc_ip} ${mdc_port} 8 101 ${pool_id} | tail -n +3| head -1 | awk -F '|' '{print $7}'|awk '{print $1}')
        cache_size_TB=$(echo "scale=2;${cache_size}/1024/1024" | bc)
        total_osd_memory=$(echo "${cache_size_TB}*6+${total_osd_memory}" | bc)

        # 提高检测阈值20%
        total_osd_memory=$(echo "${total_osd_memory}*1.2" | bc)

        # 检查该poolid的osd是否满足要求
        memory_check_per_osd ${pool_id} ${total_osd_memory}
    done
}

function main()
{
    agent_items_check ${checkItemId}
    if [ $? -ne 0 ]
    then
        log INFO "${checkItemId} do not select, pass"
        return 0
    fi

    error_result=""

    memory_check mdc 2831155
    memory_check agent 1048576
    memory_check zk 3145728
    check_osd_need_memory

    src_version="$(get_src_version)"
    if [[ "${src_version}" =~ "R006C00" ]] || [[ "${src_version}" =~ "R006C10" ]] || [[ "${src_version}" =~ "R006C20" ]]
    then
        memory_check dms 15728640
    else
        memory_check dms 8388608
    fi

    ib_env_judg=`cat /opt/dsware/agent/conf/network.cfg | grep -i network_type | grep -i InfiniBand`
    if [[ -n "${ib_env_judg}" ]]
    then
        memory_check vbs 11534336
    else
        memory_check vbs 9437184
    fi

    if [ ${resultCode} -eq 0 ]
    then
        originalInfo="${originalInfo}""result:there is no process is abnormal."
        log INFO "FSA_${checkItemId}:there is no process is abnormal."
        log INFO "FSA_${checkItemId}:ok"
        FSA_json_output "${checkItemId}" "${resultCode}" "${errorKey}" "${params}" "${originalInfo}"
        return 0
    else
        errorKey="${checkItemId}0001"  # 0001 存在进程内存泄漏
        originalInfo="${originalInfo}""result:there are some abnormal processes, include:${error_result}."
        log ERROR "FSA_${checkItemId}:there are some abnormal processes, include:${error_result}."
        log ERROR "FSA_${checkItemId}:unok"
        FSA_json_output "${checkItemId}" "${resultCode}" "${errorKey}" "${params}" "${originalInfo}"
        return 1
    fi
}

log MUST "enter [$0],para=[$@]"
main $@
retValue=$?
log MUST "leave [$0],retValue=${retValue}"
exit ${retValue}
