#!/bin/bash
set +x
#检查所有节点
G_INSPECT_MMLPATH="/opt/huawei/snas/script/inspect_mml"
. $G_INSPECT_MMLPATH/CheckItems
CurInspectNum="346"
CurInspectFun=`GetInspectType $CurInspectNum`
RESULTFILE="${G_TMP_INSPECT_PATH}tmpResult${CurInspectFun}"
LOCAL_BACK_IP=$(GetLocalIp)
maxretrytimes=3
if [ -L $RESULTFILE ]; then
    rm -f $RESULTFILE
    echo "There are security risks."
fi
>$RESULTFILE

isPass=0

# return value
#   0 - not upd master or upd slave
#   1 - upd master
#   2 - upd slave
#   3 - failed
function get_upd_identity()
{
    local mml_output=$(/usr/local/bin/MmlBatch 4040 "upd master" | tr '\r' '\n')
    local iRet=$?
    if [[ ${iRet} -ne 0 ]]; then
        LOG "[$LINENO]failed when executing mml command (MmlBatch 4040 'upd master')"
        echo "[ERR]Node:${LOCAL_BACK_IP}||INFO:failed when executing mml command (MmlBatch 4040 'upd master')" >>${RESULTFILE} 2>&1
        return 3
    fi
    local upd_master_ip=$(echo "$mml_output" | grep "master:" | awk '{print $2}')
    if [[ ! ${upd_master_ip} =~ ^[1-9][0-9]*\.[0-9]+\.[0-9]+\.[1-9][0-9]*$ ]]; then
        LOG "[$LINENO]got wrong upd master ip(${upd_master_ip})"
        echo "[ERR]Node:${LOCAL_BACK_IP}||INFO:got wrong upd master ip(${upd_master_ip})" >>${RESULTFILE} 2>&1
        return 3
    fi
    local upd_slave_ip=$(echo "$mml_output" | grep "Slave:" | awk '{print $2}')
    if [[ ! ${upd_slave_ip} =~ ^[1-9][0-9]*\.[0-9]+\.[0-9]+\.[1-9][0-9]*$ ]]; then
        LOG "[$LINENO]got wrong upd slave ip(${upd_master_ip})"
        echo "[ERR]Node:${LOCAL_BACK_IP}||INFO:got wrong upd slave ip(${upd_master_ip})" >>${RESULTFILE} 2>&1
        return 3
    fi
    if [[ ${upd_master_ip} == ${LOCAL_BACK_IP} ]]; then
        return 1
    elif [[ ${upd_slave_ip} == ${LOCAL_BACK_IP} ]]; then
        return 2
    else
        return 0
    fi
}

# 输出: /opt/huawei/snas/upd/cm_upd.db 中记录的所有节点的 SN
function get_upd_db_SN()
{
    local sql_output
    local iRet=0
    for step in $(seq 1 ${maxretrytimes})
    do
        sql_output=$(/bin/sqlite3 /opt/huawei/snas/upd/cm_upd.db 'SELECT strSerial FROM UPD_STATUS_T' 2>&1)
        iRet=$?
        if [[ ${iRet} -eq 0 ]]; then
            echo "${sql_output}" | tr '\n' ' '
            return
        fi
        if [[ ${step} -lt ${maxretrytimes} ]]; then
            sleep 2s
        fi
    done
    isPass=$(check_pass $isPass 1)
    echo "[ERR]Node:${LOCAL_BACK_IP}||INFO:Failed when selecting from table(s) in cm_upd.db" >>${RESULTFILE} 2>&1
}

function restart_upd_process()
{
    /opt/huawei/deploy/bin/daemon -s /opt/huawei/snas/upd/updservice >/dev/null 2>&1
    for step in $(seq 1 ${maxretrytimes})   # 检查进程是否已经成功重启，重试3次、每次间隔2秒
    do
        /opt/huawei/deploy/bin/daemon /opt/huawei/snas/upd/updservice >/dev/null 2>&1
        sleep 2s
        if $(ps -eo cmd | grep -v grep | grep -qE '^/opt/huawei/snas/upd/updservice$') ; then  # grep -q 直接返回true|false，-E配合^$判断整行
            return    # 进程成功重启，函数直接返回
        fi
    done
    isPass=$(check_pass $isPass 1)
    echo "[ERR]Node:${LOCAL_BACK_IP}||INFO:The upd process is not started." >>${RESULTFILE} 2>&1
}

function upd_database_check()
{
    local iRet
    local SNs
    get_upd_identity
    iRet=$?
    if [[ ${iRet} -eq 0 ]]; then
        LOG "[$LINENO]not upd master or upd slave"
        UPD_DB_SN="common"
    elif [[ ${iRet} -eq 1 ]] || [[ ${iRet} -eq 2 ]]; then
        SNs=$(get_upd_db_SN)
        if [[ ${iRet} -eq 1 ]]; then
            echo "[ZJZ]masterSNs in cm_upd.db:${SNs}" >>${RESULTFILE} 2>&1
            UPD_DB_SN='master'
        else
            echo "[ZJZ]slaveSNs in cm_upd.db:${SNs}" >>${RESULTFILE} 2>&1
            UPD_DB_SN='slave'
        fi
        restart_upd_process
    else
        UPD_DB_SN='error'
        isPass=$(check_pass $isPass 1)
    fi
}

# 输出: 本节点的 SN
function get_local_SN()
{
    LOCAL_SN=$(/bin/ipmitool fru | grep "Product Serial" | awk '{print $4}' 2>/dev/null)
}

function get_local_nid()
{
    grep NID /opt/huawei/snas/etc/cm.ini | awk -F = '{print $2}' 2>/dev/null
}

function get_cm_db_sn()
{
    local local_nid=$1
    local iRet=0
    local sql_output
    for step in $(seq 1 ${maxretrytimes})
    do
        sql_output=$(/bin/sqlite3 /opt/huawei/snas/etc/cm_conf.db "SELECT SN FROM CM_NODE_T WHERE NID=${local_nid}" 2>/dev/null)
        iRet=$?
        if [[ ${iRet} -eq 0 ]]; then
            echo "${sql_output}"
            return
        fi
        if [[ ${step} -lt ${maxretrytimes} ]]; then
            sleep 2s
        fi
    done
    isPass=$(check_pass $isPass 1)
    echo "[ERR]Node:${LOCAL_BACK_IP}||INFO:Failed when selecting from table(s) in cm_conf.db" >>${RESULTFILE} 2>&1
}

function cm_database_check()
{
    local local_NID=$(get_local_nid)
    local SN_form_cm_db=$(get_cm_db_sn ${local_NID})
    if [[ ${LOCAL_SN} == ${SN_form_cm_db} ]]; then
        CM_DB_SN='match'
        isPass=$(check_pass $isPass 0)
    else
        CM_DB_SN='not match'
        isPass=$(check_pass $isPass 1)
        LOG "[$LINENO]SN of local node(${LOCAL_SN}) SN form cm_conf.db(${SN_form_cm_db}) NID(${local_NID})"
        echo "[ERR]Node:${LOCAL_BACK_IP}||INFO:SN of local node(${LOCAL_SN}) SN form cm_conf.db(${SN_form_cm_db}) NID(${local_NID})" >>${RESULTFILE} 2>&1
    fi
}

function check_3416_raid()
{
    local ret

    lspci |grep SAS3416 >/dev/null 2>&1
    ret=$?
    if [ ${ret} -eq 0 ]
    then
        isPass=$(check_pass $isPass 1)
        LOG "[$LINENO]3416 RAID controller card is installed on this node, after the upgrade precheck, you need to embed the RAID controller card upgrade script."
        echo "[ERR]Node:${LOCAL_BACK_IP}||INFO:3416 RAID controller card is installed on this node, after the upgrade precheck, you need to embed the 3416RAID controller card upgrade script." >>${RESULTFILE} 2>&1
    fi

    return
}

# x86大内存标准如下：视频监控场景，smr盘>=240G，非smr盘>=208；默认场景，smr盘>=352G，非smr盘>=336G；海量小文件场景，smr盘>=192G，非smr盘>=176G
# x86节点大内存环境升级重启可能会出现狗咬死，没有规避需要提示
function check_x86_large_memory_restart_risk()
{
    local total_phy_kibs=0
    local total_phy_gb=0
    local serviceType=""
    local isSmrNode="false"
    local haveRisk="false"

    # 1.0 大内存arm节点重启不存在因为mds内存初始化耗时过长被狗咬死的问题
    arch | grep x86 >/dev/null 2>&1
    if [ $? -ne 0 ]; then
        LOG "[$FUNCNAME][${curInspectFun}] restart large-memory ARM nodes do not have crash risks."
        return
    fi
    # 2.0 获取节点物理内存， 取内存会去掉x86的nvdimm， ARM的BBU
    total_phy_kibs=$( grep total_phy /opt/huawei/snas/etc/snas.ini | awk -F '=' '{print $2}' | sed "s/ *//g" )
    total_phy_gb=$((total_phy_kibs/1024/1024))
    if [ ${total_phy_gb} -le 0 ]; then
        # 节点cat /proc/snas_mem/phymem 取内存会去掉x86的nvdimm， ARM的BBU
        total_phy_gb=$( cat /proc/snas_mem/phymem | sed 's/ *//g' )
    fi

    if [ ${total_phy_gb} -le 0 ]; then
        LOG "[$FUNCNAME][${curInspectFun}] Failed to get node total physical memory size."
        return
    fi

    # 3.0 获取节点业务类型
    serviceType=$( grep serviceType /opt/huawei/snas/etc/snas.ini | awk -F '=' '{print $2}' | sed 's/ *//g' )

    # 4.0 获取是否smr节点
    /opt/huawei/snas/sbin/nofs-snas dumpzoneleft |grep -v "NAME" |grep -w "^P.*" >/dev/null 2>&1
    if [ $? -eq 0 ]; then
        isSmrNode="true"
    fi

    # 5.0 确认当前节点内存是否存在重启出现crash的问题
    if [ "X${serviceType}" != "X" -a "X${serviceType}" == "Xsurveillance" ]; then
        # 视频监控场景，内存规格满足以下条件重启可能会出现crash：smr盘>=240G，非smr盘>=208G
        if [ "X${isSmrNode}" == "Xtrue" -a ${total_phy_gb} -ge 240 ]; then
            haveRisk="true"
        fi

        if [ "X${isSmrNode}" == "Xfalse" -a ${total_phy_gb} -ge 208 ]; then
            haveRisk="true"
        fi

    elif [ "X${serviceType}" != "X" -a "X${serviceType}" == "Xother" ]; then
        # 默认场景，内存规格满足以下条件重启可能会出现crash：smr盘>=352G，非smr盘>=336G
        if [ "X${isSmrNode}" == "Xtrue" -a ${total_phy_gb} -ge 352 ]; then
            haveRisk="true"
        fi

        if [ "X${isSmrNode}" == "Xfalse" -a ${total_phy_gb} -ge 336 ]; then
            haveRisk="true"
        fi

    elif [ "X${serviceType}" != "X" -a "X${serviceType}" == "Xarchive" ]; then
        # 海量小文件场景，针对内存：smr盘>=192G，非smr盘>=176G
        if [ "X${isSmrNode}" == "Xtrue" -a ${total_phy_gb} -ge 192 ]; then
            haveRisk="true"
        fi

        if [ "X${isSmrNode}" == "Xfalse" -a ${total_phy_gb} -ge 176 ]; then
            haveRisk="true"
        fi

    else
        LOG "[$FUNCNAME][${curInspectFun}] serviceType:${serviceType} is not in [surveillance other archive],have no large-memory restart mds mem init crash risk."
        return
    fi

    if [ "X${haveRisk}" == "Xtrue" ]; then
        echo "[ERR]Node:${LOCAL_BACK_IP}||INFO:serviceType(${serviceType}),isSmrNode(${isSmrNode}),after upgrade precheck is complete, take workarounds to prevent the node from crashing when the node with large memory is restarted." >> ${RESULTFILE} 2>&1
        LOG "[$FUNCNAME][${curInspectFun}] serviceType(${serviceType}),isSmrNode(${isSmrNode}),after upgrade precheck is complete, take workarounds to prevent the node from crashing when the node with large memory is restarted."
        isPass=$(check_pass $isPass 1)
    else
        LOG "[$FUNCNAME][${curInspectFun}] node serviceType(${serviceType}),isSmrNode(${isSmrNode}),,have no restart crash risk durring cluster upgrading."
    fi

    return
}

# 每需要加功能，多加一个函数、一列变量
function main()
{
    get_local_SN        # LOCAL_SN
    cm_database_check   # CM_DB_SN
    upd_database_check  # UPD_DB_SN
    # 7.1.1.SPC500版本，如果节点中有3416 RAID卡，提示升级预检测后预埋raid卡升级脚本，在升级固件前将mon进程停掉，避免raid固件升级导致mon心跳超时
    check_3416_raid     # 7.1.1.SPC700升级流程原生态已解决这个问题,所以，7.1.1.SPC600版本巡检项里面可以不加这段逻辑检查
    # x86节点大内存环境重启可能会出现狗咬死，即使规避了升级前巡检也需要有提示信息。因为/opt/huawei/snas/static/下的文件升级后是不会继承的，升级前的规避操作会被消除
    check_x86_large_memory_restart_risk
    echo "Node:${LOCAL_BACK_IP}||LOCAL_SN:${LOCAL_SN}||CM_DB_SN:${CM_DB_SN}||UPD_Identity:${UPD_DB_SN}" >>${RESULTFILE} 2>&1
}

main

echo "${CurInspectFun}_Pass $isPass" >>${RESULTFILE} 2>&1
exit 0
