#!/bin/bash

#
#巡检S3关键服务
#所有节点执行检查,不同类型节点检查的关键服务有区别
#
#1.queue              所有节点检查
#2.memcache           所有几点检查
#3.osc                所有节点检查
#4.s3fs               所有节点检查
#5.s3fs_bill          所有节点检查
#6.dns                OMS节点检查
#7.dcm                主OMS节点检查
#8.poe                主OMS节点检查
#9.poe management ip  主OMS节点检查
#10.visitor           主OMS节点检查
#11.gaussdb           主OMS节点检查
#12.gaussdb float ip  主OMS节点检查

#daemon相关脚本
readonly OSC_MONITOR_SCRIPT="/opt/obs/service/osc/bin/osc_monitor.sh"
readonly QUEUE_MONITOR_SCRIPT="/opt/obs/service/queue/bin/queue_monitor.sh"
readonly MEMCACHE_SCRIPT="/opt/obs/service/memcache/bin/memcache.sh"
readonly UDS_S3FS_SCRIPT="/opt/obs/scripts/s3fs/uds_s3fs.sh"
readonly UDS_S3FS_BILL_SCRIPT="/opt/obs/scripts/s3fs/uds_s3fs_bill.sh"
readonly UDS_DNS="/opt/obs/scripts/dns/nsd_service.sh"
readonly UDS_DNS_STATMON="/opt/obs/scripts/dns/statmon_service.sh"

#ha 相关脚本
readonly MDC_DCM_HA_SCRIPT="/opt/huawei/ha/plugin/mdcdcm.sh"
readonly POE_HA_SCRIPT="/opt/huawei/ha/plugin/poe"
readonly VISITOR_HA_SCRIPT="/opt/huawei/ha/plugin/visitor"
readonly POE_MANAGEMENT_IP_SCRIPT="/opt/huawei/ha/plugin/bussinessfloatip.sh"
readonly GAUSSDB_MANAGEMENT_IP_SCRIPT="/opt/huawei/ha/plugin/upffloatip.sh"
readonly MDC_DB_HA_SCRIPT="/opt/huawei/ha/plugin/mdcdb.sh"
readonly GAUSSDB_HA_SCRIPT="/opt/huawei/ha/plugin/rcommgsdb"
readonly LOCAL_EXEC_PATH="/opt/huawei/deploy/script/localexec.py"
CM_DB="/opt/huawei/snas/etc/cm_conf.db"
DAEMON_PROCESSES_SCRIPTS=(    \
"${QUEUE_MONITOR_SCRIPT}"        \
"${MEMCACHE_SCRIPT}"             \
"${OSC_MONITOR_SCRIPT}"          \
"${UDS_S3FS_SCRIPT}"             \
"${UDS_S3FS_BILL_SCRIPT}"        \
)

#HA相关进程
HA_PROCESSES=(          \
"POE"                      \
"DCM"                      \
"visitor"                  \
"POE Management IP"        \
"GaussDB Management IP"    \
)

#受daemon监控的进程，dns只管理节点上启动，单独处理
DAEMON_PROCESSES=(   \
"queue"                 \
"memcache"              \
"osc"                   \
"s3fs"                  \
"s3fs_bill"             \
)

#CM_INI_PATH
readonly CM_INI_PATH="/opt/huawei/snas/etc/cm.ini"

NODE_TYPE=
isPass=0
errPro=""

readonly PRIMARY_TYPE="Primary"    #主
readonly STANDBY_TYPE="Standby"    #备
readonly BUSINESS_TYPE="Business"  #业务节点

# 数据库查询状态返回码
declare -i db_normal=0           #   正常运行
declare -i db_abnormal=1         #   运行异常
declare -i db_stopped=2          #   停止
declare -i db_unknown=3          #   状态未知
declare -i db_starting=4         #   正在启动
declare -i db_stopping=5         #   正在停止
declare -i db_primary=6          #   主正常运行
declare -i db_standby=7          #   备正常运行
declare -i db_activating=8       #   正在升主
declare -i db_deactivating=9     #   正在降备
declare -i db_notsupported=10    #   动作不存在
declare -i db_repairing=11       #   正在修复


G_INSPECT_MMLPATH="/opt/huawei/snas/script/inspect_mml"
source $G_INSPECT_MMLPATH/CheckItems
CurInspectNum="266"
CurInspectFun="$(GetInspectType $CurInspectNum)"
RESULTFILE="/tmp/tmpResult${CurInspectFun}"
>${RESULTFILE}
LOG_FILE="/var/log/inspect.log"
CHECK_PASSED=0
CHECK_FAILED=1

function LOG
{
   time=$(date)
   echo [${time}][$$][$CurInspectFun]$@ >> $LOG_FILE
}

#判断该节点是否需要检查,返回：1，需要检查；0，不需要检查
function isNotNeedCheck()
{
    local isNotNeedCheck=1
    #获取节点类型标识
    local node_service_type=$(egrep '[[]|^'node_service_type'=' /opt/huawei/snas/etc/snas.ini | tr -d '\n' | grep -Po '(?<=[[]'NODE'[]]'node_service_type'=)[0-9]+')
    if [ "${node_service_type}" = "2" -o "${node_service_type}" = "3" ]; then

        NODE_TYPE="${BUSINESS_TYPE}"
        source /opt/obs/scripts/common/s3_config_utility.sh >> /dev/null 2>&1
        #判断是否是s3管理节点
        is_s3_management_node > /dev/null
        if [ $? -eq 1 ];then
            local localNID=$(grep -w NID ${CM_INI_PATH}|awk -F"=" '{print $2}')
            local hainfo=$(grep -w "node id(${localNID})" /proc/monc_hamap)
            if [ "$hainfo" = "" ];then
                NODE_TYPE="${BUSINESS_TYPE}"
            else
                role=${hainfo##*role(}
                role=${role%%)status*}

                if [ "$role" = "1" ];then
                    NODE_TYPE="${PRIMARY_TYPE}"
                elif [ "$role" = "2" ];then
                    NODE_TYPE="${STANDBY_TYPE}"
                fi
            fi
        fi
    else
        isNotNeedCheck=0
    fi
    return $isNotNeedCheck
}

#===================================================
# s3_hastandby_resource_check
# 函数功能：S3 ha 备节点进程检查服务进程状态
# 描述：
#===================================================
function s3_hastandby_resource_check()
{
    Resource_type=$1
    LOG "[$LINENO]CHECK process TYPE: ${Resource_type}" "INFO"
    #备检查
    if [ ! -f /tmp/Standby.status.info ] ;then
        LOG "[$LINENO]without the hatmp files" "ERROR"
        return 1
    fi
    Status_standby=`cat /tmp/Standby.status.info |grep  -e  "${Resource_type}" |awk '{print $NF}' | sed 's/^[ \t]*//g' | sed 's/[ \t\r\n]*$//g'`
    if [ -z "${Status_standby}" ] ;then
       LOG "[$LINENO]Status_standby is NULL(without white list or Resource_type)" "ERROR"
    fi

    #当${Resource_type}为"POE GaussDB"或"DCM GaussDB"、${Status_standby}为"Stop"时返回异常
    case "${Resource_type}" in
        "POE GaussDB"|"DCM GaussDB")
            if [ "${Status_standby}" != "Standby" ]; then
                LOG "[$LINENO]s3_hastandby_resource_check failed because S3 HA process ${Resource_type} is abnormal!" "ERROR"
                return 1
            fi
            ;;
        *)
            if [ "${Status_standby}" != "Stop" -a "${Status_standby}" != "Standby" ]; then
                LOG "[$LINENO]s3_hastandby_resource_check failed because S3 HA process ${Resource_type} is abnormal!" "ERROR"
                return 1
            fi
            ;;
    esac

    LOG "[$LINENO]S3 HA process ${Resource_type} is OK" "INFO"
    return 0
}
#===================================================
# s3_haprimary_resource_check
# 函数功能：S3 ha 主节点进程检查服务进程状态
# 描述：
#===================================================
function s3_haprimary_resource_check()
{
    Resource_type=$1
    LOG "[$LINENO]CHECK process TYPE: ${Resource_type}" "INFO"
    #主节点检查resource服务状态
    if [ ! -f /tmp/Primary.status.info ] ;then
        LOG "[$LINENO]without the hatmp files" "ERROR"
        return 1
    fi
    Status_primary=`cat /tmp/Primary.status.info |grep  -e  "${Resource_type}"|awk '{print $NF}'| sed 's/^[ \t]*//g' | sed 's/[ \t\r\n]*$//g'`
    if [ -z "${Status_primary}" ] ;then
        LOG "[$LINENO]Status_primary is NULL(without white list or Resource_type)" "ERROR"
    fi
    if [ "${Status_primary}" != "Normal"  -a  "${Status_primary}" != "Primary" ]; then
        LOG "[$LINENO]s3_primary_node_ha_check failed because S3 HA process ${Resource_type} is abnormal!" "ERROR"
        return 1
    fi
    LOG "[$LINENO]S3 HA process ${Resource_type} is OK" "INFO"
    return 0
}
#===================================================
# 函数名称：s3_primary_node_ha_check
# 函数功能：检查S3节点的HA相关进程
# 描述：
#===================================================
function s3_primary_node_ha_check()
{
    LOG "[$LINENO]This node is S3 primary node , start to check ha ..." "INFO"
    rm /tmp/Primary.status.info -rf
    #HA服务信息写入临时文件
    /usr/local/bin/MmlBatch 4016 "cm queryhainfo resource" > /tmp/Primary.status.info
    sed -i -e 's/\x1B\[0;[3-4][0-9]m//g' -e 's/\x0D//g' -e 's/\x00//g' /tmp/Primary.status.info  >/dev/null 2>&1

   #添加初步判断*.status.info是否正常，是否包含ERROR信息
    grep Error /tmp/Primary.status.info > /dev/null 2>&1
    if [ 0 -eq $? ];then
        appendErrorPro "MmlBatch queryhainfo Primary resource ERROR"
        LOG "[$LINENO]MmlBatch queryhainfo Primary resource ERROR" "ERROR"
        return 1
    fi

    #resource 中的参数空格不能去掉
    s3_haprimary_resource_check "POE    "                1 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "poe"
    s3_haprimary_resource_check "DCM    "                2 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "dcm"
    s3_haprimary_resource_check "visitor "               3 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "visitor"
    s3_haprimary_resource_check "POE Management IP"      4 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "poe management ip"
    s3_haprimary_resource_check "POE Bussiness IP"       5 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "poe bussiness ip"
    s3_haprimary_resource_check "GaussDB Management IP"  6 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "gaussdb management ip"
    s3_haprimary_resource_check "POE GaussDB"            7 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "poe gaussdb"
    s3_haprimary_resource_check "DCM GaussDB "           8 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "dcm gaussdb"
    LOG "[$LINENO]This node is s3 management node, end to check relative ha processes..." "INFO"
    return 0
}
function s3_standby_node_ha_check()
{
    LOG "[$LINENO]This node is S3 Standby node , start to check ha ..." "INFO"
    rm /tmp/Standby.status.info -rf
    #HA服务信息写入临时文件
    /usr/local/bin/MmlBatch 4016 "cm queryhainfo resource" > /tmp/Standby.status.info
    sed -i -e 's/\x1B\[0;[3-4][0-9]m//g' -e 's/\x0D//g' -e 's/\x00//g' /tmp/Standby.status.info >/dev/null 2>&1

    #添加初步判断*.status.info是否正常，是否包含ERROR信息
    grep Error /tmp/Standby.status.info > /dev/null 2>&1
    if [ 0 -eq $? ];then
        appendErrorPro "MmlBatch queryhainfo Standby resource ERROR"
        LOG "[$LINENO]MmlBatch queryhainfo Standby resource ERROR" "ERROR"
        return 1
    fi

    #resource 中的参数空格不能去掉
    s3_hastandby_resource_check "POE    "                1 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "poe"
    s3_hastandby_resource_check "DCM    "                2 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "dcm"
    s3_hastandby_resource_check "visitor "               3 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "visitor"
    s3_hastandby_resource_check "POE Management IP"      4 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "poe management ip"
    s3_hastandby_resource_check "POE Bussiness IP"       5 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "poe bussiness ip"
    s3_hastandby_resource_check "GaussDB Management IP"  6 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "gaussdb management ip"
    s3_hastandby_resource_check "POE GaussDB"            7 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "poe gaussdb"
    s3_hastandby_resource_check "DCM GaussDB "           8 ; err_ret=$?
    [[ ${err_ret} -ne 0 ]] && appendErrorPro "dcm gaussdb"
    LOG "[$LINENO]This node is s3 management node, end to check relative ha processes..." "INFO"
    return 0
}

#===================================================
# 函数名称：s3_daemon_process_check
# 函数功能：检查S3受daemon监控的进程
# 描述：
#===================================================
function s3_daemon_process_check()
{
    local count=${#DAEMON_PROCESSES[@]}
    local i=0
    for (( i=0; i<count; i=i+1 ))
    do
        sh "${DAEMON_PROCESSES_SCRIPTS[i]}" status > /dev/null 2>&1
        [[ $? -ne 0 ]] && appendErrorPro "${DAEMON_PROCESSES[i]}" && LOG "[$LINENO]${DAEMON_PROCESSES[i]} is abnoraml" "INFO"
    done
    return 0
}

#===================================================
# 函数名称：s3_dns_process_check
# 函数功能：检查管理节点上受daemon监控的dns
# 描述：    在检查主节点进程时调用
#===================================================
function s3_dns_process_check()
{
    LOG "[$LINENO]Check dns process status on S3 management node..." "INFO"
    sh ${UDS_DNS} status > /dev/null 2>&1
    [[ $? -ne 0 ]] && appendErrorPro "nsd" && LOG "[$LINENO]nsd process was abnormal!" "ERROR"
    LOG "[$LINENO]Process nsd is OK" "INFO"

    sh ${UDS_DNS_STATMON} status > /dev/null 2>&1
    [[ $? -ne 0 ]] && appendErrorPro "obsnsmon" && LOG "[$LINENO]obsnsmon process was abnormal!" "ERROR"
    LOG "[$LINENO]Process obsnsmon is OK" "INFO"

    LOG "[$LINENO]Check dns process status on S3 management node finished" "INFO"
    return 0

}

#===================================================
# 函数名称：main
# 函数功能: 检查S3关键服务主函数入口
# 描述：
#===================================================
function main()
{
    local mappri=`cat /proc/monc_hamap | grep -E "role\(1\)" | wc -l`
    local mapsta=`cat /proc/monc_hamap | grep -E "role\(2\)" | wc -l`
    LOG "[$LINENO]`cat /proc/monc_hamap`" "INFO"
    if [ $mappri -eq 0 -o $mapsta -eq 0 ];then
        appendErrorPro "the monc_hamap is not ready"
        isPass=1
    elif [ $mappri -ge 2 -o $mapsta -ge 2 ];then
        appendErrorPro "The HA nodes in the cluster do not work in active/standby mode"
        isPass=1
    elif [ "X${NODE_TYPE}" == "X${PRIMARY_TYPE}" ];then
        s3_primary_node_ha_check
        s3_dns_process_check
    elif [ "X${NODE_TYPE}" == "X${STANDBY_TYPE}" ];then
        s3_standby_node_ha_check
        s3_dns_process_check
    fi

    #queue,memcached,osc,s3fs,s3fs_bill
    s3_daemon_process_check

    #打印
    echo "NodeType:${NODE_TYPE}||Error Service:${errPro}" >> ${RESULTFILE} 2>&1
    echo "${CurInspectFun}_Pass ${isPass}" >>${RESULTFILE} 2>&1

}

#===================================================
# 函数名称：checkS3KeyService
# 函数功能: 检查S3关键服务主函数入口
# 描述：
#===================================================
function appendErrorPro()
{
     if [ "X${errPro}" == "X" ];then
          errPro="$1"
     else
          errPro="${errPro},$1"
     fi
}

#如果不需要检查，则直接退出脚本执行,输出isCheck:0
if isNotNeedCheck;then
    echo "NodeType:--||Error Service:Does not involve" >> ${RESULTFILE} 2>&1
    echo "${CurInspectFun}_Pass ${CHECK_PASSED}" >>${RESULTFILE} 2>&1
    LOG "[$LINENO]not need check, pass."
    exit 0;
fi

#开始检查S3关键服务
main
