#!/bin/bash
# 获取脚本工作路径和脚本名
SCRIPT_PATH=$(cd $(dirname $0);pwd)
INFO_DIR="/opt/oss/log/manager/easysuite_upgrade/product_info_$(date +%Y%m%d%H%M%S)"

TASK_ID="${1}"
TASKMGR="/opt/upgrade/easysuite_upgrade/taskmgr/${TASK_ID}"
TASK_LOG="${TASKMGR}/task.log"
TASK_STATUS="${TASKMGR}/task.status"
TASK_PROGRESS="${TASKMGR}/task.progress"

SELF_FILE=$(basename "${0}")
LOG_PATH="/opt/oss/log/manager/easysuite_upgrade"
[ ! -d "${LOG_PATH}" ] && LOG_PATH="/opt/upgrade/os_log"
LOG_FILE="${LOG_PATH}/${SELF_FILE//.sh/}.log"

OSS_USER=$(id -nu 3001)
NODE_LIST=""


##########################################
# 全局日志函数
##########################################
function DOUBLE_LOG()
{
    local level=${1}
    local message=${2}
    # 日志等级
    if [ "${level}" == "ERROR" ]
    then
        local level="ERROR"
    elif [ "${level}" == "WARN" ]
    then
        local level="WARN"
    else
        local level="INFO"
    fi
    # 日志目录
    if [ ! -d "${LOG_PATH}" ]
    then
        mkdir -p "${LOG_PATH}"
    fi

    # 记录日志
    local date_time=$(date '+%Y-%m-%d %H:%M:%S')
    echo "[${date_time}] ${level} | ${message}" | tee -a ${LOG_FILE} -a ${TASK_LOG}
}

##########################################
# 校验执行用户
# 脚本要求使用oss机机用户执行
##########################################
function check_user()
{
    user=$(whoami)
    oss_user=$(id -nu 3001)
    if [ "${user}" != "${oss_user}" ]
    then
        echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR | User have no permission to run this script..."
        return 1
    fi
    return 0
}

##########################################
# 解析APP/DB节点状态结果
##########################################
function check_status()
{
    local product="${1}"
    local result_file="${2}"
    
    # 检查结果文件有效性
    grep "RUNNING" ${result_file} &>/dev/null || return 1
    
    # 检查APP状态
    local app_flag="False"
    local app_node_num=$(cat ${result_file} | sed "1d" | awk '{print $2}' | grep -c "APP")
    local app_running_node_num=$(cat ${result_file} | sed "1d" | awk '{print $4}' | grep -c RUNNING)
    DOUBLE_LOG "INFO" "[${product}] APP Node sum: ${app_node_num}"
    DOUBLE_LOG "INFO" "[${product}] APP Normal node: ${app_running_node_num}"
    if [ ${app_node_num} -ne 0 -a ${app_node_num} -eq ${app_running_node_num} ]
    then
        local app_flag="True"
        DOUBLE_LOG "INFO" "[${product}] All ${product} service nodes are running properly."
    else
        DOUBLE_LOG "WARN" "[${product}] Some ${product} service nodes are abnormal."
    fi
    
    # 检查DB状态
    local db_flag="False"
    local db_node_num=$(cat ${result_file} | sed "1d" | awk '{print $2}' | grep -c "DB")
    local db_running_node_num=$(cat ${result_file} | sed "1d" | awk '{print $5}' | grep -c RUNNING)
    DOUBLE_LOG "INFO" "[${product}] DB Node sum: ${db_node_num}"
    DOUBLE_LOG "INFO" "[${product}] DB Normal node: ${db_running_node_num}"
    if [ ${db_node_num} -ne 0 -a ${db_node_num} -eq ${db_running_node_num} ]
    then
        local db_flag="True"
        DOUBLE_LOG "INFO" "[${product}] All ${product} database nodes are running properly."
    else
        DOUBLE_LOG "WARN" "[${product}] Some ${product} database nodes are abnormal."
    fi
    
    if [ ${app_flag} == "False" -o ${db_flag} == "False" ]
    then
        return 1
    fi
    return 0
}



function check_error_status()
{
  local check_cmd="source /opt/oss/manager/bin/engr_profile.sh; ipmc_adm -cmd statusapp -tenant ${product} | grep ABNORMAL"
  product="${1}"
  result_file="${2}"

  i=0
  n=1
  while read -r line
    do
    ipaddr[$n]=$line
    ((n+=1))
  done < ${result_file}

  for line in "${ipaddr[@]}"; do
    i=$(echo "${i}+1" | bc)
    if [ $i -eq 1 ]
    then
      continue
    fi

    index2=`echo $line | awk '{print $2}'`
    index3=`echo $line | awk '{print $3}'`
    index4=`echo $line | awk '{print $4}'`
    index5=`echo $line | awk '{print $5}'`
    if [ ${index2} == "APP" -a ${index4} != "RUNNING" ]
    then
      echo $line >>${TASK_LOG}
      ssh ossadm@${index3} -o stricthostkeychecking=no "${check_cmd}" >>${TASK_LOG}
      DOUBLE_LOG "ERROR" "$line"

    elif [[ ${index2} == "APP,DB"* ]] && ( [[ ${index4} != "RUNNING" ]] || [[ ${index5} != "RUNNING" ]] )
    then
      echo $line >>${TASK_LOG}
      ssh ossadm@${index3} -o stricthostkeychecking=no "${check_cmd}" >>${TASK_LOG}
      DOUBLE_LOG "ERROR" "$line"
    fi
  done

}

##########################################
# 检查管理面状态
##########################################
function check_omp_status()
{
    timeout=${timeout}
    DOUBLE_LOG "INFO" "[OMP] Start to check the service status of the management plane."
    local start_mgr_flag="true"
    while true
    do
        sleep 10
        # 0.检查是否超时
        local current_time=$(date "+%s")
        if [ ${current_time} -ge ${timeout} ]
        then
            check_error_status "OMP" "${INFO_DIR}/OMP.txt"
            DOUBLE_LOG "ERROR" "[OMP] Checking the Service Start Timeout."
            return 1
        fi
        
        # 1.获取OMP状态
        DOUBLE_LOG "INFO" "[OMP] Service Status:"
        ipmc_adm -cmd statusnodes -tenant manager | grep -v "NA" | tee -a ${LOG_FILE} >${INFO_DIR}/OMP.txt
        local exec_code=$?
        if [ ${exec_code} -ne 0 ]
        then
            DOUBLE_LOG "WARN" "[OMP] Exec code: ${exec_code}."
            continue
        fi
        DOUBLE_LOG "INFO" "[OMP] Exec code: ${exec_code}."
        
        # 2.检查状态结果
        check_status "OMP" "${INFO_DIR}/OMP.txt"
        if [ $? -ne 0 ]
        then
            DOUBLE_LOG "INFO" "[OMP] The service is not complete. Please wait..."
            local diff_time=$(expr ${current_time} - ${start_time})
            if [ ${diff_time} -ge 600 -a "${start_mgr_flag}" == "true" ]
            then
                local start_mgr_flag="false"
                local start_cmd="source /opt/oss/manager/bin/engr_profile.sh; ipmc_adm -cmd startnode"
                get_node_list
                # 内部节点ssh长链接,增加ssh连接参数
                for node_ip in ${NODE_LIST[@]};do
                    (
                        ssh ${OSS_USER}@${node_ip} -o ConnectTimeout=${max_time} -o stricthostkeychecking=no -o ConnectionAttempts=3 -o ServerAliveInterval=10 "${start_cmd}" &>>${LOG_FILE}
                        DOUBLE_LOG "INFO" "[OMP] Exec cmd(${node_ip}): ipmc_adm -cmd startmgr"
                    )&
                done
                sleep 60
                continue
            fi
            DOUBLE_LOG "INFO" "[OMP] The management plane node has been started.Please wait..."
            continue
        fi
        DOUBLE_LOG "INFO" "[OMP] The management plane node has been started.Please wait..."
        break
    done
}
##########################################
# 检查APP/DB节点状态
##########################################
function check_service_status()
{
    source $(dirname ${SCRIPT_PATH})/common.sh
    # 初始化
    [ ! -d ${INFO_DIR} ] && mkdir -p ${INFO_DIR}
    DOUBLE_LOG "INFO" "Init task..."
    init_taskmgr "${TASK_ID}"
    # 启动检查任务
    source /opt/oss/manager/bin/engr_profile.sh
    
    local start_time=$(date "+%s")
    # 启动产品服务超时时间,固定30分钟;
    local max_time=1800
    local timeout=$(expr ${start_time} + ${max_time})
    # 检查OMP节点状态
    check_omp_status "${timeout}"

    # 获取产品信息
    local query_product=$(ls ${SCRIPT_PATH}/../pyscripts/query_product.py* 2>/dev/null)
    if [ -z "${query_product}" ]
    then
        DOUBLE_LOG "INFO" "The query_product.py script does not exist."
        return 1
    fi
    local product_info=$(python ${query_product} "get_product_list")
    local product_len=$(echo ${product_info} | awk -F'|' '{print $2}')
    local product_list=$(echo ${product_info} | awk -F'|' '{print $1}' | sed 's/,/ /g')

    # 检查产品节点状态
    local count=0
    for product in ${product_list}
    do
        # 默认值-1
        local start_cmd_result=-1
        local start_flag="true"
        DOUBLE_LOG "INFO" "[${product}] Start to check the product service startup status."
        while true
        do
            sleep 10
            # 启动成功直接退出
            if [ ${start_cmd_result} -eq 0 ]
            then
                count=$(expr ${count} + 1)
                break
            fi
            # 0.检查超时,并且为-1说明未执行启动
            local current_time=$(date "+%s")
            if [ ${current_time} -ge ${timeout} ]
            then
                check_error_status "${product}" "${INFO_DIR}/${product}.txt"
                DOUBLE_LOG "ERROR" "[${product}] Checking the Service Start Timeout."
                return 1
            fi
            
            # 1.获取产品信息
            DOUBLE_LOG "INFO" "[${product}] Querying Product Data:"
            cd /opt/oss/manager/tools/resmgr && bash queryproduct.sh -pn "${product}" -output "${INFO_DIR}" &>>${LOG_FILE}
            local query_code=$?
            if [ "${query_code}" -ne 0 ]
            then
                DOUBLE_LOG "WARN" "[${product}] Query product code: ${query_code}"
                continue
            fi
            DOUBLE_LOG "INFO" "[${product}] Query product code: ${query_code}"
            parse_json()
            {
                echo "${1//\"/}" | sed "s/.*$2:\([^,}]*\).*/\1/"
            }
            local nodes=$(cat ${INFO_DIR}/nodes_${product}.json)
            local node_ip=$(parse_json "${nodes}" "nodemgrip")
            DOUBLE_LOG "INFO" "[${product}] Node ip:${node_ip}"
            
            # 2.下发检查命令
            local check_cmd="source /opt/oss/manager/bin/engr_profile.sh; ipmc_adm -cmd statusnodes -tenant ${product} | grep -v NA\ *NA"
            timeout 60 ssh ${OSS_USER}@${node_ip} -o ConnectTimeout=60 -o stricthostkeychecking=no "${check_cmd}" | tee -a ${LOG_FILE} >${INFO_DIR}/${product}.txt
            local exec_code=$?
            if [ ${exec_code} -ne 0 ]
            then
                DOUBLE_LOG "WARN" "[${product}] Exec code: ${exec_code}."
                continue
            fi
            DOUBLE_LOG "INFO" "[${product}] Exec code: ${exec_code}."
            
            # 3.检查结果
            check_status "${product}" "${INFO_DIR}/${product}.txt"
            if [ $? -ne 0 ]
            then
                local diff_time=$(expr ${current_time} - ${start_time})
                # tool_reboot_ospatch.sh 去除调用检查服务状态，当前检查服务状态未正常调整成10分钟命令调起启动服务
                if [ ${diff_time} -ge 600 -a "${start_flag}" == "true" ]
                then
                    local start_flag="false"
                    # 内部节点ssh长链接,增加ssh连接参数
                    local start_cmd="source /opt/oss/manager/bin/engr_profile.sh; ipmc_adm -cmd startnodes -tenant ${product}"
                    ssh ${OSS_USER}@${node_ip} -o ConnectTimeout=${max_time} -o stricthostkeychecking=no -o ConnectionAttempts=3 -o ServerAliveInterval=10 "${start_cmd}" &>>${LOG_FILE}
                    # 启动服务执行成功,判定对应产品启动产品服务完成
                    start_cmd_result=$?
                    DOUBLE_LOG "INFO" "[${product}] Exec cmd(${node_ip}): ipmc_adm -cmd startnodes -tenant ${product}.ret_code:${start_cmd_result}"
                    continue
                fi
                DOUBLE_LOG "INFO" "[${product}] The service is not complete. Please wait..."
                continue
            fi
            count=$(expr ${count} + 1)
            break
        done
        DOUBLE_LOG "INFO" "[${product}] Count: ${count}"
        DOUBLE_LOG "INFO" "[${product}] Product len: ${product_len}"
        # 6.管理面和产品服务全部正常
        if [ "${count}" -eq "${product_len}" ]
        then
            DOUBLE_LOG "INFO" "All Services have been started, install was successful..."
            return 0
        fi
    done
}

##########################################
# 退出前处理
##########################################
function do_return()
{
    [ -d ${INFO_DIR} ] && rm -rf ${INFO_DIR}
    local result_code=${1}
    if [ ${result_code} -eq 0 ]
    then
        echo "Progress=100" >${TASK_PROGRESS}
        echo "Status=SUCCESS" >${TASK_STATUS}
        return ${result_code}
    fi
    echo "Progress=100" >${TASK_PROGRESS}
    echo "Status=Fail" >${TASK_STATUS}
    return ${result_code}
}

##########################################
# 获取节点IP
##########################################
function get_node_list() {
    local all_node=$(
. /opt/oss/manager/bin/engr_profile.sh
python <<PEOF
import json
import sys
node_file = r'/opt/oss/manager/etc/sysconf/nodelists.json'
manager_ips = []
try:
    with open(node_file, mode="r") as f:
        node_info = json.load(f)
    for node_id, one_node in node_info.get("nodeList", {}).items():
        manager_ips.extend([one_ip.get("IP") for one_ip in one_node.get("IPAddresses",{}) if "maintenance" in one_ip.get("usage")])
    print(" ".join(manager_ips))
    sys.exit(0)
except Exception as _:
    print("exception occur when read {}".format(node_file))
    sys.exit(1)
PEOF
)
    if [[ "${all_node}" =~ "exception" ]];then
        DOUBLE_LOG "INFO" "query node info failed:${all_node}."
        return 1
    fi

    NODE_LIST=$(echo "${all_node}" | xargs -n1 | sort -u)
}

check_user || exit $?
check_service_status $@
do_return $?
exit $?

