#!/bin/bash
##################################################
# 获取脚本工作路径和脚本名
##################################################
SCRIPT_PATH=$(cd $(dirname $0);pwd)
SELF_FILE=$(basename $0)

##################################################
# 初始化日志路径和日志文件名
##################################################
LOG_PATH="/opt/oss/log/manager/easysuite_upgrade/scriptlog"
[ ! -d "${LOG_PATH}" ] && mkdir -p "${LOG_PATH}"
LOG_FILE="${LOG_PATH}/${SELF_FILE//.sh/}.log"
INFO_DIR="/opt/oss/log/manager/easysuite_upgrade/product_info_$(date +%Y%m%d%H%M%S)"
[ ! -d "${INFO_DIR}" ] && mkdir -p "${INFO_DIR}"
chmod 700 "${INFO_DIR}"

##################################################
# 超时配置
##################################################
TIMEOUT_CONFIG="/opt/upgrade/easysuite_upgrade/scripts/common/NCE-Common/envpatch/timeout.ini"
MANAGER_TIMEOUT=$(cat "${TIMEOUT_CONFIG}" | grep -w start_manager_timeout | awk -F "=" '{print $2}')
PRODUCT_TIMEOUT=$(cat "${TIMEOUT_CONFIG}" | grep -w start_product_timeout | awk -F "=" '{print $2}')
TIMEOUT_ERROR=124

##################################################
# 校验执行用户, 使用ossadm执行
##################################################
function check_user()
{
    local user
    user=$(whoami)
    oss_user=$(id -nu 3001)
    if [ "${user}" != "${oss_user}" ]
    then
        echo "[$(date +'%Y-%m-%d %H:%M:%S')]| User have no permission to run this script"
        return 1
    fi
}

##########################################
# 日志函数
##########################################
function DOUBLE_LOG()
{
    local level=${1}
    local message=${2}
    # 记录日志
    local date_time=$(date '+%Y-%m-%d %H:%M:%S')
    echo "[${date_time}] ${level} | ${message}" | tee -a ${LOG_FILE} -a ${TASK_LOG}
}

##########################################
# 刷新EasySuite任务
##########################################
function fresh_es_task()
{
    local progress="${1}"
    local status="${2}"
    echo Progress=${progress} >${TASK_PROGRESS}
    echo Status=${status} >${TASK_STATUS}
}

##################################################
# 初始化参数
##################################################
function init_params()
{
    # 脚本入参
    TASK_ID=${1}

    # 任务日志|进度|状态
    TASK_LOG="${TASK_MGR_PATH}/${TASK_ID}/task.log"
    TASK_PROGRESS="${TASK_MGR_PATH}/${TASK_ID}/task.progress"
    TASK_STATUS="${TASK_MGR_PATH}/${TASK_ID}/task.status"

    init_taskmgr "${TASK_ID}"
}

##########################################
# 获取所有管理节点IP
##########################################
function get_manager_node_list()
{
    local all_node=$(
. /opt/oss/manager/bin/engr_profile.sh
python <<PEOF
import json
import sys
node_file = r'/opt/oss/manager/etc/sysconf/nodelists.json'
manager_ips = []
try:
    with open(node_file, mode="r") as f:
        node_info = json.load(f)
    for _, one_node in node_info.get("nodeList", {}).items():
        if "MGR" not in one_node.get("role", []):
            continue
        for ip_info in one_node.get("IPAddresses", {}):
            if "maintenance" in ip_info.get("usage", []):
                manager_ips.append(ip_info.get("IP", ""))
                break
    print(" ".join(manager_ips))
    sys.exit(0)
except Exception as _:
    print("exception occur when read {}".format(node_file))
    sys.exit(1)
PEOF
)
    if [[ "${all_node}" =~ "exception" ]];then
        DOUBLE_LOG "INFO" "query node info failed:${all_node}."
        return 1
    fi

    NODE_LIST=$(echo "${all_node}" | xargs -n1 | sort -u)
    DOUBLE_LOG "INFO" "manager nodes:${NODE_LIST}."
}

##########################################
# 解析APP/DB节点状态结果
##########################################
function check_status()
{
    local product="${1}"
    local result_file="${2}"

    # 检查结果文件有效性
    grep "RUNNING" ${result_file} &>/dev/null || return 1

    # 检查APP状态
    local app_flag="False"
    local app_node_num=$(cat ${result_file} | sed "1d" | awk '{print $2}' | grep -c "APP")
    local app_running_node_num=$(cat ${result_file} | sed "1d" | awk '{print $4}' | grep -c RUNNING)
    DOUBLE_LOG "INFO" "[${product}] APP Node sum: ${app_node_num}"
    DOUBLE_LOG "INFO" "[${product}] APP Normal node: ${app_running_node_num}"
    if [ ${app_node_num} -ne 0 -a ${app_node_num} -eq ${app_running_node_num} ]
    then
        local app_flag="True"
        DOUBLE_LOG "INFO" "[${product}] All ${product} service nodes are running properly."
    else
        DOUBLE_LOG "WARN" "[${product}] Some ${product} service nodes are abnormal."
    fi

    # 检查DB状态
    local db_flag="False"
    local db_node_num=$(cat ${result_file} | sed "1d" | awk '{print $2}' | grep -c "DB")
    local db_running_node_num=$(cat ${result_file} | sed "1d" | awk '{print $5}' | grep -c RUNNING)
    DOUBLE_LOG "INFO" "[${product}] DB Node sum: ${db_node_num}"
    DOUBLE_LOG "INFO" "[${product}] DB Normal node: ${db_running_node_num}"
    if [ ${db_node_num} -ne 0 -a ${db_node_num} -eq ${db_running_node_num} ]
    then
        local db_flag="True"
        DOUBLE_LOG "INFO" "[${product}] All ${product} database nodes are running properly."
    else
        DOUBLE_LOG "WARN" "[${product}] Some ${product} database nodes are abnormal."
    fi

    if [ ${app_flag} == "False" -o ${db_flag} == "False" ]
    then
        return 1
    fi
    return 0
}

##################################################
# 重启管理面(单个节点超时30min)
##################################################
function restart_omp()
{
    local stop_mgr_cmd="source /opt/oss/manager/bin/engr_profile.sh; ipmc_adm -cmd stopmgr"
    local start_mgr_cmd="source /opt/oss/manager/bin/engr_profile.sh; ipmc_adm -cmd startmgr"
    # 重启管理面, 管理节点并发执行
    DOUBLE_LOG "INFO" "begin to restart manager."
    get_manager_node_list || return 1
    for node_ip in ${NODE_LIST[@]};do
        (
            DOUBLE_LOG "INFO" "stop manager on node: ${node_ip}."
            ssh ${OSS_USER}@${node_ip} -o ConnectTimeout=${MANAGER_TIMEOUT} -o stricthostkeychecking=no -o ConnectionAttempts=3 -o ServerAliveInterval=10 "${stop_mgr_cmd}" &>>${LOG_FILE}
            if [ $? -ne 0 ]
            then
                DOUBLE_LOG "ERROR" "failed to stop manager on node: ${node_ip}."
                touch ${INFO_DIR}/fail.flag
                return 1
            fi

            DOUBLE_LOG "INFO" "start manager on node: ${node_ip}."
            ssh ${OSS_USER}@${node_ip} -o ConnectTimeout=${MANAGER_TIMEOUT} -o stricthostkeychecking=no -o ConnectionAttempts=3 -o ServerAliveInterval=10 "${start_mgr_cmd}" &>>${LOG_FILE}
            if [ $? -ne 0 ]
            then
                DOUBLE_LOG "ERROR" "failed to start manager on node: ${node_ip}."
                touch ${INFO_DIR}/fail.flag
                return 1
            fi
        )&
    done

    # 通过标志文件判断并发任务结果(30min超时)
    local start_time=$(date "+%s")
    local timeout=$(expr ${start_time} + ${MANAGER_TIMEOUT})
    while true
    do
        sleep 60
        local current_time=$(date "+%s")
        # 超时
        if [ ${current_time} -ge ${timeout} ]
        then
            DOUBLE_LOG "ERROR" "restart manager timeout: ${MANAGER_TIMEOUT} seconds."
            fresh_es_task "100" "fail"
            return 1
        fi
        # 重启命令失败
        if [ -f "${INFO_DIR}/fail.flag" ]
        then
            DOUBLE_LOG "ERROR" "failed to restart manager."
            fresh_es_task "100" "fail"
            return 1
        fi

        # 查询管理面服务状态
        ipmc_adm -cmd statusnodes -tenant manager | grep -v "NA" >${INFO_DIR}/OMP.txt
        if [ $? -ne 0 ]
        then
            continue
        fi
        check_status "OMP" "${INFO_DIR}/OMP.txt"
        if [ $? -ne 0 ]
        then
            DOUBLE_LOG "INFO" "the manager status is not ready, please wait."
            continue
        fi
        DOUBLE_LOG "INFO" "finish to restart manager."
        fresh_es_task "100" "success"
    done
}

##################################################
# 重启业务面(单个产品超时60min)
##################################################
function restart_nce()
{
    local query_product=$(ls ${SCRIPT_PATH}/../pyscripts/query_product.py* 2>/dev/null)
    local product_info=$(python ${query_product} "get_product_list")
    local product_list=$(echo ${product_info} | awk -F'|' '{print $1}' | sed 's/,/ /g')
    # 重启业务面
    DOUBLE_LOG "INFO" "begin to restart product."
    fresh_es_task "10" "running"
    DOUBLE_LOG "INFO" "product_list: ${product_list}."
    for product in ${product_list}
    do
        local stop_product_cmd="source /opt/oss/manager/bin/engr_profile.sh; ipmc_adm -cmd stopnodes -tenant ${product}"
        local start_product_cmd="source /opt/oss/manager/bin/engr_profile.sh; ipmc_adm -cmd startnodes -tenant ${product}"
        local mgr_task_error="Please log in to the management plane to view the task failure details."

        DOUBLE_LOG "INFO" "start query product info for ${product}."
        bash /opt/oss/manager/tools/resmgr/queryproduct.sh -pn "${product}" -output "${INFO_DIR}" &>>${LOG_FILE}
        if [ $? -ne 0 ]
        then
            DOUBLE_LOG "ERROR" "failed to query product info for ${product}."
            fresh_es_task "100" "fail"
            return 1
        fi
        local node_ip=$(cat "${INFO_DIR}"/nodes_${product}.json 2>/dev/null | python -c "import json; import sys; obj=json.load(sys.stdin); print(obj['hostlist'][0]['nodemgrip'])" 2>/dev/null)

        DOUBLE_LOG "INFO" "begin to stop ${product} on ${node_ip}."
        timeout ${PRODUCT_TIMEOUT} ssh ${OSS_USER}@${node_ip} -o ConnectTimeout=${PRODUCT_TIMEOUT} -o stricthostkeychecking=no -o ConnectionAttempts=3 -o ServerAliveInterval=10 "${stop_product_cmd}" &>>${LOG_FILE}
        stop_result=$?
        if [ ${stop_result} -eq ${TIMEOUT_ERROR} ]
        then
            DOUBLE_LOG "ERROR" "stop ${product} timeout: ${PRODUCT_TIMEOUT} seconds."
            fresh_es_task "100" "fail"
            return 1
        elif [ ${stop_result} -ne 0 ]
        then
            DOUBLE_LOG "ERROR" "failed to stop ${product} on ${node_ip}. ${mgr_task_error}"
            fresh_es_task "100" "fail"
            return 1
        fi
        DOUBLE_LOG "INFO" "finish to stop ${product} on ${node_ip}."

        DOUBLE_LOG "INFO" "begin to start ${product} on ${node_ip}."
        timeout ${PRODUCT_TIMEOUT} ssh ${OSS_USER}@${node_ip} -o ConnectTimeout=${PRODUCT_TIMEOUT} -o stricthostkeychecking=no -o ConnectionAttempts=3 -o ServerAliveInterval=10 "${start_product_cmd}" &>>${LOG_FILE}
        start_result=$?
        if [ ${start_result} -eq ${TIMEOUT_ERROR} ]
        then
            DOUBLE_LOG "ERROR" "start ${product} timeout: ${PRODUCT_TIMEOUT} seconds."
            fresh_es_task "100" "fail"
            return 1
        elif [ ${start_result} -ne 0 ]
        then
            DOUBLE_LOG "ERROR" "failed to start ${product} on ${node_ip}. ${mgr_task_error}"
            fresh_es_task "100" "fail"
            return 1
        fi
    done
    DOUBLE_LOG "INFO" "finish to restart product."
    fresh_es_task "50" "running"
}

##################################################
# 主函数入口
##################################################
function main()
{
    # 校验执行用户
    check_user || return 1

    # 加载EasySuite公共方法及平台环境变量
    source ${SCRIPT_PATH}/../common.sh
    source /opt/oss/manager/bin/engr_profile.sh

    # 初始化参数
    init_params "$@"

    # 先重启业务面再重启管理面
    restart_nce || return 1
    restart_omp
}


main "$@"
exit $?