#!/usr/bin/env bash

logger -p local0.info "$(who -m | cut -d\( -f2 |cut -d\) -f1); start to one_button_repair"
CURRENT_PATH=$(cd $(dirname $0); pwd)
ENT_TOOLS_DIR="/opt/entTools"
FAULTY_EXEC_FLAG="/etc/faulty_exec_flag"

function check_user()
{
	current_user=$(whoami)
	if [[ "$current_user" != "root" ]]
	then
		echo "Please switch root user."
		exit 1
	fi
}
check_user

function check_execute_node_id()
{
    current_node_id=$(cat /opt/oss/manager/var/agent/mcagentid.conf | awk -F'=' '{print $2}')
    if [[ "$current_node_id" != "0" && "$current_node_id" != "1" ]]
    then
        echo "Please switch master or slave node."
        exit 1
    fi
}
check_execute_node_id

function check_ip_ping()
{
    ping $1 -c 5 >/dev/null 2>&1
    if [[ $? -ne 0 ]]
    then
        echo "Can't ping $1,please confirm."
        return 1
    fi
    return 0
}

function get_faulty_node_ip()
{
    product_name=$(ls /opt/oss/manager/var/tenants/ | grep -v manager | grep Insight)
    cat ${FAULTY_EXEC_FLAG} | grep check_and_init_param | grep success >/dev/null 2>&1
    if [[ $? -eq 0 ]]
    then
        return 0
    fi
    su - ossadm -c "source /opt/oss/manager/bin/engr_profile.sh;ipmc_adm -cmd statusnodes -tenant ${product_name}" > /etc/faulty_node_status
    if [[ $? -ne 0 ]]
    then
        echo "Get the node status failed,please confirm."
        return 1
    fi
    problem_node_num=$(cat /etc/faulty_node_status | awk '{print $1,$2,$3,$4}' | grep -E 'UNKNOWN|FAULT|NOT EXIST|STOPPING|STARTING|STOPPED|FAILURE|NA' | wc -l)
    faulty_node_ip=($(cat /etc/faulty_node_status | awk '{print $1,$2,$3,$4}' | grep -E 'UNKNOWN|NA|STOPPED' | awk '{print $3}'))
    faulty_node_ip_num=${#faulty_node_ip[*]}
    if [[ ${problem_node_num} -eq 1 && ${faulty_node_ip_num} -eq 1 ]]
    then
        echo "Faulty node ip is ${faulty_node_ip}"
        faulty_node_type=$(cat /etc/faulty_node_status | awk '{print $1,$2,$3,$4}' | grep -E 'UNKNOWN|NA|STOPPED' | awk '{print $1}')
        if [[ ${faulty_node_type} == "Analyzer-01" ]]
        then
            cat /etc/faulty_node_status | grep "Analyzer-02" | awk '{print $5}' | grep "RUNNING" >/dev/null 2>&1
            if [[ $? -ne 0 ]]
            then
                echo "Check the node DB status failed, please confirm."
                return 1
            fi
        elif [[ ${faulty_node_type} == "Analyzer-02" ]]; then
            cat /etc/faulty_node_status | grep "Analyzer-01" | awk '{print $5}' | grep "RUNNING" >/dev/null 2>&1
            if [[ $? -ne 0 ]]
            then
                echo "Check the node DB status failed, please confirm."
                return 1
            fi
        else
            db_run_node_num=$(cat /etc/faulty_node_status | awk '{print $5}' | grep 'RUNNING' | wc -l)
            if [[ ${db_run_node_num} -ne 2 ]]
            then
                echo "Check the node DB status failed, please confirm."
                return 1
            fi
        fi
        check_ip_ping ${faulty_node_ip}
        return $?
    fi
    echo "Check the node status by referring to the product documentation."
    return 1
}

function allowed_root_login()
{
    cat ${FAULTY_EXEC_FLAG} | grep check_and_init_param | grep success >/dev/null 2>&1
    if [[ $? -eq 0 ]]
    then
        return 0
    fi
    bash /opt/SEK/cmd/SetEnv.sh >/dev/null 2>&1
}

function prepare_execute_node_env()
{
    cat ${FAULTY_EXEC_FLAG} | grep check_and_init_param | grep success >/dev/null 2>&1
    if [[ $? -eq 0 ]]
    then
        return 0
    fi
    # 删除当前节点的known_hosts文件，避免指纹问题
    rm -rf /root/.ssh/known_hosts
    rm -rf /tmp/authorized_keys_flag
    rm -rf ${CURRENT_PATH}/../conf/config.properties
}

function check_and_init_param()
{
    cat ${FAULTY_EXEC_FLAG} | grep check_and_init_param | grep success >/dev/null 2>&1
    if [[ $? -eq 0 ]]
    then
        faulty_node_ip=$(cat ${ENT_TOOLS_DIR}/conf/config.properties | grep "faulty_node_ip=" | awk -F'=' '{print $2}')
        faulty_node_type=$(cat ${ENT_TOOLS_DIR}/conf/config.properties | grep "faulty_node_type=" | awk -F'=' '{print $2}')
        faulty_node_id=$(cat ${ENT_TOOLS_DIR}/conf/config.properties | grep "faulty_node_id=" | awk -F'=' '{print $2}')
        return 0
    fi

    ${CURRENT_PATH}/../script/python/bin/python ${CURRENT_PATH}/../script/launch.pyc processor -i -exec pre.one_button_repair.check_and_init_param -faulty_node_ip ${faulty_node_ip} -faulty_node_type ${faulty_node_type} << eof
${faulty_node_root_password}
eof
    if [[ $? -eq 0 ]]
	then
	    echo -e "\n"
		echo "Success to get and check param,continue."
        sed -i "/check_and_init_param/d" ${FAULTY_EXEC_FLAG}
		echo "check_and_init_param=success" >> ${FAULTY_EXEC_FLAG}
		return 0
	fi
	echo -e "\n"
	echo "Failed to get and check param,exit."
    return 1
}

function upload_entTools_to_faulty_node()
{
    use_ipv6=$(cat ${ENT_TOOLS_DIR}/conf/config.properties | grep "use_ipv6=" | awk -F'=' '{print $2}')
    if [[ ${use_ipv6} == "IPv4" ]]
    then
        peer_ip=${faulty_node_ip}
    else
        peer_ip="[${faulty_node_ip}]"
    fi

    cat ${FAULTY_EXEC_FLAG} | grep upload_entTools_to_faulty_node | grep success >/dev/null 2>&1
    if [[ $? -eq 0 ]]
    then
        return 0
    fi

    echo "Put entTools to faulty node root dir."
    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "rm -rf /root/entTools"

    scp -r -o StrictHostKeyChecking=no ${ENT_TOOLS_DIR} root@${peer_ip}:/root/ >/dev/null 2>&1
    if [[ $? -eq 0 ]]
    then
        faulty_node_id=$(cat ${ENT_TOOLS_DIR}/conf/config.properties | grep "faulty_node_id=" | awk -F'=' '{print $2}')
        if [[ "${faulty_node_id}" != "0" && "${faulty_node_id}" != "1" ]]
        then
            ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "rm -rf /root/entTools/one_button_repair/one_button_repair.sh"
        fi
        echo "Put entTools to faulty node root dir successfully."
        sed -i "/upload_entTools_to_faulty_node/d" ${FAULTY_EXEC_FLAG}
		echo "upload_entTools_to_faulty_node=success" >> ${FAULTY_EXEC_FLAG}
        return 0
    fi
    echo "Put entTools to faulty node root dir failed."
    return 1
}

function prepare_faulty_node_env()
{
    cat ${FAULTY_EXEC_FLAG} | grep prepare_faulty_node_env | grep success >/dev/null 2>&1
    if [[ $? -eq 0 ]]
    then
        return 0
    fi
    echo "Start to check faulty node install."
    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "/root/entTools/script/python/bin/python /root/entTools/script/launch.pyc processor -i -exec pre.install.check_items -check_item 'install.check_install'"
    if [[ $? -ne 0 ]]
	then
		echo "Failed to check faulty node install.exit."
		return 1
	fi
    echo "Start to disk partition for faulty node"
    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "rm -rf /root/install_config/; mkdir /root/install_config/; cp /root/entTools/disk_partition.sh /root/install_config/; bash /root/install_config/disk_partition.sh"
    if [[ $? -ne 0 ]]
	then
		echo "Failed to disk partition for faulty node.exit."
		return 1
	fi
	ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "cp -rf /root/entTools /opt/"
    if [[ $? -ne 0 ]]
	then
		echo "Failed to copy entTools to /opt.exit."
		return 1
	fi
	cat /proc/version | grep "euleros" >/dev/null 2>&1
	if [[ $? -eq 0 ]]
    then
        echo "Start to install os patch for faulty node"
        os_path="/opt/os_patch"
        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "rm -rf ${os_path}; mkdir ${os_path}; tar -xvf ${ENT_TOOLS_DIR}/*EulerOS*Patch*.tar -C ${os_path} >/dev/null 2>&1; cd ${os_path}; tar -xvf *.tar.gz >/dev/null 2>&1; tar -xvf *.tar.gz >/dev/null 2>&1; cd *Patch; tar -xvf os*_pkg.tar >/dev/null 2>&1; tar -xvf os_sudobin.tar >/dev/null 2>&1"
        if [[ $? -ne 0 ]]
        then
            echo "Failed to unzip os patch.exit."
            return 1
        fi
        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "cd ${os_path}/*Patch/script; bash update_*_do.sh install"
        if [[ $? -ne 0 ]]
        then
            echo "Failed to install os patch.exit."
            return 1
        fi
        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "cd /usr/local/rpmpkg; rpm -ivh numactl-libs-2.*.rpm >/dev/null 2>&1"
    fi
    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "rm -rf /root/entTools"
    if [[ $? -ne 0 ]]
	then
		echo "Failed to rm entTools in root dir.exit."
		return 1
	fi
	echo "Success to prepare faulty node env,continue."
    sed -i "/prepare_faulty_node_env/d" ${FAULTY_EXEC_FLAG}
    echo "prepare_faulty_node_env=success" >> ${FAULTY_EXEC_FLAG}
    return 0
}

function check_faulty_node_env()
{
    cat ${FAULTY_EXEC_FLAG} | grep check_faulty_node_env | grep success >/dev/null 2>&1
    if [[ $? -eq 0 ]]
    then
        return 0
    fi
    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "rm -rf ${ENT_TOOLS_DIR}/os_mediation; unzip -o ${ENT_TOOLS_DIR}/*_OSMediation*.zip -d ${ENT_TOOLS_DIR}/os_mediation >/dev/null 2>&1; bash ${ENT_TOOLS_DIR}/os_mediation/install.sh -sudobinpath /opt/sudobin" << eof
${faulty_node_os_other_password}
eof
    if [[ $? -ne 0 ]]
	then
		echo "Failed to exec os mediation.exit."
		return 1
	fi
	echo "Success to exec os mediation,continue."
    ${CURRENT_PATH}/../script/python/bin/python ${CURRENT_PATH}/../script/launch.pyc processor -i -exec pre.one_button_repair.execute_faulty_node_check << eof
${faulty_node_os_other_password}
eof
    if [[ $? -eq 0 ]]
	then
	    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "mkdir /opt/backup/ncendp; chown 3004:ossgroup /opt/backup/ncendp"
		echo "Success to check faulty node env,continue."
		sed -i "/check_faulty_node_env/d" ${FAULTY_EXEC_FLAG}
		echo "check_faulty_node_env=success" >> ${FAULTY_EXEC_FLAG}
		return 0
	fi
	echo "Failed to check faulty node env,exit."
    return 1
}

function create_vol_for_faulty_node()
{
    cat ${FAULTY_EXEC_FLAG} | grep create_vol_for_faulty_node | grep success >/dev/null 2>&1
    if [[ $? -eq 0 ]]
    then
        return 0
    fi
    ls /opt/oss/*/apps/NdpToolService/  >/dev/null 2>&1
    is_mda_scenario=$?
    if [[ ${faulty_node_type} =~ "Collector" || ${is_mda_scenario} -ne 0 ]]
    then
        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "${ENT_TOOLS_DIR}/script/python/bin/python ${ENT_TOOLS_DIR}/script/launch.pyc processor -i -exec pre.install.create_vol -localNETypeList collect_server -selectedNetType classic -sub_node_type fabric_collect_service"
        if [[ $? -eq 0 ]]
        then
            echo "Success to create vol for faulty node,continue."
            sed -i "/create_vol_for_faulty_node/d" ${FAULTY_EXEC_FLAG}
		    echo "create_vol_for_faulty_node=success" >> ${FAULTY_EXEC_FLAG}
            return 0
        fi
    else
        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "${ENT_TOOLS_DIR}/script/python/bin/python ${ENT_TOOLS_DIR}/script/launch.pyc processor -i -exec pre.install.create_vol -localNETypeList min_cluster_first -selectedNetType classic -sub_node_type main_fi_fabric"
        if [[ $? -eq 0 ]]
        then
            ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "chown -R 3004:ossgroup /srv/BigData/"
            echo "Success to create vol for faulty node,continue."
            sed -i "/create_vol_for_faulty_node/d" ${FAULTY_EXEC_FLAG}
		    echo "create_vol_for_faulty_node=success" >> ${FAULTY_EXEC_FLAG}
            return 0
        fi
    fi
	echo "Failed to create vol for faulty node,exit."
    return 1
}

function repair_faulty_node()
{
    cat ${FAULTY_EXEC_FLAG} | grep repair_faulty_node | grep success >/dev/null 2>&1
    if [[ $? -eq 0 ]]
    then
        return 0
    fi
    if [[ ! ${faulty_node_type} =~ "Collector" ]]
    then
        scp -o StrictHostKeyChecking=no /etc/hosts root@${peer_ip}:/etc/hosts >/dev/null 2>&1
    fi
    # NAIE模型包部署依赖FI的ficommon属组，产品最后才恢复FI，通过提前创建/opt/infers目录规避
    if [[ ${faulty_node_type} =~ "Analyzer" && -d "/opt/infers" ]]
    then
        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "mkdir /opt/infers; chmod 750 /opt/infers; chown 3004:ossgroup /opt/infers"
    fi
    if [[ ${faulty_node_type} =~ "Analyzer" && ${is_mda_scenario} -eq 0 ]]
    then
        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "rm -rf /tmp/ficluster_conf.json; rm -rf /tmp/dayu_repair; mkdir /tmp/dayu_repair"
        scp /opt/oss/share/*/NdpToolService/ficluster_conf.json root@${peer_ip}:/tmp/
        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "chown 3004:ossgroup /tmp/ficluster_conf.json"
        scp /opt/oss/share/*/NdpToolService/cluster_instance.ini root@${peer_ip}:/tmp/dayu_repair/
        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "chown -R 3004:ossgroup /tmp/dayu_repair/cluster_instance.ini"
        # 执行FI的修复前置动作
        bash /opt/NdpTools/repair_before_restore.sh ${faulty_node_ip}
        if [[ $? -ne 0 ]]
        then
            echo "Failed to exec repair_before_restore.sh,exit."
            return 1
        fi
    fi
    ${CURRENT_PATH}/../script/python/bin/python ${CURRENT_PATH}/../script/launch.pyc processor -i -exec cloudsop.one_button_repair.repair_faulty_node << eof
${faulty_node_os_other_password}
eof
    if [[ $? -eq 0 ]]
	then
	    su - ossadm -c ". /opt/oss/manager/bin/engr_profile.sh; sshd_ipsadm.sh -cmd restore -force"
	    if [[ $? -ne 0 ]]
	    then
	        echo "Failed to restore node,exit."
		    return 1
	    fi

        # 部分微服务启动会在coredump路径下创建目录，依赖权限改对
	    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "chmod 770 /opt/oss/log/NCE/coredump"

	    if [[ ${faulty_node_type} =~ "Analyzer" && ${is_mda_scenario} -eq 0 ]]
	    then
	        # 升级后故障节点服务默认不启动，ndp修复依赖几个服务启动一次，namenode启动临时去除依赖
	        namenode_conf="/opt/oss/${product_name}/apps/NdpNameNodeService/script/sh/service/conf/"
	        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "sed -i 's/dependency_check.required.start=1/dependency_check.required.start=0/g' ${namenode_conf}/service_ctl_ndpnamenodeservice.conf"
	        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "su - ossadm -c 'source /opt/oss/manager/bin/engr_profile.sh; ipmc_adm -cmd startapp -tenant ${product_name} -app NdpToolService,NdpStatusService,NdpNameNodeService,NdpRedisService' 2>/dev/null"
	        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "su - ossadm -c 'source /opt/oss/manager/bin/engr_profile.sh; ipmc_adm -cmd stopapp -tenant ${product_name} -app NdpNameNodeService' 2>/dev/null"
	        ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "sed -i 's/dependency_check.required.start=0/dependency_check.required.start=1/g' ${namenode_conf}/service_ctl_ndpnamenodeservice.conf"
            # 执行FI的修复后置动作
            bash /opt/NdpTools/repair_after_restore.sh ${faulty_node_ip} ${product_name} << eof
${faulty_node_root_password}
eof
            if [[ $? -ne 0 ]]
            then
                echo "Failed to exec repair_after_restore.sh,exit."
                return 1
            fi
            ${CURRENT_PATH}/../script/python/bin/python ${CURRENT_PATH}/../script/launch.pyc processor -i -exec post.one_button_repair.exec_post_repair
            if [[ $? -ne 0 ]]
            then
                echo "Failed to do post repair,exit."
                return 1
            fi
	    fi
		echo "Success to repair faulty node,continue."
		sed -i "/repair_faulty_node/d" ${FAULTY_EXEC_FLAG}
        echo "repair_faulty_node=success" >> ${FAULTY_EXEC_FLAG}
		return 0
	fi
	echo "Failed to repair faulty node,exit."
    return 1
}

function migrate_scripts()
{
    cat ${FAULTY_EXEC_FLAG} | grep migrate_scripts | grep success >/dev/null 2>&1
    if [[ $? -eq 0 ]]
    then
        return 0
    fi
    echo "Start to migrate file to faulty node"
    cracklib-unpacker /usr/share/cracklib/pw_dict > /tmp/faulty_dict.tmp
    if [[ $? -ne 0 ]]
    then
        echo "Failed to export pwd dict,please confirm."
        return 1
    fi
    scp -r -o StrictHostKeyChecking=no /tmp/faulty_dict.tmp root@${peer_ip}:/tmp/ >/dev/null 2>&1
    scp -r -o StrictHostKeyChecking=no /root/EasySuite root@${peer_ip}:/root/ >/dev/null 2>&1
    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "rm -rf /usr/local/insight/bin/*; mkdir -p /usr/local/insight/bin" >/dev/null 2>&1
    scp -r -o StrictHostKeyChecking=no /usr/local/insight/bin/* root@${peer_ip}:/usr/local/insight/bin/ >/dev/null 2>&1
    scp -r -o StrictHostKeyChecking=no /etc/logrotate.d/log_performance root@${peer_ip}:/etc/logrotate.d/ >/dev/null 2>&1
    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "chmod 600 /etc/logrotate.d/log_performance" >/dev/null 2>&1
    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "mkdir -p ${ENT_TOOLS_DIR}/log_collect" >/dev/null 2>&1
    scp -r -o StrictHostKeyChecking=no /opt/oss/log_collect_tool/log_collect_pkg.tar root@${peer_ip}:${ENT_TOOLS_DIR}/log_collect/ >/dev/null 2>&1
    if [[ ${faulty_node_type} =~ "Collector" || ${is_mda_scenario} -ne 0 ]]
    then
        echo "Success to migrate scripts,continue."
        sed -i "/migrate_scripts/d" ${FAULTY_EXEC_FLAG}
        echo "migrate_scripts=success" >> ${FAULTY_EXEC_FLAG}
        return 0
    fi
    if [[ "${faulty_node_id}" == "0" || "${faulty_node_id}" == "1" ]]
    then
        scp -r -o StrictHostKeyChecking=no /opt/NdpTools root@${peer_ip}:/opt/ >/dev/null 2>&1
    fi
    scp -r -o StrictHostKeyChecking=no /opt/oss/scale_tool root@${peer_ip}:/opt/oss/ >/dev/null 2>&1
    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "chown -R ossadm:ossgroup /opt/oss/scale_tool" >/dev/null 2>&1
    echo "Success to migrate scripts,continue."
    sed -i "/migrate_scripts/d" ${FAULTY_EXEC_FLAG}
    echo "migrate_scripts=success" >> ${FAULTY_EXEC_FLAG}
    return 0
}

function do_after()
{
    echo "Start to do after action"
    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "rm -rf /opt/os_patch; rm -rf /root/install_config; rm -rf /tmp/dayu_repair"
    ssh -o StrictHostKeyChecking=no ${faulty_node_ip} "bash ${ENT_TOOLS_DIR}/one_button_repair/after_work.sh"
    if [[ $? -ne 0 ]]
    then
        echo "Failed to do after action,exit."
        return 1
    fi
    echo "Success to do after action,end."
    return 0
}

main()
{
    if [[ ! -f ${FAULTY_EXEC_FLAG} ]]
    then
        touch ${FAULTY_EXEC_FLAG}
    fi
    get_faulty_node_ip || return 1
    allowed_root_login
    prepare_execute_node_env
    # root密码要求所有节点一致，FI恢复时需要登录到所有节点
	echo -n "Please enter root password of all node:"
    IFS=$'\n'
    read -sr faulty_node_root_password
    echo -e "\n"
    echo -n "Please enter other os password of faulty node:"
    IFS=$'\n'
    read -sr faulty_node_os_other_password
    echo -e "\n"
    check_and_init_param || return 1
    upload_entTools_to_faulty_node || return 1
    prepare_faulty_node_env || return 1
    check_faulty_node_env || return 1
    create_vol_for_faulty_node || return 1
    repair_faulty_node || return 1
    migrate_scripts || return 1
    do_after || return 1
    rm -rf /etc/faulty_node_status
    rm -rf ${FAULTY_EXEC_FLAG}
}

main
ret=$?
if [[ ${ret} -eq 0 ]]
then
    echo "Repair successfully."
else
    echo "Repair failed."
fi
logger -p local0.info "$(who -m | cut -d\( -f2 |cut -d\) -f1);execute one_button_repair ended"
exit ${ret}
