#!/bin/bash

OMA_RUN_PATH="/opt/omm/oma/"
SENDALARM_PATH="/opt/omm/oma/tools/sendAlarm"
LOG_TOOL="/opt/omm/oma/tools/omm_log"
LOG_FILE="/var/log/fusionsphere/component/omm/oma/scriptlog/sendAlarm.log" #做判断
FLAG_FILE="/opt/huawei/dj/etc/cps-monitor/flag.file" #持久化标志

ALARM_THRESHOLD="95"
CLEARANCE_THRESHOLD="80"

#告警ID
ALARM_ID_CPU="13099"
ALARM_ID_MEMORY="13098"
ALARM_ID_DISK="13097"
ALARM_ID_NETWORK="13093" # 三网合一网络亚健康告警归一优化

#告警内部ID仅仅内部使用
ALARM_ID_CPU_in="130990"
ALARM_ID_MEMORY_in="130980"
ALARM_ID_DISK_opt="130970"
ALARM_ID_DISK_var="130971"
ALARM_ID_DISK_usr="130972"
#告警类型
ALARM_TYPE_FAULT="0"
ALARM_TYPE_RECOVER="1"
#告警级别
ALARM_LEVEL_CRIT="1"
ALARM_LEVEL_MAJOR="2"
ALARM_LEVEL_MINOR="3"
ALARM_LEVEL_WARN="4"
#告警原因码
ALARM_CAUSE_CPU=
ALARM_CAUSE_MEMORY=
ALARM_CAUSE_DISK=
ALARM_CAUSE_NETWORK=
#告警时间
ALARM_TIME="13001"
#告警MOC
ALARM_MOC_HOST="CPS"
ALART_MOC_NETWORK="CPS"
#告警REC
ALARM_RES_HOST_CPU="os"
ALARM_RES_HOST_MEMORY="os"
ALARM_RES_HOST_DISK="os"
ALARM_RES_HOST_NETWORK="os"
#告警产生位置
ALARM_LOCATION=`csbs_python /usr/bin/get_info.py --hostname`
#告警附加信息
NODE_IP=`csbs_python /usr/bin/get_info.py manage_ip`
NODE_LIST=($(get_info.py manage_ip list | sed 's/,/ /g'))
ALARM_usr=0
ALARM_opt=0



log_error()
{
    logger -id -p local1.error -t "resource_monitor" "ERROR" "$1"
}

log_debug()
{
    logger -id -p local1.debug -t "resource_monitor" "DEBUG" "$1"
}

log_info()
{
    logger -id -p local1.info -t "resource_monitor" "INFO" "$1"
}

log_success()
{
    logger -id -p local1.info -t "resource_monitor" "SUCCESS" "$1"
}

log_fail()
{
    logger -id -p local1.info -t "resource_monitor" "FAIL" "$1"
}


#
#计算CPU的使用率
#

function get_cpu_rate()
{
#interval,获取cpu 使用的时间间隔；
    interval=5
#获取cpu数目
    cpu_num=`cat /proc/stat | grep cpu[0-9] -c`


#cpu等待时间数组
    start_idle=()
#cpu使用总时间数组
    start_total=()
#cpu使用率数组
    cpu_rate=()
#计算初始每个cpu使用数据
    for((i=0;i<${cpu_num};i++))
    {
        start=$(cat /proc/stat | grep "cpu$i" | awk '{print $2" "$3" "$4" "$5" "$6" "$7" "$8}')
        start_idle[$i]=$(echo ${start} | awk '{print $4}')
        start_total[$i]=$(echo ${start} | awk '{printf "%.f",$1+$2+$3+$4+$5+$6+$7}')
    }
 #计算初始总cpu使用数据
    start=$(cat /proc/stat | grep "cpu " | awk '{print $2" "$3" "$4" "$5" "$6" "$7" "$8}')
    start_idle[${cpu_num}]=$(echo ${start} | awk '{print $4}')
    start_total[${cpu_num}]=$(echo ${start} | awk '{printf "%.f",$1+$2+$3+$4+$5+$6+$7}')
#interval时间后
    sleep ${interval}
#计算时间间隔后每个cpu使用数据,并根据数据计算出每个CPU使用率
    for((i=0;i<${cpu_num};i++))
    {
        end=$(cat /proc/stat | grep "cpu$i" | awk '{print $2" "$3" "$4" "$5" "$6" "$7" "$8}')
        end_idle=$(echo ${end} | awk '{print $4}')
        end_total=$(echo ${end} | awk '{printf "%.f",$1+$2+$3+$4+$5+$6+$7}')
        idle=`expr ${end_idle} - ${start_idle[$i]}`
        total=`expr ${end_total} - ${start_total[$i]}`
        cpu_usage=`expr ${idle} \* 100 / ${total}`
        cpu_rate[$i]=`expr 100 - ${cpu_usage}`
    }
 #计算时间间隔后总cpu使用数据，并根据数据计算出计算机所有CPU的总体使用率
    end=$(cat /proc/stat | grep "cpu " | awk '{print $2" "$3" "$4" "$5" "$6" "$7" "$8}')
    end_idle=$(echo ${end} | awk '{print $4}')
    end_total=$(echo ${end} | awk '{printf "%.f",$1+$2+$3+$4+$5+$6+$7}')
    idle=`expr ${end_idle} - ${start_idle[$i]}`
    total=`expr ${end_total} - ${start_total[$i]}`
    cpu_usage=`expr  ${idle} \* 100 / ${total}`
    cpu_rate[${cpu_num}]=`expr 100 - ${cpu_usage}`

    case $1 in
    ( '-a'  )
    for((i=0;i<${cpu_num};i++))
    {
        echo "${cpu_rate[$i]}"
    }
    exit 0
    ;;
    ( *[0-9]* )
    echo "${cpu_rate[$1]}"
    exit 0
    ;;
    ( '-t' )
    echo "${cpu_rate[${cpu_num}]}"
    exit 0
    ;;
    ( * )
    exit 0
    ;;
    esac
    shift
}


#
# 检查告警
#

check_alarm()
{
    if [[ "$1" -ne "0" ]] && [[ "$2" -ne "0" ]];then
        return 1
    fi
    return 0
}


#
#计算内存的使用率
#

function get_mem_rate()
{
    suse_tag="/etc/SuSE-release"
    euleros_tag="/etc/euleros-release"
    if [[  -e "$suse_tag" ]];then
        mem=$(free -b |awk '/Mem/ {print $2,$2-($4+$6+$7)}')
    elif [[ -e "$euleros_tag" ]];then
        mem=$(free -b |awk '/Mem/ {print $2,$3}')
    else
        echo 1
        exit 0
    fi
    mem1=$(echo ${mem}| awk '{print $1}')
    mem2=$(echo ${mem}| awk '{print $2}')
    mem_total_disp=$(echo "${mem1}/1048576" | bc)
    mem_used_disp=$(echo "${mem2}/1048576" | bc)
    var=$(echo ${mem} | awk '{print $2/$1*100}')
}

#
#计算磁盘分区的使用率
#

function get_disk_rate()
{
    ALARM_usr=$(df -h |grep "/usr$" | awk '{print $5}' | head -1 | tr -d "%")
    usr_size=$(df -h |grep "/usr$" | awk '{print $2}' | head -1 | tr -d "%")
    usr_used=$(df -h |grep "/usr$" | awk '{print $3}' | head -1 | tr -d "%")
    if [[ ! "$ALARM_usr" ]];then
        ALARM_usr=$(df -h |grep "/$" | awk '{print $5}' | head -1 | tr -d "%")
        usr_size=$(df -h |grep "/$" | awk '{print $2}' | head -1 | tr -d "%")
        usr_used=$(df -h |grep "/$" | awk '{print $3}' | head -1 | tr -d "%")
    fi
    ALARM_opt=$(df -h |grep "/opt$" | awk '{print $5}' | head -1 | tr -d "%")
    opt_size=$(df -h |grep "/opt$" | awk '{print $2}' | head -1 | tr -d "%")
    opt_used=$(df -h |grep "/opt$" | awk '{print $3}' | head -1 | tr -d "%")
    if [[ ! "$ALARM_opt" ]];then
        ALARM_opt=100
    fi
}

function Log()
{
    if [[ ! -f "$LOG_FILE" ]]; then
        touch "$LOG_FILE"
    fi
    if [[ $? -ne 0 ]];then
        return 1
    fi
}


function check_flag_file()
{
    if [[ ! -f "$FLAG_FILE" ]]; then
        return 1
    fi
    return 0
}

function creat_flag_file()
{
    if [[ ! -f "$FLAG_FILE" ]]; then
        mkdir -p /etc/alarm
        touch "$FLAG_FILE"
        chmod 640 ${FLAG_FILE}
        chown -h root:openstack /etc/alarm
        chmod 750 /etc/alarm
        chown -h root:openstack /etc/alarm/*
        chmod 640 /etc/alarm/*
    else
        return 0
    fi
    echo "$ALARM_ID_CPU_in $ALARM_TYPE_RECOVER" >> $FLAG_FILE
    echo "$ALARM_ID_MEMORY_in $ALARM_TYPE_RECOVER" >> $FLAG_FILE
    echo "$ALARM_ID_DISK_opt $ALARM_TYPE_RECOVER" >> $FLAG_FILE
    echo "$ALARM_ID_DISK_usr $ALARM_TYPE_RECOVER" >> $FLAG_FILE
    echo "$ALARM_ID_DISK_var $ALARM_TYPE_RECOVER" >> $FLAG_FILE
    for node in ${NODE_LIST[@]};do
        echo "NETWORK_HEALTH_${node} $ALARM_TYPE_RECOVER" >> $FLAG_FILE
    done
}

#
#监测cpu的使用率
#

function check_cpu_alarm()
{
    #检查flag文件是否存在
    check_flag_file
    if [[ $? -ne 0 ]];then
        return 1
    fi
    local_addr=`csbs_python /usr/bin/get_info.py --internal_ip`
    if [[ ! $local_addr ]];then
            return 1
    fi
    #计算CPU的使用率
    local retry_times=3
    local i=0
    local alarm=0
    local calarm=0
    while [[ $i -lt $retry_times ]]
    do
        set +x
        read var <<< $(get_cpu_rate -t)
        set -x
        if [[ ! $var ]];then
            return 1
        fi
        if [[ $(echo "$var > $ALARM_THRESHOLD"| bc) -eq "1" ]];then
            let alarm+=1
            check_alarm $alarm $calarm
            if [[ "$?" -ne "0" ]];then
                break
            fi
            log_info "cpu use rate is:$var;retry_times:$i"
        elif [[ $(echo "$var < ${CLEARANCE_THRESHOLD}"| bc) -eq "1" ]];then
            let calarm+=1
            check_alarm $alarm $calarm
            if [[ "$?" -ne "0" ]];then
                break
            fi
            log_info "cpu use rate is:$var;retry_times:$i"
        fi
        sleep 5s
        let i+=1
    done

    if [[ "$alarm" -eq "3" ]];then
        flag=$(cat $FLAG_FILE | grep $ALARM_ID_CPU_in | awk '{print $2}')
        if [[ $flag -ne $ALARM_TYPE_FAULT ]];then
            system_time=$(date +%s)
            $SENDALARM_PATH $ALARM_ID_CPU \
            $ALARM_TYPE_FAULT \
            $ALARM_LEVEL_MAJOR \
            "$ALARM_CAUSE_CPU" \
            $system_time \
            $ALARM_MOC_HOST \
            "$ALARM_RES_HOST_CPU" \
            "$ALARM_LOCATION" \
            "${NODE_IP};${var}%;${ALARM_THRESHOLD}%;${CLEARANCE_THRESHOLD}%"
            if [[ $? -ne 0 ]];then
                log_fail "$ALARM_ID_CPU $ALARM_TYPE_FAULT $ALARM_LEVEL_MAJOR $ALARM_CAUSE_CPU $system_time $ALARM_MOC_HOST $ALARM_RES_HOST_CPU $local_addr;'cpu' 'cpu use rate' $var"
                return 1
            fi
            log_success "$ALARM_ID_CPU $ALARM_TYPE_FAULT $ALARM_LEVEL_MAJOR $ALARM_CAUSE_CPU $system_time $ALARM_MOC_HOST $ALARM_RES_HOST_CPU $local_addr;'cpu' 'cpu use rate' $var"
            sed -i "/$ALARM_ID_CPU_in/ c $ALARM_ID_CPU_in $ALARM_TYPE_FAULT" $FLAG_FILE
        fi
    fi
    if [[ "$calarm" -eq "3" ]];then
        flag1=$(cat $FLAG_FILE | grep $ALARM_ID_CPU_in | awk '{print $2}')
        if [[ $flag1 -ne $ALARM_TYPE_RECOVER ]];then
            system_time1=$(date +%s)
            $SENDALARM_PATH $ALARM_ID_CPU \
            $ALARM_TYPE_RECOVER \
            $ALARM_LEVEL_MAJOR \
            "$ALARM_CAUSE_CPU" \
            $system_time1 \
            $ALARM_MOC_HOST \
            "$ALARM_RES_HOST_CPU" \
            "$ALARM_LOCATION" \
            "${NODE_IP};${var}%;${ALARM_THRESHOLD}%;${CLEARANCE_THRESHOLD}%"
            if [[ $? -ne 0 ]];then
                log_fail "$ALARM_ID_CPU $ALARM_TYPE_RECOVER $ALARM_LEVEL_MAJOR $ALARM_CAUSE_CPU $system_time1 $ALARM_MOC_HOST $ALARM_RES_HOST_CPU $local_addr;'cpu' 'cpu use rate' $var"
                return 1
            fi
            log_success "$ALARM_ID_CPU $ALARM_TYPE_RECOVER $ALARM_LEVEL_MAJOR $ALARM_CAUSE_CPU $system_time1 $ALARM_MOC_HOST $ALARM_RES_HOST_CPU $local_addr;'cpu' 'cpu use rate' $var"
            sed -i "/$ALARM_ID_CPU_in/ c $ALARM_ID_CPU_in $ALARM_TYPE_RECOVER" $FLAG_FILE
        fi
    fi
    return 0
}

#
#检测内存的使用率
#

function check_mem_alarm()
{
    check_flag_file
    if [[ $? -ne 0 ]];then
        return 1
    fi
    local_addr=`csbs_python /usr/bin/get_info.py --internal_ip`
    if [[ ! $local_addr ]];then
            return 1
    fi
    #计算内存的使用率
    local retry_times=3
    local i=0
    local alarm=0
    local calarm=0
    while [[ $i -lt $retry_times ]]
    do
        set +x
        get_mem_rate
        set -x
        if [[ ! $var ]];then
                return 1
        fi
        if [[ $(echo "$var > $ALARM_THRESHOLD"| bc) -eq "1" ]];then
            let alarm+=1
            check_alarm $alarm $calarm
            if [[ "$?" -ne "0" ]];then
                break
            fi
            log_info "memory use rate is:$var;retry_times:$i"
        elif [[ $(echo "$var < ${CLEARANCE_THRESHOLD}"| bc) -eq "1" ]];then
            let calarm+=1
            check_alarm $alarm $calarm
            if [[ "$?" -ne "0" ]];then
                break
            fi
            log_info "memory use rate is:$var;retry_times:$i"
        fi
        sleep 5s
        let i+=1
    done

    if [[ "$alarm" -eq "3" ]];then
        flag=$(cat $FLAG_FILE | grep $ALARM_ID_MEMORY_in | awk '{print $2}')
        if [[ $flag -ne $ALARM_TYPE_FAULT ]];then
            system_time=$(date +%s)
            $SENDALARM_PATH $ALARM_ID_MEMORY \
            $ALARM_TYPE_FAULT \
            $ALARM_LEVEL_MAJOR \
            "$ALARM_CAUSE_MEMORY" \
            $system_time \
            $ALARM_MOC_HOST \
            "$ALARM_RES_HOST_MEMORY" \
            "$ALARM_LOCATION" \
            "${NODE_IP};${mem_total_disp}MB;${mem_used_disp}MB;${ALARM_THRESHOLD}%;${CLEARANCE_THRESHOLD}%"
            if [[ $? -ne 0 ]];then
                log_fail "$ALARM_ID_MEMORY $ALARM_TYPE_FAULT $ALARM_LEVEL_MAJOR $ALARM_CAUSE_MEMORY $system_time $ALARM_MOC_HOST $ALARM_RES_HOST_MEMORY $local_addr;'memory' 'memory use rate' $var"
                return 1
            fi
            log_success "$ALARM_ID_MEMORY $ALARM_TYPE_FAULT $ALARM_LEVEL_MAJOR $ALARM_CAUSE_MEMORY $system_time $ALARM_MOC_HOST $ALARM_RES_HOST_MEMORY $local_addr;'memory' 'memory use rate' $var"
            sed -i "/$ALARM_ID_MEMORY_in/ c $ALARM_ID_MEMORY_in $ALARM_TYPE_FAULT" $FLAG_FILE
        fi
    fi
    if [[ "$calarm" -eq "3" ]];then
        flag1=$(cat $FLAG_FILE | grep $ALARM_ID_MEMORY_in | awk '{print $2}')
        if [[ $flag1 -ne $ALARM_TYPE_RECOVER ]];then
            system_time1=$(date +%s)
            $SENDALARM_PATH $ALARM_ID_MEMORY \
            $ALARM_TYPE_RECOVER \
            $ALARM_LEVEL_MAJOR \
            "$ALARM_CAUSE_MEMORY" \
            $system_time1 \
            $ALARM_MOC_HOST \
            "$ALARM_RES_HOST_MEMORY" \
            "$ALARM_LOCATION" \
            "${NODE_IP};${mem_total_disp}MB;${mem_used_disp}MB;${ALARM_THRESHOLD}%;${CLEARANCE_THRESHOLD}%"
            if [[ $? -ne 0 ]];then
                log_fail "$ALARM_ID_MEMORY $ALARM_TYPE_RECOVER $ALARM_LEVEL_MAJOR $ALARM_CAUSE_MEMORY $system_time1 $ALARM_MOC_HOST $ALARM_RES_HOST_MEMORY $local_addr;'memory' 'memory use rate' $var"
                return 1
            fi
            log_success "$ALARM_ID_MEMORY $ALARM_TYPE_RECOVER $ALARM_LEVEL_MAJOR $ALARM_CAUSE_MEMORY $system_time1 $ALARM_MOC_HOST $ALARM_RES_HOST_MEMORY $local_addr;'memory' 'memory use rate' $var"
            sed -i "/$ALARM_ID_MEMORY_in/ c $ALARM_ID_MEMORY_in $ALARM_TYPE_RECOVER" $FLAG_FILE
        fi
    fi
    return 0
}

#
#监测磁盘使用率
#有两个参数opt $ALARM_ID_DISK_opt
#
function check_disk_alarm()
{
    check_flag_file
    if [[ $? -ne 0 ]];then
        return 1
    fi
    local_addr=`csbs_python /usr/bin/get_info.py --internal_ip`
    if [[ ! $local_addr ]];then
            return 1
    fi
    #计算磁盘分区的使用率
    local retry_times=3
    local i=0
    local alarm_usr=0
    local calarm_usr=0
    local alarm_opt=0
    local calarm_opt=0
    while [[ $i -lt $retry_times ]]
    do
        get_disk_rate
        if [[ $(echo "$ALARM_usr > $ALARM_THRESHOLD"| bc) -eq "1" ]];then
            let alarm_usr+=1
            check_alarm $alarm_usr $calarm_usr
            if [[ "$?" -ne "0" ]];then
                break
            fi
            log_info "usr use rate is:$ALARM_usr;retry_times:$i"
        elif [[ $(echo "$ALARM_usr < ${CLEARANCE_THRESHOLD}"| bc) -eq "1" ]];then
            let calarm_usr+=1
            check_alarm $alarm_usr $calarm_usr
            if [[ "$?" -ne "0" ]];then
                break
            fi
            log_info "usr use rate is:$ALARM_usr;retry_times:$i"
        fi

        if [[ $(echo "$ALARM_opt > $ALARM_THRESHOLD"| bc) -eq "1" ]];then
            let alarm_opt+=1
            check_alarm $alarm_opt $calarm_opt
            if [ "$?" -ne "0" ];then
                break
            fi
            log_info "opt use rate is:$ALARM_opt;retry_times:$i"
        elif [[ $(echo "$ALARM_opt < ${CLEARANCE_THRESHOLD}"| bc) -eq "1" ]];then
            let calarm_opt+=1
            check_alarm $alarm_opt $calarm_opt
            if [[ "$?" -ne "0" ]];then
                break
            fi
            log_info "opt use rate is:$ALARM_opt;retry_times:$i"
        fi
        sleep 5s
        let i+=1
    done

    if [[ "$alarm_opt" -eq "3" ]];then
        flag=$(cat $FLAG_FILE | grep $ALARM_ID_DISK_opt | awk '{print $2}')
        if [[ $flag -ne $ALARM_TYPE_FAULT ]];then
            system_time=$(date +%s)
            $SENDALARM_PATH $ALARM_ID_DISK \
            $ALARM_TYPE_FAULT \
            $ALARM_LEVEL_MAJOR \
            "$ALARM_CAUSE_DISK" \
            $system_time \
            $ALARM_MOC_HOST \
            "$ALARM_RES_HOST_DISK" \
            "$ALARM_LOCATION;/opt" \
            "${NODE_IP};${opt_size};${opt_used};${ALARM_THRESHOLD}%;${CLEARANCE_THRESHOLD}%"
            if [[ $? -ne 0 ]];then
                log_fail "$ALARM_ID_DISK $ALARM_TYPE_FAULT $ALARM_LEVEL_MAJOR $ALARM_CAUSE_DISK $system_time $ALARM_MOC_HOST $ALARM_RES_HOST_DISK '$local_addr;opt' 'opt use rate $ALARM_opt'"
                return 1
            fi
            log_success "$ALARM_ID_DISK $ALARM_TYPE_FAULT $ALARM_LEVEL_MAJOR $ALARM_CAUSE_DISK $system_time $ALARM_MOC_HOST $ALARM_RES_HOST_DISK '$local_addr;opt' 'opt use rate $ALARM_opt'"
            sed -i "/$ALARM_ID_DISK_opt/ c $ALARM_ID_DISK_opt $ALARM_TYPE_FAULT" $FLAG_FILE
        fi
    fi
    if [[ "$calarm_opt" -eq "3" ]];then
        flag1=$(cat $FLAG_FILE | grep $ALARM_ID_DISK_opt | awk '{print $2}')
        if [[ $flag1 -ne $ALARM_TYPE_RECOVER ]];then
            system_time1=$(date +%s)
            $SENDALARM_PATH $ALARM_ID_DISK \
            $ALARM_TYPE_RECOVER \
            $ALARM_LEVEL_MAJOR \
            "$ALARM_CAUSE_DISK" \
            $system_time1 \
            $ALARM_MOC_HOST \
            "$ALARM_RES_HOST_DISK" \
            "$ALARM_LOCATION;/opt" \
            "${NODE_IP};${opt_size};${opt_used};${ALARM_THRESHOLD}%;${CLEARANCE_THRESHOLD}%"
            if [[ $? -ne 0 ]];then
                log_fail "$ALARM_ID_DISK $ALARM_TYPE_RECOVER $ALARM_LEVEL_MAJOR $ALARM_CAUSE_DISK $system_time1 $ALARM_MOC_HOST $ALARM_RES_HOST_DISK '$local_addr;opt' 'opt use rate $ALARM_opt'"
                return 1
            fi
            log_success "$ALARM_ID_DISK $ALARM_TYPE_RECOVER $ALARM_LEVEL_MAJOR $ALARM_CAUSE_DISK $system_time1 $ALARM_MOC_HOST $ALARM_RES_HOST_DISK '$local_addr;opt' '$1 use rate $ALARM_opt'"
            sed -i "/$ALARM_ID_DISK_opt/ c $ALARM_ID_DISK_opt $ALARM_TYPE_RECOVER" $FLAG_FILE
        fi
    fi

    if [[ "$alarm_usr" -eq "3" ]];then
        usr=$(df -h | grep "/usr$")
        if [[ ! "$usr" ]];then
            usr="/"
        else
            usr="usr"
        fi
        flag=$(cat $FLAG_FILE | grep $ALARM_ID_DISK_usr | awk '{print $2}')
        if [[ $flag -ne $ALARM_TYPE_FAULT ]];then
            system_time=$(date +%s)
            $SENDALARM_PATH $ALARM_ID_DISK \
            $ALARM_TYPE_FAULT \
            $ALARM_LEVEL_MAJOR \
            "$ALARM_CAUSE_DISK" \
            $system_time \
            $ALARM_MOC_HOST \
            "$ALARM_RES_HOST_DISK" \
            "$ALARM_LOCATION;$usr" \
            "${NODE_IP};${usr_size};${usr_used};${ALARM_THRESHOLD}%;${CLEARANCE_THRESHOLD}%"
            if [[ $? -ne 0 ]];then
                log_fail "$ALARM_ID_DISK $ALARM_TYPE_FAULT $ALARM_LEVEL_MAJOR $ALARM_CAUSE_DISK $system_time $ALARM_MOC_HOST $ALARM_RES_HOST_DISK '$local_addr;$usr' '$usr use rate $ALARM_usr'"
                return 1
            fi
            log_success "$ALARM_ID_DISK $ALARM_TYPE_FAULT $ALARM_LEVEL_MAJOR $ALARM_CAUSE_DISK $system_time $ALARM_MOC_HOST $ALARM_RES_HOST_DISK '$local_addr;$usr' '$usr use rate $ALARM_usr'"
            sed -i "/$ALARM_ID_DISK_usr/ c $ALARM_ID_DISK_usr $ALARM_TYPE_FAULT" $FLAG_FILE
        fi
    fi
    if [[ "$calarm_usr" -eq "3" ]];then
        usr=$(df -h | grep "/usr$")
        if [[ ! "$usr" ]];then
            usr="/"
        else
            usr="usr"
        fi
        flag1=$(cat $FLAG_FILE | grep $ALARM_ID_DISK_usr | awk '{print $2}')
        if [[ $flag1 -ne $ALARM_TYPE_RECOVER ]];then
            system_time1=$(date +%s)
            $SENDALARM_PATH $ALARM_ID_DISK \
            $ALARM_TYPE_RECOVER \
            $ALARM_LEVEL_MAJOR \
            "$ALARM_CAUSE_DISK" \
            $system_time1 \
            $ALARM_MOC_HOST \
            "$ALARM_RES_HOST_DISK" \
            "$ALARM_LOCATION;$usr" \
            "${NODE_IP};${usr_size};${usr_used};${ALARM_THRESHOLD}%;${CLEARANCE_THRESHOLD}%"
            if [[ $? -ne 0 ]];then
                log_fail "$ALARM_ID_DISK $ALARM_TYPE_RECOVER $ALARM_LEVEL_MAJOR $ALARM_CAUSE_DISK $system_time1 $ALARM_MOC_HOST $ALARM_RES_HOST_DISK '$local_addr;$usr' '$usr use rate $ALARM_usr'"
                return 1
            fi
            log_success "$ALARM_ID_DISK $ALARM_TYPE_RECOVER $ALARM_LEVEL_MAJOR $ALARM_CAUSE_DISK $system_time1 $ALARM_MOC_HOST $ALARM_RES_HOST_DISK '$local_addr;$usr' '$usr use rate $ALARM_usr'"
            sed -i "/$ALARM_ID_DISK_usr/ c $ALARM_ID_DISK_usr $ALARM_TYPE_RECOVER" $FLAG_FILE
        fi
    fi
    return 0
}

monitor_network_health(){
    #ping另外两台机器的内部IP，如果满足任意一条，就判定网络有问题
    # 1. 丢包率达到50%及以上
    # 2. rtt在1秒及以上
    check_flag_file
    if [[ $? -ne 0 ]];then
        # 检查持久化文件失败也返回正常，避免发送告警
        return 0
    fi
    MY_IP=$(get_info.py --internal_ip)
    INTERFACE=$(ip addr | grep -w "${MY_IP}" | awk '{print $NF}')
    ALARM_LOCATION_NETWORK="${ALARM_LOCATION};${INTERFACE}"
    log_info "${MY_IP}(${ALARM_LOCATION_NETWORK}) will ping ${NODE_LIST[*]}"
    MESSAGE=
    local i=0
    for item in ${NODE_LIST[@]}
    do
        local flag=$(cat $FLAG_FILE | grep NETWORK_HEALTH_${item} | awk '{print $2}')
        if [[ ! $flag ]];then
            let flag=$ALARM_TYPE_RECOVER
            echo "NETWORK_HEALTH_${item} $ALARM_TYPE_RECOVER" >> $FLAG_FILE
        fi
        local loss_percentage_threshold=50
        local rtt_threshold=1000
        local rtt_arg=1 # 多次ping命令中rtt的最小值
        # 宽门限处理，防止告警发生震荡、闪段
        if [[ "$flag" -eq $ALARM_TYPE_FAULT ]];then
            loss_percentage_threshold=30
            rtt_threshold=600
            rtt_arg=3 # 多次ping命令中rtt的最大值
        fi
        karbor_ping -c 1 "$item" >/dev/null 2>&1
        local ret=$?
        local ping_result=$(karbor_ping -c 10 "$item")
        local loss_percentage=$(echo "$ping_result"|grep -E -o [[:digit:]]+% | tr -d %)
        local rtt=$(echo "$ping_result"| tail -1| awk '{print $4}' | cut -d '/' -f $rtt_arg)
        MESSAGE[$i]="${MY_IP}_ping_${item}_loss_${loss_percentage}_rtt_${rtt}"
        log_info "  ${MESSAGE[$i]}"
        local rtt_int=`echo $rtt | cut -d \. -f 1`
        local loss_percentage_int=`echo $loss_percentage | cut -d \. -f 1`

        NET_HEALTH[$i]=1
        if [[ $ret -ne 0 ]]; then
            FAULT_REASON=unreachable
        elif [[ $(expr "$rtt_int" \< $rtt_threshold) -eq 0 ]];then
            FAULT_REASON="rtt($rtt_int) >= $rtt_threshold"
        elif [[ $(expr "$loss_percentage_int" \< $loss_percentage_threshold) -eq 0 ]];then
            FAULT_REASON="loss rate($loss_percentage_int) >= $loss_percentage_threshold"
        else
            NET_HEALTH[$i]=0
        fi
        let i=i+1
    done
}

check_network_alarm(){
    which get_info.py > /dev/null 2>&1
    if [[ "$?" -ne 0 ]]; then
        log_fail "when check the network, get_info.py not exist on the path,network check quit!"
        return
    fi
    monitor_network_health
    local length=${#NET_HEALTH[*]}
    for (( i=0;i<$length;i++))
    do
        ret=${NET_HEALTH[$i]}
        if [[ "$ret" -eq 1 ]]; then
            # 网络健康告警,超过阈值
            local flag=$(cat $FLAG_FILE | grep NETWORK_HEALTH_${NODE_LIST[$i]} | awk '{print $2}')
            if [[ $flag -ne $ALARM_TYPE_FAULT ]];then
                system_time=$(date +%s)
                $SENDALARM_PATH $ALARM_ID_NETWORK \
                $ALARM_TYPE_FAULT \
                $ALARM_LEVEL_MAJOR \
                "" \
                $system_time \
                $ALART_MOC_NETWORK \
                "$ALARM_RES_HOST_NETWORK" \
                "$ALARM_LOCATION_NETWORK;${NODE_LIST[$i]}" \
                "${NODE_IP};$FAULT_REASON;${MESSAGE[$i]}"
                if [[ $? -ne 0 ]];then
                    log_fail "$ALARM_ID_NETWORK $ALARM_TYPE_FAULT $ALARM_LEVEL_MAJOR '' $system_time $ALART_MOC_NETWORK $ALARM_RES_HOST_NETWORK
                    $ALARM_LOCATION_NETWORK ${MESSAGE[$i]}"
                fi
                log_success "$ALARM_ID_NETWORK $ALARM_TYPE_FAULT $ALARM_LEVEL_MAJOR '' $system_time $ALART_MOC_NETWORK $ALARM_RES_HOST_NETWORK
                    $ALARM_LOCATION_NETWORK ${MESSAGE[$i]}"
                sed -i "/NETWORK_HEALTH_${NODE_LIST[$i]}/ c NETWORK_HEALTH_${NODE_LIST[$i]} $ALARM_TYPE_FAULT" $FLAG_FILE
            fi
        else
            flag1=$(cat $FLAG_FILE | grep NETWORK_HEALTH_${NODE_LIST[$i]} | awk '{print $2}')
            if [[ $flag1 -ne $ALARM_TYPE_RECOVER ]];then
                system_time1=$(date +%s)
                $SENDALARM_PATH $ALARM_ID_NETWORK \
                $ALARM_TYPE_RECOVER \
                $ALARM_LEVEL_MAJOR \
                "" \
                $system_time1 \
                $ALART_MOC_NETWORK \
                "$ALARM_RES_HOST_NETWORK" \
                "$ALARM_LOCATION_NETWORK;${NODE_LIST[$i]}" \
                "${NODE_IP};None;${MESSAGE[$i]}"
                if [[ $? -ne 0 ]];then
                    log_fail "$ALARM_ID_NETWORK $ALARM_TYPE_RECOVER $ALARM_LEVEL_MAJOR '' $system_time1 $ALART_MOC_NETWORK $ALARM_RES_HOST_NETWORK
                    $ALARM_LOCATION_NETWORK ${MESSAGE[$i]}"
                fi
                log_success "$ALARM_ID_NETWORK $ALARM_TYPE_RECOVER $ALARM_LEVEL_MAJOR '' $system_time1 $ALART_MOC_NETWORK $ALARM_RES_HOST_NETWORK
                $ALARM_LOCATION_NETWORK ${MESSAGE[$i]}"
                sed -i "/NETWORK_HEALTH_${NODE_LIST[$i]}/ c NETWORK_HEALTH_${NODE_LIST[$i]} $ALARM_TYPE_RECOVER" $FLAG_FILE
            fi

        fi
    done
}
###############入口################
function main()
{
    #####检查并创建flag文件#######
    creat_flag_file
    if [[ $? -ne 0 ]];then
        ###写日志的操作
        exit 1
    fi
    #####检查cpu使用率并告警#####
    check_cpu_alarm
    if [[ $? -ne 0 ]];then
        ###写日志的操作
        exit 1
    fi
    ####检查内存使用率并告警#####
    check_mem_alarm
    if [[ $? -ne 0 ]];then
        ###写日志的操作
        exit 1
    fi
    ####检查磁盘使用率并告警#####
    check_disk_alarm
    if [[ $? -ne 0 ]];then
        ###写日志的操作
        exit 1
    fi
    ####检查网络健康状态并告警#####
    check_network_alarm
    if [[ $? -ne 0 ]];then
        exit 1
    fi
    exit 0
}

main
