#! /bin/bash
# Copyright (c) 2022-2022 Huawei Technologies Co., Ltd.
# All rights reserved.
#

# DPU-P和DPU-S，后续交付都采用2个fun交付，fun0为virtio，fun1为nvme
# a、DPU-P virio不支持vf vid:1af4 did:1048 subvid:19e5 subdid:0001
# b、DPU-P virio支持vf vid:19e5 did:1048 subvid:19e5 subdid:0001
# c、DPU-S virio不支持vf vid:1af4 did:1048 subvid:19e5 subdid:0002
# d、DPU-S virio支持vf vid:19e5 did:1048 subvid:19e5 subdid:0002

# DPU的vid和did
DPU_VID_DID="1af4:1048|19e5:1048"

# DPU的subvid和subdid
DPU_SUB_VID_DID="000119e5 000219e5"

DPU_BDF_FILE_PATH="/var/run/dpu_bdf.txt"
DPU_BDF=""

LOG_FILE="/var/log/dpu_link_check.log"

function log()
{
    if [ ! -f "$LOG_FILE" ]; then
        touch "$LOG_FILE"
    fi
    echo "$(date +"[%Y-%m-%d %H:%M:%S,%N]") [$0:${BASH_LINENO}] [$1]: [PCIEEP] $2" >> ${LOG_FILE}
    chmod 640 ${LOG_FILE}
}

# 如果查询到多张DPU卡,则不进行检测,脚本直接退出
function is_dpu_bdf_vaild()
{
    bdf_num=$(cat $DPU_BDF_FILE_PATH | wc -l)
    if [ $bdf_num -ne 1 ]
    then
        exit 1
    fi
}

function get_dpu_bdf()
{
    DPU_BDF=$(cat $DPU_BDF_FILE_PATH)
}

# DPU卡不支持热插拔,如果首次检测到在位,则创建文件,复位文件删除掉
function check_dpu_exist()
{
    # 如果文件已经存在,说明已经知道DPU的BDF号
    if [ -f "$DPU_BDF_FILE_PATH" ]
    then
        is_dpu_bdf_vaild
        get_dpu_bdf
        return
    fi

    # 如果文件不存在,则需要查询DPU的BDF号,并创建文件
    bdf=$(sudo lspci -n -D | egrep "$DPU_VID_DID" | awk {'print $1'} | tr "\n" " ")
    for tmp_bdf in ${bdf[@]}
    do
        sub_vid_did=$(sudo setpci -s $tmp_bdf 0x2c.l)
        for tmp_sub_vid_did in $DPU_SUB_VID_DID
        do
            if [ $tmp_sub_vid_did = $sub_vid_did ]
            then
                echo "$tmp_bdf" >> $DPU_BDF_FILE_PATH
            fi
        done
    done
    is_dpu_bdf_vaild
    get_dpu_bdf
}

function reboot_host()
{
    log "WARN" "dpu $DPU_BDF pcie link donw, host need to reboot."
    sudo reboot
}

# 检测DPU卡的virtio-scsi设备的class code读出为0x010000(DPU复位)或全ff(pcie link断开)
function check_dpu_link()
{
    tmp_val=$(sudo setpci -s $DPU_BDF 0x8.l)
    class_code=$(echo ${tmp_val:0:6})
    # 重复检测3次,如果都检查到异常则复位主机
    for i in $(seq 0 2)
    do
        if [[ $class_code = "010000" || $class_code = "ffffff" ]]
        then
            log "WARN" "check dpu $DPU_BDF pcie link down, class code is $class_code."
            sleep 0.1
        else
            return
        fi
    done
    reboot_host
}

# 操作系统每1min拉起一次,1min检查60次,每间隔1s检查1次
function polling_dpu_link()
{
    for i in $(seq 0 59)
    do
        check_dpu_link
        sleep 1
    done
}

check_dpu_exist

polling_dpu_link
