#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
import logging
import re
import subprocess
import sys

BSP_FILE = "/proc/osp/bsp"

# 直接执行diagnose命令的字符串格式：
# {0}表示命令的执行超时时间，超过这个时间子进程将自动被杀死
# {1}表示attach的进程id,比如app_data为12
# {2}表示需要执行的diagnose命令
DIAGNOSE_CHECK_CMD = "timeout -s 9 {0} diagsh --attach=*_{1} --cmd='{2}'"

logging.basicConfig(level=logging.INFO,
                    filename="/OSM/log/cur_debug/messages",
                    format='[%(asctime)s][CHECK_LUN_PRESSURE][%(levelname)s][%(message)s][%(filename)s, %(lineno)d]',
                    datefmt='%Y-%m-%d %H:%M:%S')


def _container_enabled():
    with open(BSP_FILE, "rb") as f:
        lines = f.readlines()
    for line in lines:
        try:
            line = line.decode("utf-8")
        except Exception:
            logging.error("/proc/osp/bsp bad line")
            continue
        pos = line.find("conta:")
        if pos == -1:
            continue
        info = line[pos:len(line)]
        tmp = re.split(":|;", info)
        level = tmp[1].strip()
        if level != "0":
            return True
    return False


# 检查qos是否过载
def _is_qos_overloaded():
    qos_overload_cmd = "qos show overload_info"
    qos_overload_cmd = DIAGNOSE_CHECK_CMD.format(10, 12, qos_overload_cmd)

    ret, output = subprocess.getstatusoutput(qos_overload_cmd)
    if ret:
        logging.info("cmd(%s) exec failed.", qos_overload_cmd)
        # 为了防止命令不存在，执行失败直接默认没有过载
        return False

    # node的TokenBucket子项对应的Level < 6，则认为过载，升级检查失败
    # 没有TokenBucket则默认没有过载，继续下一个检查项
    for line in output.split("\n"):
        # 解析每行字符串
        columns = re.split(' +', line.strip())
        if len(columns) >= 4 and columns[0].strip() == 'TokenBucket':
            bucket_level = int(columns[3].strip())
            logging.info("TokenBucket[%d]", bucket_level)
            if bucket_level < 6:
                logging.error("TokenBucket[%d] < 6, qos overloaded", bucket_level)
                return True
            break

    logging.info("qos not overloading")

    return False


# 检查cache是否水位过高
def _is_cache_high_waterm():
    cache_quota_cmd = "cache show quota global"
    cache_quota_cmd = DIAGNOSE_CHECK_CMD.format(10, 12, cache_quota_cmd)

    ret, output = subprocess.getstatusoutput(cache_quota_cmd)
    if ret:
        logging.info("cmd(%s) exec failed.", cache_quota_cmd)
        # 为了防止命令不存在，执行失败直接默认没有水位过高
        return False

    # 遍历检查cache quota中2<=objID<=7的子项，如果某控的某个子项的WaterM(%)>=80%，则认为水位过高，升级检查失败
    # 没有子项则默认水位没有过高，继续执行
    for line in output.split("\n"):
        # 解析每行字符串
        columns = re.split(' +', line.strip())
        if len(columns) >= 5 and columns[0].strip().isdigit():
            obj_id = int(columns[0].strip())
            waterm = int(float(columns[4].strip()))
            if obj_id >= 2 and obj_id <= 7:
                logging.info("obj_id[%d] waterM[%d]", obj_id, waterm)
            if obj_id >= 2 and obj_id <= 7 and waterm >= 80:
                logging.error("obj_id[%d] waterM[%d] >= 80%%, cache high waterM.", obj_id, waterm)
                return True

    logging.info("cache low waterM")

    return False


# 检查盘压力
def _is_disk_pressure_overload():
    disk_check_cmd = "ld getalldiskuserate"
    disk_check_cmd = DIAGNOSE_CHECK_CMD.format(10, 12, disk_check_cmd)

    ret, output = subprocess.getstatusoutput(disk_check_cmd)
    if ret:
        logging.error("cmd[%s] exec failed", disk_check_cmd)
        return False

    total_disk = 0
    total_rate = 0
    average_rate = 0.0
    max_rate = 0
    for line in output.split("\n"):
        columns = re.split('\t', line.strip())
        if len(columns) == 4 and columns[2].strip().isdigit():
            rate = int(columns[2].strip())
            if rate > max_rate:
                max_rate = rate
            # 排除使用率10以下的盘
            if rate >= 10:
                total_disk += 1
                total_rate += rate
    if total_disk > 0:
        average_rate = total_rate / total_disk
    if max_rate >= 80 or average_rate >= 60:
        logging.error("disk use overload, max: %d, average: %f", max_rate, average_rate)
        return True
    logging.info("disk use rate is low")
    return False


# 检查容器lun访问压力
def _check_container_lun_pressure():
    logging.info("start container lun pressure check")

    # 当前只有容器在升级的时候需要使用lun
    # 如果没有激活容器的话就直接跳过该检查项，返回成功
    if not _container_enabled():
        logging.info("container is not enabled, skip container lun pressure check")
        print("True")
        return 0

    # 1.检查qos是否过载
    # 2.检查cache是否水位过高
    # 3.检查盘使用率是否过高
    if _is_qos_overloaded() or _is_cache_high_waterm() or _is_disk_pressure_overload():
        print("False")
        return 1

    print("True")
    return 0


def main():
    try:
        # 检查容器lun访问压力
        return _check_container_lun_pressure()
    except Exception as e:
        logging.error("container lun pressure check faild: %s", e)
        # 避免语法错误导致升级检查失败，这里返回成功
        print("True")
        return 0


if __name__ == '__main__':
    sys.exit(main())
