#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @brief   : 2020/12/16 21:56
# @File    : litek8s_check_kubernetes_status.py
# @Software: Software management infrastructure

import json
import logging
import os
import platform
import re
import shutil
import stat
import subprocess
import sys
import time
from os import environ

import yaml

try:
    from configparser import ConfigParser
except ImportError:
    from ConfigParser import SafeConfigParser as ConfigParser


logging.basicConfig(level=logging.INFO,
                    filename="/OSM/log/cur_debug/messages",
                    format='[%(asctime)s][CHECK_KUBERNETES][%(levelname)s][%(message)s][%(filename)s, %(lineno)d]',
                    datefmt='%Y-%m-%d %H:%M:%S')
CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))

CONTAINER_LUN_FIX_FLAG = "/OSM/component/container/bin/fix_flag"
CONTAINER_LOAD_GLOBAL_IMAGE_FLAG = "/OSM/component/container/bin/load_global_image_flag"
CONTAINER_ENABLED_LEVEL_SET = {"1", "2", "3", "4", "5", 1, 2, 3, 4, 5}    # 容器档位
CONTAINER_DISABLED_LEVEL_SET = {"0", 0} # 0档代表不开启容器
CONTAINER_ENGINE_SET = {"0", "1"}       # 容器只能够在前两个引擎上运行
PROTECT_A8000_SET = {"D5600V6_S", "D5600V6_N_S", "D5600V6_S_H"}

BSP_CONF = "/proc/osp/bsp" # 背板文件
KERNEL_PRODUCT_CONF = "/OSM/conf/product.ini"  # 内核产品配置
DEFAULT_RETRY_TIMES = 3   # 默认重试次数
CLUSTER_CONF = "/OSM/conf/innerIpInformation.conf" # 系统集群信息配置文件
DISK_MANAGE_FILE = "/OSM/script/container_platform_disk_manage.py"
DORADO_VER_615 = 7600511219
CUR_MANIFEST_PATH = "/startup_disk/image/pkg_cur/manifest.yml"


class ParserKernelProductInfo(object):
    """
    解析"/OSM/conf/product.ini"配置文件，内核产品配置
    """

    def __init__(self):
        self._container_mem_flow = 0
        self._max_pods = 0
        self._container_service_engines = []
        self._container_service_nodes = []
        self._data_protect_deploy_types = []

    @property
    def container_mem_flow(self):
        """获取内存流动标志的值，若不存在内存流动标志返回0"""
        return self._container_mem_flow

    @property
    def max_pods(self):
        """获取最大pod数，若不存在最大pod数返回0"""
        return self._max_pods

    @property
    def container_service_engines(self):
        """获取容器支持部署的引擎列表，若不存在容器支持部署的引擎列表返回["0", "1"]"""
        return self._container_service_engines

    @property
    def container_service_nodes(self):
        """获取容器支持部署的节点列表，若不存在容器支持部署的节点列表返回[]"""
        return self._container_service_nodes

    @property
    def data_protect_deploy_types(self):
        """获取支持部署的数据保护部署类型，若不存在支持部署的数据保护部署类型返回[]"""
        return self._data_protect_deploy_types

    def parse(self):
        """
        解析"/OSM/conf/product.ini"配置文件
        解析成功返回0
        解析失败返回1
        """

        section_container_platform = "CONTAINER_PLATFORM"
        option_max_pods = "maxPods"
        option_container_engines = "containerServiceEngines"
        option_container_nodes = "containerServiceNodes"
        option_op_deploy_types = "dataProtectDeployType"

        try:
            config = ConfigParser()
            config.read(KERNEL_PRODUCT_CONF)

            # 读取内存流动标志，非法整数值会抛出异常
            if config.has_option("GLOBAL", "ContainerMemFlow"):
                self._container_mem_flow = config.getint("GLOBAL", "ContainerMemFlow")

            # 读取最大pod数，非法整数值会抛出异常
            if config.has_option(section_container_platform, option_max_pods):
                self._max_pods = config.getint(section_container_platform, option_max_pods)

            # 读取容器支持部署的引擎列表
            if config.has_option(section_container_platform, option_container_engines):
                engines = config.get(section_container_platform, option_container_engines)
                engine_list, ret_code = parse_intervals(engines)
                if ret_code:
                    logging.error("parse engines %s failed", engines)
                    return 1

                # 必须包含前2个引擎
                if not {0, 1}.issubset(engine_list):
                    logging.error("engines %s must include first two engines", engines)
                    return 1

                for engine in engine_list:
                    self._container_service_engines.append(str(engine))
            else:
                self._container_service_engines = ["0", "1"]

            # 读取容器支持部署的节点列表
            if config.has_option(section_container_platform, option_container_nodes):
                nodes = config.get(section_container_platform, option_container_nodes)
                node_list, ret_code = parse_intervals(nodes)
                if ret_code:
                    logging.error("parse nodes %s failed", nodes)
                    return 1

                for n in node_list:
                    self._container_service_nodes.append(str(n))

            # 读取支持部署的数据保护部署类型
            if config.has_option(section_container_platform, option_op_deploy_types):
                deploy_types = config.get(section_container_platform, option_op_deploy_types)
                self._data_protect_deploy_types = deploy_types.split(",")

            return 0
        except Exception as exception:
            logging.error("read kernel product file %s failed: %s", KERNEL_PRODUCT_CONF, exception)
            return 1


class ParserInnerIpInfo(object):
    """
    解析"/OSM/conf/innerIpInformation.conf"配置文件，获取产品集群配置信息。
    """
    IS_ARM = True if platform.machine() == 'aarch64' else False

    def __init__(self):
        self._node_map = {}
        self._unreachable_nodes = []
        self._load_inner_info()
        self._leader_ip = ""

    @staticmethod
    def _check_conn(ip_addr):
        """检查能否ping通内部ip"""
        for _ in range(DEFAULT_RETRY_TIMES):
            cmd = "ping -c 1 %s -w 1 &>/dev/null" % ip_addr
            if ParserInnerIpInfo.IS_ARM:
                cmd = "ip vrf exec vrf-inner " + cmd
            if not os.system(cmd):
                return True
        return False

    def _load_ip_list(self, node_map, node_type):
        for per_node in node_map:
            nid = per_node.get(node_type, None)
            if not nid:
                continue
            ip_cfg = per_node.get("ipCfg", None)
            if not ip_cfg:
                continue
            ip_list = [ip for ip in ip_cfg[0].get("ips") if self._check_conn(ip)]
            if not ip_list:
                self._unreachable_nodes.append(nid)
                continue
            self._node_map[nid] = ip_list[0]  # 只取第一个ip地址即可

    def _load_inner_info(self):
        try:
            with open(CLUSTER_CONF, "rb") as f:
                cluster_info = f.read()
                cluster_info = json.loads(cluster_info)
                self.network_config = cluster_info.get("networkConfigs", {})
        except Exception:
            self.network_config = {}
        self.leader_config = self.network_config.get("leaderConfig", {})
        self.local_config = self.network_config.get("localConfig", {})
        self.node_cfg = self.network_config.get("nodeCfg", [])

    def get_leader_reachable_ip(self):
        ips = self.leader_config.get("ips")
        for c_ip in ips:
            if self._check_conn(c_ip):
                self._leader_ip = c_ip
                return
        return

    def get_cluster_reachable_ip(self):
        self._load_ip_list(self.node_cfg, "nodeId")

    @property
    def master_ip(self):
        """获取集群主节点可用ip"""
        return self._leader_ip

    @property
    def master_nid(self):
        """获取master节点id"""
        return self.leader_config.get("nodeId", "")

    @property
    def master_name(self):
        """获取master节点名"""
        return self.leader_config.get("ctl_name", "")

    @property
    def local_nid(self):
        """获取local节点id"""
        return self.local_config.get("nodeId", "")

    @property
    def local_name(self):
        """获取local节点名"""
        return self.local_config.get("ctl_name", "")

    @property
    def node_map(self):
        """获取有效节点信息"""
        return self._node_map

    @property
    def node_ips(self):
        """获取有效节点信息"""
        return self._node_map.values()

    @property
    def unreachable_nodes(self):
        """获取不可到达节点信息"""
        return self._unreachable_nodes


def parse_intervals(intervals):
    '''
    将数字区间列表转换为数字列表
    示例："3-4,2,2,,9-10" -> [2,3,4,9,10]
    1.字符串不是数字则报错
    2.左区间大于右区间则报错
    3.错误的区间格式则报错
    4.返回结果列表按升序剔重排序
    5.空区间被忽略
    success return list, 0
    failed return [], 1
    '''
    num_list = []
    try:
        interval_list = intervals.split(',')
        for interval in interval_list:
            begin = 0
            end = 0
            boundaries = interval.split('-')
            if len(boundaries) == 1 and len(boundaries[0].strip()) == 0:
                # 5.空区间被忽略，单独提出来，避免cleancode报错圈复杂度
                continue
            if len(boundaries) == 1:
                begin = int(boundaries[0].strip())
                end = begin
            elif len(boundaries) == 2:
                begin = int(boundaries[0].strip())
                end = int(boundaries[1].strip())
            else:
                # 3.错误的区间格式
                return [], 1

            if begin > end:
                # 2.左区间大于右区间
                return [], 1

            for i in range(begin, end + 1):
                num_list.append(i)

        # 4.返回剔重排序结果
        return sorted(list(set(num_list))), 0
    except Exception as err:
        # 1.字符串不是数字
        return [], 1


# 公共函数
def get_container_service_nodes_inner(ini_path):
    ini_handler = ConfigParser()
    ini_handler.read(ini_path)
    exist = ini_handler.has_option("CONTAINER_PLATFORM", "containerServiceNodes")
    if not exist:
        return []
    container_service_nodes = ini_handler.get("CONTAINER_PLATFORM", "containerServiceNodes")
    node_interval = container_service_nodes.split("-")
    begin = 0
    end = 0
    if len(node_interval) <= 2:
        begin = int(node_interval[0])
        end = begin + 1
    if len(node_interval) == 2:
        end = int(node_interval[1]) + 1

    result = []
    for i in range(begin, end):
        result.append(i)
    return result


def can_container_run_in_this_node():
    """
    检查容器是否能在此节点运行
    支持多引擎
    """

    # 获取当前节点所属引擎
    inner_ip_parser = ParserInnerIpInfo()
    local_name = str(inner_ip_parser.local_name)
    match = re.search(r"\d+", local_name)
    if not match:
        logging.error("local_name not include engine id")
        return False

    # 获取容器支持部署的引擎列表
    kernel_product_parser = ParserKernelProductInfo()
    ret_code = kernel_product_parser.parse()
    if ret_code:
        logging.error("parse kernel product failed")
        return False

    if match.group() in kernel_product_parser.container_service_engines:
        return True
    return False


def enabled_container_feature():
    """
    1. A8000产品默认开启容器使能。
    2. "/proc/osp/bsp" 文件里含有conta为1,2容器特性标识。
    Return:
        1：容器使能开启；
        0：容器使能未开启
    """
    enabled = False
    with open(BSP_CONF, "rb") as f:
        lines = f.readlines()
    for line in lines:
        try:
            line = line.decode("utf-8")
        except Exception as e:
            logging.info("[check_pod_status] using Local container flag %s", e)
            continue
        line = line.strip()
        if not line:
            continue
        if line.startswith("Model of products is:"):
            protect = line.split(":")[1]
            if protect in PROTECT_A8000_SET:
                enabled = True
                break
        if line.startswith("Local container:"):
            logging.info("[check_pod_status] using Local container flag %s", line)
            tmp = re.split(":", line)
            if len(tmp) <= 1:
                continue
            if chr(int(tmp[1], 16)) in CONTAINER_ENABLED_LEVEL_SET:
                enabled = True
                break
            if chr(int(tmp[1], 16)) in CONTAINER_DISABLED_LEVEL_SET:
                enabled = False
                break
        pos = line.find("conta")
        if pos < 0:
            continue
        info = line[pos: len(line)]
        tmp = re.split(":|;", info)
        if len(tmp) > 1 and tmp[1] in CONTAINER_ENABLED_LEVEL_SET:
            enabled = True
    if enabled:
        return 1
    return 0


def handle_pod(line):
    columns = line.split()
    if len(columns) < 5:
        return 1
    ready = columns[1]
    status = columns[2]
    ready_columns = ready.split("/")
    if len(ready_columns) >= 2 and ready_columns[0] != ready_columns[1]:
        return 1
    if status != "Running":
        return 1
    return 0


def check_pod_status():
    cmd = r"kubectl get pod -n kube-system | awk 'NR>1'"
    data = ""
    with os.popen(cmd, "r") as p:
        data = p.read()
        data = data.strip()
    logging.info("[check_pod_status] %s", data)
    lines = data.split("\n")
    for line in lines:
        ret = handle_pod(line)
        if ret:
            logging.error("[check_pod_status] %s", line)
            return 1
    return 0


def check_node_status():
    cmd = r"kubectl get node | awk 'NR>1'"
    data = ""
    with os.popen(cmd, "r") as p:
        data = p.read()
        data = data.strip()
    logging.info("check node status %s", data)
    lines = data.split("\n")
    for line in lines:
        columns = line.split()
        if len(columns) > 2 and columns[1].strip() == "NotReady":
            logging.info("node not ready  %s", line)
            return 1
    return 0


def check_is_fix():
    """
    检查容器LUN修复标记是否存在，存在返回1，不存在返回0
    """
    ret_code = os.path.exists(CONTAINER_LUN_FIX_FLAG)
    if ret_code:
        logging.warning("container service is in fix status.")
        return 1
    return 0


def check_is_load_global_image():
    """
    检查容器后台导入镜像标记是否存在，存在返回1，不存在返回0
    """
    ret_code = os.path.exists(CONTAINER_LOAD_GLOBAL_IMAGE_FLAG)
    if ret_code:
        logging.warning("container service is in load global image status.")
        return 1
    return 0


def check_is_background_task_run():
    """
    检查容器是否有影响升级的后台任务，存在返回1，不存在返回0
    """
    ret = check_is_fix()
    if ret:
        return 1
    ret = check_is_load_global_image()
    if ret:
        return 1
    return 0


def backup_litek8s_package():
    """
    备份litek8s组件包镜像
    """
    upd_dir = "/container/pkg_upd"
    pkg_dir = "/startup_disk/image/pkg_cur/posseidon"
    pkg_path = ""
    if not os.path.exists(pkg_dir):
        return 1
    for file in os.listdir(pkg_dir):
        if 'litek8s-' in file and file.endswith(".tar.gz"):
            pkg_path = os.path.join(pkg_dir, file)
    if os.path.exists(upd_dir):
        shutil.rmtree(upd_dir)
    os.makedirs(upd_dir)
    os.chmod(upd_dir, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP)  # 750
    try:
        upd_path = shutil.copy(pkg_path, upd_dir)
    except Exception as err:
        logging.error("copy litek8s package from %s to %s error: %s", pkg_path, upd_dir, err)
        return 1
    os.chmod(upd_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP)  # 640
    logging.info("backup litek8s package success")
    return 0


def _retry_execute_command(cmd, max_retry_times, interval):
    for _ in range(max_retry_times):
        res, output = subprocess.getstatusoutput(cmd)
        if res == 0:
            return res, output
        logging.error("%s failed, ret %d", cmd, res)
        time.sleep(interval)
    return res, output


def _get_sys_version():
    if not os.path.exists(CUR_MANIFEST_PATH):
        logging.warning("Manifest file is not exist.")
        return 0
    with open(CUR_MANIFEST_PATH, "rb") as file_handle:
        cfg_yml = yaml.safe_load(file_handle)
        try:
            version = str(cfg_yml.get("SYS")["Version"])
            return int(version)
        except Exception as ex:
            logging.warning("Failed to get version, err %s.", ex)
            return 0


def check_kubernetes_version_need_psp_open():
    """
    存储版本<615则不需要检查
    用于升级前获取集群k8s版本并判断是否为1.19或1.22
    """
    src_ver = _get_sys_version()
    if src_ver == 0:
        logging.error("Get sys version failed, skip check psp.")
        return False
    if src_ver < DORADO_VER_615:
        logging.info("Current version need not check psp status.")
        return False
    cmd = "timeout 5 kubectl version --short --kubeconfig /root/.kube/config | grep -i \"server version\""
    code, output = _retry_execute_command(cmd, 2, 3)

    if code != 0:
        logging.error("Failed to get server version info, error: %s", output)
        return False
    if ("v1.19" in output) or ("v1.22" in output):
        logging.info("Current k8s version need check psp status.")
        return True
    logging.info("Current k8s version need not check psp status.")
    return False


def check_psp_close():
    if not check_kubernetes_version_need_psp_open():
        return 0
    process = subprocess.Popen(['/usr/local/bin/kubectl', 'get', 'psp', 'unprivileged'], shell=False,
                               stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
    try:
        (_, stderr) = process.communicate(timeout=3)
    except subprocess.TimeoutExpired:
        logging.info("get psp timeout")
        process.kill()
        return 1
    err = stderr.decode()
    if "podsecuritypolicies.policy \"unprivileged\" not found" in err:
        logging.info("psp is closed")
        return 1
    logging.info("psp is open")
    return 0


def check_kubernetes():
    """
    检查k8s集群状态是否正常，0 正常 1 异常
    1、是否有正在运行的影响升级的后台任务
    2、kubectl get node 正常
    3、kubectl get node 状态全ready
    4、kubectl get pod -n kube-system 的ready一致，状态为Running
    5、备份litek8s组件包，LUN故障场景恢复镜像用
    6、如果升级前是1.19或1.22，则要求打开psp才允许升级
    """
    ret = check_is_background_task_run()
    if ret:
        return 1
    process = subprocess.Popen(['/usr/local/bin/kubectl', 'get', 'node'], shell=False,
                               stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
    try:
        (_, stderr) = process.communicate(timeout=3)
    except subprocess.TimeoutExpired:
        logging.info("get node status timeout")
        process.kill()
        return 1
    ret = process.returncode
    if ret:
        logging.info("get node faild: [%s]", stderr.decode("utf-8"))
        return 1
    ret = check_node_status()
    if ret:
        logging.info("get node status faild ret %d", ret)
        return 1
    ret = check_pod_status()
    if ret:
        logging.info("check pod status %d", ret)
        return 1
    ret = check_psp_close()
    if ret:
        logging.info("psp is close")
        return 1
    ret = backup_litek8s_package()
    if ret:
        logging.error("backup litek8s package error")
        return 1
    return 0


def check_device():
    """
    通过DISK_MANAGE_FILE检查容器LUN状态
    状态 0 正常 1 异常
    """
    if not os.path.exists(DISK_MANAGE_FILE):
        logging.error("container disk manager script not exist")
        return 1
    cmd = "%s check 1>/dev/null 2>/dev/kmsg" % DISK_MANAGE_FILE
    res, _ = _retry_execute_command(cmd, DEFAULT_RETRY_TIMES, 3)
    if res != 0:
        logging.error("check container device failed.")
        return 1
    return 0


if __name__ == "__main__":
    environ["CRYPTO_ENGINE"] = "om_kmc"
    environ["HOME"] = "/root"
    RET_CODE = 0
    try:
        RET_CODE = enabled_container_feature()
        if RET_CODE:
            if can_container_run_in_this_node():
                RET_CODE = check_kubernetes() or check_device()
            else:
                logging.info("container feature enabled, but can not run in this node")
                RET_CODE = 0
        else:
            logging.info("container feature not enabled")
            RET_CODE = 0
        if not RET_CODE:
            print("True")
        else:
            print("False")
        logging.info("check kubernetes status %d", RET_CODE)
        sys.exit(0)
    except Exception as e:
        logging.error("kubernets check faild: %s", e)
        print("False")
        sys.exit(RET_CODE)
