sysom1/sysom_server/sysom_cluster_health/conf/metric_settings.py

from .common import *

#################################################################################
# Base Settings
#################################################################################

# Prometheus to collect metrics
PROMETHEUS_CONFIG = YAML_CONFIG.get_server_config().db.prometheus
# Interval to collect metric from prometheus and calculate health score
CALCULATE_INTERVAL = 30
# If True, use multi-thread to collect and calculate
ENABLE_MULTI_THREAD = False
# Number or thread to use if ENABLE_MULTI_THREAD enable
ANALYZER_PROCESS_NUM = 1
# No Cluster Label in metric, assume all metric is in one cluster
NO_CLUSTER_LABEL = True

#################################################################################
# Global Metric Collecting Settings
#################################################################################

# the following settings is to specify some label name, in case metric labels changing
CLUSTER_LABEL = "cluster"
POD_LABEL = "pod"
NODE_LABEL = "instance"
NAMESPACE_LABEL = "namespace"
POD_METRIC_TAG = "value"

#################################################################################
# Global Metric Weights Settings
#################################################################################

# the following settings is to specify weight calculating method of each level
# Worst: the type score is the lowest socre of a metric of this metric type
# Equal: all metric of one metric type has equal weight
# WeightedSum: custom weight of in metric settings, should make sure sum of all metrics'
# weight equal to 1
# Auto: use built-in weight algorithm
POD_WEIGHT_METHOD = "Worst" # Worst, Equal, WeightedSum, Auto
NODE_WEIGHT_METHOD = "Worst" # Worst, Equal, WeightedSum, Auto
CLUSTER_WEIGHT_METHOD = "Worst" # Worst, Equal, WeightedSum, Auto

#################################################################################
# Global Alarm and Diagnose Settings
#################################################################################

# The size of queue which used to send diagnose request between analyzer and diagnose worker
MAX_QUEUE_SIZE = 500
# Used to merge alarms
ALARM_MERGE_NUM = 10


#################################################################################
# Cluster Metrics Settings
#################################################################################

CLUSTER_METRICS = {
    "CapacityMetric": [],
    "LoadMetric": [],
    "LatencyMetric": [],
    "ErrorMetric": []
}

#################################################################################
# Pod Metrics Settings
#################################################################################

POD_METRICS = {
    "CapacityMetric": [  # the weight of all capacity metrci must euqal to 1
        {
            "Description": "Pod memory util",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_container_memUtil", # table name
                "related_value": ["usage", "limit"], # specific metric
                "standard_type": 2, # 0 = non-standard(custom), 1 = already usage, 2 = (usage/total*100)
            },
            "Score": { # settings for calculating metric score
                "weight": 0.3,  # weight of the metric
                "score": {   # 分数：指标值
                    "100": 70,     # mem usage >= 70% -- 100分(good)
                    "70": 80,  # mem usage >= 80% -- 70分(warning)
                    "60": 90,    # mem usage >= 90% -- 60分(error)
                    "0": 100    # mem usage >= 95% -- 0分(fatel)
                }
            }
        },
        {
            "Description": "Pod cpu util",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_container_cpuacct_stat",
                "related_value": ["total"],
                "standard_type": 1,
            },
            "Score": { # settings for calculating metric score
                "weight": 0.2,
                "score": {
                    "100": 70,     # cpu total util >= 70% -- 100分(good)
                    "70": 80,  # cpu total util >= 85% -- 70分(warning)
                    "60": 90,    # cpu total util >= 90% -- 60分(error)
                    "0": 100    # cpu total util >= 95% -- 0分(fatel)
                }
            }
        },
        {
            "Description": "Pod sys util",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_container_cpuacct_stat",
                "related_value": ["system"],
                "standard_type": 1,
            },
            "Score": { # settings for calculating metric score
                "weight": 0.5,
                "score": {
                    "100": 5,
                    "70": 10,
                    "60": 20,
                    "0": 30,
                }
            }
        },
    ],
    "LoadMetric": [
        {
            "Description": "Pod load average",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_container_proc_stat",
                "related_value": ["r_load1min"],
                "standard_type": 1,
            },
            "Score": { # settings for calculating metric score
                "weight": 1.0,
                "score": {
                    "100": 0.7,
                    "70": 1,
                    "60": 5,
                    "0": 10
                }
            }
        }
    ],
    "LatencyMetric": [
        {
            "Description": "Pod memory reclaim latency",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_container_memdrcm_latency",
                # 延时区间需要按从小到达填写
                "related_value": ["memDrcm_lat_10to100ms", "memDrcm_lat_100to500ms", "memDrcm_lat_500to1000ms",
                                  "memDrcm_lat_1000ms"],
                "standard_type": 2,
            },
            "Score": { # settings for calculating metric score
                "weight": 1.0,
                "score": {
                    "100": 0,
                    "70": 100,
                    "60": 10000,
                    "0": 100000
                }
            }
        },
        {
            "Description": "Pod memory compact latency",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_container_memmcmp_latency",
                # 延时区间需要按从小到达填写
                "related_value": ["memDcmp_lat_10to100ms", "memDcmp_lat_100to500ms", "memDcmp_lat_500to1000ms",
                                  "memDcmp_lat_1000ms"],
                "standard_type": 2,
            },
            "Score": { # settings for calculating metric score
                "weight": 1.0,
                "score": {
                    "100": 0,
                    "70": 100,
                    "60": 10000,
                    "0": 100000
                }
            }
        }
    ],
    "ErrorMetric": [
        {
            "Description": "Pod OOM count",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_container_memory_oomcnt",
                "related_value": ["oom_kill"],
                "standard_type": 1,
            },
            "Score": {
                "weight": 1.0,
                "score": {
                    "100": 0,
                    "60": 1,
                    "0": 5,
                }
            }
        },
        {
            "Description": "Pod memory fail count",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_container_memfail_cnt",
                "related_value": ["fail_cnt"],
                "standard_type": 1,
            },
            "Score": {
                "weight": 1.0,
                "score": {
                    "100": 0,
                    "80": 10,
                    "60": 50,
                    "0": 100,
                }
            }
        },
        {
            "Description": "Pod cpu throttled count",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_container_cpu_stat",
                "related_value": ["nr_throttled"],
                "standard_type": 1,
            },
            "Score": {
                "weight": 1.0,
                "score": {
                    "100": 0,
                    "60": 1,
                    "0": 5,
                }
            }
        }
    ]
}

#################################################################################
# Nodes Metrics Settings
#################################################################################

NODE_METRICS = {
    "CapacityMetric": [  # the weight of all capacity metrci must euqal to 1
        {
            "Description": "Node file descriptor util",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_file_descriptor",
                "related_value": ["file-nr", "file-max"],
                "node_tag_name": "type",
                "standard_type": 0,
                "filename": "node_fd_util",
            },
            "Score": { # settings for calculating metric score
                "weight": 0.2,
                "score": {
                    "100": 40,     # fd util >= 50% -- 100分(good)
                    "60": 60,  # fd util >= 85% -- 70分(warning)
                    "30": 80,    # cpu total util >= 90% -- 60分(error)
                    "0": 100    # cpu total util >= 95% -- 0分(fatel)
                }
            },
            "Alarm": {  # settings for alerting and diagnosing
               "threshold": 30,
               "diagnose_type": "custom",
               "service_name": "command"
            },
        },
        {
            "Description": "Node memory util",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_proc_meminfo", # table name
                "related_value": ["MemAvailable", "MemTotal"], # specific metric
                "node_tag_name": "value",
                "standard_type": 3, # 0 = non-standard, 1 = already usage, 2 = (usage/total*100)
            },
            "Score": { # settings for calculating metric score
                "weight": 0.1,
                "score": {   # 分数：指标值
                    "100": 70,     # mem usage >= 70% -- 100分(good)
                    "70": 80,  # mem usage >= 80% -- 70分(warning)
                    "60": 90,    # mem usage >= 90% -- 60分(error)
                    "0": 100    # mem usage >= 95% -- 0分(fatel)
                }
            }
        },
        {
            "Description": "Node cpu util",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_proc_cpu_total",
                "related_value": ["idle"],
                "node_tag_name": "mode",
                "standard_type": 0,
                "filename": "node_cpu_util",
            },
            "Score": { # settings for calculating metric score
                "weight": 0.2,
                "score": {
                    "100": 70,     # cpu total util >= 70% -- 100分(good)
                    "70": 80,  # cpu total util >= 85% -- 70分(warning)
                    "60": 90,    # cpu total util >= 90% -- 60分(error)
                    "0": 100    # cpu total util >= 95% -- 0分(fatel)
                }
            }
        },
        {
            "Description": "Node sys util",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_proc_cpu_total",
                "related_value": ["sys"],
                "node_tag_name": "mode",
                "standard_type": 1,
            },
            "Score": { # settings for calculating metric score
                "weight": 0.5,
                "score": {
                    "100": 5,
                    "70": 10,
                    "60": 20,
                    "0": 30
                }
            }
        },
        {
            "Description": "Node rootfs util",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_fs_stat",
                "related_value": ["f_bavail", "f_blocks", "f_bfree"],
                "node_tag_name": "counter",
                "standard_type": 0,
                "filename": "node_rootfs_util",
            },
            "Score": { # settings for calculating metric score
                "weight": 0.2,
                "score": {
                    "100": 50,
                    "70": 70,
                    "60": 90,
                    "0": 95
                }
            }
        },
        {
            "Description": "Node rootfs inode util",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_fs_stat",
                "related_value": ["f_favail", "f_files"],
                "node_tag_name": "counter",
                "standard_type": 0,
                "filename": "node_rootfs_inode_util",
            },
            "Score": { # settings for calculating metric score
                "weight": 0.2,
                "score": {
                    "100": 50,
                    "70": 70,
                    "60": 90,
                    "0": 95
                }
            }
        }
    ],
    "LoadMetric": [
        {
            "Description": "Node load average",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_proc_loadavg",
                "related_value": ["load1"],
                "node_tag_name": "value",
                "standard_type": 0,
                "filename": "node_load_avg",
            },
            "Score": { # settings for calculating metric score
                "weight": 1.0,
                "score": {
                    "100": 1,     # cpu total util >= 70% -- 100分(good)
                    "70": 5,  # cpu total util >= 85% -- 70分(warning)
                    "60": 10,    # cpu total util >= 90% -- 60分(error)
                    "0": 20    # cpu total util >= 95% -- 0分(fatel)
                }
            }
        }
    ],
    "LatencyMetric": [
        {
            "Description": "Node sched latency",  # description of the metric
            "Collect": {
                "metric_name": "sysom_cpu_dist",
                "related_value": ["ms10","ms100","s1"],
                "node_tag_name": "value",
                "standard_type": 2,
            },
            "Score": {
                "weight": 1.0,
                "score": {
                    "100": 40,     # cpu total util >= 70% -- 100分(good)
                    "70": 100,  # cpu total util >= 85% -- 70分(warning)
                    "30": 150,    # cpu total util >= 90% -- 60分(error)
                    "0": 200    # cpu total util >= 95% -- 0分(fatel)
                }
            }
        }
    ],
    "ErrorMetric": [
        {
            "Description": "Node OOM count",  # description of the metric
            "Collect": {  # settings for collecting and preprocessing metric
                "metric_name": "sysom_proc_vmstat",
                "related_value": ["oom_kill"],
                "node_tag_name": "value",
                "standard_type": 1,
            },
            "Score": { # settings for calculating metric score
                "weight": 1.0,
                "score": {
                    "100": 0,
                    "60": 1,
                    "0": 5,
                }
            }
        }
    ]
}