sysom1/sysom_server/sysom_cluster_health/conf/metric_settings.py

418 lines
16 KiB
Python

from .common import *
#################################################################################
# Base Settings
#################################################################################
# Prometheus to collect metrics
PROMETHEUS_CONFIG = YAML_CONFIG.get_server_config().db.prometheus
# Interval to collect metric from prometheus and calculate health score
CALCULATE_INTERVAL = 30
# If True, use multi-thread to collect and calculate
ENABLE_MULTI_THREAD = False
# Number or thread to use if ENABLE_MULTI_THREAD enable
ANALYZER_PROCESS_NUM = 1
# No Cluster Label in metric, assume all metric is in one cluster
NO_CLUSTER_LABEL = True
#################################################################################
# Global Metric Collecting Settings
#################################################################################
# the following settings is to specify some label name, in case metric labels changing
CLUSTER_LABEL = "cluster"
POD_LABEL = "pod"
NODE_LABEL = "instance"
NAMESPACE_LABEL = "namespace"
POD_METRIC_TAG = "value"
#################################################################################
# Global Metric Weights Settings
#################################################################################
# the following settings is to specify weight calculating method of each level
# Worst: the type score is the lowest socre of a metric of this metric type
# Equal: all metric of one metric type has equal weight
# WeightedSum: custom weight of in metric settings, should make sure sum of all metrics'
# weight equal to 1
# Auto: use built-in weight algorithm
POD_WEIGHT_METHOD = "Worst" # Worst, Equal, WeightedSum, Auto
NODE_WEIGHT_METHOD = "Worst" # Worst, Equal, WeightedSum, Auto
CLUSTER_WEIGHT_METHOD = "Worst" # Worst, Equal, WeightedSum, Auto
#################################################################################
# Global Alarm and Diagnose Settings
#################################################################################
# The size of queue which used to send diagnose request between analyzer and diagnose worker
MAX_QUEUE_SIZE = 500
# Used to merge alarms
ALARM_MERGE_NUM = 10
#################################################################################
# Cluster Metrics Settings
#################################################################################
CLUSTER_METRICS = {
"CapacityMetric": [],
"LoadMetric": [],
"LatencyMetric": [],
"ErrorMetric": []
}
#################################################################################
# Pod Metrics Settings
#################################################################################
POD_METRICS = {
"CapacityMetric": [ # the weight of all capacity metrci must euqal to 1
{
"Description": "Pod memory util", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_container_memUtil", # table name
"related_value": ["usage", "limit"], # specific metric
"standard_type": 2, # 0 = non-standard(custom), 1 = already usage, 2 = (usage/total*100)
},
"Score": { # settings for calculating metric score
"weight": 0.3, # weight of the metric
"score": { # 分数:指标值
"100": 70, # mem usage >= 70% -- 100分(good)
"70": 80, # mem usage >= 80% -- 70分(warning)
"60": 90, # mem usage >= 90% -- 60分(error)
"0": 100 # mem usage >= 95% -- 0分(fatel)
}
}
},
{
"Description": "Pod cpu util", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_container_cpuacct_stat",
"related_value": ["total"],
"standard_type": 1,
},
"Score": { # settings for calculating metric score
"weight": 0.2,
"score": {
"100": 70, # cpu total util >= 70% -- 100分(good)
"70": 80, # cpu total util >= 85% -- 70分(warning)
"60": 90, # cpu total util >= 90% -- 60分(error)
"0": 100 # cpu total util >= 95% -- 0分(fatel)
}
}
},
{
"Description": "Pod sys util", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_container_cpuacct_stat",
"related_value": ["system"],
"standard_type": 1,
},
"Score": { # settings for calculating metric score
"weight": 0.5,
"score": {
"100": 5,
"70": 10,
"60": 20,
"0": 30,
}
}
},
],
"LoadMetric": [
{
"Description": "Pod load average", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_container_proc_stat",
"related_value": ["r_load1min"],
"standard_type": 1,
},
"Score": { # settings for calculating metric score
"weight": 1.0,
"score": {
"100": 0.7,
"70": 1,
"60": 5,
"0": 10
}
}
}
],
"LatencyMetric": [
{
"Description": "Pod memory reclaim latency", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_container_memdrcm_latency",
# 延时区间需要按从小到达填写
"related_value": ["memDrcm_lat_10to100ms", "memDrcm_lat_100to500ms", "memDrcm_lat_500to1000ms",
"memDrcm_lat_1000ms"],
"standard_type": 2,
},
"Score": { # settings for calculating metric score
"weight": 1.0,
"score": {
"100": 0,
"70": 100,
"60": 10000,
"0": 100000
}
}
},
{
"Description": "Pod memory compact latency", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_container_memmcmp_latency",
# 延时区间需要按从小到达填写
"related_value": ["memDcmp_lat_10to100ms", "memDcmp_lat_100to500ms", "memDcmp_lat_500to1000ms",
"memDcmp_lat_1000ms"],
"standard_type": 2,
},
"Score": { # settings for calculating metric score
"weight": 1.0,
"score": {
"100": 0,
"70": 100,
"60": 10000,
"0": 100000
}
}
}
],
"ErrorMetric": [
{
"Description": "Pod OOM count", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_container_memory_oomcnt",
"related_value": ["oom_kill"],
"standard_type": 1,
},
"Score": {
"weight": 1.0,
"score": {
"100": 0,
"60": 1,
"0": 5,
}
}
},
{
"Description": "Pod memory fail count", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_container_memfail_cnt",
"related_value": ["fail_cnt"],
"standard_type": 1,
},
"Score": {
"weight": 1.0,
"score": {
"100": 0,
"80": 10,
"60": 50,
"0": 100,
}
}
},
{
"Description": "Pod cpu throttled count", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_container_cpu_stat",
"related_value": ["nr_throttled"],
"standard_type": 1,
},
"Score": {
"weight": 1.0,
"score": {
"100": 0,
"60": 1,
"0": 5,
}
}
}
]
}
#################################################################################
# Nodes Metrics Settings
#################################################################################
NODE_METRICS = {
"CapacityMetric": [ # the weight of all capacity metrci must euqal to 1
{
"Description": "Node file descriptor util", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_file_descriptor",
"related_value": ["file-nr", "file-max"],
"node_tag_name": "type",
"standard_type": 0,
"filename": "node_fd_util",
},
"Score": { # settings for calculating metric score
"weight": 0.2,
"score": {
"100": 40, # fd util >= 50% -- 100分(good)
"60": 60, # fd util >= 85% -- 70分(warning)
"30": 80, # cpu total util >= 90% -- 60分(error)
"0": 100 # cpu total util >= 95% -- 0分(fatel)
}
},
"Alarm": { # settings for alerting and diagnosing
"threshold": 30,
"diagnose_type": "custom",
"service_name": "command"
},
},
{
"Description": "Node memory util", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_proc_meminfo", # table name
"related_value": ["MemAvailable", "MemTotal"], # specific metric
"node_tag_name": "value",
"standard_type": 3, # 0 = non-standard, 1 = already usage, 2 = (usage/total*100)
},
"Score": { # settings for calculating metric score
"weight": 0.1,
"score": { # 分数:指标值
"100": 70, # mem usage >= 70% -- 100分(good)
"70": 80, # mem usage >= 80% -- 70分(warning)
"60": 90, # mem usage >= 90% -- 60分(error)
"0": 100 # mem usage >= 95% -- 0分(fatel)
}
}
},
{
"Description": "Node cpu util", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_proc_cpu_total",
"related_value": ["idle"],
"node_tag_name": "mode",
"standard_type": 0,
"filename": "node_cpu_util",
},
"Score": { # settings for calculating metric score
"weight": 0.2,
"score": {
"100": 70, # cpu total util >= 70% -- 100分(good)
"70": 80, # cpu total util >= 85% -- 70分(warning)
"60": 90, # cpu total util >= 90% -- 60分(error)
"0": 100 # cpu total util >= 95% -- 0分(fatel)
}
}
},
{
"Description": "Node sys util", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_proc_cpu_total",
"related_value": ["sys"],
"node_tag_name": "mode",
"standard_type": 1,
},
"Score": { # settings for calculating metric score
"weight": 0.5,
"score": {
"100": 5,
"70": 10,
"60": 20,
"0": 30
}
}
},
{
"Description": "Node rootfs util", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_fs_stat",
"related_value": ["f_bavail", "f_blocks", "f_bfree"],
"node_tag_name": "counter",
"standard_type": 0,
"filename": "node_rootfs_util",
},
"Score": { # settings for calculating metric score
"weight": 0.2,
"score": {
"100": 50,
"70": 70,
"60": 90,
"0": 95
}
}
},
{
"Description": "Node rootfs inode util", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_fs_stat",
"related_value": ["f_favail", "f_files"],
"node_tag_name": "counter",
"standard_type": 0,
"filename": "node_rootfs_inode_util",
},
"Score": { # settings for calculating metric score
"weight": 0.2,
"score": {
"100": 50,
"70": 70,
"60": 90,
"0": 95
}
}
}
],
"LoadMetric": [
{
"Description": "Node load average", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_proc_loadavg",
"related_value": ["load1"],
"node_tag_name": "value",
"standard_type": 0,
"filename": "node_load_avg",
},
"Score": { # settings for calculating metric score
"weight": 1.0,
"score": {
"100": 1, # cpu total util >= 70% -- 100分(good)
"70": 5, # cpu total util >= 85% -- 70分(warning)
"60": 10, # cpu total util >= 90% -- 60分(error)
"0": 20 # cpu total util >= 95% -- 0分(fatel)
}
}
}
],
"LatencyMetric": [
{
"Description": "Node sched latency", # description of the metric
"Collect": {
"metric_name": "sysom_cpu_dist",
"related_value": ["ms10","ms100","s1"],
"node_tag_name": "value",
"standard_type": 2,
},
"Score": {
"weight": 1.0,
"score": {
"100": 40, # cpu total util >= 70% -- 100分(good)
"70": 100, # cpu total util >= 85% -- 70分(warning)
"30": 150, # cpu total util >= 90% -- 60分(error)
"0": 200 # cpu total util >= 95% -- 0分(fatel)
}
}
}
],
"ErrorMetric": [
{
"Description": "Node OOM count", # description of the metric
"Collect": { # settings for collecting and preprocessing metric
"metric_name": "sysom_proc_vmstat",
"related_value": ["oom_kill"],
"node_tag_name": "value",
"standard_type": 1,
},
"Score": { # settings for calculating metric score
"weight": 1.0,
"score": {
"100": 0,
"60": 1,
"0": 5,
}
}
}
]
}