mirror of https://gitee.com/anolis/sysom.git
418 lines
16 KiB
Python
418 lines
16 KiB
Python
from .common import *
|
|
|
|
#################################################################################
|
|
# Base Settings
|
|
#################################################################################
|
|
|
|
# Prometheus to collect metrics
|
|
PROMETHEUS_CONFIG = YAML_CONFIG.get_server_config().db.prometheus
|
|
# Interval to collect metric from prometheus and calculate health score
|
|
CALCULATE_INTERVAL = 30
|
|
# If True, use multi-thread to collect and calculate
|
|
ENABLE_MULTI_THREAD = False
|
|
# Number or thread to use if ENABLE_MULTI_THREAD enable
|
|
ANALYZER_PROCESS_NUM = 1
|
|
# No Cluster Label in metric, assume all metric is in one cluster
|
|
NO_CLUSTER_LABEL = True
|
|
|
|
#################################################################################
|
|
# Global Metric Collecting Settings
|
|
#################################################################################
|
|
|
|
# the following settings is to specify some label name, in case metric labels changing
|
|
CLUSTER_LABEL = "cluster"
|
|
POD_LABEL = "pod"
|
|
NODE_LABEL = "instance"
|
|
NAMESPACE_LABEL = "namespace"
|
|
POD_METRIC_TAG = "value"
|
|
|
|
#################################################################################
|
|
# Global Metric Weights Settings
|
|
#################################################################################
|
|
|
|
# the following settings is to specify weight calculating method of each level
|
|
# Worst: the type score is the lowest socre of a metric of this metric type
|
|
# Equal: all metric of one metric type has equal weight
|
|
# WeightedSum: custom weight of in metric settings, should make sure sum of all metrics'
|
|
# weight equal to 1
|
|
# Auto: use built-in weight algorithm
|
|
POD_WEIGHT_METHOD = "Worst" # Worst, Equal, WeightedSum, Auto
|
|
NODE_WEIGHT_METHOD = "Worst" # Worst, Equal, WeightedSum, Auto
|
|
CLUSTER_WEIGHT_METHOD = "Worst" # Worst, Equal, WeightedSum, Auto
|
|
|
|
#################################################################################
|
|
# Global Alarm and Diagnose Settings
|
|
#################################################################################
|
|
|
|
# The size of queue which used to send diagnose request between analyzer and diagnose worker
|
|
MAX_QUEUE_SIZE = 500
|
|
# Used to merge alarms
|
|
ALARM_MERGE_NUM = 10
|
|
|
|
|
|
#################################################################################
|
|
# Cluster Metrics Settings
|
|
#################################################################################
|
|
|
|
CLUSTER_METRICS = {
|
|
"CapacityMetric": [],
|
|
"LoadMetric": [],
|
|
"LatencyMetric": [],
|
|
"ErrorMetric": []
|
|
}
|
|
|
|
#################################################################################
|
|
# Pod Metrics Settings
|
|
#################################################################################
|
|
|
|
POD_METRICS = {
|
|
"CapacityMetric": [ # the weight of all capacity metrci must euqal to 1
|
|
{
|
|
"Description": "Pod memory util", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_container_memUtil", # table name
|
|
"related_value": ["usage", "limit"], # specific metric
|
|
"standard_type": 2, # 0 = non-standard(custom), 1 = already usage, 2 = (usage/total*100)
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 0.3, # weight of the metric
|
|
"score": { # 分数:指标值
|
|
"100": 70, # mem usage >= 70% -- 100分(good)
|
|
"70": 80, # mem usage >= 80% -- 70分(warning)
|
|
"60": 90, # mem usage >= 90% -- 60分(error)
|
|
"0": 100 # mem usage >= 95% -- 0分(fatel)
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"Description": "Pod cpu util", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_container_cpuacct_stat",
|
|
"related_value": ["total"],
|
|
"standard_type": 1,
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 0.2,
|
|
"score": {
|
|
"100": 70, # cpu total util >= 70% -- 100分(good)
|
|
"70": 80, # cpu total util >= 85% -- 70分(warning)
|
|
"60": 90, # cpu total util >= 90% -- 60分(error)
|
|
"0": 100 # cpu total util >= 95% -- 0分(fatel)
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"Description": "Pod sys util", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_container_cpuacct_stat",
|
|
"related_value": ["system"],
|
|
"standard_type": 1,
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 0.5,
|
|
"score": {
|
|
"100": 5,
|
|
"70": 10,
|
|
"60": 20,
|
|
"0": 30,
|
|
}
|
|
}
|
|
},
|
|
],
|
|
"LoadMetric": [
|
|
{
|
|
"Description": "Pod load average", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_container_proc_stat",
|
|
"related_value": ["r_load1min"],
|
|
"standard_type": 1,
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 1.0,
|
|
"score": {
|
|
"100": 0.7,
|
|
"70": 1,
|
|
"60": 5,
|
|
"0": 10
|
|
}
|
|
}
|
|
}
|
|
],
|
|
"LatencyMetric": [
|
|
{
|
|
"Description": "Pod memory reclaim latency", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_container_memdrcm_latency",
|
|
# 延时区间需要按从小到达填写
|
|
"related_value": ["memDrcm_lat_10to100ms", "memDrcm_lat_100to500ms", "memDrcm_lat_500to1000ms",
|
|
"memDrcm_lat_1000ms"],
|
|
"standard_type": 2,
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 1.0,
|
|
"score": {
|
|
"100": 0,
|
|
"70": 100,
|
|
"60": 10000,
|
|
"0": 100000
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"Description": "Pod memory compact latency", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_container_memmcmp_latency",
|
|
# 延时区间需要按从小到达填写
|
|
"related_value": ["memDcmp_lat_10to100ms", "memDcmp_lat_100to500ms", "memDcmp_lat_500to1000ms",
|
|
"memDcmp_lat_1000ms"],
|
|
"standard_type": 2,
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 1.0,
|
|
"score": {
|
|
"100": 0,
|
|
"70": 100,
|
|
"60": 10000,
|
|
"0": 100000
|
|
}
|
|
}
|
|
}
|
|
],
|
|
"ErrorMetric": [
|
|
{
|
|
"Description": "Pod OOM count", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_container_memory_oomcnt",
|
|
"related_value": ["oom_kill"],
|
|
"standard_type": 1,
|
|
},
|
|
"Score": {
|
|
"weight": 1.0,
|
|
"score": {
|
|
"100": 0,
|
|
"60": 1,
|
|
"0": 5,
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"Description": "Pod memory fail count", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_container_memfail_cnt",
|
|
"related_value": ["fail_cnt"],
|
|
"standard_type": 1,
|
|
},
|
|
"Score": {
|
|
"weight": 1.0,
|
|
"score": {
|
|
"100": 0,
|
|
"80": 10,
|
|
"60": 50,
|
|
"0": 100,
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"Description": "Pod cpu throttled count", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_container_cpu_stat",
|
|
"related_value": ["nr_throttled"],
|
|
"standard_type": 1,
|
|
},
|
|
"Score": {
|
|
"weight": 1.0,
|
|
"score": {
|
|
"100": 0,
|
|
"60": 1,
|
|
"0": 5,
|
|
}
|
|
}
|
|
}
|
|
]
|
|
}
|
|
|
|
#################################################################################
|
|
# Nodes Metrics Settings
|
|
#################################################################################
|
|
|
|
NODE_METRICS = {
|
|
"CapacityMetric": [ # the weight of all capacity metrci must euqal to 1
|
|
{
|
|
"Description": "Node file descriptor util", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_file_descriptor",
|
|
"related_value": ["file-nr", "file-max"],
|
|
"node_tag_name": "type",
|
|
"standard_type": 0,
|
|
"filename": "node_fd_util",
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 0.2,
|
|
"score": {
|
|
"100": 40, # fd util >= 50% -- 100分(good)
|
|
"60": 60, # fd util >= 85% -- 70分(warning)
|
|
"30": 80, # cpu total util >= 90% -- 60分(error)
|
|
"0": 100 # cpu total util >= 95% -- 0分(fatel)
|
|
}
|
|
},
|
|
"Alarm": { # settings for alerting and diagnosing
|
|
"threshold": 30,
|
|
"diagnose_type": "custom",
|
|
"service_name": "command"
|
|
},
|
|
},
|
|
{
|
|
"Description": "Node memory util", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_proc_meminfo", # table name
|
|
"related_value": ["MemAvailable", "MemTotal"], # specific metric
|
|
"node_tag_name": "value",
|
|
"standard_type": 3, # 0 = non-standard, 1 = already usage, 2 = (usage/total*100)
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 0.1,
|
|
"score": { # 分数:指标值
|
|
"100": 70, # mem usage >= 70% -- 100分(good)
|
|
"70": 80, # mem usage >= 80% -- 70分(warning)
|
|
"60": 90, # mem usage >= 90% -- 60分(error)
|
|
"0": 100 # mem usage >= 95% -- 0分(fatel)
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"Description": "Node cpu util", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_proc_cpu_total",
|
|
"related_value": ["idle"],
|
|
"node_tag_name": "mode",
|
|
"standard_type": 0,
|
|
"filename": "node_cpu_util",
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 0.2,
|
|
"score": {
|
|
"100": 70, # cpu total util >= 70% -- 100分(good)
|
|
"70": 80, # cpu total util >= 85% -- 70分(warning)
|
|
"60": 90, # cpu total util >= 90% -- 60分(error)
|
|
"0": 100 # cpu total util >= 95% -- 0分(fatel)
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"Description": "Node sys util", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_proc_cpu_total",
|
|
"related_value": ["sys"],
|
|
"node_tag_name": "mode",
|
|
"standard_type": 1,
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 0.5,
|
|
"score": {
|
|
"100": 5,
|
|
"70": 10,
|
|
"60": 20,
|
|
"0": 30
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"Description": "Node rootfs util", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_fs_stat",
|
|
"related_value": ["f_bavail", "f_blocks", "f_bfree"],
|
|
"node_tag_name": "counter",
|
|
"standard_type": 0,
|
|
"filename": "node_rootfs_util",
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 0.2,
|
|
"score": {
|
|
"100": 50,
|
|
"70": 70,
|
|
"60": 90,
|
|
"0": 95
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"Description": "Node rootfs inode util", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_fs_stat",
|
|
"related_value": ["f_favail", "f_files"],
|
|
"node_tag_name": "counter",
|
|
"standard_type": 0,
|
|
"filename": "node_rootfs_inode_util",
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 0.2,
|
|
"score": {
|
|
"100": 50,
|
|
"70": 70,
|
|
"60": 90,
|
|
"0": 95
|
|
}
|
|
}
|
|
}
|
|
],
|
|
"LoadMetric": [
|
|
{
|
|
"Description": "Node load average", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_proc_loadavg",
|
|
"related_value": ["load1"],
|
|
"node_tag_name": "value",
|
|
"standard_type": 0,
|
|
"filename": "node_load_avg",
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 1.0,
|
|
"score": {
|
|
"100": 1, # cpu total util >= 70% -- 100分(good)
|
|
"70": 5, # cpu total util >= 85% -- 70分(warning)
|
|
"60": 10, # cpu total util >= 90% -- 60分(error)
|
|
"0": 20 # cpu total util >= 95% -- 0分(fatel)
|
|
}
|
|
}
|
|
}
|
|
],
|
|
"LatencyMetric": [
|
|
{
|
|
"Description": "Node sched latency", # description of the metric
|
|
"Collect": {
|
|
"metric_name": "sysom_cpu_dist",
|
|
"related_value": ["ms10","ms100","s1"],
|
|
"node_tag_name": "value",
|
|
"standard_type": 2,
|
|
},
|
|
"Score": {
|
|
"weight": 1.0,
|
|
"score": {
|
|
"100": 40, # cpu total util >= 70% -- 100分(good)
|
|
"70": 100, # cpu total util >= 85% -- 70分(warning)
|
|
"30": 150, # cpu total util >= 90% -- 60分(error)
|
|
"0": 200 # cpu total util >= 95% -- 0分(fatel)
|
|
}
|
|
}
|
|
}
|
|
],
|
|
"ErrorMetric": [
|
|
{
|
|
"Description": "Node OOM count", # description of the metric
|
|
"Collect": { # settings for collecting and preprocessing metric
|
|
"metric_name": "sysom_proc_vmstat",
|
|
"related_value": ["oom_kill"],
|
|
"node_tag_name": "value",
|
|
"standard_type": 1,
|
|
},
|
|
"Score": { # settings for calculating metric score
|
|
"weight": 1.0,
|
|
"score": {
|
|
"100": 0,
|
|
"60": 1,
|
|
"0": 5,
|
|
}
|
|
}
|
|
}
|
|
]
|
|
} |