sysom1/sysom_server/sysom_monitor_server/scripts/base.yaml

461 lines
12 KiB
YAML

config:
freq: 30 # unit second
port: 8400 # bind port
bind_addr: 0.0.0.0 # bind ip
backlog: 32 # listen backlog
#daemon: true
url_safe: close
identity: # support hostip, curl(need url arg), hostname, file(need path arg), specify(need name arg)
# mode: curl
# url: "http://100.100.100.200/latest/meta-data/instance-id"
# name: test_specify
mode: hostip
# real_timestamps: true
# unix_socket: "/tmp/sysom_unity.sock"
proc_path: / # in container mode, like -v /:/mnt/host , should use /mnt/host/
db:
rotate: 7 # tsdb file retention time, unit day
budget: 200 # max query buffer from tsdb.
limit:
cpu: 30 # unit %
mem: 200 # unit mb
tasks: 10 # monitor 10 pid max.
cellLimit: -1 # set guard limit time. guard time is unlimit when cellLimit is -1,default is 50(ms)
forkRun:
- cmd: "../../../ntopo"
args: []
- cmd: "../../../sql-obs"
args: ["-y", "/etc/sysak/base.yaml"]
- cmd: "../../../ioMonitor"
args: ["-y", "/etc/sysak/base.yaml"]
pushTo:
to: "Influx"
host: "localhost"
port: 80
url: "/api/v1/cec_proxy/proxy/line_protocol"
outline:
- /var/sysom/outline
container:
mode: "pods"
runtime: ["k8sApi", "docker"]
luaPlugin:
[
"cg_cpu_cfs_quota",
"cg_mem_drcm_glob_latency",
"cg_memory_util",
"cg_cpu_stat_sample",
"cg_cpuacct_stat",
"cg_memory_drcm_latency",
"cg_memory_fail_cnt",
"cg_memory_dcmp_latency",
"cg_cpuacct_wait_latency",
"con_net_stat",
"cg_blkio_stat",
"podmem",
]
luaPlugins:
[
"proc_buddyinfo",
"proc_diskstats",
"proc_meminfo",
"proc_mounts",
"proc_netdev",
"proc_snmp_stat",
"proc_sockstat",
"proc_stat",
"proc_statm",
"proc_vmstat",
"proc_uptime",
"proc_arp",
"proc_cgroups",
"proc_softirqs",
"proc_softnet_stat",
]
resctrl:
path: "sys/fs/resctrl"
monLuaPlugin: ["rdt_llc_occupancy", "rdt_local_mem_bw", "rdt_total_mem_bw"]
resLuaPlugin: ["rdt_size"]
auto: true # search all mon-groups and res-group, it will ignore config arg group
# group:
# - name: ""
# - name: "LS"
# # monitor: ["mon1", "mon2"]
# - name: "BE"
# monitor: ["pod3#con3"]
plugins:
- so: kmsg
description: "collect dmesg info."
- so: proc_schedstat
description: "collect schedule stat info of percpu"
- so: proc_loadavg
description: "collect load avg"
- so: net_health
description: "tcp net health."
- so: net_retrans
description: "tcp retrans monitor."
- so: unity_nosched
description: "nosched:sys hold cpu and didn't scheduling"
# irqoff may eat up the perf_event_fd ,so pmu_events will fail;
# so we close it temporarily
# - so: unity_irqoff
# description: "irqoff:detect irq turned off and can't response"
- so: gpuinfo
description: "collect gpuinfo"
- so: uncore_imc
description: "IMC latency and bandwidth"
- so: pmu_events
description: "collect pmu events"
- so: cpufreq
description: "collect cpu frequence of perf cpu"
metrics:
- title: sysom_java_app
from: sysom_java_app
head: value
help: "java application observe."
type: "gauge"
- title: sysom_observe
from: observe
head: value
help: "application observe."
type: "gauge"
- title: sysom_proc_cpu_total
from: cpu_total
head: mode
help: "cpu usage info for total."
type: "gauge"
discrete: true # datas are discrete.default is false
- title: sysom_proc_cpus
from: cpus
head: mode
help: "cpu usage info for per-cpu."
type: "gauge"
- title: sysom_proc_sirq
from: sirq
head: type
help: "system soft irq times."
type: "gauge"
- title: sysom_proc_stat_counters
from: stat_counters
head: counter
help: "system state counter."
type: "gauge"
- title: sysom_proc_meminfo
from: meminfo
head: value
help: "meminfo from /proc/meminfo."
type: "gauge"
- title: sysom_proc_vmstat
from: vmstat
head: value
help: "vmstat info from /proc/vmstat."
type: "gauge"
- title: sysom_proc_self_statm
from: self_statm
head: value
help: "statm info from /proc/self/statm."
type: "gauge"
- title: sysom_proc_networks
from: networks
head: counter
help: "networks info from /proc/net/dev."
type: "gauge"
- title: sysom_proc_disks
from: disks
head: counter
help: "disk info from /proc/diskstats."
type: "gauge"
- title: sysom_proc_pkt_status
from: pkt_status
head: counter
help: "net status info from /proc/net/snmp and /proc/net/status."
type: "gauge"
- title: sysom_fs_stat
from: fs_stat
head: counter
help: "file system information."
type: "gauge"
- title: sysom_sock_stat
from: sock_stat
head: value
help: "sock stat counters from /proc/net/sockstat"
type: "gauge"
- title: sysom_proc_schedstat
from: proc_schedstat
head: value
help: "schedule state of percpu."
type: "gauge"
- title: sysom_proc_loadavg
from: proc_loadavg
head: value
help: "loadavg of system from /proc/loadavg"
type: "gauge"
- title: sysom_proc_buddyinfo
from: buddyinfo
head: value
help: "buddyinfo of system from /proc/buddyinfo"
type: "gauge"
- title: sysom_IOMonIndForDisksIO
from: IOMonIndForDisksIO
head: value
help: "Disk IO indicators and abnormal events"
type: "gauge"
- title: sysom_IOMonIndForSystemIO
from: IOMonIndForSystemIO
head: value
help: "System indicators and abnormal events about IO"
type: "gauge"
- title: sysom_IOMonDiagLog
from: IOMonDiagLog
head: value
help: "Diagnose log for IO exception"
type: "gauge"
- title: sched_moni_jitter
from: sched_moni_jitter
head: value
help: "nosched/irqoff:sys and irqoff hold cpu and didn't scheduling"
type: "gauge"
- title: sysom_cpu_dist
from: cpu_dist
head: value
help: "task cpu sched dist."
type: "gauge"
- title: sysom_net_health_hist
from: net_health_hist
head: value
help: "net_health_hist"
type: "gauge"
- title: sysom_net_health_count
from: net_health_count
head: value
help: "net_health_count"
type: "gauge"
- title: sysom_net_retrans_count
from: net_retrans_count
head: value
help: "net_retrans_count"
type: "gauge"
- title: sysom_gpuinfo
from: gpuinfo
head: value
help: "gpuinfo of system from nvidia-smi"
type: "gauge"
- title: sysom_uname
from: uname
head: value
help: "uname info"
type: "gauge"
- title: sysom_uptime
from: uptime
head: value
help: "uptime from /proc/uptime"
type: "gauge"
- title: sysom_system_release
from: system_release
head: value
help: "system_release from /etc/os-release"
type: "gauge"
- title: sysom_cgroups
from: cgroups
head: value
help: "cgroup number."
type: "gauge"
- title: sysom_per_sirqs
from: per_sirqs
head: value
help: "per_sirqs."
type: "gauge"
- title: sysom_softnets
from: softnets
head: value
help: "cgroup number."
type: "gauge"
- title: sysom_interrupts
from: interrupts
head: value
help: "interrupts."
type: "gauge"
- title: sysom_net_ip_count
from: net_ip_count
head: value
help: "net snmp net_ip_count"
type: "gauge"
- title: sysom_net_icmp_count
from: net_icmp_count
head: value
help: "net snmp net_icmp_count"
type: "gauge"
- title: sysom_net_udp_count
from: net_udp_count
head: value
help: "net snmp net_udp_count"
type: "gauge"
- title: sysom_net_tcp_count
from: net_tcp_count
head: value
help: "net snmp net_tcp_count"
type: "gauge"
- title: sysom_net_tcp_ext_count
from: net_tcp_ext_count
head: value
help: "net stat net_tcp_ext_count"
type: "gauge"
- title: sysom_ntopo_node
from: sysom_metrics_ntopo_node
head: mode
help: "net topology node"
type: "gauge"
- title: sysom_ntopo_edge
from: sysom_metrics_ntopo_edge
head: mode
help: "net topology edge"
type: "gauge"
- title: sysom_obser_app_rt_ntopo
from: sysom_metrics_ntopo_request
head: value
help: rt and req/resp bytes of apps
type: "gauge"
discrete: true
- title: sysom_obser_mysqld_os
from: sysom_obser_metrics_mysqld_os
head: value
help: "os level metrics of mysqld"
type: "gauge"
- title: sysom_obser_mysqld_process
from: sysom_obser_metrics_mysqld_process
head: value
help: "process level metrics of mysqld"
type: "gauge"
- title: sysom_obser_mysqld_innodb
from: sysom_obser_metrics_mysqld_innodb
head: value
help: "innodb metrics of mysqld"
type: "gauge"
- title: sysom_obser_mysqld_alarm
from: sysom_obser_mysqld_alarm
head: value
help: "alarm on process exception of mysqld"
type: "gauge"
- title: sysom_obser_os_alarm
from: sysom_obser_os_alarm
head: value
help: "alarm on os exception of mysqld"
type: "gauge"
- title: sysom_podmem
from: podmem
head: value
help: "file cache for pod"
type: "gauge"
- title: sysom_container_memfail_cnt
from: cg_memfail_cnt
head: value
help: "sysom_container_memFail_cnt"
type: "gauge"
- title: sysom_container_memUtil
from: cg_memory_util
head: value
help: "sysom_container_memory_util"
type: "gauge"
- title: sysom_container_memgdrcm_latency
from: cgGlbDrcmLatency
head: value
help: "sysom global memory latency"
type: "gauge"
- title: sysom_container_memdrcm_latency
from: cg_memdrcm_latency
head: value
help: "sysom_container_memdrcm_latency"
type: "gauge"
- title: sysom_container_memmcmp_latency
from: cg_memmcmp_latency
head: value
help: "sysom_container_memmcmp_latency"
type: "gauge"
- title: sysom_container_cpu_stat
from: cg_cpu_stat
head: value
help: "sysom_container_cpu_stat"
type: "gauge"
- title: sysom_container_cpuacct_stat
from: cg_cpuacct_stat
head: value
help: "cpuacct/cpuacct.stat"
type: "gauge"
- title: sysom_container_cfs_quota
from: cgCpuQuota
head: value
help: "cfs quota"
type: "gauge"
- title: sysom_container_network_stat
from: con_net_stat
head: value
help: "network stat of containers"
type: "gauge"
- title: sysom_container_blkio_stat
from: cg_blkio_stat
head: value
help: "io stat from blkio cgroup"
type: "gauge"
- title: sysom_container_cpuacct_wait_latency
from: cg_wait_latency
head: value
help: "wait_latency of cgroups"
type: "gauge"
- title: sysom_imc_channel_event
from: imc_channel_event
head: value
help: "imc latency and bw for channels"
type: "gauge"
- title: sysom_imc_event_socket
from: imc_socket_event
head: value
help: "imc latency and bw for socket"
type: "gauge"
- title: sysom_imc_event_node
from: imc_node_event
head: value
help: "imc latency and bw for node"
type: "gauge"
- title: sysom_rdt_usage
from: rdt_usage
head: value
help: "RDT LLC and memory bandwidth usage"
type: "gauge"
- title: sysom_rdt_alloc_policy
from: rdt_alloc_policy
head: value
help: "RDT LLC and memory bandwidth allocation policy"
type: "gauge"
- title: sysom_pmu_events
from: pmu_events
head: value
help: "pmu events, such as cycles/instructions, llc events"
type: "gauge"
- title: sysom_pmu_events_percpu
from: pmu_events_percpu
head: value
help: "pmu events of percpu"
type: "gauge"
- title: sysom_cpu_freq
from: cpufreq
head: value
help: "the frequence of percpu"
type: "gauge"
observe:
comms:
java: "cgroup"
mysqld: "cgroup"
period: 20000