!26 add collect module and avg_block_io plugin to sysSentry
From: @zhuofeng6 Reviewed-by: @gaoruoshu Signed-off-by: @gaoruoshu
This commit is contained in:
commit
9a453a260d
1165
add-collect-module-to-sysSentry.patch
Normal file
1165
add-collect-module-to-sysSentry.patch
Normal file
File diff suppressed because it is too large
Load Diff
572
feature-add-avg_block_io-plugin.patch
Normal file
572
feature-add-avg_block_io-plugin.patch
Normal file
@ -0,0 +1,572 @@
|
|||||||
|
From acb77d6a69aa9269b0f691613bef53efd0c01e53 Mon Sep 17 00:00:00 2001
|
||||||
|
From: gaoruoshu <gaoruoshu@huawei.com>
|
||||||
|
Date: Thu, 12 Sep 2024 11:31:34 +0800
|
||||||
|
Subject: [PATCH 2/2] add avg_block_io plugin
|
||||||
|
|
||||||
|
---
|
||||||
|
config/plugins/avg_block_io.ini | 21 ++
|
||||||
|
config/tasks/avg_block_io.mod | 5 +
|
||||||
|
src/python/sentryPlugins/__init__.py | 0
|
||||||
|
.../sentryPlugins/avg_block_io/__init__.py | 0
|
||||||
|
.../avg_block_io/avg_block_io.py | 257 ++++++++++++++++++
|
||||||
|
.../sentryPlugins/avg_block_io/module_conn.py | 86 ++++++
|
||||||
|
.../avg_block_io/stage_window.py | 47 ++++
|
||||||
|
.../sentryPlugins/avg_block_io/utils.py | 86 ++++++
|
||||||
|
8 files changed, 502 insertions(+)
|
||||||
|
create mode 100644 config/plugins/avg_block_io.ini
|
||||||
|
create mode 100644 config/tasks/avg_block_io.mod
|
||||||
|
create mode 100644 src/python/sentryPlugins/__init__.py
|
||||||
|
create mode 100644 src/python/sentryPlugins/avg_block_io/__init__.py
|
||||||
|
create mode 100644 src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||||
|
create mode 100644 src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||||
|
create mode 100644 src/python/sentryPlugins/avg_block_io/stage_window.py
|
||||||
|
create mode 100644 src/python/sentryPlugins/avg_block_io/utils.py
|
||||||
|
|
||||||
|
diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..bc33dde
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/config/plugins/avg_block_io.ini
|
||||||
|
@@ -0,0 +1,21 @@
|
||||||
|
+[common]
|
||||||
|
+disk=default
|
||||||
|
+stage=default
|
||||||
|
+iotype=read,write
|
||||||
|
+period_time=1
|
||||||
|
+
|
||||||
|
+[algorithm]
|
||||||
|
+win_size=30
|
||||||
|
+win_threshold=6
|
||||||
|
+
|
||||||
|
+[latency]
|
||||||
|
+read_avg_lim=10
|
||||||
|
+write_avg_lim=10
|
||||||
|
+read_avg_time=3
|
||||||
|
+write_avg_time=3
|
||||||
|
+read_tot_lim=50
|
||||||
|
+write_tot_lim=50
|
||||||
|
+
|
||||||
|
+[iodump]
|
||||||
|
+read_iodump_lim=0
|
||||||
|
+write_iodump_lim=0
|
||||||
|
diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..814c483
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/config/tasks/avg_block_io.mod
|
||||||
|
@@ -0,0 +1,5 @@
|
||||||
|
+[common]
|
||||||
|
+enabled=yes
|
||||||
|
+task_start=/usr/bin/python3 /usr/bin/avg_block_io
|
||||||
|
+task_stop=pkill avg_block_io
|
||||||
|
+type=oneshot
|
||||||
|
\ No newline at end of file
|
||||||
|
diff --git a/src/python/sentryPlugins/__init__.py b/src/python/sentryPlugins/__init__.py
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..e69de29
|
||||||
|
diff --git a/src/python/sentryPlugins/avg_block_io/__init__.py b/src/python/sentryPlugins/avg_block_io/__init__.py
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..e69de29
|
||||||
|
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..ff2071d
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||||
|
@@ -0,0 +1,257 @@
|
||||||
|
+# coding: utf-8
|
||||||
|
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||||||
|
+# sysSentry is licensed under the Mulan PSL v2.
|
||||||
|
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||||
|
+# You may obtain a copy of Mulan PSL v2 at:
|
||||||
|
+# http://license.coscl.org.cn/MulanPSL2
|
||||||
|
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
|
||||||
|
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||||
|
+# PURPOSE.
|
||||||
|
+# See the Mulan PSL v2 for more details.
|
||||||
|
+import logging
|
||||||
|
+import signal
|
||||||
|
+import configparser
|
||||||
|
+import time
|
||||||
|
+
|
||||||
|
+from .stage_window import IoWindow, IoDumpWindow
|
||||||
|
+from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler
|
||||||
|
+from .utils import update_avg_and_check_abnormal
|
||||||
|
+
|
||||||
|
+CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
|
||||||
|
+
|
||||||
|
+def log_invalid_keys(not_in_list, keys_name, config_list, default_list):
|
||||||
|
+ """print invalid log"""
|
||||||
|
+ if config_list and default_list:
|
||||||
|
+ logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list))
|
||||||
|
+ elif config_list == ["default"]:
|
||||||
|
+ logging.warning("Default {} use {}".format(keys_name, default_list))
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def read_config_common(config):
|
||||||
|
+ """read config file, get [common] section value"""
|
||||||
|
+ try:
|
||||||
|
+ common_sec = config['common']
|
||||||
|
+ except configparser.NoSectionError:
|
||||||
|
+ report_alarm_fail("Cannot find common section in config file")
|
||||||
|
+
|
||||||
|
+ try:
|
||||||
|
+ period_time = int(common_sec.get("period_time", 1))
|
||||||
|
+ if not (1 <= period_time <= 300):
|
||||||
|
+ raise ValueError("Invalid period_time")
|
||||||
|
+ except ValueError:
|
||||||
|
+ period_time = 1
|
||||||
|
+ logging.warning("Invalid period_time, set to 1s")
|
||||||
|
+
|
||||||
|
+ disk = common_sec.get('disk').split(",") if common_sec.get('disk') not in [None, 'default'] else []
|
||||||
|
+ stage = common_sec.get('stage').split(",") if common_sec.get('stage') not in [None, 'default'] else []
|
||||||
|
+
|
||||||
|
+ if len(disk) > 10:
|
||||||
|
+ logging.warning("Too many disks, record only max 10 disks")
|
||||||
|
+ disk = disk[:10]
|
||||||
|
+
|
||||||
|
+ iotype = common_sec.get('iotype', 'read,write').split(",")
|
||||||
|
+ iotype_list = [rw.lower() for rw in iotype if rw.lower() in ['read', 'write', 'flush', 'discard']]
|
||||||
|
+ err_iotype = [rw for rw in iotype if rw.lower() not in ['read', 'write', 'flush', 'discard']]
|
||||||
|
+
|
||||||
|
+ if err_iotype:
|
||||||
|
+ logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list))
|
||||||
|
+
|
||||||
|
+ return period_time, disk, stage, iotype_list
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def read_config_algorithm(config):
|
||||||
|
+ """read config file, get [algorithm] section value"""
|
||||||
|
+ if not config.has_section("algorithm"):
|
||||||
|
+ report_alarm_fail("Cannot find algorithm section in config file")
|
||||||
|
+
|
||||||
|
+ try:
|
||||||
|
+ win_size = int(config.get("algorithm", "win_size"))
|
||||||
|
+ if not (1 <= win_size <= 300):
|
||||||
|
+ raise ValueError("Invalid win_size")
|
||||||
|
+ win_threshold = int(config.get("algorithm", "win_threshold"))
|
||||||
|
+ if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
|
||||||
|
+ raise ValueError("Invalid win_threshold")
|
||||||
|
+ except ValueError:
|
||||||
|
+ report_alarm_fail("Invalid win_threshold or win_size")
|
||||||
|
+
|
||||||
|
+ return win_size, win_threshold
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def read_config_lat_iodump(io_dic, config):
|
||||||
|
+ """read config file, get [latency] [iodump] section value"""
|
||||||
|
+ common_param = {}
|
||||||
|
+ for io_type in io_dic["iotype_list"]:
|
||||||
|
+ common_param[io_type] = {}
|
||||||
|
+
|
||||||
|
+ latency_keys = {
|
||||||
|
+ "avg_lim": "{}_avg_lim".format(io_type),
|
||||||
|
+ "avg_time": "{}_avg_time".format(io_type),
|
||||||
|
+ "tot_lim": "{}_tot_lim".format(io_type),
|
||||||
|
+ }
|
||||||
|
+ iodump_key = "{}_iodump_lim".format(io_type)
|
||||||
|
+
|
||||||
|
+ for key_suffix, key_template in latency_keys.items():
|
||||||
|
+ if key_template in config["latency"] and config["latency"][key_template].isdecimal():
|
||||||
|
+ common_param[io_type][key_template] = int(config["latency"][key_template])
|
||||||
|
+
|
||||||
|
+ if iodump_key in config["iodump"] and config["iodump"][iodump_key].isdecimal():
|
||||||
|
+ common_param[io_type][iodump_key] = int(config["iodump"][iodump_key])
|
||||||
|
+
|
||||||
|
+ return common_param
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def read_config_stage(config, stage, iotype_list):
|
||||||
|
+ """read config file, get [STAGE_NAME] section value"""
|
||||||
|
+ res = {}
|
||||||
|
+ if not stage in config:
|
||||||
|
+ return res
|
||||||
|
+
|
||||||
|
+ for key in config[stage]:
|
||||||
|
+ if config[stage][key].isdecimal():
|
||||||
|
+ res[key] = int(config[stage][key])
|
||||||
|
+
|
||||||
|
+ return res
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def init_io_win(io_dic, config, common_param):
|
||||||
|
+ """initialize windows of latency, iodump, and dict of avg_value"""
|
||||||
|
+ iotype_list = io_dic["iotype_list"]
|
||||||
|
+ io_data = {}
|
||||||
|
+ io_avg_value = {}
|
||||||
|
+ for disk_name in io_dic["disk_list"]:
|
||||||
|
+ io_data[disk_name] = {}
|
||||||
|
+ io_avg_value[disk_name] = {}
|
||||||
|
+ for stage_name in io_dic["stage_list"]:
|
||||||
|
+ io_data[disk_name][stage_name] = {}
|
||||||
|
+ io_avg_value[disk_name][stage_name] = {}
|
||||||
|
+ # step3. 解析stage配置
|
||||||
|
+ curr_stage_param = read_config_stage(config, stage_name, iotype_list)
|
||||||
|
+ for rw in iotype_list:
|
||||||
|
+ io_data[disk_name][stage_name][rw] = {}
|
||||||
|
+ io_avg_value[disk_name][stage_name][rw] = [0, 0]
|
||||||
|
+
|
||||||
|
+ # 对每个rw创建latency和iodump窗口
|
||||||
|
+ avg_lim_key = "{}_avg_lim".format(rw)
|
||||||
|
+ avg_time_key = "{}_avg_time".format(rw)
|
||||||
|
+ tot_lim_key = "{}_tot_lim".format(rw)
|
||||||
|
+ iodump_lim_key = "{}_iodump_lim".format(rw)
|
||||||
|
+
|
||||||
|
+ # 获取值,优先从 curr_stage_param 获取,如果不存在,则从 common_param 获取
|
||||||
|
+ avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key))
|
||||||
|
+ avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key))
|
||||||
|
+ tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key))
|
||||||
|
+ iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key))
|
||||||
|
+
|
||||||
|
+ if avg_lim_value and avg_time_value and tot_lim_value:
|
||||||
|
+ io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
|
||||||
|
+
|
||||||
|
+ if iodump_lim_value is not None:
|
||||||
|
+ io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value)
|
||||||
|
+ return io_data, io_avg_value
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
|
||||||
|
+ """get disk_list and stage_list by sentryCollector"""
|
||||||
|
+ json_data = avg_is_iocollect_valid(io_dic, config_disk, config_stage)
|
||||||
|
+
|
||||||
|
+ all_disk_set = json_data.keys()
|
||||||
|
+ all_stage_set = set()
|
||||||
|
+ for disk_stage_list in json_data.values():
|
||||||
|
+ all_stage_set.update(disk_stage_list)
|
||||||
|
+
|
||||||
|
+ disk_list = [key for key in config_disk if key in all_disk_set]
|
||||||
|
+ not_in_disk_list = [key for key in config_disk if key not in all_disk_set]
|
||||||
|
+
|
||||||
|
+ stage_list = [key for key in config_stage if key in all_stage_set]
|
||||||
|
+ not_in_stage_list = [key for key in config_stage if key not in all_stage_set]
|
||||||
|
+
|
||||||
|
+ if not config_disk:
|
||||||
|
+ disk_list = [key for key in all_disk_set]
|
||||||
|
+
|
||||||
|
+ if not config_stage:
|
||||||
|
+ stage_list = [key for key in all_stage_set]
|
||||||
|
+
|
||||||
|
+ if config_disk and not disk_list:
|
||||||
|
+ logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk))
|
||||||
|
+ disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage)
|
||||||
|
+
|
||||||
|
+ if config_stage and not stage_list:
|
||||||
|
+ logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage))
|
||||||
|
+ disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, [])
|
||||||
|
+
|
||||||
|
+ if not stage_list or not disk_list:
|
||||||
|
+ report_alarm_fail("Cannot get valid disk name or stage name.")
|
||||||
|
+
|
||||||
|
+ log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list)
|
||||||
|
+ log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list)
|
||||||
|
+
|
||||||
|
+ return disk_list, stage_list
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def main_loop(io_dic, io_data, io_avg_value):
|
||||||
|
+ """main loop of avg_block_io"""
|
||||||
|
+ period_time = io_dic["period_time"]
|
||||||
|
+ disk_list = io_dic["disk_list"]
|
||||||
|
+ stage_list = io_dic["stage_list"]
|
||||||
|
+ iotype_list = io_dic["iotype_list"]
|
||||||
|
+ win_size = io_dic["win_size"]
|
||||||
|
+ # 开始循环
|
||||||
|
+ while True:
|
||||||
|
+ # 等待x秒
|
||||||
|
+ time.sleep(period_time)
|
||||||
|
+
|
||||||
|
+ # 采集模块对接,获取周期数据
|
||||||
|
+ curr_period_data = avg_get_io_data(io_dic)
|
||||||
|
+
|
||||||
|
+ # 处理周期数据
|
||||||
|
+ reach_size = False
|
||||||
|
+ for disk_name in disk_list:
|
||||||
|
+ for stage_name in stage_list:
|
||||||
|
+ for rw in iotype_list:
|
||||||
|
+ if disk_name in curr_period_data and stage_name in curr_period_data[disk_name] and rw in curr_period_data[disk_name][stage_name]:
|
||||||
|
+ io_key = (disk_name, stage_name, rw)
|
||||||
|
+ reach_size = update_avg_and_check_abnormal(curr_period_data, io_key, win_size, io_avg_value, io_data)
|
||||||
|
+
|
||||||
|
+ # win_size不满时不进行告警判断
|
||||||
|
+ if not reach_size:
|
||||||
|
+ continue
|
||||||
|
+
|
||||||
|
+ # 判断异常窗口、异常场景
|
||||||
|
+ for disk_name in disk_list:
|
||||||
|
+ for rw in iotype_list:
|
||||||
|
+ process_report_data(disk_name, rw, io_data)
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def main():
|
||||||
|
+ """main func"""
|
||||||
|
+ # 注册停止信号-2/-15
|
||||||
|
+ signal.signal(signal.SIGINT, sig_handler)
|
||||||
|
+ signal.signal(signal.SIGTERM, sig_handler)
|
||||||
|
+
|
||||||
|
+ # 初始化配置读取
|
||||||
|
+ config = configparser.ConfigParser(comment_prefixes=('#', ';'))
|
||||||
|
+ try:
|
||||||
|
+ config.read(CONFIG_FILE)
|
||||||
|
+ except configparser.Error:
|
||||||
|
+ report_alarm_fail("Failed to read config file")
|
||||||
|
+
|
||||||
|
+ io_dic = {}
|
||||||
|
+
|
||||||
|
+ # 读取配置文件 -- common段
|
||||||
|
+ io_dic["period_time"], disk, stage, io_dic["iotype_list"] = read_config_common(config)
|
||||||
|
+
|
||||||
|
+ # 采集模块对接,is_iocollect_valid()
|
||||||
|
+ io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage)
|
||||||
|
+
|
||||||
|
+ if "bio" not in io_dic["stage_list"]:
|
||||||
|
+ report_alarm_fail("Cannot run avg_block_io without bio stage")
|
||||||
|
+
|
||||||
|
+ # 初始化窗口 -- config读取,对应is_iocollect_valid返回的结果
|
||||||
|
+ # step1. 解析公共配置 --- algorithm
|
||||||
|
+ io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config)
|
||||||
|
+
|
||||||
|
+ # step2. 循环创建窗口
|
||||||
|
+ common_param = read_config_lat_iodump(io_dic, config)
|
||||||
|
+ io_data, io_avg_value = init_io_win(io_dic, config, common_param)
|
||||||
|
+
|
||||||
|
+ main_loop(io_dic, io_data, io_avg_value)
|
||||||
|
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..caa0191
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||||
|
@@ -0,0 +1,86 @@
|
||||||
|
+# coding: utf-8
|
||||||
|
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||||||
|
+# sysSentry is licensed under the Mulan PSL v2.
|
||||||
|
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||||
|
+# You may obtain a copy of Mulan PSL v2 at:
|
||||||
|
+# http://license.coscl.org.cn/MulanPSL2
|
||||||
|
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
|
||||||
|
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||||
|
+# PURPOSE.
|
||||||
|
+# See the Mulan PSL v2 for more details.
|
||||||
|
+import json
|
||||||
|
+import logging
|
||||||
|
+import sys
|
||||||
|
+import time
|
||||||
|
+
|
||||||
|
+from .utils import is_abnormal
|
||||||
|
+from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
|
||||||
|
+from syssentry.result import ResultLevel, report_result
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+TASK_NAME = "avg_block_io"
|
||||||
|
+
|
||||||
|
+def sig_handler(signum, _f):
|
||||||
|
+ """stop avg_block_io"""
|
||||||
|
+ report_result(TASK_NAME, ResultLevel.PASS, json.dumps({}))
|
||||||
|
+ logging.info("Finished avg_block_io plugin running.")
|
||||||
|
+ sys.exit(0)
|
||||||
|
+
|
||||||
|
+def avg_get_io_data(io_dic):
|
||||||
|
+ """get_io_data from sentryCollector"""
|
||||||
|
+ res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"])
|
||||||
|
+ return check_result_validation(res, 'get io data')
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def avg_is_iocollect_valid(io_dic, config_disk, config_stage):
|
||||||
|
+ """is_iocollect_valid from sentryCollector"""
|
||||||
|
+ res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage)
|
||||||
|
+ return check_result_validation(res, 'check config validation')
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def check_result_validation(res, reason):
|
||||||
|
+ """check validation of result from sentryCollector"""
|
||||||
|
+ if not 'ret' in res or not 'message' in res:
|
||||||
|
+ err_msg = "Failed to {}: Cannot connect to sentryCollector.".format(reason)
|
||||||
|
+ report_alarm_fail(err_msg)
|
||||||
|
+ if res['ret'] != 0:
|
||||||
|
+ err_msg = "Failed to {}: {}".format(reason, Result_Messages[res['ret']])
|
||||||
|
+ report_alarm_fail(err_msg)
|
||||||
|
+
|
||||||
|
+ try:
|
||||||
|
+ json_data = json.loads(res['message'])
|
||||||
|
+ except json.JSONDecodeError:
|
||||||
|
+ err_msg = "Failed to {}: invalid return message".format(reason)
|
||||||
|
+ report_alarm_fail(err_msg)
|
||||||
|
+
|
||||||
|
+ return json_data
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def report_alarm_fail(alarm_info):
|
||||||
|
+ """report result to xalarmd"""
|
||||||
|
+ report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info}))
|
||||||
|
+ logging.error(alarm_info)
|
||||||
|
+ sys.exit(1)
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def process_report_data(disk_name, rw, io_data):
|
||||||
|
+ """check abnormal window and report to xalarm"""
|
||||||
|
+ if not is_abnormal((disk_name, 'bio', rw), io_data):
|
||||||
|
+ return
|
||||||
|
+
|
||||||
|
+ ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
|
||||||
|
+ for stage_name in ctrl_stage:
|
||||||
|
+ if is_abnormal((disk_name, stage_name, rw), io_data):
|
||||||
|
+ logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw))
|
||||||
|
+ return
|
||||||
|
+
|
||||||
|
+ if is_abnormal((disk_name, 'rq_driver', rw), io_data):
|
||||||
|
+ logging.warning("{} - {} - {} report driver".format(time.ctime(), disk_name, rw))
|
||||||
|
+ return
|
||||||
|
+
|
||||||
|
+ kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue']
|
||||||
|
+ for stage_name in kernel_stage:
|
||||||
|
+ if is_abnormal((disk_name, stage_name, rw), io_data):
|
||||||
|
+ logging.warning("{} - {} - {} report kernel".format(time.ctime(), disk_name, rw))
|
||||||
|
+ return
|
||||||
|
+ logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw))
|
||||||
|
diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..9b0ce79
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py
|
||||||
|
@@ -0,0 +1,47 @@
|
||||||
|
+# coding: utf-8
|
||||||
|
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||||||
|
+# sysSentry is licensed under the Mulan PSL v2.
|
||||||
|
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||||
|
+# You may obtain a copy of Mulan PSL v2 at:
|
||||||
|
+# http://license.coscl.org.cn/MulanPSL2
|
||||||
|
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
|
||||||
|
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||||
|
+# PURPOSE.
|
||||||
|
+# See the Mulan PSL v2 for more details.
|
||||||
|
+
|
||||||
|
+class AbnormalWindowBase:
|
||||||
|
+ def __init__(self, window_size=10, window_threshold=7):
|
||||||
|
+ self.window_size = window_size
|
||||||
|
+ self.window_threshold = window_threshold
|
||||||
|
+ self.abnormal_window = [False] * window_size
|
||||||
|
+
|
||||||
|
+ def append_new_period(self, ab_res, avg_val=0):
|
||||||
|
+ self.abnormal_window.pop(0)
|
||||||
|
+ if self.is_abnormal_period(ab_res, avg_val):
|
||||||
|
+ self.abnormal_window.append(True)
|
||||||
|
+ else:
|
||||||
|
+ self.abnormal_window.append(False)
|
||||||
|
+
|
||||||
|
+ def is_abnormal_window(self):
|
||||||
|
+ return sum(self.abnormal_window) > self.window_threshold
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+class IoWindow(AbnormalWindowBase):
|
||||||
|
+ def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40):
|
||||||
|
+ super().__init__(window_size, window_threshold)
|
||||||
|
+ self.abnormal_multiple = abnormal_multiple
|
||||||
|
+ self.abnormal_multiple_lim = abnormal_multiple_lim
|
||||||
|
+ self.abnormal_time = abnormal_time
|
||||||
|
+
|
||||||
|
+ def is_abnormal_period(self, value, avg_val):
|
||||||
|
+ return (value > avg_val * self.abnormal_multiple and value > self.abnormal_multiple_lim) or \
|
||||||
|
+ (value > self.abnormal_time)
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+class IoDumpWindow(AbnormalWindowBase):
|
||||||
|
+ def __init__(self, window_size=10, window_threshold=7, abnormal_time=40):
|
||||||
|
+ super().__init__(window_size, window_threshold)
|
||||||
|
+ self.abnormal_time = abnormal_time
|
||||||
|
+
|
||||||
|
+ def is_abnormal_period(self, value, avg_val=0):
|
||||||
|
+ return value > self.abnormal_time
|
||||||
|
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..54ed080
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
|
||||||
|
@@ -0,0 +1,86 @@
|
||||||
|
+# coding: utf-8
|
||||||
|
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||||||
|
+# sysSentry is licensed under the Mulan PSL v2.
|
||||||
|
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||||
|
+# You may obtain a copy of Mulan PSL v2 at:
|
||||||
|
+# http://license.coscl.org.cn/MulanPSL2
|
||||||
|
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
|
||||||
|
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||||
|
+# PURPOSE.
|
||||||
|
+# See the Mulan PSL v2 for more details.
|
||||||
|
+AVG_VALUE = 0
|
||||||
|
+AVG_COUNT = 1
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def get_nested_value(data, keys):
|
||||||
|
+ """get data from nested dict"""
|
||||||
|
+ for key in keys:
|
||||||
|
+ if key in data:
|
||||||
|
+ data = data[key]
|
||||||
|
+ else:
|
||||||
|
+ return None
|
||||||
|
+ return data
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def set_nested_value(data, keys, value):
|
||||||
|
+ """set data to nested dict"""
|
||||||
|
+ for key in keys[:-1]:
|
||||||
|
+ if key in data:
|
||||||
|
+ data = data[key]
|
||||||
|
+ else:
|
||||||
|
+ return False
|
||||||
|
+ data[keys[-1]] = value
|
||||||
|
+ return True
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def is_abnormal(io_key, io_data):
|
||||||
|
+ """check if latency and iodump win abnormal"""
|
||||||
|
+ for key in ['latency', 'iodump']:
|
||||||
|
+ all_keys = get_nested_value(io_data, io_key)
|
||||||
|
+ if all_keys and key in all_keys:
|
||||||
|
+ win = get_nested_value(io_data, io_key + (key,))
|
||||||
|
+ if win and win.is_abnormal_window():
|
||||||
|
+ return True
|
||||||
|
+ return False
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def update_io_avg(old_avg, period_value, win_size):
|
||||||
|
+ """update average of latency window"""
|
||||||
|
+ if old_avg[AVG_COUNT] < win_size:
|
||||||
|
+ new_avg_count = old_avg[AVG_COUNT] + 1
|
||||||
|
+ new_avg_value = (old_avg[AVG_VALUE] * old_avg[AVG_COUNT] + period_value[0]) / new_avg_count
|
||||||
|
+ else:
|
||||||
|
+ new_avg_count = old_avg[AVG_COUNT]
|
||||||
|
+ new_avg_value = (old_avg[AVG_VALUE] * (old_avg[AVG_COUNT] - 1) + period_value[0]) / new_avg_count
|
||||||
|
+ return [new_avg_value, new_avg_count]
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def update_io_data(old_avg, period_value, win_size, io_data, io_key):
|
||||||
|
+ """update data of latency and iodump window"""
|
||||||
|
+ all_wins = get_nested_value(io_data, io_key)
|
||||||
|
+ if all_wins and "latency" in all_wins:
|
||||||
|
+ io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE])
|
||||||
|
+ if all_wins and "iodump" in all_wins:
|
||||||
|
+ io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1])
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data):
|
||||||
|
+ """update avg and check abonrmal, return true if win_size full"""
|
||||||
|
+ period_value = get_nested_value(data, io_key)
|
||||||
|
+ old_avg = get_nested_value(io_avg_value, io_key)
|
||||||
|
+
|
||||||
|
+ # 更新avg数据
|
||||||
|
+ if old_avg[AVG_COUNT] < win_size:
|
||||||
|
+ set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
|
||||||
|
+ return False
|
||||||
|
+
|
||||||
|
+ # 更新win数据 -- 判断异常周期
|
||||||
|
+ update_io_data(old_avg, period_value, win_size, io_data, io_key)
|
||||||
|
+ all_wins = get_nested_value(io_data, io_key)
|
||||||
|
+ if all_wins and 'latency' not in all_wins:
|
||||||
|
+ return True
|
||||||
|
+ period = get_nested_value(io_data, io_key + ("latency",))
|
||||||
|
+ if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
|
||||||
|
+ return True
|
||||||
|
+ set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
|
||||||
|
+ return True
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
@ -4,7 +4,7 @@
|
|||||||
Summary: System Inspection Framework
|
Summary: System Inspection Framework
|
||||||
Name: sysSentry
|
Name: sysSentry
|
||||||
Version: 1.0.2
|
Version: 1.0.2
|
||||||
Release: 12
|
Release: 13
|
||||||
License: Mulan PSL v2
|
License: Mulan PSL v2
|
||||||
Group: System Environment/Daemons
|
Group: System Environment/Daemons
|
||||||
Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz
|
Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz
|
||||||
@ -22,6 +22,8 @@ Patch9: Remove-ANSI-escape-sequences.patch
|
|||||||
Patch10: split-cpu_sentry-and-syssentry.patch
|
Patch10: split-cpu_sentry-and-syssentry.patch
|
||||||
Patch11: fix-configparser.InterpolationSyntaxError.patch
|
Patch11: fix-configparser.InterpolationSyntaxError.patch
|
||||||
Patch12: fix-syssentry-fails-to-be-started-when-cpu_sentry-is.patch
|
Patch12: fix-syssentry-fails-to-be-started-when-cpu_sentry-is.patch
|
||||||
|
Patch13: add-collect-module-to-sysSentry.patch
|
||||||
|
Patch14: feature-add-avg_block_io-plugin.patch
|
||||||
|
|
||||||
BuildRequires: cmake gcc-c++
|
BuildRequires: cmake gcc-c++
|
||||||
BuildRequires: python3 python3-setuptools
|
BuildRequires: python3 python3-setuptools
|
||||||
@ -58,6 +60,13 @@ Recommends: ipmitool
|
|||||||
%description -n cpu_sentry
|
%description -n cpu_sentry
|
||||||
This package provides CPU fault detection
|
This package provides CPU fault detection
|
||||||
|
|
||||||
|
%package -n avg_block_io
|
||||||
|
Summary: Supports slow I/O detection
|
||||||
|
Requires: sysSentry = %{version}-%{release}
|
||||||
|
|
||||||
|
%description -n avg_block_io
|
||||||
|
This package provides Supports slow I/O detection based on EBPF
|
||||||
|
|
||||||
%prep
|
%prep
|
||||||
%autosetup -n %{name}-%{version} -p1
|
%autosetup -n %{name}-%{version} -p1
|
||||||
|
|
||||||
@ -99,6 +108,10 @@ install -m 600 service/xalarmd.service %{buildroot}%{_unitdir}
|
|||||||
install -m 600 config/logrotate %{buildroot}%{_sysconfdir}/logrotate.d/sysSentry
|
install -m 600 config/logrotate %{buildroot}%{_sysconfdir}/logrotate.d/sysSentry
|
||||||
install -m 644 src/libso/xalarm/register_xalarm.h %{buildroot}%{_includedir}/xalarm/register_xalarm.h
|
install -m 644 src/libso/xalarm/register_xalarm.h %{buildroot}%{_includedir}/xalarm/register_xalarm.h
|
||||||
|
|
||||||
|
# sentryCollector
|
||||||
|
install -m 600 config/collector.conf %{buildroot}%{_sysconfdir}/sysSentry
|
||||||
|
install -m 600 service/sentryCollector.service %{buildroot}%{_unitdir}
|
||||||
|
|
||||||
# cpu sentry
|
# cpu sentry
|
||||||
install config/tasks/cpu_sentry.mod %{buildroot}/etc/sysSentry/tasks/
|
install config/tasks/cpu_sentry.mod %{buildroot}/etc/sysSentry/tasks/
|
||||||
install config/plugins/cpu_sentry.ini %{buildroot}/etc/sysSentry/plugins/cpu_sentry.ini
|
install config/plugins/cpu_sentry.ini %{buildroot}/etc/sysSentry/plugins/cpu_sentry.ini
|
||||||
@ -108,6 +121,10 @@ install src/c/catcli/catlib/build/plugin/cpu_patrol/libcpu_patrol.so %{buildroot
|
|||||||
chrpath -d %{buildroot}%{_bindir}/cat-cli
|
chrpath -d %{buildroot}%{_bindir}/cat-cli
|
||||||
chrpath -d %{buildroot}%{_libdir}/libcpu_patrol.so
|
chrpath -d %{buildroot}%{_libdir}/libcpu_patrol.so
|
||||||
|
|
||||||
|
# avg_block_io
|
||||||
|
install config/tasks/avg_block_io.mod %{buildroot}/etc/sysSentry/tasks/
|
||||||
|
install config/plugins/avg_block_io.ini %{buildroot}/etc/sysSentry/plugins/avg_block_io.ini
|
||||||
|
|
||||||
pushd src/python
|
pushd src/python
|
||||||
python3 setup.py install -O1 --root=$RPM_BUILD_ROOT --record=SENTRY_FILES
|
python3 setup.py install -O1 --root=$RPM_BUILD_ROOT --record=SENTRY_FILES
|
||||||
popd
|
popd
|
||||||
@ -123,6 +140,8 @@ if [ "$1" = "0" ]; then
|
|||||||
systemctl disable xalarmd.service
|
systemctl disable xalarmd.service
|
||||||
systemctl stop sysSentry.service
|
systemctl stop sysSentry.service
|
||||||
systemctl disable sysSentry.service
|
systemctl disable sysSentry.service
|
||||||
|
systemctl stop sentryCollector.service
|
||||||
|
systemctl disable sentryCollector.service
|
||||||
fi
|
fi
|
||||||
rm -rf /var/run/xalarm | :
|
rm -rf /var/run/xalarm | :
|
||||||
rm -rf /var/run/sysSentry | :
|
rm -rf /var/run/sysSentry | :
|
||||||
@ -137,6 +156,8 @@ rm -rf %{buildroot}
|
|||||||
%defattr(0550,root,root)
|
%defattr(0550,root,root)
|
||||||
%attr(0550,root,root) %{python3_sitelib}/xalarm
|
%attr(0550,root,root) %{python3_sitelib}/xalarm
|
||||||
%attr(0550,root,root) %{python3_sitelib}/syssentry
|
%attr(0550,root,root) %{python3_sitelib}/syssentry
|
||||||
|
%attr(0550,root,root) %{python3_sitelib}/sentryCollector
|
||||||
|
%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/avg_block_io
|
||||||
|
|
||||||
# sysSentry
|
# sysSentry
|
||||||
%attr(0500,root,root) %{_bindir}/sentryctl
|
%attr(0500,root,root) %{_bindir}/sentryctl
|
||||||
@ -162,6 +183,17 @@ rm -rf %{buildroot}
|
|||||||
%exclude %{python3_sitelib}/syssentry/cpu_*
|
%exclude %{python3_sitelib}/syssentry/cpu_*
|
||||||
%exclude %{python3_sitelib}/syssentry/*/cpu_*
|
%exclude %{python3_sitelib}/syssentry/*/cpu_*
|
||||||
|
|
||||||
|
# avg block io
|
||||||
|
%exclude %{_sysconfdir}/sysSentry/tasks/avg_block_io.mod
|
||||||
|
%exclude %{_sysconfdir}/sysSentry/plugins/avg_block_io.ini
|
||||||
|
%exclude %{_bindir}/avg_block_io
|
||||||
|
%exclude %{python3_sitelib}/sentryPlugins/*
|
||||||
|
|
||||||
|
# sentryCollector
|
||||||
|
%attr(0550,root,root) %{_bindir}/sentryCollector
|
||||||
|
%attr(0600,root,root) %{_sysconfdir}/sysSentry/collector.conf
|
||||||
|
%attr(0600,root,root) %{_unitdir}/sentryCollector.service
|
||||||
|
|
||||||
%files -n libxalarm
|
%files -n libxalarm
|
||||||
%attr(0550,root,root) %{_libdir}/libxalarm.so
|
%attr(0550,root,root) %{_libdir}/libxalarm.so
|
||||||
|
|
||||||
@ -178,7 +210,19 @@ rm -rf %{buildroot}
|
|||||||
%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/cpu_sentry.ini
|
%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/cpu_sentry.ini
|
||||||
%attr(0550,root,root) %{python3_sitelib}/syssentry/cpu_*
|
%attr(0550,root,root) %{python3_sitelib}/syssentry/cpu_*
|
||||||
|
|
||||||
|
%files -n avg_block_io
|
||||||
|
%attr(0500,root,root) %{_bindir}/avg_block_io
|
||||||
|
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/avg_block_io.mod
|
||||||
|
%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/avg_block_io.ini
|
||||||
|
%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/avg_block_io
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Sat Sep 14 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-13
|
||||||
|
- Type:requirement
|
||||||
|
- CVE:NA
|
||||||
|
- SUG:NA
|
||||||
|
- DESC:add collect module and avg_block_io plugin to sysSentry
|
||||||
|
|
||||||
* Sat Sep 14 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-12
|
* Sat Sep 14 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-12
|
||||||
- Type:bugfix
|
- Type:bugfix
|
||||||
- CVE:NA
|
- CVE:NA
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user