sysSentry/add-ai-threshold-slow-io-detection-plugin.patch
贺有志 ec6f42737a add ai threshold slow io detection plugin
Signed-off-by: 贺有志 <1037617413@qq.com>
2024-09-23 14:36:47 +08:00

1202 lines
51 KiB
Diff
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

From 3d72fa7f517e6e99af1205e965c3775dc23461f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Mon, 23 Sep 2024 11:03:26 +0800
Subject: [PATCH] add ai threshold slow io detection to sysSentry
---
.../ai_threshold_slow_io_detection.ini | 16 ++
.../tasks/ai_threshold_slow_io_detection.mod | 5 +
.../test_ai_threshold_slow_io_detection.py | 165 ++++++++++++++++++
.../ai_threshold_slow_io_detection/README.md | 2 +
.../__init__.py | 0
.../alarm_report.py | 49 ++++++
.../config_parser.py | 141 +++++++++++++++
.../data_access.py | 91 ++++++++++
.../detector.py | 48 +++++
.../ai_threshold_slow_io_detection/io_data.py | 74 ++++++++
.../sliding_window.py | 113 ++++++++++++
.../slow_io_detection.py | 133 ++++++++++++++
.../threshold.py | 160 +++++++++++++++++
.../ai_threshold_slow_io_detection/utils.py | 67 +++++++
src/python/setup.py | 3 +-
15 files changed, 1066 insertions(+), 1 deletion(-)
create mode 100644 config/plugins/ai_threshold_slow_io_detection.ini
create mode 100644 config/tasks/ai_threshold_slow_io_detection.mod
create mode 100644 selftest/test/test_ai_threshold_slow_io_detection.py
create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md
create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py
create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py
create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py
create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py
create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py
create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py
create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py
create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py
create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py
create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py
diff --git a/config/plugins/ai_threshold_slow_io_detection.ini b/config/plugins/ai_threshold_slow_io_detection.ini
new file mode 100644
index 0000000..44eb928
--- /dev/null
+++ b/config/plugins/ai_threshold_slow_io_detection.ini
@@ -0,0 +1,16 @@
+[common]
+absolute_threshold=40
+slow_io_detect_frequency=1
+log_level=info
+
+[algorithm]
+train_data_duration=0.1
+train_update_duration=0.02
+algorithm_type=n_sigma
+boxplot_parameter=1.5
+n_sigma_parameter=3
+
+[sliding_window]
+sliding_window_type=not_continuous
+window_size=30
+window_minimum_threshold=6
\ No newline at end of file
diff --git a/config/tasks/ai_threshold_slow_io_detection.mod b/config/tasks/ai_threshold_slow_io_detection.mod
new file mode 100644
index 0000000..2729f72
--- /dev/null
+++ b/config/tasks/ai_threshold_slow_io_detection.mod
@@ -0,0 +1,5 @@
+[common]
+enabled=yes
+task_start=/usr/bin/python3 /usr/bin/ai_threshold_slow_io_detection
+task_stop=pkill -f /usr/bin/ai_threshold_slow_io_detection
+type=oneshot
\ No newline at end of file
diff --git a/selftest/test/test_ai_threshold_slow_io_detection.py b/selftest/test/test_ai_threshold_slow_io_detection.py
new file mode 100644
index 0000000..c36fef5
--- /dev/null
+++ b/selftest/test/test_ai_threshold_slow_io_detection.py
@@ -0,0 +1,165 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+
+import unittest
+import numpy as np
+
+from sentryPlugins.ai_threshold_slow_io_detection.threshold import AbsoluteThreshold, BoxplotThreshold, NSigmaThreshold
+from sentryPlugins.ai_threshold_slow_io_detection.sliding_window import (NotContinuousSlidingWindow,
+ ContinuousSlidingWindow, MedianSlidingWindow)
+
+
+def _get_boxplot_threshold(data_list: list, parameter):
+ q1 = np.percentile(data_list, 25)
+ q3 = np.percentile(data_list, 75)
+ iqr = q3 - q1
+ return q3 + parameter * iqr
+
+
+def _get_n_sigma_threshold(data_list: list, parameter):
+ mean = np.mean(data_list)
+ std = np.std(data_list)
+ return mean + parameter * std
+
+
+class Test(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ print("UnitTest Begin...")
+
+ @classmethod
+ def tearDownClass(cls):
+ print("UnitTest End...")
+
+ def setUp(self):
+ print("Begin...")
+
+ def tearDown(self):
+ print("End...")
+
+ def test_absolute_threshold(self):
+ absolute = AbsoluteThreshold()
+ self.assertEqual(None, absolute.get_threshold())
+ self.assertFalse(absolute.is_abnormal(5000))
+ absolute.set_threshold(40)
+ self.assertEqual(40, absolute.get_threshold())
+ self.assertTrue(absolute.is_abnormal(50))
+
+ def test_boxplot_threshold(self):
+ boxplot = BoxplotThreshold(1.5, 5, 1)
+ # 阶段1尚未初始化
+ self.assertEqual(None, boxplot.get_threshold())
+ self.assertFalse(boxplot.is_abnormal(5000))
+ # 往boxplot中插入5个元素后会生成阈值
+ data_list = [20, 20, 20, 30, 10]
+ for data in data_list:
+ boxplot.push_latest_data_to_queue(data)
+ # 阶段2初始化
+ boxplot_threshold = boxplot.get_threshold()
+ self.assertEqual(_get_boxplot_threshold(data_list, 1.5), boxplot_threshold)
+ self.assertTrue(boxplot.is_abnormal(5000))
+ data_list.pop(0)
+ data_list.append(100)
+ boxplot.push_latest_data_to_queue(100)
+ # 阶段3更新阈值
+ boxplot_threshold = boxplot.get_threshold()
+ self.assertEqual(_get_boxplot_threshold(data_list, 1.5), boxplot_threshold)
+
+ def test_n_sigma_threshold(self):
+ n_sigma = NSigmaThreshold(3, 5, 1)
+ self.assertEqual(None, n_sigma.get_threshold())
+ self.assertFalse(n_sigma.is_abnormal(5000))
+ data_list = [20, 20, 20, 30, 10]
+ for data in data_list:
+ n_sigma.push_latest_data_to_queue(data)
+ n_sigma_threshold = n_sigma.get_threshold()
+ self.assertEqual(_get_n_sigma_threshold(data_list, 3), n_sigma_threshold)
+ self.assertTrue(n_sigma.is_abnormal(5000))
+ data_list.pop(0)
+ data_list.append(100)
+ n_sigma.push_latest_data_to_queue(100)
+ # 阶段3更新阈值
+ n_sigma_threshold = n_sigma.get_threshold()
+ self.assertEqual(_get_n_sigma_threshold(data_list, 3), n_sigma_threshold)
+
+ def test_not_continuous_sliding_window(self):
+ not_continuous = NotContinuousSlidingWindow(5, 3)
+ boxplot_threshold = BoxplotThreshold(1.5, 10, 8)
+ boxplot_threshold.attach_observer(not_continuous)
+ data_list1 = [19, 20, 20, 20, 20, 20, 22, 24, 23, 20]
+ for data in data_list1:
+ boxplot_threshold.push_latest_data_to_queue(data)
+ result = not_continuous.is_slow_io_event(data)
+ self.assertFalse(result[0])
+ self.assertEqual(23.75, boxplot_threshold.get_threshold())
+ boxplot_threshold.push_latest_data_to_queue(24)
+ result = not_continuous.is_slow_io_event(24)
+ self.assertFalse(result[0])
+ boxplot_threshold.push_latest_data_to_queue(25)
+ result = not_continuous.is_slow_io_event(25)
+ self.assertTrue(result[0])
+ data_list2 = [20, 20, 20, 20, 20, 20]
+ for data in data_list2:
+ boxplot_threshold.push_latest_data_to_queue(data)
+ result = not_continuous.is_slow_io_event(data)
+ self.assertFalse(result[0])
+ self.assertEqual(25.625, boxplot_threshold.get_threshold())
+
+ def test_continuous_sliding_window(self):
+ continuous = ContinuousSlidingWindow(5, 3)
+ boxplot_threshold = BoxplotThreshold(1.5, 10, 8)
+ boxplot_threshold.attach_observer(continuous)
+ data_list = [19, 20, 20, 20, 20, 20, 22, 24, 23, 20]
+ for data in data_list:
+ boxplot_threshold.push_latest_data_to_queue(data)
+ result = continuous.is_slow_io_event(data)
+ self.assertFalse(result[0])
+ self.assertEqual(23.75, boxplot_threshold.get_threshold())
+ # 没有三个异常点
+ self.assertFalse(continuous.is_slow_io_event(25)[0])
+ # 不连续的三个异常点
+ self.assertFalse(continuous.is_slow_io_event(25)[0])
+ # 连续的三个异常点
+ self.assertTrue(continuous.is_slow_io_event(25)[0])
+
+ def test_median_sliding_window(self):
+ median = MedianSlidingWindow(5, 3)
+ absolute_threshold = AbsoluteThreshold(10, 8)
+ absolute_threshold.attach_observer(median)
+ absolute_threshold.set_threshold(24.5)
+ data_list = [24, 24, 24, 25, 25]
+ for data in data_list:
+ self.assertFalse(median.is_slow_io_event(data)[0])
+ self.assertTrue(median.is_slow_io_event(25)[0])
+
+ def test_parse_collect_data(self):
+ collect = {
+ "read": [1.0, 2.0, 3.0, 4.0],
+ "write": [5.0, 6.0, 7.0, 8.0],
+ "flush": [9.0, 10.0, 11.0, 12.0],
+ "discard": [13.0, 14.0, 15.0, 16.0],
+ }
+ from io_data import BaseData
+ from data_access import _get_io_stage_data
+
+ io_data = _get_io_stage_data(collect)
+ self.assertEqual(
+ io_data.read, BaseData(latency=1.0, io_dump=2.0, io_length=3.0, iops=4.0)
+ )
+ self.assertEqual(
+ io_data.write, BaseData(latency=5.0, io_dump=6.0, io_length=7.0, iops=8.0)
+ )
+ self.assertEqual(
+ io_data.flush, BaseData(latency=9.0, io_dump=10.0, io_length=11.0, iops=12.0)
+ )
+ self.assertEqual(
+ io_data.discard, BaseData(latency=13.0, io_dump=14.0, io_length=15.0, iops=16.0)
+ )
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md b/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md
new file mode 100644
index 0000000..f9b8388
--- /dev/null
+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md
@@ -0,0 +1,2 @@
+# slow_io_detection
+
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py
new file mode 100644
index 0000000..3f4f34e
--- /dev/null
+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+
+from syssentry.result import ResultLevel, report_result
+import logging
+import json
+
+
+class AlarmReport:
+ TASK_NAME = "SLOW_IO_DETECTION"
+
+ @staticmethod
+ def report_pass(info: str):
+ report_result(AlarmReport.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info}))
+ logging.info(f'Report {AlarmReport.TASK_NAME} PASS: {info}')
+
+ @staticmethod
+ def report_fail(info: str):
+ report_result(AlarmReport.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info}))
+ logging.info(f'Report {AlarmReport.TASK_NAME} FAIL: {info}')
+
+ @staticmethod
+ def report_skip(info: str):
+ report_result(AlarmReport.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info}))
+ logging.info(f'Report {AlarmReport.TASK_NAME} SKIP: {info}')
+
+ @staticmethod
+ def report_minor_alm(info: str):
+ report_result(AlarmReport.TASK_NAME, ResultLevel.MINOR_ALM, json.dumps({"msg": info}))
+ logging.info(f'Report {AlarmReport.TASK_NAME} MINOR_ALM: {info}')
+
+ @staticmethod
+ def report_major_alm(info: str):
+ report_result(AlarmReport.TASK_NAME, ResultLevel.MAJOR_ALM, json.dumps({"msg": info}))
+ logging.info(f'Report {AlarmReport.TASK_NAME} MAJOR_ALM: {info}')
+
+ @staticmethod
+ def report_critical_alm(info: str):
+ report_result(AlarmReport.TASK_NAME, ResultLevel.CRITICAL_ALM, json.dumps({"msg": info}))
+ logging.info(f'Report {AlarmReport.TASK_NAME} CRITICAL_ALM: {info}')
+
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py
new file mode 100644
index 0000000..cd4e6f1
--- /dev/null
+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py
@@ -0,0 +1,141 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+
+import configparser
+import logging
+
+
+class ConfigParser:
+
+ DEFAULT_ABSOLUTE_THRESHOLD = 40
+ DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1
+ DEFAULT_LOG_LEVEL = 'info'
+ DEFAULT_TRAIN_DATA_DURATION = 24
+ DEFAULT_TRAIN_UPDATE_DURATION = 2
+ DEFAULT_ALGORITHM_TYPE = 'boxplot'
+ DEFAULT_N_SIGMA_PARAMETER = 3
+ DEFAULT_BOXPLOT_PARAMETER = 1.5
+ DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous'
+ DEFAULT_WINDOW_SIZE = 30
+ DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6
+
+ def __init__(self, config_file_name):
+ self.__boxplot_parameter = None
+ self.__window_minimum_threshold = None
+ self.__window_size = None
+ self.__sliding_window_type = None
+ self.__n_sigma_parameter = None
+ self.__algorithm_type = None
+ self.__train_update_duration = None
+ self.__log_level = None
+ self.__slow_io_detect_frequency = None
+ self.__absolute_threshold = None
+ self.__train_data_duration = None
+ self.__config_file_name = config_file_name
+
+ def read_config_from_file(self):
+
+ con = configparser.ConfigParser()
+ con.read(self.__config_file_name, encoding='utf-8')
+
+ items_common = dict(con.items('common'))
+ items_algorithm = dict(con.items('algorithm'))
+ items_sliding_window = dict(con.items('sliding_window'))
+
+ try:
+ self.__absolute_threshold = int(items_common.get('absolute_threshold',
+ ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD))
+ except ValueError:
+ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
+ logging.warning('absolute threshold type conversion has error, use default value.')
+
+ try:
+ self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency',
+ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY))
+ except ValueError:
+ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
+ logging.warning('slow_io_detect_frequency type conversion has error, use default value.')
+
+ self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL)
+
+ try:
+ self.__train_data_duration = float(items_algorithm.get('train_data_duration',
+ ConfigParser.DEFAULT_TRAIN_DATA_DURATION))
+ except ValueError:
+ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION
+ logging.warning('train_data_duration type conversion has error, use default value.')
+
+ try:
+ self.__train_update_duration = float(items_algorithm.get('train_update_duration',
+ ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION))
+ except ValueError:
+ self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
+ logging.warning('train_update_duration type conversion has error, use default value.')
+
+ try:
+ self.__algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE)
+ except ValueError:
+ self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE
+ logging.warning('algorithmType type conversion has error, use default value.')
+
+ if self.__algorithm_type == 'n_sigma':
+ try:
+ self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter',
+ ConfigParser.DEFAULT_N_SIGMA_PARAMETER))
+ except ValueError:
+ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
+ logging.warning('n_sigma_parameter type conversion has error, use default value.')
+ elif self.__algorithm_type == 'boxplot':
+ try:
+ self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter',
+ ConfigParser.DEFAULT_BOXPLOT_PARAMETER))
+ except ValueError:
+ self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
+ logging.warning('boxplot_parameter type conversion has error, use default value.')
+
+ self.__sliding_window_type = items_sliding_window.get('sliding_window_type',
+ ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE)
+
+ try:
+ self.__window_size = int(items_sliding_window.get('window_size',
+ ConfigParser.DEFAULT_WINDOW_SIZE))
+ except ValueError:
+ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
+ logging.warning('window_size type conversion has error, use default value.')
+
+ try:
+ self.__window_minimum_threshold = (
+ int(items_sliding_window.get('window_minimum_threshold',
+ ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD)))
+ except ValueError:
+ self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD
+ logging.warning('window_minimum_threshold type conversion has error, use default value.')
+
+ def get_slow_io_detect_frequency(self):
+ return self.__slow_io_detect_frequency
+
+ def get_algorithm_type(self):
+ return self.__algorithm_type
+
+ def get_sliding_window_type(self):
+ return self.__sliding_window_type
+
+ def get_train_data_duration_and_train_update_duration(self):
+ return self.__train_data_duration, self.__train_update_duration
+
+ def get_window_size_and_window_minimum_threshold(self):
+ return self.__window_size, self.__window_minimum_threshold
+
+ def get_absolute_threshold(self):
+ return self.__absolute_threshold
+
+ def get_log_level(self):
+ return self.__log_level
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py
new file mode 100644
index 0000000..d9f3460
--- /dev/null
+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py
@@ -0,0 +1,91 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+
+import json
+import logging
+
+from sentryCollector.collect_plugin import (
+ Result_Messages,
+ get_io_data,
+ is_iocollect_valid,
+)
+from .io_data import IOStageData, IOData
+
+COLLECT_STAGES = [
+ "throtl",
+ "wbt",
+ "gettag",
+ "plug",
+ "bfq",
+ "hctx",
+ "requeue",
+ "rq_driver",
+ "bio",
+ "iocost",
+]
+
+def check_collect_valid(period):
+ data_raw = is_iocollect_valid(period)
+ if data_raw["ret"] == 0:
+ try:
+ data = json.loads(data_raw["message"])
+ except Exception as e:
+ logging.warning(f"get io data failed, {e}")
+ return []
+ return [k for k in data.keys()]
+ else:
+ return []
+
+
+def _get_raw_data(period, disk_list):
+ return get_io_data(
+ period,
+ disk_list,
+ COLLECT_STAGES,
+ ["read", "write", "flush", "discard"],
+ )
+
+
+def _get_io_stage_data(data):
+ io_stage_data = IOStageData()
+ for data_type in ('read', 'write', 'flush', 'discard'):
+ if data_type in data:
+ getattr(io_stage_data, data_type).latency = data[data_type][0]
+ getattr(io_stage_data, data_type).io_dump = data[data_type][1]
+ getattr(io_stage_data, data_type).io_length = data[data_type][2]
+ getattr(io_stage_data, data_type).iops = data[data_type][3]
+ return io_stage_data
+
+
+def get_io_data_from_collect_plug(period, disk_list):
+ data_raw = _get_raw_data(period, disk_list)
+ if data_raw["ret"] == 0:
+ ret = {}
+ try:
+ data = json.loads(data_raw["message"])
+ except json.decoder.JSONDecodeError as e:
+ logging.warning(f"get io data failed, {e}")
+ return None
+
+ for disk in data:
+ disk_data = data[disk]
+ disk_ret = IOData()
+ for k, v in disk_data.items():
+ try:
+ getattr(disk_ret, k)
+ setattr(disk_ret, k, _get_io_stage_data(v))
+ except AttributeError:
+ logging.debug(f'no attr {k}')
+ continue
+ ret[disk] = disk_ret
+ return ret
+ logging.warning(f'get io data failed with message: {data_raw["message"]}')
+ return None
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py
new file mode 100644
index 0000000..eda9825
--- /dev/null
+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+import logging
+
+from .io_data import MetricName
+from .threshold import Threshold
+from .sliding_window import SlidingWindow
+from .utils import get_metric_value_from_io_data_dict_by_metric_name
+
+
+class Detector:
+ _metric_name: MetricName = None
+ _threshold: Threshold = None
+ _slidingWindow: SlidingWindow = None
+
+ def __init__(self, metric_name: MetricName, threshold: Threshold, sliding_window: SlidingWindow):
+ self._metric_name = metric_name
+ self._threshold = threshold
+ self._slidingWindow = sliding_window
+ self._threshold.attach_observer(self._slidingWindow)
+
+ def get_metric_name(self):
+ return self._metric_name
+
+ def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
+ logging.debug(f'Enter Detector: {self}')
+ metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name)
+ if metric_value > 1e-6:
+ logging.debug(f'Input metric value: {str(metric_value)}')
+ self._threshold.push_latest_data_to_queue(metric_value)
+ detection_result = self._slidingWindow.is_slow_io_event(metric_value)
+ logging.debug(f'Detection result: {str(detection_result)}')
+ logging.debug(f'Exit Detector: {self}')
+ return detection_result
+
+ def __repr__(self):
+ return (f'disk_name: {self._metric_name.get_disk_name()}, stage_name: {self._metric_name.get_stage_name()},'
+ f' access_type_name: {self._metric_name.get_io_access_type_name()},'
+ f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},'
+ f' sliding_window_type: {self._slidingWindow}')
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py
new file mode 100644
index 0000000..0e17051
--- /dev/null
+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Optional
+
+
+@dataclass
+class BaseData:
+ latency: Optional[float] = field(default_factory=lambda: None)
+ io_dump: Optional[int] = field(default_factory=lambda: None)
+ io_length: Optional[int] = field(default_factory=lambda: None)
+ iops: Optional[int] = field(default_factory=lambda: None)
+
+
+@dataclass
+class IOStageData:
+ read: BaseData = field(default_factory=lambda: BaseData())
+ write: BaseData = field(default_factory=lambda: BaseData())
+ flush: BaseData = field(default_factory=lambda: BaseData())
+ discard: BaseData = field(default_factory=lambda: BaseData())
+
+
+@dataclass
+class IOData:
+ throtl: IOStageData = field(default_factory=lambda: IOStageData())
+ wbt: IOStageData = field(default_factory=lambda: IOStageData())
+ gettag: IOStageData = field(default_factory=lambda: IOStageData())
+ iocost: IOStageData = field(default_factory=lambda: IOStageData())
+ plug: IOStageData = field(default_factory=lambda: IOStageData())
+ bfq: IOStageData = field(default_factory=lambda: IOStageData())
+ hctx: IOStageData = field(default_factory=lambda: IOStageData())
+ requeue: IOStageData = field(default_factory=lambda: IOStageData())
+ rq_driver: IOStageData = field(default_factory=lambda: IOStageData())
+ bio: IOStageData = field(default_factory=lambda: IOStageData())
+ time_stamp: float = field(default_factory=lambda: datetime.now().timestamp())
+
+
+class MetricName:
+ _disk_name: str = None
+ _stage_name: str = None
+ _io_access_type_name: str = None
+ _metric_name: str = None
+
+ def __init__(self, disk_name: str, stage_name: str, io_access_type_name: str, metric_name: str):
+ self._disk_name = disk_name
+ self._stage_name = stage_name
+ self._io_access_type_name = io_access_type_name
+ self._metric_name = metric_name
+
+ def get_disk_name(self):
+ return self._disk_name
+
+ def get_stage_name(self):
+ return self._stage_name
+
+ def get_io_access_type_name(self):
+ return self._io_access_type_name
+
+ def get_metric_name(self):
+ return self._metric_name
+
+ def __repr__(self):
+ return (f'disk: {self._disk_name}, stage: {self._stage_name}, io_access_type: {self._io_access_type_name},'
+ f'metric: {self._metric_name}')
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py
new file mode 100644
index 0000000..d395d48
--- /dev/null
+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py
@@ -0,0 +1,113 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+
+from enum import Enum, unique
+import numpy as np
+
+
+@unique
+class SlidingWindowType(Enum):
+ NotContinuousSlidingWindow = 0
+ ContinuousSlidingWindow = 1
+ MedianSlidingWindow = 2
+
+
+class SlidingWindow:
+ _ai_threshold = None
+ _queue_length = None
+ _queue_threshold = None
+ _io_data_queue: list = None
+ _io_data_queue_abnormal_tag: list = None
+
+ def __init__(self, queue_length: int, threshold: int):
+ self._queue_length = queue_length
+ self._queue_threshold = threshold
+ self._io_data_queue = []
+ self._io_data_queue_abnormal_tag = []
+
+ def push(self, data: float):
+ if len(self._io_data_queue) == self._queue_length:
+ self._io_data_queue.pop(0)
+ self._io_data_queue_abnormal_tag.pop(0)
+ self._io_data_queue.append(data)
+ self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold if self._ai_threshold is not None else False)
+
+ def update(self, threshold):
+ if self._ai_threshold == threshold:
+ return
+ self._ai_threshold = threshold
+ self._io_data_queue_abnormal_tag.clear()
+ for data in self._io_data_queue:
+ self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold)
+
+ def is_slow_io_event(self, data):
+ return False, None, None
+
+ def __repr__(self):
+ return "SlidingWindow"
+
+
+class NotContinuousSlidingWindow(SlidingWindow):
+ def is_slow_io_event(self, data):
+ super().push(data)
+ if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None:
+ return False, self._io_data_queue, self._ai_threshold
+ if self._io_data_queue_abnormal_tag.count(True) >= self._queue_threshold:
+ return True, self._io_data_queue, self._ai_threshold
+ return False, self._io_data_queue, self._ai_threshold
+
+ def __repr__(self):
+ return "NotContinuousSlidingWindow"
+
+
+class ContinuousSlidingWindow(SlidingWindow):
+ def is_slow_io_event(self, data):
+ super().push(data)
+ if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None:
+ return False, self._io_data_queue, self._ai_threshold
+ consecutive_count = 0
+ for tag in self._io_data_queue_abnormal_tag:
+ if tag:
+ consecutive_count += 1
+ if consecutive_count >= self._queue_threshold:
+ return True, self._io_data_queue, self._ai_threshold
+ else:
+ consecutive_count = 0
+ return False, self._io_data_queue, self._ai_threshold
+
+ def __repr__(self):
+ return "ContinuousSlidingWindow"
+
+
+class MedianSlidingWindow(SlidingWindow):
+ def is_slow_io_event(self, data):
+ super().push(data)
+ if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None:
+ return False, self._io_data_queue, self._ai_threshold
+ median = np.median(self._io_data_queue)
+ if median >= self._ai_threshold:
+ return True, self._io_data_queue, self._ai_threshold
+ return False, self._io_data_queue, self._ai_threshold
+
+ def __repr__(self):
+ return "MedianSlidingWindow"
+
+
+class SlidingWindowFactory:
+ def get_sliding_window(self, sliding_window_type: SlidingWindowType, *args, **kwargs):
+ if sliding_window_type == SlidingWindowType.NotContinuousSlidingWindow:
+ return NotContinuousSlidingWindow(*args, **kwargs)
+ elif sliding_window_type == SlidingWindowType.ContinuousSlidingWindow:
+ return ContinuousSlidingWindow(*args, **kwargs)
+ elif sliding_window_type == SlidingWindowType.MedianSlidingWindow:
+ return MedianSlidingWindow(*args, **kwargs)
+ else:
+ return NotContinuousSlidingWindow(*args, **kwargs)
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py
new file mode 100644
index 0000000..43cf770
--- /dev/null
+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py
@@ -0,0 +1,133 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+
+import time
+import signal
+import logging
+
+from .detector import Detector
+from .threshold import ThresholdFactory, AbsoluteThreshold
+from .sliding_window import SlidingWindowFactory
+from .utils import (get_threshold_type_enum, get_sliding_window_type_enum, get_data_queue_size_and_update_size,
+ get_log_level)
+from .config_parser import ConfigParser
+from .data_access import get_io_data_from_collect_plug, check_collect_valid
+from .io_data import MetricName
+from .alarm_report import AlarmReport
+
+CONFIG_FILE = "/etc/sysSentry/plugins/ai_threshold_slow_io_detection.ini"
+
+
+def sig_handler(signum, frame):
+ logging.info("receive signal: %d", signum)
+ AlarmReport().report_fail(f"receive signal: {signum}")
+ exit(signum)
+
+
+class SlowIODetection:
+ _config_parser = None
+ _disk_list = None
+ _detector_name_list = []
+ _detectors = {}
+
+ def __init__(self, config_parser: ConfigParser):
+ self._config_parser = config_parser
+ self.__set_log_format()
+ self.__init_detector_name_list()
+ self.__init_detector()
+
+ def __set_log_format(self):
+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
+ log_level = get_log_level(self._config_parser.get_log_level())
+ logging.basicConfig(level=log_level, format=log_format)
+
+ def __init_detector_name_list(self):
+ self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency())
+ for disk in self._disk_list:
+ self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
+ self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
+
+ def __init_detector(self):
+ train_data_duration, train_update_duration = (self._config_parser.
+ get_train_data_duration_and_train_update_duration())
+ slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency()
+ threshold_type = get_threshold_type_enum(self._config_parser.get_algorithm_type())
+ data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration,
+ train_update_duration,
+ slow_io_detection_frequency)
+ sliding_window_type = get_sliding_window_type_enum(self._config_parser.get_sliding_window_type())
+ window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold()
+
+ for detector_name in self._detector_name_list:
+ threshold = ThresholdFactory().get_threshold(threshold_type, data_queue_size=data_queue_size,
+ data_queue_update_size=update_size)
+ sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size,
+ threshold=window_threshold)
+ detector = Detector(detector_name, threshold, sliding_window)
+ # 绝对阈值的阈值初始化
+ if isinstance(threshold, AbsoluteThreshold):
+ threshold.set_threshold(self._config_parser.get_absolute_threshold())
+ self._detectors[detector_name] = detector
+ logging.info(f"add detector: {detector}")
+
+ def launch(self):
+ while True:
+ logging.debug('step0. AI threshold slow io event detection is looping.')
+
+ # Step1获取IO数据
+ io_data_dict_with_disk_name = get_io_data_from_collect_plug(
+ self._config_parser.get_slow_io_detect_frequency(), self._disk_list
+ )
+ logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}')
+ if io_data_dict_with_disk_name is None:
+ continue
+ # Step2慢IO检测
+ logging.debug('step2. Start to detection slow io event.')
+ slow_io_event_list = []
+ for metric_name, detector in self._detectors.items():
+ result = detector.is_slow_io_event(io_data_dict_with_disk_name)
+ if result[0]:
+ slow_io_event_list.append((detector.get_metric_name(), result))
+ logging.debug('step2. End to detection slow io event.')
+
+ # Step3慢IO事件上报
+ logging.debug('step3. Report slow io event to sysSentry.')
+ for slow_io_event in slow_io_event_list:
+ metric_name: MetricName = slow_io_event[0]
+ result = slow_io_event[1]
+ AlarmReport.report_major_alm(f"disk {metric_name.get_disk_name()} has slow io event."
+ f"stage: {metric_name.get_metric_name()},"
+ f"type: {metric_name.get_io_access_type_name()},"
+ f"metric: {metric_name.get_metric_name()},"
+ f"current window: {result[1]},"
+ f"threshold: {result[2]}")
+ logging.error(f"slow io event happen: {str(slow_io_event)}")
+
+ # Step4等待检测时间
+ logging.debug('step4. Wait to start next slow io event detection loop.')
+ time.sleep(self._config_parser.get_slow_io_detect_frequency())
+
+
+def main():
+ # Step1注册消息处理函数
+ signal.signal(signal.SIGINT, sig_handler)
+ signal.signal(signal.SIGTERM, sig_handler)
+ # Step2断点恢复
+ # todo:
+
+ # Step3读取配置
+ config_file_name = CONFIG_FILE
+ config = ConfigParser(config_file_name)
+ config.read_config_from_file()
+
+ # Step4启动慢IO检测
+ slow_io_detection = SlowIODetection(config)
+ slow_io_detection.launch()
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py
new file mode 100644
index 0000000..9e1ca7b
--- /dev/null
+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py
@@ -0,0 +1,160 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+import logging
+from enum import Enum
+import queue
+import numpy as np
+import math
+
+from .sliding_window import SlidingWindow
+
+
+class ThresholdState(Enum):
+ INIT = 0
+ START = 1
+
+
+class Threshold:
+ threshold = None
+ data_queue: queue.Queue = None
+ data_queue_update_size: int = None
+ new_data_size: int = None
+ threshold_state: ThresholdState = None
+
+ def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
+ self._observer = None
+ self.data_queue = queue.Queue(data_queue_size)
+ self.data_queue_update_size = data_queue_update_size
+ self.new_data_size = 0
+ self.threshold_state = ThresholdState.INIT
+ self.threshold = math.inf
+
+ def set_threshold(self, threshold):
+ self.threshold = threshold
+ self.threshold_state = ThresholdState.START
+ self.notify_observer()
+
+ def get_threshold(self):
+ if self.threshold_state == ThresholdState.INIT:
+ return None
+ return self.threshold
+
+ def is_abnormal(self, data):
+ if self.threshold_state == ThresholdState.INIT:
+ return False
+ return data >= self.threshold
+
+ # 使用观察者模式,当阈值更新时,自动同步刷新滑窗中的阈值
+ def attach_observer(self, observer: SlidingWindow):
+ self._observer = observer
+
+ def notify_observer(self):
+ if self._observer is not None:
+ self._observer.update(self.threshold)
+
+ def push_latest_data_to_queue(self, data):
+ pass
+
+ def __repr__(self):
+ return "Threshold"
+
+
+class AbsoluteThreshold(Threshold):
+ def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
+ super().__init__(data_queue_size, data_queue_update_size)
+
+ def push_latest_data_to_queue(self, data):
+ pass
+
+ def __repr__(self):
+ return "AbsoluteThreshold"
+
+
+class BoxplotThreshold(Threshold):
+ def __init__(self, parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
+ super().__init__(data_queue_size, data_queue_update_size)
+ self.parameter = parameter
+
+ def _update_threshold(self):
+ data = list(self.data_queue.queue)
+ q1 = np.percentile(data, 25)
+ q3 = np.percentile(data, 75)
+ iqr = q3 - q1
+ self.threshold = q3 + self.parameter * iqr
+ if self.threshold_state == ThresholdState.INIT:
+ self.threshold_state = ThresholdState.START
+ self.notify_observer()
+
+ def push_latest_data_to_queue(self, data):
+ try:
+ self.data_queue.put(data, block=False)
+ except queue.Full:
+ self.data_queue.get()
+ self.data_queue.put(data)
+ self.new_data_size += 1
+ if (self.data_queue.full() and (self.threshold_state == ThresholdState.INIT or
+ (self.threshold_state == ThresholdState.START and
+ self.new_data_size >= self.data_queue_update_size))):
+ self._update_threshold()
+ self.new_data_size = 0
+
+ def __repr__(self):
+ return "BoxplotThreshold"
+
+
+class NSigmaThreshold(Threshold):
+ def __init__(self, parameter: float = 2.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
+ super().__init__(data_queue_size, data_queue_update_size)
+ self.parameter = parameter
+
+ def _update_threshold(self):
+ data = list(self.data_queue.queue)
+ mean = np.mean(data)
+ std = np.std(data)
+ self.threshold = mean + self.parameter * std
+ if self.threshold_state == ThresholdState.INIT:
+ self.threshold_state = ThresholdState.START
+ self.notify_observer()
+
+ def push_latest_data_to_queue(self, data):
+ try:
+ self.data_queue.put(data, block=False)
+ except queue.Full:
+ self.data_queue.get()
+ self.data_queue.put(data)
+ self.new_data_size += 1
+ if (self.data_queue.full() and (self.threshold_state == ThresholdState.INIT or
+ (self.threshold_state == ThresholdState.START and
+ self.new_data_size >= self.data_queue_update_size))):
+ self._update_threshold()
+ self.new_data_size = 0
+
+ def __repr__(self):
+ return "NSigmaThreshold"
+
+
+class ThresholdType(Enum):
+ AbsoluteThreshold = 0
+ BoxplotThreshold = 1
+ NSigmaThreshold = 2
+
+
+class ThresholdFactory:
+ def get_threshold(self, threshold_type: ThresholdType, *args, **kwargs):
+ if threshold_type == ThresholdType.AbsoluteThreshold:
+ return AbsoluteThreshold(*args, **kwargs)
+ elif threshold_type == ThresholdType.BoxplotThreshold:
+ return BoxplotThreshold(*args, **kwargs)
+ elif threshold_type == ThresholdType.NSigmaThreshold:
+ return NSigmaThreshold(*args, **kwargs)
+ else:
+ raise ValueError(f"Invalid threshold type: {threshold_type}")
+
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py
new file mode 100644
index 0000000..f66e5ed
--- /dev/null
+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+import logging
+from dataclasses import asdict
+
+from .threshold import ThresholdType
+from .sliding_window import SlidingWindowType
+from .io_data import MetricName, IOData
+
+def get_threshold_type_enum(algorithm_type: str):
+ if algorithm_type.lower() == 'absolute':
+ return ThresholdType.AbsoluteThreshold
+ if algorithm_type.lower() == 'boxplot':
+ return ThresholdType.BoxplotThreshold
+ if algorithm_type.lower() == 'n_sigma':
+ return ThresholdType.NSigmaThreshold
+ logging.info('not found correct algorithm type, use default: boxplot.')
+ return ThresholdType.BoxplotThreshold
+
+
+def get_sliding_window_type_enum(sliding_window_type: str):
+ if sliding_window_type.lower() == 'not_continuous':
+ return SlidingWindowType.NotContinuousSlidingWindow
+ if sliding_window_type.lower() == 'continuous':
+ return SlidingWindowType.ContinuousSlidingWindow
+ if sliding_window_type.lower() == 'median':
+ return SlidingWindowType.MedianSlidingWindow
+ logging.info('not found correct sliding window type, use default: not_continuous.')
+ return SlidingWindowType.NotContinuousSlidingWindow
+
+
+def get_metric_value_from_io_data_dict_by_metric_name(io_data_dict: dict, metric_name: MetricName):
+ try:
+ io_data: IOData = io_data_dict[metric_name.get_disk_name()]
+ io_stage_data = asdict(io_data)[metric_name.get_stage_name()]
+ base_data = io_stage_data[metric_name.get_io_access_type_name()]
+ metric_value = base_data[metric_name.get_metric_name()]
+ return metric_value
+ except KeyError:
+ return None
+
+
+def get_data_queue_size_and_update_size(training_data_duration: float, train_update_duration: float,
+ slow_io_detect_frequency: int):
+ data_queue_size = int(training_data_duration * 60 * 60 / slow_io_detect_frequency)
+ update_size = int(train_update_duration * 60 * 60 / slow_io_detect_frequency)
+ return data_queue_size, update_size
+
+
+def get_log_level(log_level: str):
+ if log_level.lower() == 'debug':
+ return logging.DEBUG
+ elif log_level.lower() == 'info':
+ return logging.INFO
+ elif log_level.lower() == 'warning':
+ return logging.WARNING
+ elif log_level.lower() == 'fatal':
+ return logging.FATAL
+ return None
diff --git a/src/python/setup.py b/src/python/setup.py
index c28c691..dac6481 100644
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -33,7 +33,8 @@ setup(
'syssentry=syssentry.syssentry:main',
'xalarmd=xalarm.xalarm_daemon:alarm_process_create',
'sentryCollector=sentryCollector.collectd:main',
- 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main'
+ 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main',
+ 'ai_threshold_slow_io_detection=sentryPlugins.ai_threshold_slow_io_detection.slow_io_detection:main'
]
},
)
--
2.23.0