!163 update nvme config

From: @znzjugod 
Reviewed-by: @gaoruoshu 
Signed-off-by: @gaoruoshu
This commit is contained in:
openeuler-ci-bot 2024-11-05 07:10:11 +00:00 committed by Gitee
commit c1416f4afb
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
4 changed files with 1415 additions and 1 deletions

View File

@ -0,0 +1,626 @@
From f3a0738061e852c8125513f6222b4a5d6ea73270 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Fri, 25 Oct 2024 15:34:25 +0800
Subject: [PATCH] ai_block_io fix some config parameters parse bug
---
.../sentryPlugins/ai_block_io/ai_block_io.py | 70 +++++----
.../ai_block_io/config_parser.py | 135 ++++++++++++++----
.../sentryPlugins/ai_block_io/data_access.py | 14 ++
.../sentryPlugins/ai_block_io/detector.py | 16 ++-
.../ai_block_io/sliding_window.py | 2 +-
.../sentryPlugins/ai_block_io/threshold.py | 14 +-
src/python/sentryPlugins/ai_block_io/utils.py | 2 -
7 files changed, 180 insertions(+), 73 deletions(-)
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
index 74f246a..14f740d 100644
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
@@ -23,6 +23,7 @@ from .data_access import (
get_io_data_from_collect_plug,
check_collect_valid,
get_disk_type,
+ check_disk_is_available
)
from .io_data import MetricName
from .alarm_report import Xalarm, Report
@@ -31,14 +32,14 @@ CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini"
def sig_handler(signum, frame):
- logging.info("receive signal: %d", signum)
Report.report_pass(f"receive signal: {signum}, exiting...")
+ logging.info("Finished ai_block_io plugin running.")
exit(signum)
class SlowIODetection:
_config_parser = None
- _disk_list = None
+ _disk_list = []
_detector_name_list = defaultdict(list)
_disk_detectors = {}
@@ -48,32 +49,30 @@ class SlowIODetection:
self.__init_detector()
def __init_detector_name_list(self):
- self._disk_list = check_collect_valid(
- self._config_parser.period_time
- )
- if self._disk_list is None:
- Report.report_pass(
- "get available disk error, please check if the collector plug is enable. exiting..."
- )
- logging.critical("get available disk error, please check if the collector plug is enable. exiting...")
- exit(1)
-
- logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
disks: list = self._config_parser.disks_to_detection
stages: list = self._config_parser.stage
iotypes: list = self._config_parser.iotype
- # 情况1None则启用所有磁盘检测
- # 情况2is not None and len = 0则不启动任何磁盘检测
- # 情况3len = 0则取交集
+
if disks is None:
- logging.warning(
- "you not specify any disk or use default, so ai_block_io will enable all available disk."
- )
- for disk in self._disk_list:
- if disks is not None:
- if disk not in disks:
- continue
- disks.remove(disk)
+ logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.")
+ all_available_disk_list = check_collect_valid(self._config_parser.period_time)
+ if all_available_disk_list is None:
+ Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...")
+ logging.critical("get available disk error, please check if the collector plug is enable. exiting...")
+ exit(1)
+ if len(all_available_disk_list) == 0:
+ Report.report_pass("not found available disk. exiting...")
+ logging.critical("not found available disk. exiting...")
+ exit(1)
+ disks = all_available_disk_list
+ logging.info(f"available disk list is follow: {disks}.")
+
+ for disk in disks:
+ tmp_disk = [disk]
+ ret = check_disk_is_available(self._config_parser.period_time, tmp_disk)
+ if not ret:
+ logging.warning(f"disk: {disk} is not available, it will be ignored.")
+ continue
disk_type_result = get_disk_type(disk)
if disk_type_result["ret"] == 0 and disk_type_result["message"] in (
@@ -89,20 +88,15 @@ class SlowIODetection:
disk_type_result,
)
continue
+ self._disk_list.append(disk)
for stage in stages:
for iotype in iotypes:
self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "latency"))
self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "io_dump"))
- if disks:
- logging.warning(
- "disks: %s not in available disk list, so they will be ignored.",
- disks,
- )
+
if not self._detector_name_list:
+ Report.report_pass("the disks to detection is empty, ai_block_io will exit.")
logging.critical("the disks to detection is empty, ai_block_io will exit.")
- Report.report_pass(
- "the disks to detection is empty, ai_block_io will exit."
- )
exit(1)
def __init_detector(self):
@@ -202,16 +196,20 @@ class SlowIODetection:
logging.debug("step3. Report slow io event to sysSentry.")
for slow_io_event in slow_io_event_list:
alarm_content = {
+ "alarm_source": "ai_block_io",
"driver_name": slow_io_event[1],
+ "io_type": slow_io_event[4],
"reason": slow_io_event[2],
"block_stack": slow_io_event[3],
- "io_type": slow_io_event[4],
- "alarm_source": "ai_block_io",
"alarm_type": slow_io_event[5],
- "details": slow_io_event[6],
+ "details": slow_io_event[6]
}
Xalarm.major(alarm_content)
- logging.warning("[SLOW IO] " + str(alarm_content))
+ tmp_alarm_content = alarm_content.copy()
+ del tmp_alarm_content["details"]
+ logging.warning("[SLOW IO] " + str(tmp_alarm_content))
+ logging.warning(f"latency: " + str(alarm_content.get("details").get("latency")))
+ logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump")))
# Step4等待检测时间
logging.debug("step4. Wait to start next slow io event detection loop.")
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
index 91ec5c6..3049db2 100644
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -105,21 +105,26 @@ class ConfigParser:
ge=None,
lt=None,
le=None,
+ section=None
):
+ if section is not None:
+ print_key = section + "." + key
+ else:
+ print_key = key
value = config_items.get(key)
if value is None:
logging.warning(
"config of %s not found, the default value %s will be used.",
- key,
+ print_key,
default_value,
)
value = default_value
if not value:
logging.critical(
- "the value of %s is empty, ai_block_io plug will exit.", key
+ "the value of %s is empty, ai_block_io plug will exit.", print_key
)
Report.report_pass(
- f"the value of {key} is empty, ai_block_io plug will exit."
+ f"the value of {print_key} is empty, ai_block_io plug will exit."
)
exit(1)
try:
@@ -127,51 +132,51 @@ class ConfigParser:
except ValueError:
logging.critical(
"the value of %s is not a valid %s, ai_block_io plug will exit.",
- key,
+ print_key,
value_type,
)
Report.report_pass(
- f"the value of {key} is not a valid {value_type}, ai_block_io plug will exit."
+ f"the value of {print_key} is not a valid {value_type}, ai_block_io plug will exit."
)
exit(1)
if gt is not None and value <= gt:
logging.critical(
"the value of %s is not greater than %s, ai_block_io plug will exit.",
- key,
+ print_key,
gt,
)
Report.report_pass(
- f"the value of {key} is not greater than {gt}, ai_block_io plug will exit."
+ f"the value of {print_key} is not greater than {gt}, ai_block_io plug will exit."
)
exit(1)
if ge is not None and value < ge:
logging.critical(
"the value of %s is not greater than or equal to %s, ai_block_io plug will exit.",
- key,
+ print_key,
ge,
)
Report.report_pass(
- f"the value of {key} is not greater than or equal to {ge}, ai_block_io plug will exit."
+ f"the value of {print_key} is not greater than or equal to {ge}, ai_block_io plug will exit."
)
exit(1)
if lt is not None and value >= lt:
logging.critical(
"the value of %s is not less than %s, ai_block_io plug will exit.",
- key,
+ print_key,
lt,
)
Report.report_pass(
- f"the value of {key} is not less than {lt}, ai_block_io plug will exit."
+ f"the value of {print_key} is not less than {lt}, ai_block_io plug will exit."
)
exit(1)
if le is not None and value > le:
logging.critical(
"the value of %s is not less than or equal to %s, ai_block_io plug will exit.",
- key,
+ print_key,
le,
)
Report.report_pass(
- f"the value of {key} is not less than or equal to {le}, ai_block_io plug will exit."
+ f"the value of {print_key} is not less than or equal to {le}, ai_block_io plug will exit."
)
exit(1)
@@ -188,7 +193,7 @@ class ConfigParser:
frequency = self._conf["common"]["period_time"]
ret = check_detect_frequency_is_valid(frequency)
if ret is None:
- log = f"period_time: {frequency} is valid, "\
+ log = f"period_time: {frequency} is invalid, "\
f"Check whether the value range is too large or is not an "\
f"integer multiple of period_time.. exiting..."
Report.report_pass(log)
@@ -202,6 +207,7 @@ class ConfigParser:
self._conf["common"]["disk"] = None
return
disks_to_detection = disks_to_detection.strip()
+ disks_to_detection = disks_to_detection.lower()
if not disks_to_detection:
logging.critical("the value of disk is empty, ai_block_io plug will exit.")
Report.report_pass(
@@ -213,7 +219,18 @@ class ConfigParser:
if len(disk_list) == 1 and disk_list[0] == "default":
self._conf["common"]["disk"] = None
return
- self._conf["common"]["disk"] = disk_list
+ if len(disk_list) > 10:
+ ten_disk_list = disk_list[0:10]
+ other_disk_list = disk_list[10:]
+ logging.warning(f"disk only support maximum is 10, disks: {ten_disk_list} will be retained, other: {other_disk_list} will be ignored.")
+ else:
+ ten_disk_list = disk_list
+ set_ten_disk_list = set(ten_disk_list)
+ if len(ten_disk_list) > len(set_ten_disk_list):
+ tmp = ten_disk_list
+ ten_disk_list = list(set_ten_disk_list)
+ logging.warning(f"disk exist duplicate, it will be deduplicate, before: {tmp}, after: {ten_disk_list}")
+ self._conf["common"]["disk"] = ten_disk_list
def _read_train_data_duration(self, items_algorithm: dict):
self._conf["algorithm"]["train_data_duration"] = self._get_config_value(
@@ -244,10 +261,12 @@ class ConfigParser:
def _read_algorithm_type_and_parameter(self, items_algorithm: dict):
algorithm_type = items_algorithm.get("algorithm_type")
- if algorithm_type is not None:
- self._conf["algorithm"]["algorithm_type"] = get_threshold_type_enum(
- algorithm_type
- )
+ if algorithm_type is None:
+ default_algorithm_type = self._conf["algorithm"]["algorithm_type"]
+ logging.warning(f"algorithm_type not found, it will be set default: {default_algorithm_type}")
+ else:
+ self._conf["algorithm"]["algorithm_type"] = get_threshold_type_enum(algorithm_type)
+
if self._conf["algorithm"]["algorithm_type"] is None:
logging.critical(
"the algorithm_type: %s you set is invalid. ai_block_io plug will exit.",
@@ -257,6 +276,7 @@ class ConfigParser:
f"the algorithm_type: {algorithm_type} you set is invalid. ai_block_io plug will exit."
)
exit(1)
+
elif self._conf["algorithm"]["algorithm_type"] == ThresholdType.NSigmaThreshold:
self._conf["algorithm"]["n_sigma_parameter"] = self._get_config_value(
items_algorithm,
@@ -279,9 +299,14 @@ class ConfigParser:
)
def _read_stage(self, items_algorithm: dict):
- stage_str = items_algorithm.get(
- "stage", self.DEFAULT_CONF["common"]["stage"]
- ).strip()
+ stage_str = items_algorithm.get("stage")
+ if stage_str is None:
+ stage_str = self.DEFAULT_CONF["common"]["stage"]
+ logging.warning(f"stage not found, it will be set default: {stage_str}")
+ else:
+ stage_str = stage_str.strip()
+
+ stage_str = stage_str.lower()
stage_list = stage_str.split(",")
stage_list = [stage.strip() for stage in stage_list]
if len(stage_list) == 1 and stage_list[0] == "":
@@ -307,9 +332,14 @@ class ConfigParser:
self._conf["common"]["stage"] = dup_stage_list
def _read_iotype(self, items_algorithm: dict):
- iotype_str = items_algorithm.get(
- "iotype", self.DEFAULT_CONF["common"]["iotype"]
- ).strip()
+ iotype_str = items_algorithm.get("iotype")
+ if iotype_str is None:
+ iotype_str = self.DEFAULT_CONF["common"]["iotype"]
+ logging.warning(f"iotype not found, it will be set default: {iotype_str}")
+ else:
+ iotype_str = iotype_str.strip()
+
+ iotype_str = iotype_str.lower()
iotype_list = iotype_str.split(",")
iotype_list = [iotype.strip() for iotype in iotype_list]
if len(iotype_list) == 1 and iotype_list[0] == "":
@@ -333,6 +363,13 @@ class ConfigParser:
def _read_sliding_window_type(self, items_sliding_window: dict):
sliding_window_type = items_sliding_window.get("win_type")
+
+ if sliding_window_type is None:
+ default_sliding_window_type = self._conf["algorithm"]["win_type"]
+ logging.warning(f"win_type not found, it will be set default: {default_sliding_window_type}")
+ return
+
+ sliding_window_type = sliding_window_type.strip()
if sliding_window_type is not None:
self._conf["algorithm"]["win_type"] = (
get_sliding_window_type_enum(sliding_window_type)
@@ -439,6 +476,7 @@ class ConfigParser:
int,
self.DEFAULT_CONF["latency_sata_ssd"]["read_tot_lim"],
gt=0,
+ section="latency_sata_ssd"
)
self._conf["latency_sata_ssd"]["write_tot_lim"] = self._get_config_value(
items_latency_sata_ssd,
@@ -446,21 +484,32 @@ class ConfigParser:
int,
self.DEFAULT_CONF["latency_sata_ssd"]["write_tot_lim"],
gt=0,
+ section="latency_sata_ssd"
)
self._conf["latency_sata_ssd"]["read_avg_lim"] = self._get_config_value(
items_latency_sata_ssd,
"read_avg_lim",
int,
self.DEFAULT_CONF["latency_sata_ssd"]["read_avg_lim"],
- gt=0
+ gt=0,
+ section="latency_sata_ssd"
)
self._conf["latency_sata_ssd"]["write_avg_lim"] = self._get_config_value(
items_latency_sata_ssd,
"write_avg_lim",
int,
self.DEFAULT_CONF["latency_sata_ssd"]["write_avg_lim"],
- gt=0
+ gt=0,
+ section="latency_sata_ssd"
)
+ if self._conf["latency_sata_ssd"]["read_avg_lim"] >= self._conf["latency_sata_ssd"]["read_tot_lim"]:
+ Report.report_pass("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...")
+ logging.critical("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...")
+ exit(1)
+ if self._conf["latency_sata_ssd"]["write_avg_lim"] >= self._conf["latency_sata_ssd"]["write_tot_lim"]:
+ Report.report_pass("latency_sata_ssd.write_avg_lim must < latency_sata_ssd.write_tot_lim . exiting...")
+ logging.critical("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...")
+ exit(1)
else:
Report.report_pass("not found latency_sata_ssd section. exiting...")
logging.critical("not found latency_sata_ssd section. exiting...")
@@ -474,6 +523,7 @@ class ConfigParser:
int,
self.DEFAULT_CONF["latency_nvme_ssd"]["read_tot_lim"],
gt=0,
+ section="latency_nvme_ssd"
)
self._conf["latency_nvme_ssd"]["write_tot_lim"] = self._get_config_value(
items_latency_nvme_ssd,
@@ -481,21 +531,32 @@ class ConfigParser:
int,
self.DEFAULT_CONF["latency_nvme_ssd"]["write_tot_lim"],
gt=0,
+ section="latency_nvme_ssd"
)
self._conf["latency_nvme_ssd"]["read_avg_lim"] = self._get_config_value(
items_latency_nvme_ssd,
"read_avg_lim",
int,
self.DEFAULT_CONF["latency_nvme_ssd"]["read_avg_lim"],
- gt=0
+ gt=0,
+ section="latency_nvme_ssd"
)
self._conf["latency_nvme_ssd"]["write_avg_lim"] = self._get_config_value(
items_latency_nvme_ssd,
"write_avg_lim",
int,
self.DEFAULT_CONF["latency_nvme_ssd"]["write_avg_lim"],
- gt=0
+ gt=0,
+ section="latency_nvme_ssd"
)
+ if self._conf["latency_nvme_ssd"]["read_avg_lim"] >= self._conf["latency_nvme_ssd"]["read_tot_lim"]:
+ Report.report_pass("latency_nvme_ssd.read_avg_lim must < latency_nvme_ssd.read_tot_lim . exiting...")
+ logging.critical("latency_nvme_ssd.read_avg_lim must < latency_nvme_ssd.read_tot_lim . exiting...")
+ exit(1)
+ if self._conf["latency_nvme_ssd"]["write_avg_lim"] >= self._conf["latency_nvme_ssd"]["write_tot_lim"]:
+ Report.report_pass("latency_nvme_ssd.write_avg_lim must < latency_nvme_ssd.write_tot_lim . exiting...")
+ logging.critical("latency_nvme_ssd.write_avg_lim must < latency_nvme_ssd.write_tot_lim . exiting...")
+ exit(1)
else:
Report.report_pass("not found latency_nvme_ssd section. exiting...")
logging.critical("not found latency_nvme_ssd section. exiting...")
@@ -509,6 +570,7 @@ class ConfigParser:
int,
self.DEFAULT_CONF["latency_sata_hdd"]["read_tot_lim"],
gt=0,
+ section="latency_sata_hdd"
)
self._conf["latency_sata_hdd"]["write_tot_lim"] = self._get_config_value(
items_latency_sata_hdd,
@@ -516,21 +578,32 @@ class ConfigParser:
int,
self.DEFAULT_CONF["latency_sata_hdd"]["write_tot_lim"],
gt=0,
+ section="latency_sata_hdd"
)
self._conf["latency_sata_hdd"]["read_avg_lim"] = self._get_config_value(
items_latency_sata_hdd,
"read_avg_lim",
int,
self.DEFAULT_CONF["latency_sata_hdd"]["read_avg_lim"],
- gt=0
+ gt=0,
+ section="latency_sata_hdd"
)
self._conf["latency_sata_hdd"]["write_avg_lim"] = self._get_config_value(
items_latency_sata_hdd,
"write_avg_lim",
int,
self.DEFAULT_CONF["latency_sata_hdd"]["write_avg_lim"],
- gt=0
+ gt=0,
+ section="latency_sata_hdd"
)
+ if self._conf["latency_sata_hdd"]["read_avg_lim"] >= self._conf["latency_sata_hdd"]["read_tot_lim"]:
+ Report.report_pass("latency_sata_hdd.read_avg_lim must < latency_sata_hdd.read_tot_lim . exiting...")
+ logging.critical("latency_sata_hdd.read_avg_lim must < latency_sata_hdd.read_tot_lim . exiting...")
+ exit(1)
+ if self._conf["latency_sata_hdd"]["write_avg_lim"] >= self._conf["latency_sata_hdd"]["write_tot_lim"]:
+ Report.report_pass("latency_sata_hdd.write_avg_lim must < latency_sata_hdd.write_tot_lim . exiting...")
+ logging.critical("latency_sata_hdd.write_avg_lim must < latency_sata_hdd.write_tot_lim . exiting...")
+ exit(1)
else:
Report.report_pass("not found latency_sata_hdd section. exiting...")
logging.critical("not found latency_sata_hdd section. exiting...")
diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
index e4869d5..2f2d607 100644
--- a/src/python/sentryPlugins/ai_block_io/data_access.py
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
@@ -67,6 +67,20 @@ def check_detect_frequency_is_valid(period):
return None
+def check_disk_is_available(period_time, disk):
+ data_raw = is_iocollect_valid(period_time, disk)
+ if data_raw["ret"] == 0:
+ try:
+ data = json.loads(data_raw["message"])
+ except Exception as e:
+ return False
+ if not data:
+ return False
+ return True
+ else:
+ return False
+
+
def _get_raw_data(period, disk_list):
return get_io_data(
period,
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
index e3a0952..496e032 100644
--- a/src/python/sentryPlugins/ai_block_io/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -75,6 +75,18 @@ class Detector:
f' sliding_window_type: {self._slidingWindow}')
+def set_to_str(parameter: set):
+ ret = ""
+ parameter = list(parameter)
+ length = len(parameter)
+ for i in range(length):
+ if i == 0:
+ ret += parameter[i]
+ else:
+ ret += "," + parameter[i]
+ return ret
+
+
class DiskDetector:
def __init__(self, disk_name: str):
@@ -124,7 +136,7 @@ class DiskDetector:
alarm_type.add(metric_name.metric_name)
latency_wins, iodump_wins = self.get_detector_list_window()
- details = f"latency: {latency_wins}, iodump: {iodump_wins}"
+ details = {"latency": latency_wins, "iodump": iodump_wins}
io_press = {"throtl", "wbt", "iocost", "bfq"}
driver_slow = {"rq_driver"}
@@ -137,7 +149,7 @@ class DiskDetector:
elif not kernel_slow.isdisjoint(block_stack):
reason = "kernel_slow"
- return True, driver_name, reason, str(block_stack), str(io_type), str(alarm_type), details
+ return True, driver_name, reason, set_to_str(block_stack), set_to_str(io_type), set_to_str(alarm_type), details
def __repr__(self):
msg = f'disk: {self._disk_name}, '
diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py
index 4083c43..ff3fa3b 100644
--- a/src/python/sentryPlugins/ai_block_io/sliding_window.py
+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py
@@ -107,7 +107,7 @@ class MedianSlidingWindow(SlidingWindow):
if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None):
is_slow_io_event = False
median = np.median(self._io_data_queue)
- if median >= self._ai_threshold:
+ if (self._ai_threshold is not None and median > self._ai_threshold) or (self._abs_threshold is not None and median > self._abs_threshold):
is_slow_io_event = True
return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim
diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py
index 600d041..e202bb8 100644
--- a/src/python/sentryPlugins/ai_block_io/threshold.py
+++ b/src/python/sentryPlugins/ai_block_io/threshold.py
@@ -65,9 +65,12 @@ class Threshold:
def __repr__(self):
return "Threshold"
+ def __str__(self):
+ return "Threshold"
+
class AbsoluteThreshold(Threshold):
- def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
+ def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
super().__init__(data_queue_size, data_queue_update_size)
def push_latest_data_to_queue(self, data):
@@ -76,6 +79,9 @@ class AbsoluteThreshold(Threshold):
def __repr__(self):
return "[AbsoluteThreshold]"
+ def __str__(self):
+ return "absolute"
+
class BoxplotThreshold(Threshold):
def __init__(self, boxplot_parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
@@ -112,6 +118,9 @@ class BoxplotThreshold(Threshold):
def __repr__(self):
return f"[BoxplotThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]"
+ def __str__(self):
+ return "boxplot"
+
class NSigmaThreshold(Threshold):
def __init__(self, n_sigma_parameter: float = 3.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
@@ -147,6 +156,9 @@ class NSigmaThreshold(Threshold):
def __repr__(self):
return f"[NSigmaThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]"
+ def __str__(self):
+ return "n_sigma"
+
class ThresholdType(Enum):
AbsoluteThreshold = 0
diff --git a/src/python/sentryPlugins/ai_block_io/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py
index d6f4067..7d2390b 100644
--- a/src/python/sentryPlugins/ai_block_io/utils.py
+++ b/src/python/sentryPlugins/ai_block_io/utils.py
@@ -19,8 +19,6 @@ from .io_data import MetricName, IOData
def get_threshold_type_enum(algorithm_type: str):
- if algorithm_type.lower() == "absolute":
- return ThresholdType.AbsoluteThreshold
if algorithm_type.lower() == "boxplot":
return ThresholdType.BoxplotThreshold
if algorithm_type.lower() == "n_sigma":
--
2.23.0

View File

@ -0,0 +1,728 @@
From cedd862d4e4a97a6c4fa13cbff2af452910ea5b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Thu, 24 Oct 2024 09:39:16 +0800
Subject: [PATCH] ai_block_io support absolute threshold lower limit
---
config/plugins/ai_block_io.ini | 19 +-
.../sentryPlugins/ai_block_io/ai_block_io.py | 36 ++--
.../sentryPlugins/ai_block_io/alarm_report.py | 18 +-
.../ai_block_io/config_parser.py | 168 ++++++++++++------
.../sentryPlugins/ai_block_io/detector.py | 92 ++++++----
.../ai_block_io/sliding_window.py | 21 ++-
6 files changed, 222 insertions(+), 132 deletions(-)
diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini
index 040237d..d0b1e74 100644
--- a/config/plugins/ai_block_io.ini
+++ b/config/plugins/ai_block_io.ini
@@ -2,9 +2,9 @@
level=info
[common]
-slow_io_detect_frequency=1
+period_time=1
disk=default
-stage=bio
+stage=default
iotype=read,write
[algorithm]
@@ -12,22 +12,25 @@ train_data_duration=24
train_update_duration=2
algorithm_type=boxplot
boxplot_parameter=1.5
-n_sigma_parameter=3
-
-[sliding_window]
-sliding_window_type=not_continuous
-window_size=30
-window_minimum_threshold=6
+win_type=not_continuous
+win_size=30
+win_threshold=6
[latency_sata_ssd]
+read_avg_lim=10000
+write_avg_lim=10000
read_tot_lim=50000
write_tot_lim=50000
[latency_nvme_ssd]
+read_avg_lim=300
+write_avg_lim=300
read_tot_lim=500
write_tot_lim=500
[latency_sata_hdd]
+read_avg_lim=15000
+write_avg_lim=15000
read_tot_lim=50000
write_tot_lim=50000
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
index f25e6d5..74f246a 100644
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
@@ -49,7 +49,7 @@ class SlowIODetection:
def __init_detector_name_list(self):
self._disk_list = check_collect_valid(
- self._config_parser.slow_io_detect_frequency
+ self._config_parser.period_time
)
if self._disk_list is None:
Report.report_pass(
@@ -109,7 +109,7 @@ class SlowIODetection:
train_data_duration, train_update_duration = (
self._config_parser.get_train_data_duration_and_train_update_duration()
)
- slow_io_detection_frequency = self._config_parser.slow_io_detect_frequency
+ slow_io_detection_frequency = self._config_parser.period_time
threshold_type = self._config_parser.algorithm_type
data_queue_size, update_size = get_data_queue_size_and_update_size(
train_data_duration, train_update_duration, slow_io_detection_frequency
@@ -131,10 +131,13 @@ class SlowIODetection:
data_queue_size=data_queue_size,
data_queue_update_size=update_size,
)
- abs_threshold = self._config_parser.get_tot_lim(
+ tot_lim = self._config_parser.get_tot_lim(
metric_name.disk_type, metric_name.io_access_type_name
)
- if abs_threshold is None:
+ avg_lim = self._config_parser.get_avg_lim(
+ metric_name.disk_type, metric_name.io_access_type_name
+ )
+ if tot_lim is None:
logging.warning(
"disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.",
disk,
@@ -145,7 +148,8 @@ class SlowIODetection:
sliding_window_type,
queue_length=window_size,
threshold=window_threshold,
- abs_threshold=abs_threshold,
+ abs_threshold=tot_lim,
+ avg_lim=avg_lim
)
detector = Detector(metric_name, threshold, sliding_window)
disk_detector.add_detector(detector)
@@ -176,7 +180,7 @@ class SlowIODetection:
# Step1获取IO数据
io_data_dict_with_disk_name = get_io_data_from_collect_plug(
- self._config_parser.slow_io_detect_frequency, self._disk_list
+ self._config_parser.period_time, self._disk_list
)
logging.debug(f"step1. Get io data: {str(io_data_dict_with_disk_name)}")
if io_data_dict_with_disk_name is None:
@@ -197,25 +201,21 @@ class SlowIODetection:
# Step3慢IO事件上报
logging.debug("step3. Report slow io event to sysSentry.")
for slow_io_event in slow_io_event_list:
- metric_name: MetricName = slow_io_event[1]
- window_info = slow_io_event[2]
- root_cause = slow_io_event[3]
alarm_content = {
- "driver_name": f"{metric_name.disk_name}",
- "reason": root_cause,
- "block_stack": f"{metric_name.stage_name}",
- "io_type": f"{metric_name.io_access_type_name}",
+ "driver_name": slow_io_event[1],
+ "reason": slow_io_event[2],
+ "block_stack": slow_io_event[3],
+ "io_type": slow_io_event[4],
"alarm_source": "ai_block_io",
- "alarm_type": "latency",
- "details": f"disk type: {metric_name.disk_type}, current window: {window_info[1]}, "
- f"ai threshold: {window_info[2]}, abs threshold: {window_info[3]}.",
+ "alarm_type": slow_io_event[5],
+ "details": slow_io_event[6],
}
Xalarm.major(alarm_content)
- logging.warning(alarm_content)
+ logging.warning("[SLOW IO] " + str(alarm_content))
# Step4等待检测时间
logging.debug("step4. Wait to start next slow io event detection loop.")
- time.sleep(self._config_parser.slow_io_detect_frequency)
+ time.sleep(self._config_parser.period_time)
def main():
diff --git a/src/python/sentryPlugins/ai_block_io/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py
index 92bd6e3..61bb145 100644
--- a/src/python/sentryPlugins/ai_block_io/alarm_report.py
+++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py
@@ -30,17 +30,17 @@ class Report:
@staticmethod
def report_pass(info: str):
report_result(Report.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info}))
- logging.info(f'Report {Report.TASK_NAME} PASS: {info}')
+ logging.debug(f'Report {Report.TASK_NAME} PASS: {info}')
@staticmethod
def report_fail(info: str):
report_result(Report.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info}))
- logging.info(f'Report {Report.TASK_NAME} FAIL: {info}')
+ logging.debug(f'Report {Report.TASK_NAME} FAIL: {info}')
@staticmethod
def report_skip(info: str):
report_result(Report.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info}))
- logging.info(f'Report {Report.TASK_NAME} SKIP: {info}')
+ logging.debug(f'Report {Report.TASK_NAME} SKIP: {info}')
class Xalarm:
@@ -50,31 +50,31 @@ class Xalarm:
def minor(info: dict):
info_str = json.dumps(info)
xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_OCCUR, info_str)
- logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}")
+ logging.debug(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}")
@staticmethod
def major(info: dict):
info_str = json.dumps(info)
xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_OCCUR, info_str)
- logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}")
+ logging.debug(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}")
@staticmethod
def critical(info: dict):
info_str = json.dumps(info)
xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_OCCUR, info_str)
- logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}")
+ logging.debug(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}")
def minor_recover(info: dict):
info_str = json.dumps(info)
xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_RECOVER, info_str)
- logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}")
+ logging.debug(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}")
def major_recover(info: dict):
info_str = json.dumps(info)
xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_RECOVER, info_str)
- logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}")
+ logging.debug(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}")
def critical_recover(info: dict):
info_str = json.dumps(info)
xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_RECOVER, info_str)
- logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}")
+ logging.debug(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}")
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
index 1117939..91ec5c6 100644
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -52,7 +52,7 @@ class ConfigParser:
DEFAULT_CONF = {
"log": {"level": "info"},
"common": {
- "slow_io_detect_frequency": 1,
+ "period_time": 1,
"disk": None,
"stage": "throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio",
"iotype": "read,write",
@@ -63,16 +63,32 @@ class ConfigParser:
"algorithm_type": get_threshold_type_enum("boxplot"),
"boxplot_parameter": 1.5,
"n_sigma_parameter": 3.0,
+ "win_type": get_sliding_window_type_enum("not_continuous"),
+ "win_size": 30,
+ "win_threshold": 6,
},
- "sliding_window": {
- "sliding_window_type": get_sliding_window_type_enum("not_continuous"),
- "window_size": 30,
- "window_minimum_threshold": 6,
+ "latency_sata_ssd": {
+ "read_avg_lim": 10000,
+ "write_avg_lim": 10000,
+ "read_tot_lim": 50000,
+ "write_tot_lim": 50000
},
- "latency_sata_ssd": {"read_tot_lim": 50000, "write_tot_lim": 50000},
- "latency_nvme_ssd": {"read_tot_lim": 500, "write_tot_lim": 500},
- "latency_sata_hdd": {"read_tot_lim": 50000, "write_tot_lim": 50000},
- "iodump": {"read_iodump_lim": 0, "write_iodump_lim": 0}
+ "latency_nvme_ssd": {
+ "read_avg_lim": 300,
+ "write_avg_lim": 300,
+ "read_tot_lim": 500,
+ "write_tot_lim": 500
+ },
+ "latency_sata_hdd": {
+ "read_avg_lim": 15000,
+ "write_avg_lim": 15000,
+ "read_tot_lim": 50000,
+ "write_tot_lim": 50000
+ },
+ "iodump": {
+ "read_iodump_lim": 0,
+ "write_iodump_lim": 0
+ }
}
def __init__(self, config_file_name):
@@ -161,18 +177,18 @@ class ConfigParser:
return value
- def _read_slow_io_detect_frequency(self, items_common: dict):
- self._conf["common"]["slow_io_detect_frequency"] = self._get_config_value(
+ def _read_period_time(self, items_common: dict):
+ self._conf["common"]["period_time"] = self._get_config_value(
items_common,
- "slow_io_detect_frequency",
+ "period_time",
int,
- self.DEFAULT_CONF["common"]["slow_io_detect_frequency"],
+ self.DEFAULT_CONF["common"]["period_time"],
gt=0
)
- frequency = self._conf["common"]["slow_io_detect_frequency"]
+ frequency = self._conf["common"]["period_time"]
ret = check_detect_frequency_is_valid(frequency)
if ret is None:
- log = f"slow io detect frequency: {frequency} is valid, "\
+ log = f"period_time: {frequency} is valid, "\
f"Check whether the value range is too large or is not an "\
f"integer multiple of period_time.. exiting..."
Report.report_pass(log)
@@ -316,50 +332,41 @@ class ConfigParser:
self._conf["common"]["iotype"] = dup_iotype_list
def _read_sliding_window_type(self, items_sliding_window: dict):
- sliding_window_type = items_sliding_window.get("sliding_window_type")
+ sliding_window_type = items_sliding_window.get("win_type")
if sliding_window_type is not None:
- self._conf["sliding_window"]["sliding_window_type"] = (
+ self._conf["algorithm"]["win_type"] = (
get_sliding_window_type_enum(sliding_window_type)
)
- if self._conf["sliding_window"]["sliding_window_type"] is None:
+ if self._conf["algorithm"]["win_type"] is None:
logging.critical(
- "the sliding_window_type: %s you set is invalid. ai_block_io plug will exit.",
+ "the win_type: %s you set is invalid. ai_block_io plug will exit.",
sliding_window_type,
)
Report.report_pass(
- f"the sliding_window_type: {sliding_window_type} you set is invalid. ai_block_io plug will exit."
+ f"the win_type: {sliding_window_type} you set is invalid. ai_block_io plug will exit."
)
exit(1)
def _read_window_size(self, items_sliding_window: dict):
- self._conf["sliding_window"]["window_size"] = self._get_config_value(
+ self._conf["algorithm"]["win_size"] = self._get_config_value(
items_sliding_window,
- "window_size",
+ "win_size",
int,
- self.DEFAULT_CONF["sliding_window"]["window_size"],
+ self.DEFAULT_CONF["algorithm"]["win_size"],
gt=0,
- le=3600,
+ le=300,
)
def _read_window_minimum_threshold(self, items_sliding_window: dict):
- default_window_minimum_threshold = self.DEFAULT_CONF["sliding_window"][
- "window_minimum_threshold"
- ]
- if (
- default_window_minimum_threshold
- > self._conf["sliding_window"]["window_size"]
- ):
- default_window_minimum_threshold = (
- self._conf["sliding_window"]["window_size"] / 2
- )
- self._conf["sliding_window"]["window_minimum_threshold"] = (
+ default_window_minimum_threshold = self.DEFAULT_CONF["algorithm"]["win_threshold"]
+ self._conf["algorithm"]["win_threshold"] = (
self._get_config_value(
items_sliding_window,
- "window_minimum_threshold",
+ "win_threshold",
int,
default_window_minimum_threshold,
gt=0,
- le=self._conf["sliding_window"]["window_size"],
+ le=self._conf["algorithm"]["win_size"],
)
)
@@ -406,7 +413,7 @@ class ConfigParser:
if con.has_section("common"):
items_common = dict(con.items("common"))
- self._read_slow_io_detect_frequency(items_common)
+ self._read_period_time(items_common)
self._read_disks_to_detect(items_common)
self._read_stage(items_common)
self._read_iotype(items_common)
@@ -420,20 +427,9 @@ class ConfigParser:
self._read_train_data_duration(items_algorithm)
self._read_train_update_duration(items_algorithm)
self._read_algorithm_type_and_parameter(items_algorithm)
- else:
- Report.report_pass("not found algorithm section. exiting...")
- logging.critical("not found algorithm section. exiting...")
- exit(1)
-
- if con.has_section("sliding_window"):
- items_sliding_window = dict(con.items("sliding_window"))
-
- self._read_window_size(items_sliding_window)
- self._read_window_minimum_threshold(items_sliding_window)
- else:
- Report.report_pass("not found sliding_window section. exiting...")
- logging.critical("not found sliding_window section. exiting...")
- exit(1)
+ self._read_sliding_window_type(items_algorithm)
+ self._read_window_size(items_algorithm)
+ self._read_window_minimum_threshold(items_algorithm)
if con.has_section("latency_sata_ssd"):
items_latency_sata_ssd = dict(con.items("latency_sata_ssd"))
@@ -451,6 +447,20 @@ class ConfigParser:
self.DEFAULT_CONF["latency_sata_ssd"]["write_tot_lim"],
gt=0,
)
+ self._conf["latency_sata_ssd"]["read_avg_lim"] = self._get_config_value(
+ items_latency_sata_ssd,
+ "read_avg_lim",
+ int,
+ self.DEFAULT_CONF["latency_sata_ssd"]["read_avg_lim"],
+ gt=0
+ )
+ self._conf["latency_sata_ssd"]["write_avg_lim"] = self._get_config_value(
+ items_latency_sata_ssd,
+ "write_avg_lim",
+ int,
+ self.DEFAULT_CONF["latency_sata_ssd"]["write_avg_lim"],
+ gt=0
+ )
else:
Report.report_pass("not found latency_sata_ssd section. exiting...")
logging.critical("not found latency_sata_ssd section. exiting...")
@@ -472,6 +482,20 @@ class ConfigParser:
self.DEFAULT_CONF["latency_nvme_ssd"]["write_tot_lim"],
gt=0,
)
+ self._conf["latency_nvme_ssd"]["read_avg_lim"] = self._get_config_value(
+ items_latency_nvme_ssd,
+ "read_avg_lim",
+ int,
+ self.DEFAULT_CONF["latency_nvme_ssd"]["read_avg_lim"],
+ gt=0
+ )
+ self._conf["latency_nvme_ssd"]["write_avg_lim"] = self._get_config_value(
+ items_latency_nvme_ssd,
+ "write_avg_lim",
+ int,
+ self.DEFAULT_CONF["latency_nvme_ssd"]["write_avg_lim"],
+ gt=0
+ )
else:
Report.report_pass("not found latency_nvme_ssd section. exiting...")
logging.critical("not found latency_nvme_ssd section. exiting...")
@@ -493,6 +517,20 @@ class ConfigParser:
self.DEFAULT_CONF["latency_sata_hdd"]["write_tot_lim"],
gt=0,
)
+ self._conf["latency_sata_hdd"]["read_avg_lim"] = self._get_config_value(
+ items_latency_sata_hdd,
+ "read_avg_lim",
+ int,
+ self.DEFAULT_CONF["latency_sata_hdd"]["read_avg_lim"],
+ gt=0
+ )
+ self._conf["latency_sata_hdd"]["write_avg_lim"] = self._get_config_value(
+ items_latency_sata_hdd,
+ "write_avg_lim",
+ int,
+ self.DEFAULT_CONF["latency_sata_hdd"]["write_avg_lim"],
+ gt=0
+ )
else:
Report.report_pass("not found latency_sata_hdd section. exiting...")
logging.critical("not found latency_sata_hdd section. exiting...")
@@ -542,6 +580,18 @@ class ConfigParser:
else:
return None
+ def get_avg_lim(self, disk_type, io_type):
+ if io_type == "read":
+ return self._conf.get(
+ f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {}
+ ).get("read_avg_lim", None)
+ elif io_type == "write":
+ return self._conf.get(
+ f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {}
+ ).get("write_avg_lim", None)
+ else:
+ return None
+
def get_train_data_duration_and_train_update_duration(self):
return (
self._conf["algorithm"]["train_data_duration"],
@@ -550,13 +600,13 @@ class ConfigParser:
def get_window_size_and_window_minimum_threshold(self):
return (
- self._conf["sliding_window"]["window_size"],
- self._conf["sliding_window"]["window_minimum_threshold"],
+ self._conf["algorithm"]["win_size"],
+ self._conf["algorithm"]["win_threshold"],
)
@property
- def slow_io_detect_frequency(self):
- return self._conf["common"]["slow_io_detect_frequency"]
+ def period_time(self):
+ return self._conf["common"]["period_time"]
@property
def algorithm_type(self):
@@ -564,7 +614,7 @@ class ConfigParser:
@property
def sliding_window_type(self):
- return self._conf["sliding_window"]["sliding_window_type"]
+ return self._conf["algorithm"]["win_type"]
@property
def train_data_duration(self):
@@ -576,11 +626,11 @@ class ConfigParser:
@property
def window_size(self):
- return self._conf["sliding_window"]["window_size"]
+ return self._conf["algorithm"]["win_size"]
@property
def window_minimum_threshold(self):
- return self._conf["sliding_window"]["window_minimum_threshold"]
+ return self._conf["algorithm"]["win_threshold"]
@property
def absolute_threshold(self):
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
index 8536f7a..e3a0952 100644
--- a/src/python/sentryPlugins/ai_block_io/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -28,9 +28,13 @@ class Detector:
self._threshold.attach_observer(self._slidingWindow)
self._count = None
- def get_metric_name(self):
+ @property
+ def metric_name(self):
return self._metric_name
+ def get_sliding_window_data(self):
+ return self._slidingWindow.get_data()
+
def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
if self._count is None:
self._count = datetime.now()
@@ -38,22 +42,27 @@ class Detector:
now_time = datetime.now()
time_diff = (now_time - self._count).total_seconds()
if time_diff >= 60:
- logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
+ logging.info(f"({self._metric_name}) 's latest ai threshold is: {self._threshold.get_threshold()}.")
self._count = None
logging.debug(f'enter Detector: {self}')
metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name)
if metric_value is None:
logging.debug('not found metric value, so return None.')
- return (False, False), None, None, None
+ return (False, False), None, None, None, None
logging.debug(f'input metric value: {str(metric_value)}')
self._threshold.push_latest_data_to_queue(metric_value)
detection_result = self._slidingWindow.is_slow_io_event(metric_value)
# 检测到慢周期由Detector负责打印info级别日志
if detection_result[0][1]:
- logging.info(f'[abnormal period happen]: disk info: {self._metric_name}, window: {detection_result[1]}, '
- f'current value: {metric_value}, ai threshold: {detection_result[2]}, '
- f'absolute threshold: {detection_result[3]}')
+ logging.info(f'[abnormal_period]: disk: {self._metric_name.disk_name}, '
+ f'stage: {self._metric_name.stage_name}, '
+ f'iotype: {self._metric_name.io_access_type_name}, '
+ f'metric: {self._metric_name.metric_name}, '
+ f'current value: {metric_value}, '
+ f'ai threshold: {detection_result[2]}, '
+ f'absolute threshold upper limit: {detection_result[3]}, '
+ f'lower limit: {detection_result[4]}')
else:
logging.debug(f'Detection result: {str(detection_result)}')
logging.debug(f'exit Detector: {self}')
@@ -75,41 +84,60 @@ class DiskDetector:
def add_detector(self, detector: Detector):
self._detector_list.append(detector)
+ def get_detector_list_window(self):
+ latency_wins = {"read": {}, "write": {}}
+ iodump_wins = {"read": {}, "write": {}}
+ for detector in self._detector_list:
+ if detector.metric_name.metric_name == 'latency':
+ latency_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data()
+ elif detector.metric_name.metric_name == 'io_dump':
+ iodump_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data()
+ return latency_wins, iodump_wins
+
def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
- """
- 根因诊断逻辑只有bio阶段发生异常才认为发生了慢IO事件即bio阶段异常是慢IO事件的必要条件
- 情况一bio异常rq_driver也异常则慢盘
- 情况二bio异常rq_driver无异常且有内核IO栈任意阶段异常则IO栈异常
- 情况三bio异常rq_driver无异常且无内核IO栈任意阶段异常则IO压力大
- 情况四bio异常则UNKNOWN
- """
- diagnosis_info = {"bio": [], "rq_driver": [], "io_stage": []}
+ diagnosis_info = {"bio": [], "rq_driver": [], "kernel_stack": []}
for detector in self._detector_list:
# result返回内容(是否检测到慢IO是否检测到慢周期)、窗口、ai阈值、绝对阈值
# 示例: (False, False), self._io_data_queue, self._ai_threshold, self._abs_threshold
result = detector.is_slow_io_event(io_data_dict_with_disk_name)
if result[0][0]:
- if detector.get_metric_name().stage_name == "bio":
- diagnosis_info["bio"].append((detector.get_metric_name(), result))
- elif detector.get_metric_name().stage_name == "rq_driver":
- diagnosis_info["rq_driver"].append((detector.get_metric_name(), result))
+ if detector.metric_name.stage_name == "bio":
+ diagnosis_info["bio"].append(detector.metric_name)
+ elif detector.metric_name.stage_name == "rq_driver":
+ diagnosis_info["rq_driver"].append(detector.metric_name)
else:
- diagnosis_info["io_stage"].append((detector.get_metric_name(), result))
+ diagnosis_info["kernel_stack"].append(detector.metric_name)
- # 返回内容1是否检测到慢IO事件、2MetricName、3滑动窗口及阈值、4慢IO事件根因
- root_cause = None
if len(diagnosis_info["bio"]) == 0:
- return False, None, None, None
- elif len(diagnosis_info["rq_driver"]) != 0:
- root_cause = "[Root Cause: disk slow]"
- elif len(diagnosis_info["io_stage"]) != 0:
- stage_list = []
- for io_stage in diagnosis_info["io_stage"]:
- stage_list.append(io_stage[0].stage_name)
- root_cause = f"[Root Cause: io stage slow, stage: {stage_list}]"
- if root_cause is None:
- root_cause = "[Root Cause: high io pressure]"
- return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause
+ return False, None, None, None, None, None, None
+
+ driver_name = self._disk_name
+ reason = "unknown"
+ block_stack = set()
+ io_type = set()
+ alarm_type = set()
+
+ for key, value in diagnosis_info.items():
+ for metric_name in value:
+ block_stack.add(metric_name.stage_name)
+ io_type.add(metric_name.io_access_type_name)
+ alarm_type.add(metric_name.metric_name)
+
+ latency_wins, iodump_wins = self.get_detector_list_window()
+ details = f"latency: {latency_wins}, iodump: {iodump_wins}"
+
+ io_press = {"throtl", "wbt", "iocost", "bfq"}
+ driver_slow = {"rq_driver"}
+ kernel_slow = {"gettag", "plug", "deadline", "hctx", "requeue"}
+
+ if not io_press.isdisjoint(block_stack):
+ reason = "io_press"
+ elif not driver_slow.isdisjoint(block_stack):
+ reason = "driver_slow"
+ elif not kernel_slow.isdisjoint(block_stack):
+ reason = "kernel_slow"
+
+ return True, driver_name, reason, str(block_stack), str(io_type), str(alarm_type), details
def __repr__(self):
msg = f'disk: {self._disk_name}, '
diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py
index cebe41f..4083c43 100644
--- a/src/python/sentryPlugins/ai_block_io/sliding_window.py
+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py
@@ -21,11 +21,12 @@ class SlidingWindowType(Enum):
class SlidingWindow:
- def __init__(self, queue_length: int, threshold: int, abs_threshold: int = None):
+ def __init__(self, queue_length: int, threshold: int, abs_threshold: int = None, avg_lim: int = None):
self._queue_length = queue_length
self._queue_threshold = threshold
self._ai_threshold = None
self._abs_threshold = abs_threshold
+ self._avg_lim = avg_lim
self._io_data_queue = []
self._io_data_queue_abnormal_tag = []
@@ -35,8 +36,13 @@ class SlidingWindow:
self._io_data_queue_abnormal_tag.pop(0)
self._io_data_queue.append(data)
tag = False
- if ((self._ai_threshold is not None and data > self._ai_threshold) or
- (self._abs_threshold is not None and data > self._abs_threshold)):
+ if self._avg_lim is not None and data < self._avg_lim:
+ tag = False
+ self._io_data_queue_abnormal_tag.append(tag)
+ return tag
+ if self._ai_threshold is not None and data > self._ai_threshold:
+ tag = True
+ if self._abs_threshold is not None and data > self._abs_threshold:
tag = True
self._io_data_queue_abnormal_tag.append(tag)
return tag
@@ -52,6 +58,9 @@ class SlidingWindow:
def is_slow_io_event(self, data):
return False, None, None, None
+ def get_data(self):
+ return self._io_data_queue
+
def __repr__(self):
return "[SlidingWindow]"
@@ -64,7 +73,7 @@ class NotContinuousSlidingWindow(SlidingWindow):
is_slow_io_event = False
if self._io_data_queue_abnormal_tag.count(True) >= self._queue_threshold:
is_slow_io_event = True
- return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold
+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim
def __repr__(self):
return f"[NotContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]"
@@ -85,7 +94,7 @@ class ContinuousSlidingWindow(SlidingWindow):
break
else:
consecutive_count = 0
- return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold
+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim
def __repr__(self):
return f"[ContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]"
@@ -100,7 +109,7 @@ class MedianSlidingWindow(SlidingWindow):
median = np.median(self._io_data_queue)
if median >= self._ai_threshold:
is_slow_io_event = True
- return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold
+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim
def __repr__(self):
return f"[MedianSlidingWindow, window size: {self._queue_length}]"
--
2.23.0

View File

@ -4,7 +4,7 @@
Summary: System Inspection Framework Summary: System Inspection Framework
Name: sysSentry Name: sysSentry
Version: 1.0.2 Version: 1.0.2
Release: 59 Release: 60
License: Mulan PSL v2 License: Mulan PSL v2
Group: System Environment/Daemons Group: System Environment/Daemons
Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz
@ -79,6 +79,9 @@ Patch66: fix-excessive-CPU-usage.patch
Patch67: fix-uint8-bug-and-change-isolation-default-value.patch Patch67: fix-uint8-bug-and-change-isolation-default-value.patch
Patch68: fix-write-file-return-code-bug.patch Patch68: fix-write-file-return-code-bug.patch
Patch69: change-avg_block_io-config.patch Patch69: change-avg_block_io-config.patch
Patch70: ai_block_io-support-absolute-threshold-lower-limit.patch
Patch71: ai_block_io-fix-some-config-parameters-parse-bug.patch
Patch72: update-nvme-config.patch
BuildRequires: cmake gcc-c++ BuildRequires: cmake gcc-c++
BuildRequires: python3 python3-setuptools BuildRequires: python3 python3-setuptools
@ -372,6 +375,12 @@ rm -rf %{buildroot}
%attr(0550,root,root) %{python3_sitelib}/syssentry/bmc_alarm.py %attr(0550,root,root) %{python3_sitelib}/syssentry/bmc_alarm.py
%changelog %changelog
* Tue Nov 5 2024 zhangnan <zhangnan134@huawei.com> - 1.0.2-60
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:update nvme config
* Tue Nov 5 2024 gaoruoshu <gaoruoshu@huawei.com> - 1.0.2-59 * Tue Nov 5 2024 gaoruoshu <gaoruoshu@huawei.com> - 1.0.2-59
- Type:bugfix - Type:bugfix
- CVE:NA - CVE:NA

51
update-nvme-config.patch Normal file
View File

@ -0,0 +1,51 @@
From f50b4e1b7f5fa38b1930349b1a9a905eb5307ab7 Mon Sep 17 00:00:00 2001
From: znzjugod <zhangnan134@huawei.com>
Date: Tue, 5 Nov 2024 11:47:56 +0800
Subject: [PATCH] update nvme config
---
config/plugins/ai_block_io.ini | 8 ++++----
src/python/sentryPlugins/ai_block_io/config_parser.py | 8 ++++----
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini
index d0b1e74..69f44ba 100644
--- a/config/plugins/ai_block_io.ini
+++ b/config/plugins/ai_block_io.ini
@@ -23,10 +23,10 @@ read_tot_lim=50000
write_tot_lim=50000
[latency_nvme_ssd]
-read_avg_lim=300
-write_avg_lim=300
-read_tot_lim=500
-write_tot_lim=500
+read_avg_lim=10000
+write_avg_lim=10000
+read_tot_lim=50000
+write_tot_lim=50000
[latency_sata_hdd]
read_avg_lim=15000
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
index 3049db2..1bbb609 100644
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -74,10 +74,10 @@ class ConfigParser:
"write_tot_lim": 50000
},
"latency_nvme_ssd": {
- "read_avg_lim": 300,
- "write_avg_lim": 300,
- "read_tot_lim": 500,
- "write_tot_lim": 500
+ "read_avg_lim": 10000,
+ "write_avg_lim": 10000,
+ "read_tot_lim": 50000,
+ "write_tot_lim": 50000
},
"latency_sata_hdd": {
"read_avg_lim": 15000,
--
2.45.2