sysSentry/refactor-config.py-and-bugfix-uncorrect-slow-io-repo.patch
gaoruoshu 7e035b92d0 refactor config.py and bugfix uncorrect slow io report
get_io_data failed wont stop avg_block_io and del disk not support

Signed-off-by: gaoruoshu <gaoruoshu@huawei.com>
2024-10-15 21:40:07 +08:00

567 lines
21 KiB
Diff
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

From d5cb115a97e27c8270e8fb385fb3914af9ba3c34 Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Tue, 15 Oct 2024 10:00:07 +0000
Subject: [PATCH] refactor config.py and bugfix uncorrect slow io report
Signed-off-by: gaoruoshu <gaoruoshu@huawei.com>
---
.../avg_block_io/avg_block_io.py | 155 ++-----------
.../sentryPlugins/avg_block_io/config.py | 208 ++++++++++++++++++
.../sentryPlugins/avg_block_io/module_conn.py | 9 +-
.../sentryPlugins/avg_block_io/utils.py | 72 ------
4 files changed, 238 insertions(+), 206 deletions(-)
create mode 100644 src/python/sentryPlugins/avg_block_io/config.py
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
index f3ade09..cd47919 100644
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
@@ -13,132 +13,13 @@ import signal
import configparser
import time
+from .config import read_config_log, read_config_common, read_config_algorithm, read_config_latency, read_config_iodump, read_config_stage
from .stage_window import IoWindow, IoDumpWindow
from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name
-from .utils import update_avg_and_check_abnormal, get_log_level, get_section_value
-from sentryCollector.collect_plugin import Disk_Type
+from .utils import update_avg_and_check_abnormal
CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
-def log_invalid_keys(not_in_list, keys_name, config_list, default_list):
- """print invalid log"""
- if config_list and not_in_list:
- logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list))
- elif config_list == ["default"]:
- logging.warning("Default {} use {}".format(keys_name, default_list))
-
-
-def read_config_common(config):
- """read config file, get [common] section value"""
- if not config.has_section("common"):
- report_alarm_fail("Cannot find common section in config file")
-
- try:
- disk_name = config.get("common", "disk")
- disk = [] if disk_name == "default" else disk_name.split(",")
- except configparser.NoOptionError:
- disk = []
- logging.warning("Unset common.disk, set to default")
-
- try:
- stage_name = config.get("common", "stage")
- stage = [] if stage_name == "default" else stage_name.split(",")
- except configparser.NoOptionError:
- stage = []
- logging.warning("Unset common.stage, set to default")
-
- if len(disk) > 10:
- logging.warning("Too many common.disks, record only max 10 disks")
- disk = disk[:10]
-
- try:
- iotype_name = config.get("common", "iotype").split(",")
- iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']]
- err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']]
-
- if err_iotype:
- report_alarm_fail("Invalid common.iotype config")
-
- except configparser.NoOptionError:
- iotype_list = ["read", "write"]
- logging.warning("Unset common.iotype, set to read,write")
-
- try:
- period_time = int(config.get("common", "period_time"))
- if not (1 <= period_time <= 300):
- raise ValueError("Invalid period_time")
- except ValueError:
- report_alarm_fail("Invalid common.period_time")
- except configparser.NoOptionError:
- period_time = 1
- logging.warning("Unset common.period_time, use 1s as default")
-
- return period_time, disk, stage, iotype_list
-
-
-def read_config_algorithm(config):
- """read config file, get [algorithm] section value"""
- if not config.has_section("algorithm"):
- report_alarm_fail("Cannot find algorithm section in config file")
-
- try:
- win_size = int(config.get("algorithm", "win_size"))
- if not (1 <= win_size <= 300):
- raise ValueError("Invalid algorithm.win_size")
- except ValueError:
- report_alarm_fail("Invalid algorithm.win_size config")
- except configparser.NoOptionError:
- win_size = 30
- logging.warning("Unset algorithm.win_size, use 30 as default")
-
- try:
- win_threshold = int(config.get("algorithm", "win_threshold"))
- if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
- raise ValueError("Invalid algorithm.win_threshold")
- except ValueError:
- report_alarm_fail("Invalid algorithm.win_threshold config")
- except configparser.NoOptionError:
- win_threshold = 6
- logging.warning("Unset algorithm.win_threshold, use 6 as default")
-
- return win_size, win_threshold
-
-
-def read_config_latency(config):
- """read config file, get [latency_xxx] section value"""
- common_param = {}
- for type_name in Disk_Type:
- section_name = f"latency_{Disk_Type[type_name]}"
- if not config.has_section(section_name):
- report_alarm_fail(f"Cannot find {section_name} section in config file")
-
- common_param[Disk_Type[type_name]] = get_section_value(section_name, config)
- return common_param
-
-
-def read_config_iodump(config):
- """read config file, get [iodump] section value"""
- common_param = {}
- section_name = "iodump"
- if not config.has_section(section_name):
- report_alarm_fail(f"Cannot find {section_name} section in config file")
-
- return get_section_value(section_name, config)
-
-
-def read_config_stage(config, stage, iotype_list, curr_disk_type):
- """read config file, get [STAGE_NAME_diskType] section value"""
- res = {}
- section_name = f"{stage}_{curr_disk_type}"
- if not config.has_section(section_name):
- return res
-
- for key in config[section_name]:
- if config[stage][key].isdecimal():
- res[key] = int(config[stage][key])
-
- return res
-
def init_io_win(io_dic, config, common_param):
"""initialize windows of latency, iodump, and dict of avg_value"""
@@ -192,24 +73,33 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
disk_list = [key for key in all_disk_set if key in config_disk]
not_in_disk_list = [key for key in config_disk if key not in all_disk_set]
+ if not config_disk and not not_in_disk_list:
+ disk_list = [key for key in all_disk_set]
+
+ if not disk_list:
+ report_alarm_fail("Cannot get valid disk name")
+
+ disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list
+
+ if not config_disk:
+ logging.info(f"Default common.disk using disk={disk_list}")
+ elif sorted(disk_list) != sorted(config_disk):
+ logging.warning(f"Set common.disk to {disk_list}")
+
stage_list = [key for key in all_stage_set if key in config_stage]
not_in_stage_list = [key for key in config_stage if key not in all_stage_set]
if not_in_stage_list:
report_alarm_fail(f"Invalid common.stage_list config, cannot set {not_in_stage_list}")
- if not config_disk and not not_in_disk_list:
- disk_list = [key for key in all_disk_set]
-
- if not config_stage and not not_in_stage_list:
+ if not config_stage:
stage_list = [key for key in all_stage_set]
- disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list
-
- if not stage_list or not disk_list:
- report_alarm_fail("Cannot get valid disk name or stage name.")
+ if not stage_list:
+ report_alarm_fail("Cannot get valid stage name.")
- log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list)
+ if not config_stage:
+ logging.info(f"Default common.stage using stage={stage_list}")
return disk_list, stage_list
@@ -254,9 +144,8 @@ def main():
signal.signal(signal.SIGINT, sig_handler)
signal.signal(signal.SIGTERM, sig_handler)
- log_level = get_log_level(CONFIG_FILE)
+ log_level = read_config_log(CONFIG_FILE)
log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
-
logging.basicConfig(level=log_level, format=log_format)
# 初始化配置读取
@@ -274,6 +163,8 @@ def main():
# 采集模块对接is_iocollect_valid()
io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage)
+ logging.debug(f"disk={io_dic['disk_list']}, stage={io_dic['stage_list']}")
+
if "bio" not in io_dic["stage_list"]:
report_alarm_fail("Cannot run avg_block_io without bio stage")
diff --git a/src/python/sentryPlugins/avg_block_io/config.py b/src/python/sentryPlugins/avg_block_io/config.py
new file mode 100644
index 0000000..c8f45ce
--- /dev/null
+++ b/src/python/sentryPlugins/avg_block_io/config.py
@@ -0,0 +1,208 @@
+import configparser
+import logging
+import os
+
+from .module_conn import report_alarm_fail
+from sentryCollector.collect_plugin import Disk_Type
+
+
+CONF_LOG = 'log'
+CONF_LOG_LEVEL = 'level'
+LogLevel = {
+ "debug": logging.DEBUG,
+ "info": logging.INFO,
+ "warning": logging.WARNING,
+ "error": logging.ERROR,
+ "critical": logging.CRITICAL
+}
+
+CONF_COMMON = 'common'
+CONF_COMMON_DISK = 'disk'
+CONF_COMMON_STAGE = 'stage'
+CONF_COMMON_IOTYPE = 'iotype'
+CONF_COMMON_PER_TIME = 'period_time'
+
+CONF_ALGO = 'algorithm'
+CONF_ALGO_SIZE = 'win_size'
+CONF_ALGO_THRE = 'win_threshold'
+
+CONF_LATENCY = 'latency_{}'
+CONF_IODUMP = 'iodump'
+
+
+DEFAULT_PARAM = {
+ CONF_LOG: {
+ CONF_LOG_LEVEL: 'info'
+ }, CONF_COMMON: {
+ CONF_COMMON_DISK: 'default',
+ CONF_COMMON_STAGE: 'default',
+ CONF_COMMON_IOTYPE: 'read,write',
+ CONF_COMMON_PER_TIME: 1
+ }, CONF_ALGO: {
+ CONF_ALGO_SIZE: 30,
+ CONF_ALGO_THRE: 6
+ }, 'latency_nvme_ssd': {
+ 'read_avg_lim': 300,
+ 'write_avg_lim': 300,
+ 'read_avg_time': 3,
+ 'write_avg_time': 3,
+ 'read_tot_lim': 500,
+ 'write_tot_lim': 500,
+ }, 'latency_sata_ssd' : {
+ 'read_avg_lim': 10000,
+ 'write_avg_lim': 10000,
+ 'read_avg_time': 3,
+ 'write_avg_time': 3,
+ 'read_tot_lim': 50000,
+ 'write_tot_lim': 50000,
+ }, 'latency_sata_hdd' : {
+ 'read_avg_lim': 15000,
+ 'write_avg_lim': 15000,
+ 'read_avg_time': 3,
+ 'write_avg_time': 3,
+ 'read_tot_lim': 50000,
+ 'write_tot_lim': 50000
+ }, CONF_IODUMP: {
+ 'read_iodump_lim': 0,
+ 'write_iodump_lim': 0
+ }
+}
+
+
+def get_section_value(section_name, config):
+ common_param = {}
+ config_sec = config[section_name]
+ for config_key in DEFAULT_PARAM[section_name]:
+ if config_key in config_sec:
+ if not config_sec[config_key].isdecimal():
+ report_alarm_fail(f"Invalid {section_name}.{config_key} config.")
+ common_param[config_key] = int(config_sec[config_key])
+ else:
+ common_param[config_key] = DEFAULT_PARAM[section_name][config_key]
+ logging.warning(f"Unset {section_name}.{config_key} in config file, use {common_param[config_key]} as default")
+ return common_param
+
+
+def read_config_log(filename):
+ """read config file, get [log] section value"""
+ default_log_level = DEFAULT_PARAM[CONF_LOG][CONF_LOG_LEVEL]
+ if not os.path.exists(filename):
+ return LogLevel.get(default_log_level)
+
+ config = configparser.ConfigParser()
+ config.read(filename)
+
+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL, fallback=default_log_level)
+ if log_level.lower() in LogLevel:
+ return LogLevel.get(log_level.lower())
+ return LogLevel.get(default_log_level)
+
+
+def read_config_common(config):
+ """read config file, get [common] section value"""
+ if not config.has_section(CONF_COMMON):
+ report_alarm_fail(f"Cannot find {CONF_COMMON} section in config file")
+
+ try:
+ disk_name = config.get(CONF_COMMON, CONF_COMMON_DISK).lower()
+ disk = [] if disk_name == "default" else disk_name.split(",")
+ except configparser.NoOptionError:
+ disk = []
+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_DISK}, set to default")
+
+ try:
+ stage_name = config.get(CONF_COMMON, CONF_COMMON_STAGE).lower()
+ stage = [] if stage_name == "default" else stage_name.split(",")
+ except configparser.NoOptionError:
+ stage = []
+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_STAGE}, set to default")
+
+ if len(disk) > 10:
+ logging.warning(f"Too many {CONF_COMMON}.disks, record only max 10 disks")
+ disk = disk[:10]
+
+ try:
+ iotype_name = config.get(CONF_COMMON, CONF_COMMON_IOTYPE).lower().split(",")
+ iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']]
+ err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']]
+
+ if err_iotype:
+ report_alarm_fail(f"Invalid {CONF_COMMON}.{CONF_COMMON_IOTYPE} config")
+
+ except configparser.NoOptionError:
+ iotype_list = DEFAULT_PARAM[CONF_COMMON][CONF_COMMON_IOTYPE]
+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_IOTYPE}, use {iotupe_list} as default")
+
+ try:
+ period_time = int(config.get(CONF_COMMON, CONF_COMMON_PER_TIME))
+ if not (1 <= period_time <= 300):
+ raise ValueError("Invalid period_time")
+ except ValueError:
+ report_alarm_fail(f"Invalid {CONF_COMMON}.{CONF_COMMON_PER_TIME}")
+ except configparser.NoOptionError:
+ period_time = DEFAULT_PARAM[CONF_COMMON][CONF_COMMON_PER_TIME]
+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_PER_TIME}, use {period_time} as default")
+
+ return period_time, disk, stage, iotype_list
+
+
+def read_config_algorithm(config):
+ """read config file, get [algorithm] section value"""
+ if not config.has_section(CONF_ALGO):
+ report_alarm_fail(f"Cannot find {CONF_ALGO} section in config file")
+
+ try:
+ win_size = int(config.get(CONF_ALGO, CONF_ALGO_SIZE))
+ if not (1 <= win_size <= 300):
+ raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE}")
+ except ValueError:
+ report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE} config")
+ except configparser.NoOptionError:
+ win_size = DEFAULT_PARAM[CONF_ALGO][CONF_ALGO_SIZE]
+ logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_SIZE}, use {win_size} as default")
+
+ try:
+ win_threshold = int(config.get(CONF_ALGO, CONF_ALGO_THRE))
+ if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
+ raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE}")
+ except ValueError:
+ report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE} config")
+ except configparser.NoOptionError:
+ win_threshold = DEFAULT_PARAM[CONF_ALGO]['win_threshold']
+ logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE}, use {win_threshold} as default")
+
+ return win_size, win_threshold
+
+
+def read_config_latency(config):
+ """read config file, get [latency_xxx] section value"""
+ common_param = {}
+ for type_name in Disk_Type:
+ section_name = CONF_LATENCY.format(Disk_Type[type_name])
+ if not config.has_section(section_name):
+ report_alarm_fail(f"Cannot find {section_name} section in config file")
+
+ common_param[Disk_Type[type_name]] = get_section_value(section_name, config)
+ return common_param
+
+
+def read_config_iodump(config):
+ """read config file, get [iodump] section value"""
+ if not config.has_section(CONF_IODUMP):
+ report_alarm_fail(f"Cannot find {CONF_IODUMP} section in config file")
+
+ return get_section_value(CONF_IODUMP, config)
+
+
+def read_config_stage(config, stage, iotype_list, curr_disk_type):
+ """read config file, get [STAGE_NAME_diskType] section value"""
+ res = {}
+ section_name = f"{stage}_{curr_disk_type}"
+ if not config.has_section(section_name):
+ return res
+
+ for key in config[section_name]:
+ if config[stage][key].isdecimal():
+ res[key] = int(config[stage][key])
+
+ return res
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
index 8d6f429..cbdaad4 100644
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
@@ -29,12 +29,16 @@ def sig_handler(signum, _f):
def avg_get_io_data(io_dic):
"""get_io_data from sentryCollector"""
+ logging.debug(f"send to sentryCollector get_io_data: period={io_dic['period_time']}, "
+ f"disk={io_dic['disk_list']}, stage={io_dic['stage_list']}, iotype={io_dic['iotype_list']}")
res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"])
return check_result_validation(res, 'get io data')
def avg_is_iocollect_valid(io_dic, config_disk, config_stage):
"""is_iocollect_valid from sentryCollector"""
+ logging.debug(f"send to sentryCollector is_iocollect_valid: period={io_dic['period_time']}, "
+ f"disk={config_disk}, stage={config_stage}")
res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage)
return check_result_validation(res, 'check config validation')
@@ -79,7 +83,7 @@ def process_report_data(disk_name, rw, io_data):
# io press
ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
for stage_name in ctrl_stage:
- abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
+ abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data)
if not abnormal:
continue
msg["reason"] = "IO press"
@@ -117,6 +121,7 @@ def process_report_data(disk_name, rw, io_data):
def get_disk_type_by_name(disk_name):
+ logging.debug(f"send to sentryCollector get_disk_type: disk_name={disk_name}")
res = get_disk_type(disk_name)
disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}')
try:
@@ -126,4 +131,4 @@ def get_disk_type_by_name(disk_name):
except ValueError:
report_alarm_fail(f"Failed to get disk type for {disk_name}")
- return Disk_Type[curr_disk_type]
\ No newline at end of file
+ return Disk_Type[curr_disk_type]
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
index c381c07..1bfd4e8 100644
--- a/src/python/sentryPlugins/avg_block_io/utils.py
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
@@ -8,84 +8,12 @@
# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
# PURPOSE.
# See the Mulan PSL v2 for more details.
-import configparser
import logging
import os
AVG_VALUE = 0
AVG_COUNT = 1
-CONF_LOG = 'log'
-CONF_LOG_LEVEL = 'level'
-LogLevel = {
- "debug": logging.DEBUG,
- "info": logging.INFO,
- "warning": logging.WARNING,
- "error": logging.ERROR,
- "critical": logging.CRITICAL
-}
-
-
-DEFAULT_PARAM = {
- 'latency_nvme_ssd': {
- 'read_avg_lim': 300,
- 'write_avg_lim': 300,
- 'read_avg_time': 3,
- 'write_avg_time': 3,
- 'read_tot_lim': 500,
- 'write_tot_lim': 500,
- }, 'latency_sata_ssd' : {
- 'read_avg_lim': 10000,
- 'write_avg_lim': 10000,
- 'read_avg_time': 3,
- 'write_avg_time': 3,
- 'read_tot_lim': 50000,
- 'write_tot_lim': 50000,
- }, 'latency_sata_hdd' : {
- 'read_avg_lim': 15000,
- 'write_avg_lim': 15000,
- 'read_avg_time': 3,
- 'write_avg_time': 3,
- 'read_tot_lim': 50000,
- 'write_tot_lim': 50000
- }, 'iodump': {
- 'read_iodump_lim': 0,
- 'write_iodump_lim': 0
- }
-}
-
-
-def get_section_value(section_name, config):
- common_param = {}
- config_sec = config[section_name]
- for config_key in DEFAULT_PARAM[section_name]:
- if config_key in config_sec:
- if not config_sec[config_key].isdecimal():
- report_alarm_fail(f"Invalid {section_name}.{config_key} config.")
- common_param[config_key] = int(config_sec[config_key])
- else:
- logging.warning(f"Unset {section_name}.{config_key} in config file, use {DEFAULT_PARAM[section_name][config_key]} as default")
- common_param[config_key] = DEFAULT_PARAM[section_name][config_key]
- return common_param
-
-
-def get_log_level(filename):
- if not os.path.exists(filename):
- return logging.INFO
-
- try:
- config = configparser.ConfigParser()
- config.read(filename)
- if not config.has_option(CONF_LOG, CONF_LOG_LEVEL):
- return logging.INFO
- log_level = config.get(CONF_LOG, CONF_LOG_LEVEL)
-
- if log_level.lower() in LogLevel:
- return LogLevel.get(log_level.lower())
- return logging.INFO
- except configparser.Error:
- return logging.INFO
-
def get_nested_value(data, keys):
"""get data from nested dict"""
--
2.27.0