From e7c1b0095e16369fb09ae62ffa3158be5e8893a1 Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Fri, 11 Oct 2024 10:48:35 +0800 Subject: [PATCH] diff disk type use diff config --- config/plugins/avg_block_io.ini | 26 +++- src/python/sentryCollector/collect_plugin.py | 6 + .../avg_block_io/avg_block_io.py | 144 ++++++++---------- .../sentryPlugins/avg_block_io/module_conn.py | 19 ++- .../sentryPlugins/avg_block_io/utils.py | 43 ++++++ 5 files changed, 146 insertions(+), 92 deletions(-) diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini index 858db18..5c4b9b0 100644 --- a/config/plugins/avg_block_io.ini +++ b/config/plugins/avg_block_io.ini @@ -11,13 +11,29 @@ period_time=1 win_size=30 win_threshold=6 -[latency] -read_avg_lim=10 -write_avg_lim=10 +[latency_nvme_ssd] +read_avg_lim=300 +write_avg_lim=300 read_avg_time=3 write_avg_time=3 -read_tot_lim=50 -write_tot_lim=50 +read_tot_lim=500 +write_tot_lim=500 + +[latency_sata_ssd] +read_avg_lim=10000 +write_avg_lim=10000 +read_avg_time=3 +write_avg_time=3 +read_tot_lim=50000 +write_tot_lim=50000 + +[latency_sata_hdd] +read_avg_lim=15000 +write_avg_lim=15000 +read_avg_time=3 +write_avg_time=3 +read_tot_lim=50000 +write_tot_lim=50000 [iodump] read_iodump_lim=0 diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py index 31bf11b..bec405a 100644 --- a/src/python/sentryCollector/collect_plugin.py +++ b/src/python/sentryCollector/collect_plugin.py @@ -79,6 +79,12 @@ class DiskType(): TYPE_SATA_SSD = 1 TYPE_SATA_HDD = 2 +Disk_Type = { + DiskType.TYPE_NVME_SSD: "nvme_ssd", + DiskType.TYPE_SATA_SSD: "sata_ssd", + DiskType.TYPE_SATA_HDD: "sata_hdd" +} + def client_send_and_recv(request_data, data_str_len, protocol): """client socket send and recv message""" try: diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py index cf2ded3..fdad995 100644 --- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py @@ -14,8 +14,9 @@ import configparser import time from .stage_window import IoWindow, IoDumpWindow -from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler -from .utils import update_avg_and_check_abnormal, get_log_level +from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name +from .utils import update_avg_and_check_abnormal, get_log_level, get_section_value +from sentryCollector.collect_plugin import Disk_Type CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" @@ -37,44 +38,40 @@ def read_config_common(config): disk = [] if disk_name == "default" else disk_name.split(",") except configparser.NoOptionError: disk = [] - logging.warning("Unset disk, set to default") + logging.warning("Unset common.disk, set to default") try: stage_name = config.get("common", "stage") stage = [] if stage_name == "default" else stage_name.split(",") except configparser.NoOptionError: stage = [] - logging.warning("Unset stage, set to read,write") + logging.warning("Unset common.stage, set to default") if len(disk) > 10: - logging.warning("Too many disks, record only max 10 disks") + logging.warning("Too many common.disks, record only max 10 disks") disk = disk[:10] try: iotype_name = config.get("common", "iotype").split(",") - iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write', 'flush', 'discard']] - err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write', 'flush', 'discard']] + iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']] + err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']] - if iotype_list in [None, []]: - iotype_list = ["read", "write"] - except configparser.NoOptionError: - iotype = ["read", "write"] - logging.warning("Unset iotype, set to default") + if err_iotype: + report_alarm_fail("Invalid common.iotype config") - if err_iotype: - logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list)) - + except configparser.NoOptionError: + iotype_list = ["read", "write"] + logging.warning("Unset common.iotype, set to read,write") try: period_time = int(config.get("common", "period_time")) if not (1 <= period_time <= 300): raise ValueError("Invalid period_time") except ValueError: - period_time = 1 - logging.warning("Invalid period_time, set to 1s") + report_alarm_fail("Invalid common.period_time") except configparser.NoOptionError: period_time = 1 - logging.warning("Unset period_time, use 1s as default") + logging.warning("Unset common.period_time, use 1s as default") return period_time, disk, stage, iotype_list @@ -87,76 +84,56 @@ def read_config_algorithm(config): try: win_size = int(config.get("algorithm", "win_size")) if not (1 <= win_size <= 300): - raise ValueError("Invalid win_size") + raise ValueError("Invalid algorithm.win_size") except ValueError: - win_size = 30 - logging.warning("Invalid win_size, set to 30") + report_alarm_fail("Invalid algorithm.win_size config") except configparser.NoOptionError: win_size = 30 - logging.warning("Unset win_size, use 30 as default") + logging.warning("Unset algorithm.win_size, use 30 as default") try: win_threshold = int(config.get("algorithm", "win_threshold")) if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: - raise ValueError("Invalid win_threshold") + raise ValueError("Invalid algorithm.win_threshold") except ValueError: - win_threshold = 6 - logging.warning("Invalid win_threshold, set to 6") + report_alarm_fail("Invalid algorithm.win_threshold config") except configparser.NoOptionError: win_threshold = 6 - logging.warning("Unset win_threshold, use 6 as default") + logging.warning("Unset algorithm.win_threshold, use 6 as default") return win_size, win_threshold -def read_config_lat_iodump(io_dic, config): - """read config file, get [latency] [iodump] section value""" +def read_config_latency(config): + """read config file, get [latency_xxx] section value""" common_param = {} - lat_sec = None - if not config.has_section("latency"): - logging.warning("Cannot find latency section in config file") - else: - lat_sec = config["latency"] - - iodump_sec = None - if not config.has_section("iodump"): - logging.warning("Cannot find iodump section in config file") - else: - iodump_sec = config["iodump"] - - if not lat_sec and not iodump_sec: - return common_param - - for io_type in io_dic["iotype_list"]: - common_param[io_type] = {} - - latency_keys = { - "avg_lim": "{}_avg_lim".format(io_type), - "avg_time": "{}_avg_time".format(io_type), - "tot_lim": "{}_tot_lim".format(io_type), - } - iodump_key = "{}_iodump_lim".format(io_type) + for type_name in Disk_Type: + section_name = f"latency_{Disk_Type[type_name]}" + if not config.has_section(section_name): + report_alarm_fail(f"Cannot find {section_name} section in config file") - if iodump_sec and iodump_key in iodump_sec and iodump_sec[iodump_key].isdecimal(): - common_param[io_type][iodump_key] = int(iodump_sec[iodump_key]) + common_param[Disk_Type[type_name]] = get_section_value(section_name, config) + return common_param - if not lat_sec: - continue - for key_suffix, key_template in latency_keys.items(): - if key_template in lat_sec and lat_sec[key_template].isdecimal(): - common_param[io_type][key_template] = int(lat_sec[key_template]) +def read_config_iodump(config): + """read config file, get [iodump] section value""" + common_param = {} + section_name = "iodump" + if not config.has_section(section_name): + report_alarm_fail(f"Cannot find {section_name} section in config file") - return common_param + return get_section_value(section_name, config) -def read_config_stage(config, stage, iotype_list): - """read config file, get [STAGE_NAME] section value""" +def read_config_stage(config, stage, iotype_list, curr_disk_type): + """read config file, get [STAGE_NAME_diskType] section value""" res = {} - if not stage in config: + section_name = f"{stage}_{curr_disk_type}" + if not config.has_section(section_name): return res - for key in config[stage]: + for key in config[section_name]: if config[stage][key].isdecimal(): res[key] = int(config[stage][key]) @@ -171,11 +148,12 @@ def init_io_win(io_dic, config, common_param): for disk_name in io_dic["disk_list"]: io_data[disk_name] = {} io_avg_value[disk_name] = {} + curr_disk_type = get_disk_type_by_name(disk_name) for stage_name in io_dic["stage_list"]: io_data[disk_name][stage_name] = {} io_avg_value[disk_name][stage_name] = {} - # step3. 解析stage配置 - curr_stage_param = read_config_stage(config, stage_name, iotype_list) + # 解析stage配置 + curr_stage_param = read_config_stage(config, stage_name, iotype_list, curr_disk_type) for rw in iotype_list: io_data[disk_name][stage_name][rw] = {} io_avg_value[disk_name][stage_name][rw] = [0, 0] @@ -187,10 +165,10 @@ def init_io_win(io_dic, config, common_param): iodump_lim_key = "{}_iodump_lim".format(rw) # 获取值,优先从 curr_stage_param 获取,如果不存在,则从 common_param 获取 - avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key)) - avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key)) - tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key)) - iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key)) + avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(curr_disk_type, {}).get(avg_lim_key)) + avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(curr_disk_type, {}).get(avg_time_key)) + tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(curr_disk_type, {}).get(tot_lim_key)) + iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get("iodump", {}).get(iodump_lim_key)) if avg_lim_value and avg_time_value and tot_lim_value: io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) @@ -217,28 +195,21 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage): stage_list = [key for key in all_stage_set if key in config_stage] not_in_stage_list = [key for key in config_stage if key not in all_stage_set] - if not config_disk: + if not_in_stage_list: + report_alarm_fail(f"Invalid common.stage_list config, cannot set {not_in_stage_list}") + + if not config_disk and not not_in_disk_list: disk_list = [key for key in all_disk_set] - if not config_stage: + if not config_stage and not not_in_stage_list: stage_list = [key for key in all_stage_set] disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list - stage_list = stage_list[:15] if len(stage_list) > 15 else stage_list - - if config_disk and not disk_list: - logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk)) - disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage) - - if config_stage and not stage_list: - logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage)) - disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, []) if not stage_list or not disk_list: report_alarm_fail("Cannot get valid disk name or stage name.") log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list) - log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list) return disk_list, stage_list @@ -310,8 +281,13 @@ def main(): # step1. 解析公共配置 --- algorithm io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config) - # step2. 循环创建窗口 - common_param = read_config_lat_iodump(io_dic, config) + # step2. 解析公共配置 --- latency_xxx + common_param = read_config_latency(config) + + # step3. 解析公共配置 --- iodump + common_param['iodump'] = read_config_iodump(config) + + # step4. 循环创建窗口 io_data, io_avg_value = init_io_win(io_dic, config, common_param) main_loop(io_dic, io_data, io_avg_value) diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py index 40b3fcc..8d6f429 100644 --- a/src/python/sentryPlugins/avg_block_io/module_conn.py +++ b/src/python/sentryPlugins/avg_block_io/module_conn.py @@ -14,7 +14,7 @@ import sys import time from .utils import is_abnormal, get_win_data, log_slow_win -from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages +from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages, get_disk_type, Disk_Type from syssentry.result import ResultLevel, report_result from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR @@ -51,7 +51,7 @@ def check_result_validation(res, reason): try: json_data = json.loads(res['message']) except json.JSONDecodeError: - err_msg = "Failed to {}: invalid return message".format(reason) + err_msg = f"Failed to {reason}: invalid return message" report_alarm_fail(err_msg) return json_data @@ -60,7 +60,7 @@ def check_result_validation(res, reason): def report_alarm_fail(alarm_info): """report result to xalarmd""" report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info})) - logging.error(alarm_info) + logging.critical(alarm_info) sys.exit(1) @@ -114,3 +114,16 @@ def process_report_data(disk_name, rw, io_data): log_slow_win(msg, "unknown") xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) + + +def get_disk_type_by_name(disk_name): + res = get_disk_type(disk_name) + disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}') + try: + curr_disk_type = int(disk_type_str) + if curr_disk_type not in Disk_Type: + raise ValueError + except ValueError: + report_alarm_fail(f"Failed to get disk type for {disk_name}") + + return Disk_Type[curr_disk_type] \ No newline at end of file diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py index 3b7f027..cef1edd 100644 --- a/src/python/sentryPlugins/avg_block_io/utils.py +++ b/src/python/sentryPlugins/avg_block_io/utils.py @@ -26,6 +26,49 @@ LogLevel = { } +DEFAULT_PARAM = { + 'latency_nvme_ssd': { + 'read_avg_lim': 300, + 'write_avg_lim': 300, + 'read_avg_time': 3, + 'write_avg_time': 3, + 'read_tot_lim': 500, + 'write_tot_lim': 500, + }, 'latency_sata_ssd' : { + 'read_avg_lim': 10000, + 'write_avg_lim': 10000, + 'read_avg_time': 3, + 'write_avg_time': 3, + 'read_tot_lim': 50000, + 'write_tot_lim': 50000, + }, 'latency_sata_hdd' : { + 'read_avg_lim': 15000, + 'write_avg_lim': 15000, + 'read_avg_time': 3, + 'write_avg_time': 3, + 'read_tot_lim': 50000, + 'write_tot_lim': 50000 + }, 'iodump': { + 'read_iodump_lim': 0, + 'write_iodump_lim': 0 + } +} + + +def get_section_value(section_name, config): + common_param = {} + config_sec = config[section_name] + for config_key in DEFAULT_PARAM[section_name]: + if config_key in config_sec: + if not config_sec[config_key].isdecimal(): + report_alarm_fail(f"Invalid {section_name}.{config_key} config.") + common_param[config_key] = int(config_sec[config_key]) + else: + logging.warning(f"Unset {section_name}.{config_key} in config file, use {DEFAULT_PARAM[section_name][config_key]} as default") + common_param[config_key] = DEFAULT_PARAM[section_name][config_key] + return common_param + + def get_log_level(filename): if not os.path.exists(filename): return logging.INFO -- 2.27.0