From a8418093bb37482da7ccaac0c950f2ed8d0ba2fa Mon Sep 17 00:00:00 2001 From: gaoruoshu Date: Thu, 10 Oct 2024 15:07:29 +0800 Subject: [PATCH] add log for improving maintainability --- .../avg_block_io/avg_block_io.py | 4 +- .../sentryPlugins/avg_block_io/module_conn.py | 57 ++++++++++------- .../avg_block_io/stage_window.py | 8 +++ .../sentryPlugins/avg_block_io/utils.py | 63 +++++++++++++++++-- 4 files changed, 103 insertions(+), 29 deletions(-) diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py index 26a60c5..cf2ded3 100644 --- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py +++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py @@ -194,11 +194,11 @@ def init_io_win(io_dic, config, common_param): if avg_lim_value and avg_time_value and tot_lim_value: io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) - logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw)) + logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw)) if iodump_lim_value is not None: io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value) - logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw)) + logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw)) return io_data, io_avg_value diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py index 2fc5a83..40b3fcc 100644 --- a/src/python/sentryPlugins/avg_block_io/module_conn.py +++ b/src/python/sentryPlugins/avg_block_io/module_conn.py @@ -13,7 +13,7 @@ import logging import sys import time -from .utils import is_abnormal +from .utils import is_abnormal, get_win_data, log_slow_win from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages from syssentry.result import ResultLevel, report_result from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR @@ -66,36 +66,51 @@ def report_alarm_fail(alarm_info): def process_report_data(disk_name, rw, io_data): """check abnormal window and report to xalarm""" - if not is_abnormal((disk_name, 'bio', rw), io_data): + abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data) + if not abnormal: return - msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw} + msg = { + "alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw, + "reason": "unknown", "block_stack": "bio", "alarm_type": abnormal_list, + "details": get_win_data(disk_name, rw, io_data) + } + # io press ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq'] for stage_name in ctrl_stage: - if is_abnormal((disk_name, stage_name, rw), io_data): - msg["reason"] = "IO press slow" - msg["block_stack"] = f"bio,{stage_name}" - logging.warning("{} - {} report IO press slow".format(disk_name, rw)) - xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) - return - - if is_abnormal((disk_name, 'rq_driver', rw), io_data): + abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data) + if not abnormal: + continue + msg["reason"] = "IO press" + msg["block_stack"] = f"bio,{stage_name}" + msg["alarm_type"] = abnormal_list + log_slow_win(msg, "IO press") + xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) + return + + # driver slow + abnormal, abnormal_list = is_abnormal((disk_name, 'rq_driver', rw), io_data) + if abnormal: msg["reason"] = "driver slow" msg["block_stack"] = "bio,rq_driver" - logging.warning("{} - {} report driver slow".format(disk_name, rw)) + msg["alarm_type"] = abnormal_list + log_slow_win(msg, "driver slow") xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) return + # kernel slow kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue'] for stage_name in kernel_stage: - if is_abnormal((disk_name, stage_name, rw), io_data): - msg["reason"] = "kernel slow" - msg["block_stack"] = f"bio,{stage_name}" - logging.warning("{} - {} report kernel slow".format(disk_name, rw)) - xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) - return - msg["reason"] = "unknown" - msg["block_stack"] = "bio" - logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw)) + abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data) + if not abnormal: + continue + msg["reason"] = "kernel slow" + msg["block_stack"] = f"bio,{stage_name}" + msg["alarm_type"] = abnormal_list + log_slow_win(msg, "kernel slow") + xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) + return + + log_slow_win(msg, "unknown") xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py index 9b0ce79..5113782 100644 --- a/src/python/sentryPlugins/avg_block_io/stage_window.py +++ b/src/python/sentryPlugins/avg_block_io/stage_window.py @@ -14,6 +14,11 @@ class AbnormalWindowBase: self.window_size = window_size self.window_threshold = window_threshold self.abnormal_window = [False] * window_size + self.window_data = [-1] * window_size + + def append_new_data(self, ab_res): + self.window_data.pop(0) + self.window_data.append(ab_res) def append_new_period(self, ab_res, avg_val=0): self.abnormal_window.pop(0) @@ -25,6 +30,9 @@ class AbnormalWindowBase: def is_abnormal_window(self): return sum(self.abnormal_window) > self.window_threshold + def window_data_to_string(self): + return ",".join(str(x) for x in self.window_data) + class IoWindow(AbnormalWindowBase): def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40): diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py index 2de9a46..3b7f027 100644 --- a/src/python/sentryPlugins/avg_block_io/utils.py +++ b/src/python/sentryPlugins/avg_block_io/utils.py @@ -65,15 +65,32 @@ def set_nested_value(data, keys, value): return True +def get_win_data(disk_name, rw, io_data): + """get latency and iodump win data""" + latency = '' + iodump = '' + for stage_name in io_data[disk_name]: + if 'latency' in io_data[disk_name][stage_name][rw]: + latency_list = io_data[disk_name][stage_name][rw]['latency'].window_data_to_string() + latency += f'{stage_name}: [{latency_list}], ' + if 'iodump' in io_data[disk_name][stage_name][rw]: + iodump_list = io_data[disk_name][stage_name][rw]['iodump'].window_data_to_string() + iodump += f'{stage_name}: [{iodump_list}], ' + return {"latency": latency[:-2], "iodump": iodump[:-2]} + + def is_abnormal(io_key, io_data): """check if latency and iodump win abnormal""" + abnormal_list = '' for key in ['latency', 'iodump']: all_keys = get_nested_value(io_data, io_key) if all_keys and key in all_keys: win = get_nested_value(io_data, io_key + (key,)) if win and win.is_abnormal_window(): - return True - return False + abnormal_list += key + ', ' + if not abnormal_list: + return False, abnormal_list + return True, abnormal_list[:-2] def update_io_avg(old_avg, period_value, win_size): @@ -87,8 +104,8 @@ def update_io_avg(old_avg, period_value, win_size): return [new_avg_value, new_avg_count] -def update_io_data(old_avg, period_value, win_size, io_data, io_key): - """update data of latency and iodump window""" +def update_io_period(old_avg, period_value, io_data, io_key): + """update period of latency and iodump window""" all_wins = get_nested_value(io_data, io_key) if all_wins and "latency" in all_wins: io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE]) @@ -96,20 +113,54 @@ def update_io_data(old_avg, period_value, win_size, io_data, io_key): io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1]) +def update_io_data(period_value, io_data, io_key): + """update data of latency and iodump window""" + all_wins = get_nested_value(io_data, io_key) + if all_wins and "latency" in all_wins: + io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_data(period_value[0]) + if all_wins and "iodump" in all_wins: + io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_data(period_value[1]) + + +def log_abnormal_period(old_avg, period_value, io_data, io_key): + """record log of abnormal period""" + all_wins = get_nested_value(io_data, io_key) + if all_wins and "latency" in all_wins: + if all_wins["latency"].is_abnormal_period(period_value[0], old_avg[AVG_VALUE]): + logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, " + f"type: latency, avg: {round(old_avg[AVG_VALUE], 3)}, curr_val: {period_value[0]}") + if all_wins and "iodump" in all_wins: + if all_wins["iodump"].is_abnormal_period(period_value[1]): + logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, " + f"type: iodump, curr_val: {period_value[1]}") + + +def log_slow_win(msg, reason): + """record log of slow win""" + logging.warning(f"[SLOW IO] disk: {msg['driver_name']}, stage: {msg['block_stack']}, " + f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}") + logging.info(f"latency: {msg['details']['latency']}") + logging.info(f"iodump: {msg['details']['iodump']}") + + def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data): """update avg and check abonrmal, return true if win_size full""" period_value = get_nested_value(data, io_key) old_avg = get_nested_value(io_avg_value, io_key) # 更新avg数据 + update_io_data(period_value, io_data, io_key) if old_avg[AVG_COUNT] < win_size: set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) return False + # 打印异常周期数据 + log_abnormal_period(old_avg, period_value, io_data, io_key) + # 更新win数据 -- 判断异常周期 - update_io_data(old_avg, period_value, win_size, io_data, io_key) + update_io_period(old_avg, period_value, io_data, io_key) all_wins = get_nested_value(io_data, io_key) - if all_wins and 'latency' not in all_wins: + if not all_wins or 'latency' not in all_wins: return True period = get_nested_value(io_data, io_key + ("latency",)) if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]): -- 2.27.0