sysSentry/add-log-for-improving-maintainability.patch

252 lines
12 KiB
Diff
Raw Normal View History

From a8418093bb37482da7ccaac0c950f2ed8d0ba2fa Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Thu, 10 Oct 2024 15:07:29 +0800
Subject: [PATCH] add log for improving maintainability
---
.../avg_block_io/avg_block_io.py | 4 +-
.../sentryPlugins/avg_block_io/module_conn.py | 57 ++++++++++-------
.../avg_block_io/stage_window.py | 8 +++
.../sentryPlugins/avg_block_io/utils.py | 63 +++++++++++++++++--
4 files changed, 103 insertions(+), 29 deletions(-)
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
index 26a60c5..cf2ded3 100644
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
@@ -194,11 +194,11 @@ def init_io_win(io_dic, config, common_param):
if avg_lim_value and avg_time_value and tot_lim_value:
io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
- logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw))
+ logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw))
if iodump_lim_value is not None:
io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value)
- logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw))
+ logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw))
return io_data, io_avg_value
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
index 2fc5a83..40b3fcc 100644
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
@@ -13,7 +13,7 @@ import logging
import sys
import time
-from .utils import is_abnormal
+from .utils import is_abnormal, get_win_data, log_slow_win
from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
from syssentry.result import ResultLevel, report_result
from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR
@@ -66,36 +66,51 @@ def report_alarm_fail(alarm_info):
def process_report_data(disk_name, rw, io_data):
"""check abnormal window and report to xalarm"""
- if not is_abnormal((disk_name, 'bio', rw), io_data):
+ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
+ if not abnormal:
return
- msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw}
+ msg = {
+ "alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw,
+ "reason": "unknown", "block_stack": "bio", "alarm_type": abnormal_list,
+ "details": get_win_data(disk_name, rw, io_data)
+ }
+ # io press
ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
for stage_name in ctrl_stage:
- if is_abnormal((disk_name, stage_name, rw), io_data):
- msg["reason"] = "IO press slow"
- msg["block_stack"] = f"bio,{stage_name}"
- logging.warning("{} - {} report IO press slow".format(disk_name, rw))
- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
- return
-
- if is_abnormal((disk_name, 'rq_driver', rw), io_data):
+ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
+ if not abnormal:
+ continue
+ msg["reason"] = "IO press"
+ msg["block_stack"] = f"bio,{stage_name}"
+ msg["alarm_type"] = abnormal_list
+ log_slow_win(msg, "IO press")
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
+ return
+
+ # driver slow
+ abnormal, abnormal_list = is_abnormal((disk_name, 'rq_driver', rw), io_data)
+ if abnormal:
msg["reason"] = "driver slow"
msg["block_stack"] = "bio,rq_driver"
- logging.warning("{} - {} report driver slow".format(disk_name, rw))
+ msg["alarm_type"] = abnormal_list
+ log_slow_win(msg, "driver slow")
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
return
+ # kernel slow
kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue']
for stage_name in kernel_stage:
- if is_abnormal((disk_name, stage_name, rw), io_data):
- msg["reason"] = "kernel slow"
- msg["block_stack"] = f"bio,{stage_name}"
- logging.warning("{} - {} report kernel slow".format(disk_name, rw))
- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
- return
- msg["reason"] = "unknown"
- msg["block_stack"] = "bio"
- logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw))
+ abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data)
+ if not abnormal:
+ continue
+ msg["reason"] = "kernel slow"
+ msg["block_stack"] = f"bio,{stage_name}"
+ msg["alarm_type"] = abnormal_list
+ log_slow_win(msg, "kernel slow")
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
+ return
+
+ log_slow_win(msg, "unknown")
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py
index 9b0ce79..5113782 100644
--- a/src/python/sentryPlugins/avg_block_io/stage_window.py
+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py
@@ -14,6 +14,11 @@ class AbnormalWindowBase:
self.window_size = window_size
self.window_threshold = window_threshold
self.abnormal_window = [False] * window_size
+ self.window_data = [-1] * window_size
+
+ def append_new_data(self, ab_res):
+ self.window_data.pop(0)
+ self.window_data.append(ab_res)
def append_new_period(self, ab_res, avg_val=0):
self.abnormal_window.pop(0)
@@ -25,6 +30,9 @@ class AbnormalWindowBase:
def is_abnormal_window(self):
return sum(self.abnormal_window) > self.window_threshold
+ def window_data_to_string(self):
+ return ",".join(str(x) for x in self.window_data)
+
class IoWindow(AbnormalWindowBase):
def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40):
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
index 2de9a46..3b7f027 100644
--- a/src/python/sentryPlugins/avg_block_io/utils.py
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
@@ -65,15 +65,32 @@ def set_nested_value(data, keys, value):
return True
+def get_win_data(disk_name, rw, io_data):
+ """get latency and iodump win data"""
+ latency = ''
+ iodump = ''
+ for stage_name in io_data[disk_name]:
+ if 'latency' in io_data[disk_name][stage_name][rw]:
+ latency_list = io_data[disk_name][stage_name][rw]['latency'].window_data_to_string()
+ latency += f'{stage_name}: [{latency_list}], '
+ if 'iodump' in io_data[disk_name][stage_name][rw]:
+ iodump_list = io_data[disk_name][stage_name][rw]['iodump'].window_data_to_string()
+ iodump += f'{stage_name}: [{iodump_list}], '
+ return {"latency": latency[:-2], "iodump": iodump[:-2]}
+
+
def is_abnormal(io_key, io_data):
"""check if latency and iodump win abnormal"""
+ abnormal_list = ''
for key in ['latency', 'iodump']:
all_keys = get_nested_value(io_data, io_key)
if all_keys and key in all_keys:
win = get_nested_value(io_data, io_key + (key,))
if win and win.is_abnormal_window():
- return True
- return False
+ abnormal_list += key + ', '
+ if not abnormal_list:
+ return False, abnormal_list
+ return True, abnormal_list[:-2]
def update_io_avg(old_avg, period_value, win_size):
@@ -87,8 +104,8 @@ def update_io_avg(old_avg, period_value, win_size):
return [new_avg_value, new_avg_count]
-def update_io_data(old_avg, period_value, win_size, io_data, io_key):
- """update data of latency and iodump window"""
+def update_io_period(old_avg, period_value, io_data, io_key):
+ """update period of latency and iodump window"""
all_wins = get_nested_value(io_data, io_key)
if all_wins and "latency" in all_wins:
io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE])
@@ -96,20 +113,54 @@ def update_io_data(old_avg, period_value, win_size, io_data, io_key):
io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1])
+def update_io_data(period_value, io_data, io_key):
+ """update data of latency and iodump window"""
+ all_wins = get_nested_value(io_data, io_key)
+ if all_wins and "latency" in all_wins:
+ io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_data(period_value[0])
+ if all_wins and "iodump" in all_wins:
+ io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_data(period_value[1])
+
+
+def log_abnormal_period(old_avg, period_value, io_data, io_key):
+ """record log of abnormal period"""
+ all_wins = get_nested_value(io_data, io_key)
+ if all_wins and "latency" in all_wins:
+ if all_wins["latency"].is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
+ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, "
+ f"type: latency, avg: {round(old_avg[AVG_VALUE], 3)}, curr_val: {period_value[0]}")
+ if all_wins and "iodump" in all_wins:
+ if all_wins["iodump"].is_abnormal_period(period_value[1]):
+ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, "
+ f"type: iodump, curr_val: {period_value[1]}")
+
+
+def log_slow_win(msg, reason):
+ """record log of slow win"""
+ logging.warning(f"[SLOW IO] disk: {msg['driver_name']}, stage: {msg['block_stack']}, "
+ f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}")
+ logging.info(f"latency: {msg['details']['latency']}")
+ logging.info(f"iodump: {msg['details']['iodump']}")
+
+
def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data):
"""update avg and check abonrmal, return true if win_size full"""
period_value = get_nested_value(data, io_key)
old_avg = get_nested_value(io_avg_value, io_key)
# 更新avg数据
+ update_io_data(period_value, io_data, io_key)
if old_avg[AVG_COUNT] < win_size:
set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
return False
+ # 打印异常周期数据
+ log_abnormal_period(old_avg, period_value, io_data, io_key)
+
# 更新win数据 -- 判断异常周期
- update_io_data(old_avg, period_value, win_size, io_data, io_key)
+ update_io_period(old_avg, period_value, io_data, io_key)
all_wins = get_nested_value(io_data, io_key)
- if all_wins and 'latency' not in all_wins:
+ if not all_wins or 'latency' not in all_wins:
return True
period = get_nested_value(io_data, io_key + ("latency",))
if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
--
2.27.0