252 lines
12 KiB
Diff
252 lines
12 KiB
Diff
From a8418093bb37482da7ccaac0c950f2ed8d0ba2fa Mon Sep 17 00:00:00 2001
|
|
From: gaoruoshu <gaoruoshu@huawei.com>
|
|
Date: Thu, 10 Oct 2024 15:07:29 +0800
|
|
Subject: [PATCH] add log for improving maintainability
|
|
|
|
---
|
|
.../avg_block_io/avg_block_io.py | 4 +-
|
|
.../sentryPlugins/avg_block_io/module_conn.py | 57 ++++++++++-------
|
|
.../avg_block_io/stage_window.py | 8 +++
|
|
.../sentryPlugins/avg_block_io/utils.py | 63 +++++++++++++++++--
|
|
4 files changed, 103 insertions(+), 29 deletions(-)
|
|
|
|
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
|
index 26a60c5..cf2ded3 100644
|
|
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
|
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
|
@@ -194,11 +194,11 @@ def init_io_win(io_dic, config, common_param):
|
|
|
|
if avg_lim_value and avg_time_value and tot_lim_value:
|
|
io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
|
|
- logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw))
|
|
+ logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw))
|
|
|
|
if iodump_lim_value is not None:
|
|
io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value)
|
|
- logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw))
|
|
+ logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw))
|
|
return io_data, io_avg_value
|
|
|
|
|
|
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
|
index 2fc5a83..40b3fcc 100644
|
|
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
|
|
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
|
@@ -13,7 +13,7 @@ import logging
|
|
import sys
|
|
import time
|
|
|
|
-from .utils import is_abnormal
|
|
+from .utils import is_abnormal, get_win_data, log_slow_win
|
|
from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
|
|
from syssentry.result import ResultLevel, report_result
|
|
from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR
|
|
@@ -66,36 +66,51 @@ def report_alarm_fail(alarm_info):
|
|
|
|
def process_report_data(disk_name, rw, io_data):
|
|
"""check abnormal window and report to xalarm"""
|
|
- if not is_abnormal((disk_name, 'bio', rw), io_data):
|
|
+ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
|
|
+ if not abnormal:
|
|
return
|
|
|
|
- msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw}
|
|
+ msg = {
|
|
+ "alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw,
|
|
+ "reason": "unknown", "block_stack": "bio", "alarm_type": abnormal_list,
|
|
+ "details": get_win_data(disk_name, rw, io_data)
|
|
+ }
|
|
|
|
+ # io press
|
|
ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
|
|
for stage_name in ctrl_stage:
|
|
- if is_abnormal((disk_name, stage_name, rw), io_data):
|
|
- msg["reason"] = "IO press slow"
|
|
- msg["block_stack"] = f"bio,{stage_name}"
|
|
- logging.warning("{} - {} report IO press slow".format(disk_name, rw))
|
|
- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
|
- return
|
|
-
|
|
- if is_abnormal((disk_name, 'rq_driver', rw), io_data):
|
|
+ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
|
|
+ if not abnormal:
|
|
+ continue
|
|
+ msg["reason"] = "IO press"
|
|
+ msg["block_stack"] = f"bio,{stage_name}"
|
|
+ msg["alarm_type"] = abnormal_list
|
|
+ log_slow_win(msg, "IO press")
|
|
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
|
+ return
|
|
+
|
|
+ # driver slow
|
|
+ abnormal, abnormal_list = is_abnormal((disk_name, 'rq_driver', rw), io_data)
|
|
+ if abnormal:
|
|
msg["reason"] = "driver slow"
|
|
msg["block_stack"] = "bio,rq_driver"
|
|
- logging.warning("{} - {} report driver slow".format(disk_name, rw))
|
|
+ msg["alarm_type"] = abnormal_list
|
|
+ log_slow_win(msg, "driver slow")
|
|
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
|
return
|
|
|
|
+ # kernel slow
|
|
kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue']
|
|
for stage_name in kernel_stage:
|
|
- if is_abnormal((disk_name, stage_name, rw), io_data):
|
|
- msg["reason"] = "kernel slow"
|
|
- msg["block_stack"] = f"bio,{stage_name}"
|
|
- logging.warning("{} - {} report kernel slow".format(disk_name, rw))
|
|
- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
|
- return
|
|
- msg["reason"] = "unknown"
|
|
- msg["block_stack"] = "bio"
|
|
- logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw))
|
|
+ abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data)
|
|
+ if not abnormal:
|
|
+ continue
|
|
+ msg["reason"] = "kernel slow"
|
|
+ msg["block_stack"] = f"bio,{stage_name}"
|
|
+ msg["alarm_type"] = abnormal_list
|
|
+ log_slow_win(msg, "kernel slow")
|
|
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
|
+ return
|
|
+
|
|
+ log_slow_win(msg, "unknown")
|
|
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
|
diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py
|
|
index 9b0ce79..5113782 100644
|
|
--- a/src/python/sentryPlugins/avg_block_io/stage_window.py
|
|
+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py
|
|
@@ -14,6 +14,11 @@ class AbnormalWindowBase:
|
|
self.window_size = window_size
|
|
self.window_threshold = window_threshold
|
|
self.abnormal_window = [False] * window_size
|
|
+ self.window_data = [-1] * window_size
|
|
+
|
|
+ def append_new_data(self, ab_res):
|
|
+ self.window_data.pop(0)
|
|
+ self.window_data.append(ab_res)
|
|
|
|
def append_new_period(self, ab_res, avg_val=0):
|
|
self.abnormal_window.pop(0)
|
|
@@ -25,6 +30,9 @@ class AbnormalWindowBase:
|
|
def is_abnormal_window(self):
|
|
return sum(self.abnormal_window) > self.window_threshold
|
|
|
|
+ def window_data_to_string(self):
|
|
+ return ",".join(str(x) for x in self.window_data)
|
|
+
|
|
|
|
class IoWindow(AbnormalWindowBase):
|
|
def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40):
|
|
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
|
|
index 2de9a46..3b7f027 100644
|
|
--- a/src/python/sentryPlugins/avg_block_io/utils.py
|
|
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
|
|
@@ -65,15 +65,32 @@ def set_nested_value(data, keys, value):
|
|
return True
|
|
|
|
|
|
+def get_win_data(disk_name, rw, io_data):
|
|
+ """get latency and iodump win data"""
|
|
+ latency = ''
|
|
+ iodump = ''
|
|
+ for stage_name in io_data[disk_name]:
|
|
+ if 'latency' in io_data[disk_name][stage_name][rw]:
|
|
+ latency_list = io_data[disk_name][stage_name][rw]['latency'].window_data_to_string()
|
|
+ latency += f'{stage_name}: [{latency_list}], '
|
|
+ if 'iodump' in io_data[disk_name][stage_name][rw]:
|
|
+ iodump_list = io_data[disk_name][stage_name][rw]['iodump'].window_data_to_string()
|
|
+ iodump += f'{stage_name}: [{iodump_list}], '
|
|
+ return {"latency": latency[:-2], "iodump": iodump[:-2]}
|
|
+
|
|
+
|
|
def is_abnormal(io_key, io_data):
|
|
"""check if latency and iodump win abnormal"""
|
|
+ abnormal_list = ''
|
|
for key in ['latency', 'iodump']:
|
|
all_keys = get_nested_value(io_data, io_key)
|
|
if all_keys and key in all_keys:
|
|
win = get_nested_value(io_data, io_key + (key,))
|
|
if win and win.is_abnormal_window():
|
|
- return True
|
|
- return False
|
|
+ abnormal_list += key + ', '
|
|
+ if not abnormal_list:
|
|
+ return False, abnormal_list
|
|
+ return True, abnormal_list[:-2]
|
|
|
|
|
|
def update_io_avg(old_avg, period_value, win_size):
|
|
@@ -87,8 +104,8 @@ def update_io_avg(old_avg, period_value, win_size):
|
|
return [new_avg_value, new_avg_count]
|
|
|
|
|
|
-def update_io_data(old_avg, period_value, win_size, io_data, io_key):
|
|
- """update data of latency and iodump window"""
|
|
+def update_io_period(old_avg, period_value, io_data, io_key):
|
|
+ """update period of latency and iodump window"""
|
|
all_wins = get_nested_value(io_data, io_key)
|
|
if all_wins and "latency" in all_wins:
|
|
io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE])
|
|
@@ -96,20 +113,54 @@ def update_io_data(old_avg, period_value, win_size, io_data, io_key):
|
|
io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1])
|
|
|
|
|
|
+def update_io_data(period_value, io_data, io_key):
|
|
+ """update data of latency and iodump window"""
|
|
+ all_wins = get_nested_value(io_data, io_key)
|
|
+ if all_wins and "latency" in all_wins:
|
|
+ io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_data(period_value[0])
|
|
+ if all_wins and "iodump" in all_wins:
|
|
+ io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_data(period_value[1])
|
|
+
|
|
+
|
|
+def log_abnormal_period(old_avg, period_value, io_data, io_key):
|
|
+ """record log of abnormal period"""
|
|
+ all_wins = get_nested_value(io_data, io_key)
|
|
+ if all_wins and "latency" in all_wins:
|
|
+ if all_wins["latency"].is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
|
|
+ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, "
|
|
+ f"type: latency, avg: {round(old_avg[AVG_VALUE], 3)}, curr_val: {period_value[0]}")
|
|
+ if all_wins and "iodump" in all_wins:
|
|
+ if all_wins["iodump"].is_abnormal_period(period_value[1]):
|
|
+ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, "
|
|
+ f"type: iodump, curr_val: {period_value[1]}")
|
|
+
|
|
+
|
|
+def log_slow_win(msg, reason):
|
|
+ """record log of slow win"""
|
|
+ logging.warning(f"[SLOW IO] disk: {msg['driver_name']}, stage: {msg['block_stack']}, "
|
|
+ f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}")
|
|
+ logging.info(f"latency: {msg['details']['latency']}")
|
|
+ logging.info(f"iodump: {msg['details']['iodump']}")
|
|
+
|
|
+
|
|
def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data):
|
|
"""update avg and check abonrmal, return true if win_size full"""
|
|
period_value = get_nested_value(data, io_key)
|
|
old_avg = get_nested_value(io_avg_value, io_key)
|
|
|
|
# 更新avg数据
|
|
+ update_io_data(period_value, io_data, io_key)
|
|
if old_avg[AVG_COUNT] < win_size:
|
|
set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
|
|
return False
|
|
|
|
+ # 打印异常周期数据
|
|
+ log_abnormal_period(old_avg, period_value, io_data, io_key)
|
|
+
|
|
# 更新win数据 -- 判断异常周期
|
|
- update_io_data(old_avg, period_value, win_size, io_data, io_key)
|
|
+ update_io_period(old_avg, period_value, io_data, io_key)
|
|
all_wins = get_nested_value(io_data, io_key)
|
|
- if all_wins and 'latency' not in all_wins:
|
|
+ if not all_wins or 'latency' not in all_wins:
|
|
return True
|
|
period = get_nested_value(io_data, io_key + ("latency",))
|
|
if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
|
|
--
|
|
2.27.0
|
|
|