!73 update collect and avg_block_io

From: @zhuofeng6 
Reviewed-by: @gaoruoshu 
Signed-off-by: @gaoruoshu
This commit is contained in:
openeuler-ci-bot 2024-10-11 01:23:40 +00:00 committed by Gitee
commit 129ab8dfc3
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
3 changed files with 437 additions and 1 deletions

View File

@ -0,0 +1,176 @@
From c2ffc679eddda5d78362612d89a9319d268da7e3 Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Thu, 10 Oct 2024 20:17:34 +0800
Subject: [PATCH] add get_disk_type and fix some bugs
---
service/sentryCollector.service | 2 +-
src/python/sentryCollector/collect_io.py | 16 ++++-
src/python/sentryCollector/collect_plugin.py | 68 +++++++++++++++++++-
3 files changed, 81 insertions(+), 5 deletions(-)
diff --git a/service/sentryCollector.service b/service/sentryCollector.service
index 4ee07d5..e09ddb3 100644
--- a/service/sentryCollector.service
+++ b/service/sentryCollector.service
@@ -1,5 +1,5 @@
[Unit]
-Description = Collection module added for sysSentry and kernel lock-free collection
+Description = Collection module added for sysSentry
[Service]
ExecStart=/usr/bin/python3 /usr/bin/sentryCollector
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
index 8780648..6699a90 100644
--- a/src/python/sentryCollector/collect_io.py
+++ b/src/python/sentryCollector/collect_io.py
@@ -116,7 +116,7 @@ class CollectIo():
return 0
if finish <= 0 or lat_time <= 0:
return 0
- value = lat_time / finish / 1000 / 1000
+ value = lat_time / finish / 1000
if value.is_integer():
return int(value)
else:
@@ -124,11 +124,17 @@ class CollectIo():
def get_io_length(self, curr_stage_value, last_stage_value, category):
try:
- finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH])
+ lat_time = (int(curr_stage_value[category * 3 + IoStatus.LATENCY]) - int(last_stage_value[category * 3 + IoStatus.LATENCY]))
except ValueError as e:
logging.error("get_io_length convert to int failed, %s", e)
return 0
- value = finish / self.period_time / 1000 / 1000
+ if lat_time <= 0:
+ return 0
+ # ns convert us
+ lat_time = lat_time / 1000
+ # s convert us
+ period_time = self.period_time * 1000 * 1000
+ value = lat_time / period_time
if value.is_integer():
return int(value)
else:
@@ -141,6 +147,8 @@ class CollectIo():
with open(io_dump_file, 'r') as file:
for line in file:
count += line.count('.op=' + Io_Category[category])
+ if count > 0:
+ logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}")
except FileNotFoundError:
logging.error("The file %s does not exist.", io_dump_file)
return count
@@ -223,6 +231,8 @@ class CollectIo():
if self.get_blk_io_hierarchy(disk_name, stage_list) < 0:
continue
self.append_period_lat(disk_name, stage_list)
+
+ logging.debug(f"no-lock collect data : {IO_GLOBAL_DATA}")
elapsed_time = time.time() - start_time
sleep_time = self.period_time - elapsed_time
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
index 3e2cf4c..31bf11b 100644
--- a/src/python/sentryCollector/collect_plugin.py
+++ b/src/python/sentryCollector/collect_plugin.py
@@ -16,6 +16,7 @@ import json
import socket
import logging
import re
+import os
COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock"
@@ -58,6 +59,8 @@ class ResultMessage():
RESULT_EXCEED_LIMIT = 4 # the parameter length exceeds the limit.
RESULT_PARSE_FAILED = 5 # parse failed
RESULT_INVALID_CHAR = 6 # invalid char
+ RESULT_DISK_NOEXIST = 7 # disk is not exist
+ RESULT_DISK_TYPE_MISMATCH= 8 # disk type mismatch
Result_Messages = {
ResultMessage.RESULT_SUCCEED: "Succeed",
@@ -66,9 +69,15 @@ Result_Messages = {
ResultMessage.RESULT_INVALID_LENGTH: "Invalid parameter length",
ResultMessage.RESULT_EXCEED_LIMIT: "The parameter length exceeds the limit",
ResultMessage.RESULT_PARSE_FAILED: "Parse failed",
- ResultMessage.RESULT_INVALID_CHAR: "Invalid char"
+ ResultMessage.RESULT_INVALID_CHAR: "Invalid char",
+ ResultMessage.RESULT_DISK_NOEXIST: "Disk is not exist",
+ ResultMessage.RESULT_DISK_TYPE_MISMATCH: "Disk type mismatch"
}
+class DiskType():
+ TYPE_NVME_SSD = 0
+ TYPE_SATA_SSD = 1
+ TYPE_SATA_HDD = 2
def client_send_and_recv(request_data, data_str_len, protocol):
"""client socket send and recv message"""
@@ -273,3 +282,60 @@ def inter_get_io_data(period, disk_list, stage, iotype):
result['message'] = result_message
return result
+def get_disk_type(disk):
+ result = {}
+ result['ret'] = ResultMessage.RESULT_UNKNOWN
+ result['message'] = ""
+ if not disk:
+ logging.error("param is invalid")
+ result['ret'] = ResultMessage.RESULT_NOT_PARAM
+ return result
+ if len(disk) <= 0 or len(disk) > LIMIT_DISK_CHAR_LEN:
+ logging.error("invalid disk length")
+ result['ret'] = ResultMessage.RESULT_INVALID_LENGTH
+ return result
+ pattern = r'^[a-zA-Z0-9_-]+$'
+ if not re.match(pattern, disk):
+ logging.error("%s is invalid char", disk)
+ result['ret'] = ResultMessage.RESULT_INVALID_CHAR
+ return result
+
+ base_path = '/sys/block'
+ all_disk = []
+ for disk_name in os.listdir(base_path):
+ all_disk.append(disk_name)
+
+ if disk not in all_disk:
+ logging.error("disk %s is not exist", disk)
+ result['ret'] = ResultMessage.RESULT_DISK_NOEXIST
+ return result
+
+ if disk[0:4] == "nvme":
+ result['message'] = str(DiskType.TYPE_NVME_SSD)
+ elif disk[0:2] == "sd":
+ disk_file = '/sys/block/{}/queue/rotational'.format(disk)
+ try:
+ with open(disk_file, 'r') as file:
+ num = int(file.read())
+ if num == 1:
+ result['message'] = str(DiskType.TYPE_SATA_SSD)
+ elif num == 0:
+ result['message'] = str(DiskType.TYPE_SATA_HDD)
+ else:
+ logging.error("disk %s is not support, num = %d", disk, num)
+ result['ret'] = ResultMessage.RESULT_DISK_TYPE_MISMATCH
+ return result
+ except FileNotFoundError:
+ logging.error("The disk_file [%s] does not exist", disk_file)
+ result['ret'] = ResultMessage.RESULT_DISK_NOEXIST
+ return result
+ except Exception as e:
+ logging.error("open disk_file %s happen an error: %s", disk_file, e)
+ return result
+ else:
+ logging.error("disk %s is not support", disk)
+ result['ret'] = ResultMessage.RESULT_DISK_TYPE_MISMATCH
+ return result
+
+ result['ret'] = ResultMessage.RESULT_SUCCEED
+ return result
\ No newline at end of file
--
2.33.0

View File

@ -0,0 +1,251 @@
From a8418093bb37482da7ccaac0c950f2ed8d0ba2fa Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Thu, 10 Oct 2024 15:07:29 +0800
Subject: [PATCH] add log for improving maintainability
---
.../avg_block_io/avg_block_io.py | 4 +-
.../sentryPlugins/avg_block_io/module_conn.py | 57 ++++++++++-------
.../avg_block_io/stage_window.py | 8 +++
.../sentryPlugins/avg_block_io/utils.py | 63 +++++++++++++++++--
4 files changed, 103 insertions(+), 29 deletions(-)
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
index 26a60c5..cf2ded3 100644
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
@@ -194,11 +194,11 @@ def init_io_win(io_dic, config, common_param):
if avg_lim_value and avg_time_value and tot_lim_value:
io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
- logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw))
+ logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw))
if iodump_lim_value is not None:
io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value)
- logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw))
+ logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw))
return io_data, io_avg_value
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
index 2fc5a83..40b3fcc 100644
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
@@ -13,7 +13,7 @@ import logging
import sys
import time
-from .utils import is_abnormal
+from .utils import is_abnormal, get_win_data, log_slow_win
from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
from syssentry.result import ResultLevel, report_result
from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR
@@ -66,36 +66,51 @@ def report_alarm_fail(alarm_info):
def process_report_data(disk_name, rw, io_data):
"""check abnormal window and report to xalarm"""
- if not is_abnormal((disk_name, 'bio', rw), io_data):
+ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
+ if not abnormal:
return
- msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw}
+ msg = {
+ "alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw,
+ "reason": "unknown", "block_stack": "bio", "alarm_type": abnormal_list,
+ "details": get_win_data(disk_name, rw, io_data)
+ }
+ # io press
ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
for stage_name in ctrl_stage:
- if is_abnormal((disk_name, stage_name, rw), io_data):
- msg["reason"] = "IO press slow"
- msg["block_stack"] = f"bio,{stage_name}"
- logging.warning("{} - {} report IO press slow".format(disk_name, rw))
- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
- return
-
- if is_abnormal((disk_name, 'rq_driver', rw), io_data):
+ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
+ if not abnormal:
+ continue
+ msg["reason"] = "IO press"
+ msg["block_stack"] = f"bio,{stage_name}"
+ msg["alarm_type"] = abnormal_list
+ log_slow_win(msg, "IO press")
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
+ return
+
+ # driver slow
+ abnormal, abnormal_list = is_abnormal((disk_name, 'rq_driver', rw), io_data)
+ if abnormal:
msg["reason"] = "driver slow"
msg["block_stack"] = "bio,rq_driver"
- logging.warning("{} - {} report driver slow".format(disk_name, rw))
+ msg["alarm_type"] = abnormal_list
+ log_slow_win(msg, "driver slow")
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
return
+ # kernel slow
kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue']
for stage_name in kernel_stage:
- if is_abnormal((disk_name, stage_name, rw), io_data):
- msg["reason"] = "kernel slow"
- msg["block_stack"] = f"bio,{stage_name}"
- logging.warning("{} - {} report kernel slow".format(disk_name, rw))
- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
- return
- msg["reason"] = "unknown"
- msg["block_stack"] = "bio"
- logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw))
+ abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data)
+ if not abnormal:
+ continue
+ msg["reason"] = "kernel slow"
+ msg["block_stack"] = f"bio,{stage_name}"
+ msg["alarm_type"] = abnormal_list
+ log_slow_win(msg, "kernel slow")
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
+ return
+
+ log_slow_win(msg, "unknown")
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py
index 9b0ce79..5113782 100644
--- a/src/python/sentryPlugins/avg_block_io/stage_window.py
+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py
@@ -14,6 +14,11 @@ class AbnormalWindowBase:
self.window_size = window_size
self.window_threshold = window_threshold
self.abnormal_window = [False] * window_size
+ self.window_data = [-1] * window_size
+
+ def append_new_data(self, ab_res):
+ self.window_data.pop(0)
+ self.window_data.append(ab_res)
def append_new_period(self, ab_res, avg_val=0):
self.abnormal_window.pop(0)
@@ -25,6 +30,9 @@ class AbnormalWindowBase:
def is_abnormal_window(self):
return sum(self.abnormal_window) > self.window_threshold
+ def window_data_to_string(self):
+ return ",".join(str(x) for x in self.window_data)
+
class IoWindow(AbnormalWindowBase):
def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40):
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
index 2de9a46..3b7f027 100644
--- a/src/python/sentryPlugins/avg_block_io/utils.py
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
@@ -65,15 +65,32 @@ def set_nested_value(data, keys, value):
return True
+def get_win_data(disk_name, rw, io_data):
+ """get latency and iodump win data"""
+ latency = ''
+ iodump = ''
+ for stage_name in io_data[disk_name]:
+ if 'latency' in io_data[disk_name][stage_name][rw]:
+ latency_list = io_data[disk_name][stage_name][rw]['latency'].window_data_to_string()
+ latency += f'{stage_name}: [{latency_list}], '
+ if 'iodump' in io_data[disk_name][stage_name][rw]:
+ iodump_list = io_data[disk_name][stage_name][rw]['iodump'].window_data_to_string()
+ iodump += f'{stage_name}: [{iodump_list}], '
+ return {"latency": latency[:-2], "iodump": iodump[:-2]}
+
+
def is_abnormal(io_key, io_data):
"""check if latency and iodump win abnormal"""
+ abnormal_list = ''
for key in ['latency', 'iodump']:
all_keys = get_nested_value(io_data, io_key)
if all_keys and key in all_keys:
win = get_nested_value(io_data, io_key + (key,))
if win and win.is_abnormal_window():
- return True
- return False
+ abnormal_list += key + ', '
+ if not abnormal_list:
+ return False, abnormal_list
+ return True, abnormal_list[:-2]
def update_io_avg(old_avg, period_value, win_size):
@@ -87,8 +104,8 @@ def update_io_avg(old_avg, period_value, win_size):
return [new_avg_value, new_avg_count]
-def update_io_data(old_avg, period_value, win_size, io_data, io_key):
- """update data of latency and iodump window"""
+def update_io_period(old_avg, period_value, io_data, io_key):
+ """update period of latency and iodump window"""
all_wins = get_nested_value(io_data, io_key)
if all_wins and "latency" in all_wins:
io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE])
@@ -96,20 +113,54 @@ def update_io_data(old_avg, period_value, win_size, io_data, io_key):
io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1])
+def update_io_data(period_value, io_data, io_key):
+ """update data of latency and iodump window"""
+ all_wins = get_nested_value(io_data, io_key)
+ if all_wins and "latency" in all_wins:
+ io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_data(period_value[0])
+ if all_wins and "iodump" in all_wins:
+ io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_data(period_value[1])
+
+
+def log_abnormal_period(old_avg, period_value, io_data, io_key):
+ """record log of abnormal period"""
+ all_wins = get_nested_value(io_data, io_key)
+ if all_wins and "latency" in all_wins:
+ if all_wins["latency"].is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
+ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, "
+ f"type: latency, avg: {round(old_avg[AVG_VALUE], 3)}, curr_val: {period_value[0]}")
+ if all_wins and "iodump" in all_wins:
+ if all_wins["iodump"].is_abnormal_period(period_value[1]):
+ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, "
+ f"type: iodump, curr_val: {period_value[1]}")
+
+
+def log_slow_win(msg, reason):
+ """record log of slow win"""
+ logging.warning(f"[SLOW IO] disk: {msg['driver_name']}, stage: {msg['block_stack']}, "
+ f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}")
+ logging.info(f"latency: {msg['details']['latency']}")
+ logging.info(f"iodump: {msg['details']['iodump']}")
+
+
def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data):
"""update avg and check abonrmal, return true if win_size full"""
period_value = get_nested_value(data, io_key)
old_avg = get_nested_value(io_avg_value, io_key)
# 更新avg数据
+ update_io_data(period_value, io_data, io_key)
if old_avg[AVG_COUNT] < win_size:
set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
return False
+ # 打印异常周期数据
+ log_abnormal_period(old_avg, period_value, io_data, io_key)
+
# 更新win数据 -- 判断异常周期
- update_io_data(old_avg, period_value, win_size, io_data, io_key)
+ update_io_period(old_avg, period_value, io_data, io_key)
all_wins = get_nested_value(io_data, io_key)
- if all_wins and 'latency' not in all_wins:
+ if not all_wins or 'latency' not in all_wins:
return True
period = get_nested_value(io_data, io_key + ("latency",))
if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
--
2.27.0

View File

@ -4,7 +4,7 @@
Summary: System Inspection Framework Summary: System Inspection Framework
Name: sysSentry Name: sysSentry
Version: 1.0.2 Version: 1.0.2
Release: 29 Release: 30
License: Mulan PSL v2 License: Mulan PSL v2
Group: System Environment/Daemons Group: System Environment/Daemons
Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz
@ -42,6 +42,8 @@ Patch29: change-alarm-length.patch
Patch30: add-detail-time.patch Patch30: add-detail-time.patch
Patch31: xalarm-add-alarm-msg-length-to-8192.patch Patch31: xalarm-add-alarm-msg-length-to-8192.patch
Patch32: ai_block_io-adapt-alarm-module.patch Patch32: ai_block_io-adapt-alarm-module.patch
Patch33: add-log-for-improving-maintainability.patch
Patch34: add-get_disk_type-and-fix-some-bugs.patch
BuildRequires: cmake gcc-c++ BuildRequires: cmake gcc-c++
BuildRequires: python3 python3-setuptools BuildRequires: python3 python3-setuptools
@ -286,6 +288,13 @@ rm -rf %{buildroot}
%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_block_io %attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_block_io
%changelog %changelog
* Thu Oct 10 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-30
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:add get_disk_type and fix some bugs
add log for improving maintainability
* Thu Oct 10 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-29 * Thu Oct 10 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-29
- Type:requirement - Type:requirement
- CVE:NA - CVE:NA