init the new version-1.0.3

Signed-off-by: zhuofeng <1107893276@qq.com>
This commit is contained in:
zhuofeng 2025-01-20 08:29:08 +00:00 committed by zhuofeng
parent d65453dccd
commit 9de4c5a236
84 changed files with 31 additions and 18528 deletions

View File

@ -1,35 +0,0 @@
From 3e2721852ad1f8047ad219a5ab6c68fd4c9d6f5c Mon Sep 17 00:00:00 2001
From: shixuantong <shixuantong1@huawei.com>
Date: Wed, 24 Jul 2024 16:17:54 +0800
Subject: [PATCH] Fix the problem that function cpu_report_result() is called
more than once
when task is running, user to exec "sentryctl stop cpu_sentry", cpu_report_result() will be called twice. This will cause the log to be printed twice
---
src/python/syssentry/cpu_sentry.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py
index 7e77654..3c4d58d 100644
--- a/src/python/syssentry/cpu_sentry.py
+++ b/src/python/syssentry/cpu_sentry.py
@@ -133,6 +133,7 @@ class CpuSentry:
result_level = self.send_result.get("result", ResultLevel.FAIL)
report_result(task_name, result_level, details)
+ self.init_send_result()
def kill_process(signum, _f, cpu_sentry_obj):
"""kill process by 'pkill -9'"""
@@ -179,6 +180,6 @@ def main():
cpu_sentry_task.send_result["result"] = ResultLevel.FAIL
cpu_sentry_task.send_result["details"]["code"] = 1004
cpu_sentry_task.send_result["details"]["msg"] = "run cmd [%s] raise Error" % cpu_sentry_task_cmd
- finally:
cpu_sentry_task.cpu_report_result()
- cpu_sentry_task.init_send_result()
+ else:
+ cpu_sentry_task.cpu_report_result()
--
2.27.0

View File

@ -1,36 +0,0 @@
# sysSentry
#### Description
sysSentry is a system inspection framework used to manage system inspection tasks.
#### Software Architecture
Software architecture description
#### Installation
1. xxxx
2. xxxx
3. xxxx
#### Instructions
1. xxxx
2. xxxx
3. xxxx
#### Contribution
1. Fork the repository
2. Create Feat_xxx branch
3. Commit your code
4. Create Pull Request
#### Gitee Feature
1. You can use Readme\_XXX.md to support different languages, such as Readme\_en.md, Readme\_zh.md
2. Gitee blog [blog.gitee.com](https://blog.gitee.com)
3. Explore open source project [https://gitee.com/explore](https://gitee.com/explore)
4. The most valuable open source project [GVP](https://gitee.com/gvp)
5. The manual of Gitee [https://gitee.com/help](https://gitee.com/help)
6. The most popular members [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)

110
README.md
View File

@ -1,110 +0,0 @@
# sysSentry
#### 介绍
sysSentry is a system inspection framework used to manage system inspection tasks.
#### 软件架构
1. 框架支持x86和aarch64架构
2. 插件不同的差距支持架构不同请参考docs.openeuler.org中的内容
#### 安装教程
1. 安装巡检框架
```shell
[root@openEuler ~]# yum install -y sysSentry
```
2. 启动巡检框架
```shell
[root@openEuler ~]# systemctl start sentryCollector
[root@openEuler ~]# systemctl start xalarmd
[root@openEuler ~]# systemctl start sysSentry
```
3. 安装&重载巡检插件
step1. 安装用户需要的巡检插件
```shell
yum install <插件名>
```
当前支持插件有:
- cpu_sentry -- cpu巡检支持22.03-LTS-SP4版本aarch64架构920F芯片使用
- avg_block_io -- 平均阈值慢io检测支持20.03-LTS-SP4版本x86及aarch64架构
- ai_block_io -- AI阈值慢io检测支持20.03-LTS-SP4版本x86及aarch64架构
step2. 重载巡检插件
```shell
[root@openEuler ~]# sentryctl reload <插件名>
```
#### 使用说明
sysSentry提供了用于管理巡检插件的命令 -- sentryctl可以用于启动/停止巡检插件任务、查看巡检插件运行状态、查看巡检插件上报信息等功能。
1. 启动指定巡检任务
```shell
[root@openEuler ~]# sentryctl start <module_name>
```
2. 终止指定巡检任务
```shell
[root@openEuler ~]# sentryctl stop <module_name>
```
3. 列出所有已加载的巡检任务及状态
```shell
[root@openEuler ~]# sentryctl list
```
4. 查询指定巡检任务的状态
```shell
[root@openEuler ~]# sentryctl status <module_name>
```
巡检任务共存在四种状态,每种状态的回显信息及对应介绍如下:
| 状态 | 描述 |
| ------- | ------------------------------------------------------------ |
| RUNNING | 巡检任务正在运行 |
| WAITING | 仅period类型巡检任务可设置此状态表示period巡检任务等待下一次被调度执行 |
| EXITED | 巡检任务尚未执行或者oneshot类型的巡检任务执行结束处于此状态 |
| FAILED | 巡检任务未拉起成功,或者巡检任务未正常退出 |
5. 重载指定巡检任务的配置
当用户修改了巡检任务的配置文件/etc/sysSentry/tasks/<module_name>.mod时可通过以下命令重载配置文件
```shell
[root@openEuler ~]# sentryctl reload <module_name>
```
6. 查询指定任务的告警信息
```shell
[root@openEuler ~]# sentryctl get_alarm <module_name> [options]
```
options可选参数及释义如下
| 参数 | 描述 |
| -------------------------------------- | ------------------------------------------------------------ |
| -s TIME_RANGE, --time_range TIME_RANGE | 展示用户指定时间长度内的告警信息TIME_RANGE为整形单位秒范围为1~15 |
| -d, --detailed | 打印详细告警信息 |
7. 查询指定巡检任务的巡检结果
```shell
sentryctl get_result <module_name>
```
#### 参与贡献
1. Fork 本仓库
2. 新建 Feat_xxx 分支
3. 提交代码
4. 新建 Pull Request

View File

@ -1,32 +0,0 @@
From 91aa47999030503fda4935d4cc238b82d6842238 Mon Sep 17 00:00:00 2001
From: shixuantong <shixuantong1@huawei.com>
Date: Sun, 11 Aug 2024 18:36:23 +0800
Subject: [PATCH] Remove ANSI escape sequences
---
src/python/syssentry/cpu_sentry.py | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py
index 9287e2f..99af127 100644
--- a/src/python/syssentry/cpu_sentry.py
+++ b/src/python/syssentry/cpu_sentry.py
@@ -97,7 +97,14 @@ class CpuSentry:
if "ERROR" in stdout:
self.send_result["result"] = ResultLevel.FAIL
self.send_result["details"]["code"] = 1004
- self.send_result["details"]["msg"] = stdout.split("\n")[0]
+
+ # Remove ANSI escape sequences
+ error_info = stdout.split("\n")[0]
+ if error_info.startswith("\u001b"):
+ ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])'
+ error_info = re.sub(ansi_escape, '', error_info)
+
+ self.send_result["details"]["msg"] = error_info
return
out_split = stdout.split("\n")
--
2.33.0

File diff suppressed because it is too large Load Diff

View File

@ -1,39 +0,0 @@
From abf36bf0351efde388c089245aed9f6d8d2e6d3b Mon Sep 17 00:00:00 2001
From: luckky <guodashun1@huawei.com>
Date: Wed, 6 Nov 2024 11:42:53 +0800
Subject: [PATCH] add boundary check for settings
1. add two boundary checks for page_isolation_threshold and hbm_online_repair_log_level
(0 <= page_isolation_threshold)
(0(LOG_DEBUG) <= hbm_online_repair_log_level <= 3(LOG_ERROR))
---
src/c/hbm_online_repair/hbm_online_repair.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c
index 943f201..00c9c0b 100644
--- a/src/c/hbm_online_repair/hbm_online_repair.c
+++ b/src/c/hbm_online_repair/hbm_online_repair.c
@@ -89,6 +89,9 @@ void hbm_param_init(void)
if (ret < 0) {
global_level_setting = DEFAULT_LOG_LEVEL;
log(LOG_WARNING, "Get log level from config failed, set the default value %d\n", DEFAULT_LOG_LEVEL);
+ } else if (global_level_setting < LOG_DEBUG || global_level_setting > LOG_ERROR) {
+ log(LOG_WARNING, "The log level value %d in config is out of range, set the default value %d\n", global_level_setting, DEFAULT_LOG_LEVEL);
+ global_level_setting = DEFAULT_LOG_LEVEL;
} else {
log(LOG_INFO, "log level: %d\n", global_level_setting);
}
@@ -98,6 +101,9 @@ void hbm_param_init(void)
if (ret < 0) {
page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD;
log(LOG_WARNING, "Get page_isolation_threshold from config failed, set the default value %d\n", DEFAULT_PAGE_ISOLATION_THRESHOLD);
+ } else if (page_isolation_threshold < 0) {
+ log(LOG_WARNING, "The page_isolation_threshold %d in config is out of range, set the default value %d\n", page_isolation_threshold, DEFAULT_PAGE_ISOLATION_THRESHOLD);
+ page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD;
} else {
log(LOG_INFO, "page_isolation_threshold: %d\n", page_isolation_threshold);
}
--
2.43.0

File diff suppressed because it is too large Load Diff

View File

@ -1,31 +0,0 @@
From eca8c542875aef5cfbf947d697c4b644490d1c05 Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Fri, 30 Aug 2024 19:58:41 +0800
Subject: [PATCH] add deleted code to plugin rasdaemon
---
src/python/syssentry/syssentry.py | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
index 32b81e3..3d5cb8d 100644
--- a/src/python/syssentry/syssentry.py
+++ b/src/python/syssentry/syssentry.py
@@ -462,6 +462,14 @@ def main_loop():
epoll_fd.register(cpu_alarm_fd.fileno(), select.EPOLLIN)
logging.debug("start main loop")
+ # onstart_tasks_handle()
+ for task_type in TasksMap.tasks_dict:
+ for task_name in TasksMap.tasks_dict.get(task_type):
+ task = TasksMap.tasks_dict.get(task_type).get(task_name)
+ if not task:
+ continue
+ task.onstart_handle()
+
while True:
try:
events_list = epoll_fd.poll(SERVER_EPOLL_TIMEOUT)
--
2.33.0

View File

@ -1,32 +0,0 @@
From 9ecd4c2c9c9f9578f5ec4780360dc67b182b384a Mon Sep 17 00:00:00 2001
From: jinsaihang <jinsaihang@h-partners.com>
Date: Wed, 9 Oct 2024 08:09:04 +0000
Subject: [PATCH 2/2] add detail time
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
---
src/python/syssentry/alarm.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
index 74a2716..d5337d3 100644
--- a/src/python/syssentry/alarm.py
+++ b/src/python/syssentry/alarm.py
@@ -118,11 +118,13 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
logging.debug(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements")
def xalarm_to_dict(alarm_info: Xalarm) -> dict:
+ timestamp = alarm_info.timetamp.tv_sec + alarm_info.timetamp.tv_usec / 1000000
+ dt_object = datetime.fromtimestamp(int(timestamp))
return {
'alarm_id': xalarm_getid(alarm_info),
'alarm_type': xalarm_gettype(alarm_info),
'alarm_level': xalarm_getlevel(alarm_info),
- 'timetamp': xalarm_gettime(alarm_info),
+ 'timestamp': dt_object.strftime("%Y-%m-%d %H:%M:%S"),
'msg1': xalarm_getdesc(alarm_info)
}
--
2.27.0

View File

@ -1,176 +0,0 @@
From c2ffc679eddda5d78362612d89a9319d268da7e3 Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Thu, 10 Oct 2024 20:17:34 +0800
Subject: [PATCH] add get_disk_type and fix some bugs
---
service/sentryCollector.service | 2 +-
src/python/sentryCollector/collect_io.py | 16 ++++-
src/python/sentryCollector/collect_plugin.py | 68 +++++++++++++++++++-
3 files changed, 81 insertions(+), 5 deletions(-)
diff --git a/service/sentryCollector.service b/service/sentryCollector.service
index 4ee07d5..e09ddb3 100644
--- a/service/sentryCollector.service
+++ b/service/sentryCollector.service
@@ -1,5 +1,5 @@
[Unit]
-Description = Collection module added for sysSentry and kernel lock-free collection
+Description = Collection module added for sysSentry
[Service]
ExecStart=/usr/bin/python3 /usr/bin/sentryCollector
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
index 8780648..6699a90 100644
--- a/src/python/sentryCollector/collect_io.py
+++ b/src/python/sentryCollector/collect_io.py
@@ -116,7 +116,7 @@ class CollectIo():
return 0
if finish <= 0 or lat_time <= 0:
return 0
- value = lat_time / finish / 1000 / 1000
+ value = lat_time / finish / 1000
if value.is_integer():
return int(value)
else:
@@ -124,11 +124,17 @@ class CollectIo():
def get_io_length(self, curr_stage_value, last_stage_value, category):
try:
- finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH])
+ lat_time = (int(curr_stage_value[category * 3 + IoStatus.LATENCY]) - int(last_stage_value[category * 3 + IoStatus.LATENCY]))
except ValueError as e:
logging.error("get_io_length convert to int failed, %s", e)
return 0
- value = finish / self.period_time / 1000 / 1000
+ if lat_time <= 0:
+ return 0
+ # ns convert us
+ lat_time = lat_time / 1000
+ # s convert us
+ period_time = self.period_time * 1000 * 1000
+ value = lat_time / period_time
if value.is_integer():
return int(value)
else:
@@ -141,6 +147,8 @@ class CollectIo():
with open(io_dump_file, 'r') as file:
for line in file:
count += line.count('.op=' + Io_Category[category])
+ if count > 0:
+ logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}")
except FileNotFoundError:
logging.error("The file %s does not exist.", io_dump_file)
return count
@@ -223,6 +231,8 @@ class CollectIo():
if self.get_blk_io_hierarchy(disk_name, stage_list) < 0:
continue
self.append_period_lat(disk_name, stage_list)
+
+ logging.debug(f"no-lock collect data : {IO_GLOBAL_DATA}")
elapsed_time = time.time() - start_time
sleep_time = self.period_time - elapsed_time
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
index 3e2cf4c..31bf11b 100644
--- a/src/python/sentryCollector/collect_plugin.py
+++ b/src/python/sentryCollector/collect_plugin.py
@@ -16,6 +16,7 @@ import json
import socket
import logging
import re
+import os
COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock"
@@ -58,6 +59,8 @@ class ResultMessage():
RESULT_EXCEED_LIMIT = 4 # the parameter length exceeds the limit.
RESULT_PARSE_FAILED = 5 # parse failed
RESULT_INVALID_CHAR = 6 # invalid char
+ RESULT_DISK_NOEXIST = 7 # disk is not exist
+ RESULT_DISK_TYPE_MISMATCH= 8 # disk type mismatch
Result_Messages = {
ResultMessage.RESULT_SUCCEED: "Succeed",
@@ -66,9 +69,15 @@ Result_Messages = {
ResultMessage.RESULT_INVALID_LENGTH: "Invalid parameter length",
ResultMessage.RESULT_EXCEED_LIMIT: "The parameter length exceeds the limit",
ResultMessage.RESULT_PARSE_FAILED: "Parse failed",
- ResultMessage.RESULT_INVALID_CHAR: "Invalid char"
+ ResultMessage.RESULT_INVALID_CHAR: "Invalid char",
+ ResultMessage.RESULT_DISK_NOEXIST: "Disk is not exist",
+ ResultMessage.RESULT_DISK_TYPE_MISMATCH: "Disk type mismatch"
}
+class DiskType():
+ TYPE_NVME_SSD = 0
+ TYPE_SATA_SSD = 1
+ TYPE_SATA_HDD = 2
def client_send_and_recv(request_data, data_str_len, protocol):
"""client socket send and recv message"""
@@ -273,3 +282,60 @@ def inter_get_io_data(period, disk_list, stage, iotype):
result['message'] = result_message
return result
+def get_disk_type(disk):
+ result = {}
+ result['ret'] = ResultMessage.RESULT_UNKNOWN
+ result['message'] = ""
+ if not disk:
+ logging.error("param is invalid")
+ result['ret'] = ResultMessage.RESULT_NOT_PARAM
+ return result
+ if len(disk) <= 0 or len(disk) > LIMIT_DISK_CHAR_LEN:
+ logging.error("invalid disk length")
+ result['ret'] = ResultMessage.RESULT_INVALID_LENGTH
+ return result
+ pattern = r'^[a-zA-Z0-9_-]+$'
+ if not re.match(pattern, disk):
+ logging.error("%s is invalid char", disk)
+ result['ret'] = ResultMessage.RESULT_INVALID_CHAR
+ return result
+
+ base_path = '/sys/block'
+ all_disk = []
+ for disk_name in os.listdir(base_path):
+ all_disk.append(disk_name)
+
+ if disk not in all_disk:
+ logging.error("disk %s is not exist", disk)
+ result['ret'] = ResultMessage.RESULT_DISK_NOEXIST
+ return result
+
+ if disk[0:4] == "nvme":
+ result['message'] = str(DiskType.TYPE_NVME_SSD)
+ elif disk[0:2] == "sd":
+ disk_file = '/sys/block/{}/queue/rotational'.format(disk)
+ try:
+ with open(disk_file, 'r') as file:
+ num = int(file.read())
+ if num == 1:
+ result['message'] = str(DiskType.TYPE_SATA_SSD)
+ elif num == 0:
+ result['message'] = str(DiskType.TYPE_SATA_HDD)
+ else:
+ logging.error("disk %s is not support, num = %d", disk, num)
+ result['ret'] = ResultMessage.RESULT_DISK_TYPE_MISMATCH
+ return result
+ except FileNotFoundError:
+ logging.error("The disk_file [%s] does not exist", disk_file)
+ result['ret'] = ResultMessage.RESULT_DISK_NOEXIST
+ return result
+ except Exception as e:
+ logging.error("open disk_file %s happen an error: %s", disk_file, e)
+ return result
+ else:
+ logging.error("disk %s is not support", disk)
+ result['ret'] = ResultMessage.RESULT_DISK_TYPE_MISMATCH
+ return result
+
+ result['ret'] = ResultMessage.RESULT_SUCCEED
+ return result
\ No newline at end of file
--
2.33.0

File diff suppressed because it is too large Load Diff

View File

@ -1,251 +0,0 @@
From a8418093bb37482da7ccaac0c950f2ed8d0ba2fa Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Thu, 10 Oct 2024 15:07:29 +0800
Subject: [PATCH] add log for improving maintainability
---
.../avg_block_io/avg_block_io.py | 4 +-
.../sentryPlugins/avg_block_io/module_conn.py | 57 ++++++++++-------
.../avg_block_io/stage_window.py | 8 +++
.../sentryPlugins/avg_block_io/utils.py | 63 +++++++++++++++++--
4 files changed, 103 insertions(+), 29 deletions(-)
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
index 26a60c5..cf2ded3 100644
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
@@ -194,11 +194,11 @@ def init_io_win(io_dic, config, common_param):
if avg_lim_value and avg_time_value and tot_lim_value:
io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
- logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw))
+ logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw))
if iodump_lim_value is not None:
io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value)
- logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw))
+ logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw))
return io_data, io_avg_value
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
index 2fc5a83..40b3fcc 100644
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
@@ -13,7 +13,7 @@ import logging
import sys
import time
-from .utils import is_abnormal
+from .utils import is_abnormal, get_win_data, log_slow_win
from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
from syssentry.result import ResultLevel, report_result
from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR
@@ -66,36 +66,51 @@ def report_alarm_fail(alarm_info):
def process_report_data(disk_name, rw, io_data):
"""check abnormal window and report to xalarm"""
- if not is_abnormal((disk_name, 'bio', rw), io_data):
+ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
+ if not abnormal:
return
- msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw}
+ msg = {
+ "alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw,
+ "reason": "unknown", "block_stack": "bio", "alarm_type": abnormal_list,
+ "details": get_win_data(disk_name, rw, io_data)
+ }
+ # io press
ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
for stage_name in ctrl_stage:
- if is_abnormal((disk_name, stage_name, rw), io_data):
- msg["reason"] = "IO press slow"
- msg["block_stack"] = f"bio,{stage_name}"
- logging.warning("{} - {} report IO press slow".format(disk_name, rw))
- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
- return
-
- if is_abnormal((disk_name, 'rq_driver', rw), io_data):
+ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
+ if not abnormal:
+ continue
+ msg["reason"] = "IO press"
+ msg["block_stack"] = f"bio,{stage_name}"
+ msg["alarm_type"] = abnormal_list
+ log_slow_win(msg, "IO press")
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
+ return
+
+ # driver slow
+ abnormal, abnormal_list = is_abnormal((disk_name, 'rq_driver', rw), io_data)
+ if abnormal:
msg["reason"] = "driver slow"
msg["block_stack"] = "bio,rq_driver"
- logging.warning("{} - {} report driver slow".format(disk_name, rw))
+ msg["alarm_type"] = abnormal_list
+ log_slow_win(msg, "driver slow")
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
return
+ # kernel slow
kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue']
for stage_name in kernel_stage:
- if is_abnormal((disk_name, stage_name, rw), io_data):
- msg["reason"] = "kernel slow"
- msg["block_stack"] = f"bio,{stage_name}"
- logging.warning("{} - {} report kernel slow".format(disk_name, rw))
- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
- return
- msg["reason"] = "unknown"
- msg["block_stack"] = "bio"
- logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw))
+ abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data)
+ if not abnormal:
+ continue
+ msg["reason"] = "kernel slow"
+ msg["block_stack"] = f"bio,{stage_name}"
+ msg["alarm_type"] = abnormal_list
+ log_slow_win(msg, "kernel slow")
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
+ return
+
+ log_slow_win(msg, "unknown")
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py
index 9b0ce79..5113782 100644
--- a/src/python/sentryPlugins/avg_block_io/stage_window.py
+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py
@@ -14,6 +14,11 @@ class AbnormalWindowBase:
self.window_size = window_size
self.window_threshold = window_threshold
self.abnormal_window = [False] * window_size
+ self.window_data = [-1] * window_size
+
+ def append_new_data(self, ab_res):
+ self.window_data.pop(0)
+ self.window_data.append(ab_res)
def append_new_period(self, ab_res, avg_val=0):
self.abnormal_window.pop(0)
@@ -25,6 +30,9 @@ class AbnormalWindowBase:
def is_abnormal_window(self):
return sum(self.abnormal_window) > self.window_threshold
+ def window_data_to_string(self):
+ return ",".join(str(x) for x in self.window_data)
+
class IoWindow(AbnormalWindowBase):
def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40):
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
index 2de9a46..3b7f027 100644
--- a/src/python/sentryPlugins/avg_block_io/utils.py
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
@@ -65,15 +65,32 @@ def set_nested_value(data, keys, value):
return True
+def get_win_data(disk_name, rw, io_data):
+ """get latency and iodump win data"""
+ latency = ''
+ iodump = ''
+ for stage_name in io_data[disk_name]:
+ if 'latency' in io_data[disk_name][stage_name][rw]:
+ latency_list = io_data[disk_name][stage_name][rw]['latency'].window_data_to_string()
+ latency += f'{stage_name}: [{latency_list}], '
+ if 'iodump' in io_data[disk_name][stage_name][rw]:
+ iodump_list = io_data[disk_name][stage_name][rw]['iodump'].window_data_to_string()
+ iodump += f'{stage_name}: [{iodump_list}], '
+ return {"latency": latency[:-2], "iodump": iodump[:-2]}
+
+
def is_abnormal(io_key, io_data):
"""check if latency and iodump win abnormal"""
+ abnormal_list = ''
for key in ['latency', 'iodump']:
all_keys = get_nested_value(io_data, io_key)
if all_keys and key in all_keys:
win = get_nested_value(io_data, io_key + (key,))
if win and win.is_abnormal_window():
- return True
- return False
+ abnormal_list += key + ', '
+ if not abnormal_list:
+ return False, abnormal_list
+ return True, abnormal_list[:-2]
def update_io_avg(old_avg, period_value, win_size):
@@ -87,8 +104,8 @@ def update_io_avg(old_avg, period_value, win_size):
return [new_avg_value, new_avg_count]
-def update_io_data(old_avg, period_value, win_size, io_data, io_key):
- """update data of latency and iodump window"""
+def update_io_period(old_avg, period_value, io_data, io_key):
+ """update period of latency and iodump window"""
all_wins = get_nested_value(io_data, io_key)
if all_wins and "latency" in all_wins:
io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE])
@@ -96,20 +113,54 @@ def update_io_data(old_avg, period_value, win_size, io_data, io_key):
io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1])
+def update_io_data(period_value, io_data, io_key):
+ """update data of latency and iodump window"""
+ all_wins = get_nested_value(io_data, io_key)
+ if all_wins and "latency" in all_wins:
+ io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_data(period_value[0])
+ if all_wins and "iodump" in all_wins:
+ io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_data(period_value[1])
+
+
+def log_abnormal_period(old_avg, period_value, io_data, io_key):
+ """record log of abnormal period"""
+ all_wins = get_nested_value(io_data, io_key)
+ if all_wins and "latency" in all_wins:
+ if all_wins["latency"].is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
+ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, "
+ f"type: latency, avg: {round(old_avg[AVG_VALUE], 3)}, curr_val: {period_value[0]}")
+ if all_wins and "iodump" in all_wins:
+ if all_wins["iodump"].is_abnormal_period(period_value[1]):
+ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, "
+ f"type: iodump, curr_val: {period_value[1]}")
+
+
+def log_slow_win(msg, reason):
+ """record log of slow win"""
+ logging.warning(f"[SLOW IO] disk: {msg['driver_name']}, stage: {msg['block_stack']}, "
+ f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}")
+ logging.info(f"latency: {msg['details']['latency']}")
+ logging.info(f"iodump: {msg['details']['iodump']}")
+
+
def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data):
"""update avg and check abonrmal, return true if win_size full"""
period_value = get_nested_value(data, io_key)
old_avg = get_nested_value(io_avg_value, io_key)
# 更新avg数据
+ update_io_data(period_value, io_data, io_key)
if old_avg[AVG_COUNT] < win_size:
set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
return False
+ # 打印异常周期数据
+ log_abnormal_period(old_avg, period_value, io_data, io_key)
+
# 更新win数据 -- 判断异常周期
- update_io_data(old_avg, period_value, win_size, io_data, io_key)
+ update_io_period(old_avg, period_value, io_data, io_key)
all_wins = get_nested_value(io_data, io_key)
- if all_wins and 'latency' not in all_wins:
+ if not all_wins or 'latency' not in all_wins:
return True
period = get_nested_value(io_data, io_key + ("latency",))
if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
--
2.27.0

View File

@ -1,24 +0,0 @@
From ef3aad0ca57d35b0a4fe29a0205596021bae0227 Mon Sep 17 00:00:00 2001
From: caixiaomeng <caixiaomeng2@.com>
Date: Fri, 11 Oct 2024 17:59:54 +0800
Subject: [PATCH] add log for xalarm when sending msg and clean invalid client
socket
---
src/python/xalarm/xalarm_transfer.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py
index 42137d8..9e867cc 100644
--- a/src/python/xalarm/xalarm_transfer.py
+++ b/src/python/xalarm/xalarm_transfer.py
@@ -117,4 +117,5 @@ def transmit_alarm(server_sock, epoll, fd_to_socket, bin_data):
epoll.unregister(fileno)
fd_to_socket[fileno].close()
del fd_to_socket[fileno]
+ logging.info(f"cleaned up connection {fileno} for client lost connection.")
--
2.27.0

View File

@ -1,522 +0,0 @@
From c1ab550a3f817826ac6f279de97e6d3820901275 Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Fri, 27 Sep 2024 14:10:18 +0800
Subject: [PATCH] add log level and change log format
---
config/collector.conf | 5 ++-
config/inspect.conf | 5 ++-
config/plugins/avg_block_io.ini | 5 ++-
config/xalarm.conf | 3 ++
src/python/sentryCollector/collect_config.py | 29 ++++++++++++++++
src/python/sentryCollector/collect_io.py | 15 ++-------
src/python/sentryCollector/collect_plugin.py | 32 +++++++++---------
src/python/sentryCollector/collectd.py | 6 ++--
.../avg_block_io/avg_block_io.py | 7 ++--
.../sentryPlugins/avg_block_io/utils.py | 32 ++++++++++++++++++
src/python/syssentry/sentry_config.py | 28 ++++++++++++++++
src/python/syssentry/syssentry.py | 7 ++--
src/python/xalarm/xalarm_config.py | 33 +++++++++++++++++--
src/python/xalarm/xalarm_daemon.py | 7 ++--
14 files changed, 172 insertions(+), 42 deletions(-)
diff --git a/config/collector.conf b/config/collector.conf
index 9baa086..56b0ed1 100644
--- a/config/collector.conf
+++ b/config/collector.conf
@@ -4,4 +4,7 @@ modules=io
[io]
period_time=1
max_save=10
-disk=default
\ No newline at end of file
+disk=default
+
+[log]
+level=info
\ No newline at end of file
diff --git a/config/inspect.conf b/config/inspect.conf
index 071cca1..f451d9e 100644
--- a/config/inspect.conf
+++ b/config/inspect.conf
@@ -1,2 +1,5 @@
[inspect]
-Interval=3
\ No newline at end of file
+Interval=3
+
+[log]
+level=info
diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini
index bc33dde..858db18 100644
--- a/config/plugins/avg_block_io.ini
+++ b/config/plugins/avg_block_io.ini
@@ -1,8 +1,11 @@
+[log]
+level=info
+
[common]
disk=default
stage=default
iotype=read,write
-period_time=1
+period_time=1
[algorithm]
win_size=30
diff --git a/config/xalarm.conf b/config/xalarm.conf
index 14c6d39..323d2dd 100644
--- a/config/xalarm.conf
+++ b/config/xalarm.conf
@@ -1,2 +1,5 @@
[filter]
id_mask = 1001-1128
+
+[log]
+level=info
diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py
index 0fdd9f0..5aa38ec 100644
--- a/src/python/sentryCollector/collect_config.py
+++ b/src/python/sentryCollector/collect_config.py
@@ -32,6 +32,35 @@ CONF_IO_PERIOD_TIME_DEFAULT = 1
CONF_IO_MAX_SAVE_DEFAULT = 10
CONF_IO_DISK_DEFAULT = "default"
+# log
+CONF_LOG = 'log'
+CONF_LOG_LEVEL = 'level'
+LogLevel = {
+ "debug": logging.DEBUG,
+ "info": logging.INFO,
+ "warning": logging.WARNING,
+ "error": logging.ERROR,
+ "critical": logging.CRITICAL
+}
+
+
+def get_log_level(filename=COLLECT_CONF_PATH):
+ if not os.path.exists(filename):
+ return logging.INFO
+
+ try:
+ config = configparser.ConfigParser()
+ config.read(filename)
+ if not config.has_option(CONF_LOG, CONF_LOG_LEVEL):
+ return logging.INFO
+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL)
+ if log_level.lower() in LogLevel:
+ return LogLevel.get(log_level.lower())
+ return logging.INFO
+ except configparser.Error:
+ return logging.INFO
+
+
class CollectConfig:
def __init__(self, filename=COLLECT_CONF_PATH):
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
index 9c8dae7..019d174 100644
--- a/src/python/sentryCollector/collect_io.py
+++ b/src/python/sentryCollector/collect_io.py
@@ -163,18 +163,6 @@ class CollectIo():
logging.error("An error occurred2: %s", e)
return column_names
- def task_loop(self):
- if self.stop_event.is_set():
- logging.info("collect io thread exit")
- return
-
- for disk_name, stage_list in self.disk_map_stage.items():
- if self.get_blk_io_hierarchy(disk_name, stage_list) < 0:
- continue
- self.append_period_lat(disk_name, stage_list)
-
- threading.Timer(self.period_time, self.task_loop).start()
-
def is_kernel_avaliable(self):
base_path = '/sys/kernel/debug/block'
all_disk = []
@@ -191,6 +179,9 @@ class CollectIo():
if file_name == 'stats':
all_disk.append(disk_name)
+ if self.loop_all:
+ self.disk_list = all_disk
+
for disk_name in self.disk_list:
if not self.loop_all and disk_name not in all_disk:
logging.warning("the %s disk not exist!", disk_name)
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
index 1faa5e3..3e2cf4c 100644
--- a/src/python/sentryCollector/collect_plugin.py
+++ b/src/python/sentryCollector/collect_plugin.py
@@ -75,14 +75,14 @@ def client_send_and_recv(request_data, data_str_len, protocol):
try:
client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
except socket.error:
- print("collect_plugin: client create socket error")
+ logging.error("collect_plugin: client create socket error")
return None
try:
client_socket.connect(COLLECT_SOCKET_PATH)
except OSError:
client_socket.close()
- print("collect_plugin: client connect error")
+ logging.error("collect_plugin: client connect error")
return None
req_data_len = len(request_data)
@@ -94,23 +94,23 @@ def client_send_and_recv(request_data, data_str_len, protocol):
res_data = res_data.decode()
except (OSError, UnicodeError):
client_socket.close()
- print("collect_plugin: client communicate error")
+ logging.error("collect_plugin: client communicate error")
return None
res_magic = res_data[:CLT_MSG_MAGIC_LEN]
if res_magic != "RES":
- print("res msg format error")
+ logging.error("res msg format error")
return None
protocol_str = res_data[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN]
try:
protocol_id = int(protocol_str)
except ValueError:
- print("recv msg protocol id is invalid %s", protocol_str)
+ logging.error("recv msg protocol id is invalid %s", protocol_str)
return None
if protocol_id >= ClientProtocol.PRO_END:
- print("protocol id is invalid")
+ logging.error("protocol id is invalid")
return None
try:
@@ -119,7 +119,7 @@ def client_send_and_recv(request_data, data_str_len, protocol):
res_msg_data = res_msg_data.decode()
return res_msg_data
except (OSError, ValueError, UnicodeError):
- print("collect_plugin: client recv res msg error")
+ logging.error("collect_plugin: client recv res msg error")
finally:
client_socket.close()
@@ -128,30 +128,30 @@ def client_send_and_recv(request_data, data_str_len, protocol):
def validate_parameters(param, len_limit, char_limit):
ret = ResultMessage.RESULT_SUCCEED
if not param:
- print("param is invalid")
+ logging.error("param is invalid, param = %s", param)
ret = ResultMessage.RESULT_NOT_PARAM
return [False, ret]
if not isinstance(param, list):
- print(f"{param} is not list type.")
+ logging.error("%s is not list type.", param)
ret = ResultMessage.RESULT_NOT_PARAM
return [False, ret]
if len(param) <= 0:
- print(f"{param} length is 0.")
+ logging.error("%s length is 0.", param)
ret = ResultMessage.RESULT_INVALID_LENGTH
return [False, ret]
pattern = r'^[a-zA-Z0-9_-]+$'
for info in param:
if not re.match(pattern, info):
- print(f"{info} is invalid char")
+ logging.error("%s is invalid char", info)
ret = ResultMessage.RESULT_INVALID_CHAR
return [False, ret]
# length of len_limit is exceeded, keep len_limit
if len(param) > len_limit:
- print(f"{param} length more than {len_limit}, keep the first {len_limit}")
+ logging.error("%s length more than %d, keep the first %d", param, len_limit, len_limit)
param[:] = param[0:len_limit]
# only keep elements under the char_limit length
@@ -202,13 +202,13 @@ def inter_is_iocollect_valid(period, disk_list=None, stage=None):
request_message = json.dumps(req_msg_struct)
result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.IS_IOCOLLECT_VALID)
if not result_message:
- print("collect_plugin: client_send_and_recv failed")
+ logging.error("collect_plugin: client_send_and_recv failed")
return result
try:
json.loads(result_message)
except json.JSONDecodeError:
- print("is_iocollect_valid: json decode error")
+ logging.error("is_iocollect_valid: json decode error")
result['ret'] = ResultMessage.RESULT_PARSE_FAILED
return result
@@ -260,12 +260,12 @@ def inter_get_io_data(period, disk_list, stage, iotype):
request_message = json.dumps(req_msg_struct)
result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.GET_IO_DATA)
if not result_message:
- print("collect_plugin: client_send_and_recv failed")
+ logging.error("collect_plugin: client_send_and_recv failed")
return result
try:
json.loads(result_message)
except json.JSONDecodeError:
- print("get_io_data: json decode error")
+ logging.error("get_io_data: json decode error")
result['ret'] = ResultMessage.RESULT_PARSE_FAILED
return result
diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py
index d9d8862..33f4b04 100644
--- a/src/python/sentryCollector/collectd.py
+++ b/src/python/sentryCollector/collectd.py
@@ -26,7 +26,7 @@ import threading
from .collect_io import CollectIo
from .collect_server import CollectServer
-from .collect_config import CollectConfig
+from .collect_config import CollectConfig, get_log_level
SENTRY_RUN_DIR = "/var/run/sysSentry"
COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock"
@@ -57,7 +57,9 @@ def main():
os.mkdir(SENTRY_RUN_DIR)
os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM)
- logging.basicConfig(filename=COLLECT_LOG_FILE, level=logging.INFO)
+ log_level = get_log_level()
+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
+ logging.basicConfig(filename=COLLECT_LOG_FILE, level=log_level, format=log_format)
os.chmod(COLLECT_LOG_FILE, 0o600)
try:
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
index ac35be2..b6b3b28 100644
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
@@ -15,7 +15,7 @@ import time
from .stage_window import IoWindow, IoDumpWindow
from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler
-from .utils import update_avg_and_check_abnormal
+from .utils import update_avg_and_check_abnormal, get_log_level
CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
@@ -283,7 +283,10 @@ def main():
signal.signal(signal.SIGINT, sig_handler)
signal.signal(signal.SIGTERM, sig_handler)
- logging.basicConfig(level=logging.INFO)
+ log_level = get_log_level(CONFIG_FILE)
+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
+
+ logging.basicConfig(level=log_level, format=log_format)
# 初始化配置读取
config = configparser.ConfigParser(comment_prefixes=('#', ';'))
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
index 54ed080..2de9a46 100644
--- a/src/python/sentryPlugins/avg_block_io/utils.py
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
@@ -8,9 +8,41 @@
# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
# PURPOSE.
# See the Mulan PSL v2 for more details.
+import configparser
+import logging
+import os
+
AVG_VALUE = 0
AVG_COUNT = 1
+CONF_LOG = 'log'
+CONF_LOG_LEVEL = 'level'
+LogLevel = {
+ "debug": logging.DEBUG,
+ "info": logging.INFO,
+ "warning": logging.WARNING,
+ "error": logging.ERROR,
+ "critical": logging.CRITICAL
+}
+
+
+def get_log_level(filename):
+ if not os.path.exists(filename):
+ return logging.INFO
+
+ try:
+ config = configparser.ConfigParser()
+ config.read(filename)
+ if not config.has_option(CONF_LOG, CONF_LOG_LEVEL):
+ return logging.INFO
+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL)
+
+ if log_level.lower() in LogLevel:
+ return LogLevel.get(log_level.lower())
+ return logging.INFO
+ except configparser.Error:
+ return logging.INFO
+
def get_nested_value(data, keys):
"""get data from nested dict"""
diff --git a/src/python/syssentry/sentry_config.py b/src/python/syssentry/sentry_config.py
index a0e7b79..1169887 100644
--- a/src/python/syssentry/sentry_config.py
+++ b/src/python/syssentry/sentry_config.py
@@ -21,6 +21,34 @@ import sys
DEFAULT_INSPECT_DELAY = 3
INSPECT_CONF_PATH = "/etc/sysSentry/inspect.conf"
+CONF_LOG = 'log'
+CONF_LOG_LEVEL = 'level'
+LogLevel = {
+ "debug": logging.DEBUG,
+ "info": logging.INFO,
+ "warning": logging.WARNING,
+ "error": logging.ERROR,
+ "critical": logging.CRITICAL
+}
+
+
+def get_log_level(filename=INSPECT_CONF_PATH):
+ if not os.path.exists(filename):
+ return logging.INFO
+
+ try:
+ config = configparser.ConfigParser()
+ config.read(filename)
+ if not config.has_option(CONF_LOG, CONF_LOG_LEVEL):
+ return logging.INFO
+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL)
+
+ if log_level.lower() in LogLevel:
+ return LogLevel.get(log_level.lower())
+ return logging.INFO
+ except configparser.Error:
+ return logging.INFO
+
class SentryConfig:
"""
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
index 776971f..9ef0203 100644
--- a/src/python/syssentry/syssentry.py
+++ b/src/python/syssentry/syssentry.py
@@ -23,7 +23,7 @@ import fcntl
import select
-from .sentry_config import SentryConfig
+from .sentry_config import SentryConfig, get_log_level
from .task_map import TasksMap
from .global_values import SENTRY_RUN_DIR, CTL_SOCKET_PATH, SENTRY_RUN_DIR_PERM
@@ -563,7 +563,10 @@ def main():
os.mkdir(SENTRY_RUN_DIR)
os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM)
- logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=logging.INFO)
+ log_level = get_log_level()
+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
+
+ logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=log_level, format=log_format)
os.chmod(SYSSENTRY_LOG_FILE, 0o600)
if not chk_and_set_pidfile():
diff --git a/src/python/xalarm/xalarm_config.py b/src/python/xalarm/xalarm_config.py
index 8e56d10..754a816 100644
--- a/src/python/xalarm/xalarm_config.py
+++ b/src/python/xalarm/xalarm_config.py
@@ -15,9 +15,10 @@ Create: 2023-11-02
"""
import re
+import os
import dataclasses
import logging
-from configparser import ConfigParser
+import configparser
MAIN_CONFIG_PATH = '/etc/sysSentry/xalarm.conf'
@@ -27,6 +28,34 @@ MIN_ID_NUMBER = 1001
MAX_ID_NUMBER = 1128
MAX_ID_MASK_CAPACITY = 128
+# log
+CONF_LOG = 'log'
+CONF_LOG_LEVEL = 'level'
+LogLevel = {
+ "debug": logging.DEBUG,
+ "info": logging.INFO,
+ "warning": logging.WARNING,
+ "error": logging.ERROR,
+ "critical": logging.CRITICAL
+}
+
+
+def get_log_level(filename=MAIN_CONFIG_PATH):
+ if not os.path.exists(filename):
+ return logging.INFO
+
+ try:
+ config = configparser.ConfigParser()
+ config.read(filename)
+ if not config.has_option(CONF_LOG, CONF_LOG_LEVEL):
+ return logging.INFO
+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL)
+ if log_level.lower() in LogLevel:
+ return LogLevel.get(log_level.lower())
+ return logging.INFO
+ except configparser.Error:
+ return logging.INFO
+
@dataclasses.dataclass
class AlarmConfig:
@@ -106,7 +135,7 @@ def config_init():
"""
alarm_config = AlarmConfig()
- cfg = ConfigParser()
+ cfg = configparser.ConfigParser()
cfg.read(MAIN_CONFIG_PATH)
id_mask = parse_id_mask(cfg)
diff --git a/src/python/xalarm/xalarm_daemon.py b/src/python/xalarm/xalarm_daemon.py
index 00e8886..3ab211c 100644
--- a/src/python/xalarm/xalarm_daemon.py
+++ b/src/python/xalarm/xalarm_daemon.py
@@ -21,7 +21,7 @@ import signal
import fcntl
import socket
-from .xalarm_config import config_init
+from .xalarm_config import config_init, get_log_level
from .xalarm_server import server_loop, SOCK_FILE
ALARM_DIR = "/var/run/xalarm"
@@ -120,9 +120,10 @@ def alarm_process_create():
os.mkdir(ALARM_DIR)
os.chmod(ALARM_DIR, ALARM_DIR_PERMISSION)
+ log_level = get_log_level()
+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
- logging.basicConfig(filename=ALARM_LOGFILE, level=logging.INFO,
- format='%(asctime)s|%(levelname)s| %(message)s')
+ logging.basicConfig(filename=ALARM_LOGFILE, level=log_level, format=log_format)
signal.signal(signal.SIGTERM, signal_handler)
--
2.23.0

View File

@ -1,104 +0,0 @@
From 0a4bd4097690bee7250676a0c262a830c7a8fbcf Mon Sep 17 00:00:00 2001
From: jinsaihang <jinsaihang@h-partners.com>
Date: Fri, 11 Oct 2024 15:35:43 +0800
Subject: [PATCH] add parameter time_range ,alarm_id and alarm_clear_time
validation
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
---
sysSentry-1.0.2/src/python/syssentry/alarm.py | 19 +++++++++++++++++++
.../src/python/syssentry/load_mods.py | 6 ++----
.../src/python/syssentry/sentryctl | 4 +++-
3 files changed, 24 insertions(+), 5 deletions(-)
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
index d5337d3..43c1065 100644
--- a/src/python/syssentry/alarm.py
+++ b/src/python/syssentry/alarm.py
@@ -18,6 +18,7 @@ from datetime import datetime
import time
import logging
import json
+import sys
from xalarm.register_xalarm import xalarm_register,xalarm_getid,xalarm_getlevel,xalarm_gettype,xalarm_gettime,xalarm_getdesc
from xalarm.xalarm_api import Xalarm
@@ -41,9 +42,15 @@ id_base = 1001
clientId = -1
MILLISECONDS_UNIT_SECONDS = 1000
+MAX_NUM_OF_ALARM_ID = 128
+MIN_ALARM_ID = 1001
+MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
def update_alarm_list(alarm_info: Xalarm):
alarm_id = xalarm_getid(alarm_info)
+ if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
+ logging.warnning(f"Invalid alarm_id {alarm_id}")
+ return
timestamp = xalarm_gettime(alarm_info)
if not timestamp:
logging.error("Retrieve timestamp failed")
@@ -77,7 +84,19 @@ def alarm_register():
logging.info(f"alarm_register: {task_name} is registered")
task = TasksMap.tasks_dict[task_type][task_name]
alarm_id = task.alarm_id
+ if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
+ logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
+ continue
alarm_clear_time = task.alarm_clear_time
+ try:
+ alarm_clear_time = int(alarm_clear_time)
+ if alarm_clear_time <= 0:
+ raise ValueError("Not a positive integer")
+ if alarm_clear_time > sys.maxsize:
+ raise ValueError("Exceeds maximum value for int")
+ except (ValueError, OverflowError, TypeError) as e:
+ logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
+ continue
alarm_list_dict[alarm_id] = []
task_alarm_id_dict[task_name] = alarm_id
if alarm_id not in alarm_id_clear_time_dict:
diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py
index ae05e57..7daf17d 100644
--- a/src/python/syssentry/load_mods.py
+++ b/src/python/syssentry/load_mods.py
@@ -203,11 +203,9 @@ def parse_mod_conf(mod_name, mod_conf):
if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID):
raise ValueError("Invalid alarm_id")
except ValueError:
- task.alarm_id = -1
- logging.warning("Invalid alarm_id, set to -1")
+ logging.warning("Invalid alarm_id")
except configparser.NoOptionError:
- task.alarm_id = -1
- logging.warning("Unset alarm_id and alarm_clear_time, use -1 and 15s as default")
+ logging.warning("Unset alarm_clear_time, use 15s as default")
if CONF_ONSTART in mod_conf.options(CONF_TASK):
is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes')
diff --git a/src/python/syssentry/sentryctl b/src/python/syssentry/sentryctl
index 3de93d0..c2e3cef 100644
--- a/src/python/syssentry/sentryctl
+++ b/src/python/syssentry/sentryctl
@@ -136,7 +136,7 @@ if __name__ == '__main__':
parser_get_result.add_argument('task_name')
parser_get_alarm = subparsers.add_parser('get_alarm', help='get task alarm')
parser_get_alarm.add_argument('task_name')
- parser_get_alarm.add_argument('-s', '--time_range', type=str, default=DEFAULT_ALARM_TIME_RANGE, help='Specified time range')
+ parser_get_alarm.add_argument('-s', '--time_range', type=int, default=DEFAULT_ALARM_TIME_RANGE, help='Specified time range')
parser_get_alarm.add_argument('-d', '--detailed', action='store_true', help='Print Detailed Information')
parser_list = subparsers.add_parser('list', help='show all loaded task mod')
@@ -153,6 +153,8 @@ if __name__ == '__main__':
elif client_args.cmd_type == 'get_result':
req_msg_struct = {"type": "get_result", "data": client_args.task_name}
elif client_args.cmd_type == 'get_alarm':
+ if not isinstance(client_args.time_range, int) or client_args.time_range <= 0:
+ print(f"time_range is not a positive integer: {client_args.time_range}")
req_msg_struct = {
"type": "get_alarm",
"data": {
--
2.27.0

View File

@ -1,678 +0,0 @@
From a18ea2e94fef78334a56dce1ea3f67ee649732f3 Mon Sep 17 00:00:00 2001
From: PshySimon <caixiaomeng2@huawei.com>
Date: Thu, 26 Sep 2024 16:12:25 +0800
Subject: [PATCH] add pyxalarm and pySentryNotify, add multi users support for
xalarmd and adapt libxalarm
---
src/libso/xalarm/register_xalarm.c | 41 ++----
src/libso/xalarm/register_xalarm.h | 10 +-
src/python/xalarm/register_xalarm.py | 192 +++++++++++++++++++++++++++
src/python/xalarm/sentry_notify.py | 71 ++++++++++
src/python/xalarm/xalarm_api.py | 18 ++-
src/python/xalarm/xalarm_server.py | 40 +++++-
src/python/xalarm/xalarm_transfer.py | 96 ++++++++++++--
7 files changed, 408 insertions(+), 60 deletions(-)
create mode 100644 src/python/xalarm/register_xalarm.py
create mode 100644 src/python/xalarm/sentry_notify.py
diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c
index 152c078..21a419f 100644
--- a/src/libso/xalarm/register_xalarm.c
+++ b/src/libso/xalarm/register_xalarm.c
@@ -35,7 +35,7 @@
#define ALARM_SOCKET_PERMISSION 0700
#define TIME_UNIT_MILLISECONDS 1000
-#define MAX_PARAS_LEN 511
+#define MAX_PARAS_LEN 1023
#define MIN_ALARM_ID 1001
#define MAX_ALARM_ID (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
@@ -91,7 +91,7 @@ static int create_unix_socket(const char *path)
return -1;
}
- fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
if (fd < 0) {
printf("socket failed:%s\n", strerror(errno));
return -1;
@@ -103,14 +103,6 @@ static int create_unix_socket(const char *path)
goto release_socket;
}
- if (access(PATH_REG_ALARM, F_OK) == 0) {
- ret = unlink(PATH_REG_ALARM);
- if (ret != 0) {
- printf("unlink register socket file failed\n");
- goto release_socket;
- }
- }
-
if (access(DIR_XALARM, F_OK) == -1) {
if (mkdir(DIR_XALARM, ALARM_DIR_PERMISSION) == -1) {
printf("mkdir %s failed\n", DIR_XALARM);
@@ -120,32 +112,22 @@ static int create_unix_socket(const char *path)
if (memset(&alarm_addr, 0, sizeof(alarm_addr)) == NULL) {
printf("create_unix_socket: memset alarm_addr failed, ret: %d\n", ret);
- goto remove_dir;
+ goto release_socket;
}
alarm_addr.sun_family = AF_UNIX;
strncpy(alarm_addr.sun_path, path, sizeof(alarm_addr.sun_path) - 1);
- if (bind(fd, (struct sockaddr *)&alarm_addr, sizeof(alarm_addr.sun_family) + strlen(alarm_addr.sun_path)) < 0) {
- printf("bind socket failed:%s\n", strerror(errno));
- goto remove_dir;
+ if (connect(fd, (struct sockaddr*)&alarm_addr, sizeof(alarm_addr)) == -1) {
+ printf("create_unix_socket: connect alarm_addr failed, ret: %d\n", ret);
+ goto release_socket;
}
if (chmod(path, ALARM_SOCKET_PERMISSION) < 0) {
printf("chmod %s failed: %s\n", path, strerror(errno));
- goto unlink_sockfile;
+ goto release_socket;
}
return fd;
-unlink_sockfile:
- ret = unlink(PATH_REG_ALARM);
- if (ret != 0) {
- printf("unlink register socket file failed\n");
- }
-remove_dir:
- ret = rmdir(DIR_XALARM);
- if (ret != 0) {
- printf("rmdir %s failed: %s\n", path, strerror(errno));
- }
release_socket:
(void)close(fd);
@@ -271,8 +253,6 @@ int xalarm_Register(alarm_callback_func callback, struct alarm_subscription_info
void xalarm_UnRegister(int client_id)
{
- int ret;
-
if (!g_register_info.is_registered) {
printf("%s: alarm has not registered\n", __func__);
return;
@@ -292,10 +272,6 @@ void xalarm_UnRegister(int client_id)
if (g_register_info.register_fd != -1) {
(void)close(g_register_info.register_fd);
g_register_info.register_fd = -1;
- ret = unlink(PATH_REG_ALARM);
- if (ret != 0) {
- printf("%s: unlink register socket file failed\n", __func__);
- }
}
memset(g_register_info.alarm_enable_bitmap, 0, MAX_NUM_OF_ALARM_ID * sizeof(char));
@@ -357,7 +333,7 @@ int xalarm_Report(unsigned short usAlarmId, unsigned char ucAlarmLevel,
struct sockaddr_un alarm_addr;
if ((usAlarmId < MIN_ALARM_ID || usAlarmId > MAX_ALARM_ID) ||
- (ucAlarmLevel < ALARM_LEVEL_FATAL || ucAlarmLevel > ALARM_LEVEL_DEBUG) ||
+ (ucAlarmLevel < MINOR_ALM || ucAlarmLevel > CRITICAL_ALM) ||
(ucAlarmType < ALARM_TYPE_OCCUR || ucAlarmType > ALARM_TYPE_RECOVER)) {
fprintf(stderr, "%s: alarm info invalid\n", __func__);
return -1;
@@ -666,3 +642,4 @@ int report_result(const char *task_name, enum RESULT_LEVEL result_level, const c
return RETURE_CODE_SUCCESS;
}
+
diff --git a/src/libso/xalarm/register_xalarm.h b/src/libso/xalarm/register_xalarm.h
index 1f26c6a..fef9482 100644
--- a/src/libso/xalarm/register_xalarm.h
+++ b/src/libso/xalarm/register_xalarm.h
@@ -11,7 +11,7 @@
#include <sys/time.h>
#include <stdbool.h>
-#define ALARM_INFO_MAX_PARAS_LEN 512
+#define ALARM_INFO_MAX_PARAS_LEN 1024
#define MAX_STRERROR_SIZE 1024
#define MAX_ALARM_TYEPS 1024
#define MIN_ALARM_ID 1001
@@ -19,11 +19,9 @@
#define MEMORY_ALARM_ID 1001
-#define ALARM_LEVEL_FATAL 1
-#define ALARM_LEVEL_ERROR 2
-#define ALARM_LEVEL_WARNING 3
-#define ALARM_LEVEL_INFO 4
-#define ALARM_LEVEL_DEBUG 5
+#define MINOR_ALM 1
+#define MAJOR_ALM 2
+#define CRITICAL_ALM 3
#define ALARM_TYPE_OCCUR 1
#define ALARM_TYPE_RECOVER 2
diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py
new file mode 100644
index 0000000..e58343d
--- /dev/null
+++ b/src/python/xalarm/register_xalarm.py
@@ -0,0 +1,192 @@
+import os
+import sys
+import socket
+import logging
+import threading
+import time
+import fcntl
+import inspect
+from struct import error as StructParseError
+
+from .xalarm_api import Xalarm, alarm_bin2stu
+
+
+ALARM_REPORT_LEN = 1048
+MAX_NUM_OF_ALARM_ID=128
+MIN_ALARM_ID = 1001
+MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
+DIR_XALARM = "/var/run/xalarm"
+PATH_REG_ALARM = "/var/run/xalarm/alarm"
+PATH_REPORT_ALARM = "/var/run/xalarm/report"
+ALARM_DIR_PERMISSION = 0o0750
+ALARM_REG_SOCK_PERMISSION = 0o0700
+ALARM_SOCKET_PERMISSION = 0o0700
+TIME_UNIT_MILLISECONDS = 1000
+ALARM_REGISTER_INFO = None
+
+
+class AlarmRegister:
+ def __init__(self, id_filter: list[bool], callback: callable):
+ self.id_filter = id_filter
+ self.callback = callback
+ self.socket = self.create_unix_socket()
+ self.is_registered = True
+ self.thread = threading.Thread(target=self.alarm_recv)
+ self.thread_should_stop = False
+
+ def check_params(self) -> bool:
+ if (len(self.id_filter) != MAX_NUM_OF_ALARM_ID):
+ sys.stderr.write("check_params: invalid param id_filter\n")
+ return False
+
+ sig = inspect.signature(self.callback)
+ if len(sig.parameters) != 1:
+ sys.stderr.write("check_params: invalid param callback\n")
+ return False
+
+ if self.socket is None:
+ sys.stderr.write("check_params: scoket create failed\n")
+ return False
+ return True
+
+ def set_id_filter(self, id_filter: list[bool]) -> bool:
+ if (len(id_filter) > MAX_NUM_OF_ALARM_ID):
+ sys.stderr.write("set_id_filter: invalid param id_filter\n")
+ return False
+ self.id_filter = id_filter
+
+ def id_is_registered(self, alarm_id) -> bool:
+ if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
+ return False
+ return self.id_filter[alarm_id - MIN_ALARM_ID]
+
+ def put_alarm_info(self, alarm_info: Xalarm) -> None:
+ if not self.callback or not alarm_info:
+ return
+ if not self.id_is_registered(alarm_info.alarm_id):
+ return
+ self.callback(alarm_info)
+
+ def create_unix_socket(self) -> socket.socket:
+ try:
+ sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ sock.setblocking(False)
+
+ if not os.access(DIR_XALARM, os.F_OK):
+ os.makedirs(DIR_XALARM)
+ os.chmod(DIR_XALARM, ALARM_DIR_PERMISSION)
+
+ sock.connect(PATH_REG_ALARM)
+ return sock
+ except (IOError, OSError, FileNotFoundError) as e:
+ sock.close()
+ sys.stderr.write(f"create_unix_socket: create socket error:{e}\n")
+ return None
+
+ def alarm_recv(self):
+ while not self.thread_should_stop:
+ try:
+ data = self.socket.recv(ALARM_REPORT_LEN)
+ if not data:
+ sys.stderr.write("connection closed by xalarmd, maybe connections reach max num or service stopped.\n")
+ self.thread_should_stop = True
+ break
+ if len(data) != ALARM_REPORT_LEN:
+ sys.stderr.write(f"server receive report msg length wrong {len(data)}\n")
+ continue
+
+ alarm_info = alarm_bin2stu(data)
+ self.put_alarm_info(alarm_info)
+ except (BlockingIOError) as e:
+ time.sleep(0.1)
+ except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError):
+ sys.stderr.write("Connection closed by the server.\n")
+ self.thread_should_stop = True
+ except (ValueError, StructParseError, InterruptedError) as e:
+ sys.stderr.write(f"{e}\n")
+ except Exception as e:
+ sys.stderr.write(f"{e}\n")
+ self.thread_should_stop = True
+
+ def start_thread(self) -> None:
+ self.thread.daemon = True
+ self.thread.start()
+
+ def stop_thread(self) -> None:
+ self.thread_should_stop = True
+ self.thread.join()
+ self.socket.close()
+
+
+def xalarm_register(callback: callable, id_filter: list[bool]) -> int:
+ global ALARM_REGISTER_INFO
+
+ if ALARM_REGISTER_INFO is not None:
+ sys.stderr.write("xalarm_register: alarm has registered\n")
+ return -1
+
+ ALARM_REGISTER_INFO = AlarmRegister(id_filter, callback)
+ if not ALARM_REGISTER_INFO.check_params():
+ return -1
+
+ ALARM_REGISTER_INFO.start_thread()
+
+ return 0
+
+
+def xalarm_unregister(clientId: int) -> None:
+ global ALARM_REGISTER_INFO
+ if clientId < 0:
+ sys.stderr.write("xalarm_unregister: invalid client\n")
+ return
+
+ if ALARM_REGISTER_INFO is None:
+ sys.stderr.write("xalarm_unregister: alarm has not registered\n")
+ return
+
+ ALARM_REGISTER_INFO.stop_thread()
+ ALARM_REGISTER_INFO = None
+
+
+def xalarm_upgrade(clientId: int, id_filter: list[bool]) -> None:
+ global ALARM_REGISTER_INFO
+ if clientId < 0:
+ sys.stderr.write("xalarm_unregister: invalid client\n")
+ return
+ if ALARM_REGISTER_INFO is None:
+ sys.stderr.write("xalarm_unregister: alarm has not registered\n")
+ return
+ ALARM_REGISTER_INFO.id_filter = id_filter
+
+
+def xalarm_getid(alarm_info: Xalarm) -> int:
+ if not alarm_info:
+ return 0
+ return alarm_info.alarm_id
+
+
+def xalarm_getlevel(alarm_info: Xalarm) -> int:
+ if not alarm_info:
+ return 0
+ return alarm_info.alarm_level
+
+
+def xalarm_gettype(alarm_info: Xalarm) -> int:
+ if not alarm_info:
+ return 0
+ return alarm_info.alarm_type
+
+
+def xalarm_gettime(alarm_info: Xalarm) -> int:
+ if not alarm_info:
+ return 0
+ return alarm_info.timetamp.tv_sec * TIME_UNIT_MILLISECONDS + alarm_info.timetamp.tv_usec / TIME_UNIT_MILLISECONDS
+
+def xalarm_getdesc(alarm_info: Xalarm) -> str:
+ if not alarm_info:
+ return None
+ try:
+ desc_str = alarm_info.msg1.rstrip(b'\x00').decode('utf-8')
+ except UnicodeError:
+ desc_str = None
+ return desc_str
diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py
new file mode 100644
index 0000000..a19e5b3
--- /dev/null
+++ b/src/python/xalarm/sentry_notify.py
@@ -0,0 +1,71 @@
+import os
+import sys
+import time
+import socket
+from struct import error as StructParseError
+
+from .xalarm_api import alarm_stu2bin, Xalarm
+
+MAX_NUM_OF_ALARM_ID = 128
+MIN_ALARM_ID = 1001
+MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
+
+MINOR_ALM = 1
+MAJOR_ALM = 2
+CRITICAL_ALM = 3
+
+ALARM_TYPE_OCCUR = 1
+ALARM_TYPE_RECOVER = 2
+
+MAX_PUC_PARAS_LEN = 1024
+
+DIR_XALARM = "/var/run/xalarm"
+PATH_REPORT_ALARM = "/var/run/xalarm/report"
+ALARM_DIR_PERMISSION = 0o750
+ALARM_SOCKET_PERMISSION = 0o700
+
+
+def check_params(alarm_id, alarm_level, alarm_type, puc_paras) -> bool:
+ if not os.path.exists(DIR_XALARM):
+ sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed")
+ return False
+
+ if not os.path.exists(PATH_REPORT_ALARM):
+ sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed")
+ return False
+
+ if (alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID or
+ alarm_level < MINOR_ALM or alarm_level > CRITICAL_ALM or
+ alarm_type < ALARM_TYPE_OCCUR or alarm_type > ALARM_TYPE_RECOVER):
+ sys.stderr.write("check_params: alarm info invalid\n")
+ return False
+
+ if len(puc_paras) >= MAX_PUC_PARAS_LEN:
+ sys.stderr.write(f"check_params: alarm msg should be less than {MAX_PUC_PARAS_LEN}\n")
+ return False
+
+ return True
+
+def xalarm_report(alarm_id, alarm_level, alarm_type, puc_paras) -> bool:
+ if not check_params(alarm_id, alarm_level, alarm_type, puc_paras):
+ return False
+
+ try:
+ sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
+
+ current_time = time.time()
+ current_time_seconds = int(current_time)
+ current_microseconds = int((current_time - current_time_seconds) * 1_000_000)
+ alarm_info = Xalarm(alarm_id, alarm_type, alarm_level,
+ current_time_seconds, current_microseconds, puc_paras)
+
+ sock.sendto(alarm_stu2bin(alarm_info), PATH_REPORT_ALARM)
+ except (FileNotFoundError, StructParseError, socket.error, OSError, UnicodeError) as e:
+ sys.stderr.write(f"check_params: error occurs when sending msg.{e}\n")
+ return False
+ finally:
+ sock.close()
+
+ return True
+
+
diff --git a/src/python/xalarm/xalarm_api.py b/src/python/xalarm/xalarm_api.py
index 94d7638..99eabf5 100644
--- a/src/python/xalarm/xalarm_api.py
+++ b/src/python/xalarm/xalarm_api.py
@@ -23,6 +23,7 @@ ALARM_LEVELS = (1, 2, 3, 4, 5)
ALARM_SOCK_PATH = "/var/run/xalarm/report"
MIN_ALARM_ID = 1001
MAX_ALARM_ID = 1128
+MAX_MSG_LEN = 1024
@dataclasses.dataclass
@@ -97,15 +98,15 @@ class Xalarm:
def msg1(self, msg):
"""msg1 setter
"""
- if len(msg) > 512:
- raise ValueError("msg1 length must below 255")
+ if len(msg) > MAX_MSG_LEN:
+ raise ValueError(f"msg1 length must below {MAX_MSG_LEN}")
self._msg1 = msg
def alarm_bin2stu(bin_data):
"""alarm binary to struct
"""
- struct_data = struct.unpack("@HBBll512s", bin_data)
+ struct_data = struct.unpack(f"@HBBll{MAX_MSG_LEN}s", bin_data)
alarm_info = Xalarm(1001, 2, 1, 0, 0, "")
alarm_info.alarm_id = struct_data[0]
@@ -116,3 +117,14 @@ def alarm_bin2stu(bin_data):
alarm_info.msg1 = struct_data[5]
return alarm_info
+
+
+def alarm_stu2bin(alarm_info: Xalarm):
+ return struct.pack(
+ f'@HBBll{MAX_MSG_LEN}s',
+ alarm_info.alarm_id,
+ alarm_info.alarm_level,
+ alarm_info.alarm_type,
+ alarm_info.timetamp.tv_sec,
+ alarm_info.timetamp.tv_usec,
+ alarm_info.msg1.encode('utf-8'))
diff --git a/src/python/xalarm/xalarm_server.py b/src/python/xalarm/xalarm_server.py
index 84db273..fcaf393 100644
--- a/src/python/xalarm/xalarm_server.py
+++ b/src/python/xalarm/xalarm_server.py
@@ -17,16 +17,20 @@ Create: 2023-11-02
import socket
import os
import logging
+import select
+import threading
from struct import error as StructParseError
from .xalarm_api import alarm_bin2stu
-from .xalarm_transfer import check_filter, transmit_alarm
+from .xalarm_transfer import check_filter, transmit_alarm, wait_for_connection
ALARM_DIR = "/var/run/xalarm"
+USER_RECV_SOCK = "/var/run/xalarm/alarm"
SOCK_FILE = "/var/run/xalarm/report"
-ALARM_REPORT_LEN = 536
+ALARM_REPORT_LEN = 1048
ALARM_DIR_PERMISSION = 0o750
+ALARM_LISTEN_QUEUE_LEN = 5
def clear_sock_path():
@@ -37,6 +41,8 @@ def clear_sock_path():
os.chmod(ALARM_DIR, ALARM_DIR_PERMISSION)
if os.path.exists(SOCK_FILE):
os.unlink(SOCK_FILE)
+ if os.path.exists(USER_RECV_SOCK):
+ os.unlink(USER_RECV_SOCK)
def server_loop(alarm_config):
@@ -49,6 +55,21 @@ def server_loop(alarm_config):
sock.bind(SOCK_FILE)
os.chmod(SOCK_FILE, 0o600)
+ alarm_sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ alarm_sock.bind(USER_RECV_SOCK)
+ os.chmod(USER_RECV_SOCK, 0o600)
+ alarm_sock.listen(ALARM_LISTEN_QUEUE_LEN)
+ alarm_sock.setblocking(False)
+
+ epoll = select.epoll()
+ epoll.register(alarm_sock.fileno(), select.EPOLLIN)
+ fd_to_socket = {alarm_sock.fileno(): alarm_sock,}
+ thread_should_stop = False
+
+ thread = threading.Thread(target=wait_for_connection, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop))
+ thread.daemon = True
+ thread.start()
+
while True:
try:
data, _ = sock.recvfrom(ALARM_REPORT_LEN)
@@ -58,14 +79,21 @@ def server_loop(alarm_config):
logging.debug("server receive report msg length wrong %d",
len(data))
continue
-
alarm_info = alarm_bin2stu(data)
logging.debug("server bin2stu msg")
if not check_filter(alarm_info, alarm_config):
continue
+ transmit_alarm(alarm_sock, epoll, fd_to_socket, data)
+ except Exception as e:
+ logging.error(f"Error server:{e}")
+
+ thread_should_stop = True
+ thread.join()
- transmit_alarm(data)
- except (ValueError, StructParseError):
- pass
+ epoll.unregister(alarm_sock.fileno())
+ epoll.close()
+ alarm_sock.close()
+ os.unlink(USER_RECV_SOCK)
sock.close()
+
diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py
index b590b43..42137d8 100644
--- a/src/python/xalarm/xalarm_transfer.py
+++ b/src/python/xalarm/xalarm_transfer.py
@@ -16,10 +16,12 @@ Create: 2023-11-02
import socket
import logging
+import select
-USER_RECV_SOCK = "/var/run/xalarm/alarm"
MIN_ID_NUMBER = 1001
MAX_ID_NUMBER = 1128
+MAX_CONNECTION_NUM = 100
+TEST_CONNECT_BUFFER_SIZE = 32
def check_filter(alarm_info, alarm_filter):
@@ -35,16 +37,84 @@ def check_filter(alarm_info, alarm_filter):
return True
-def transmit_alarm(bin_data):
- """forward alarm message
+def cleanup_closed_connections(server_sock, epoll, fd_to_socket):
"""
- sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
- try:
- sock.sendto(bin_data, USER_RECV_SOCK)
- logging.debug("transfer alarm success")
- except ConnectionRefusedError:
- logging.debug("transfer sendto failed")
- except FileNotFoundError:
- logging.debug("transfer sendto failed")
- finally:
- sock.close()
+ clean invalid client socket connections saved in 'fd_to_socket'
+ :param server_sock: server socket instance of alarm
+ :param epoll: epoll instance, used to unregister invalid client connections
+ :param fd_to_socket: dict instance, used to hold client connections and server connections
+ """
+ to_remove = []
+ for fileno, connection in fd_to_socket.items():
+ if connection is server_sock:
+ continue
+ try:
+ # test whether connection still alive, use MSG_DONTWAIT to avoid blocking thread
+ # use MSG_PEEK to avoid consuming buffer data
+ data = connection.recv(TEST_CONNECT_BUFFER_SIZE, socket.MSG_DONTWAIT | socket.MSG_PEEK)
+ if not data:
+ to_remove.append(fileno)
+ except BlockingIOError:
+ pass
+ except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError):
+ to_remove.append(fileno)
+
+ for fileno in to_remove:
+ epoll.unregister(fileno)
+ fd_to_socket[fileno].close()
+ del fd_to_socket[fileno]
+ logging.info(f"cleaned up connection {fileno} for client lost connection.")
+
+
+def wait_for_connection(server_sock, epoll, fd_to_socket, thread_should_stop):
+ """
+ thread function for catch and save client connection
+ :param server_sock: server socket instance of alarm
+ :param epoll: epoll instance, used to unregister invalid client connections
+ :param fd_to_socket: dict instance, used to hold client connections and server connections
+ :param thread_should_stop: bool instance
+ """
+ while not thread_should_stop:
+ try:
+ events = epoll.poll(1)
+
+ for fileno, event in events:
+ if fileno == server_sock.fileno():
+ connection, client_address = server_sock.accept()
+ # if reach max connection, cleanup closed connections
+ if len(fd_to_socket) - 1 >= MAX_CONNECTION_NUM:
+ cleanup_closed_connections(server_sock, epoll, fd_to_socket)
+ # if connections still reach max num, close this connection automatically
+ if len(fd_to_socket) - 1 >= MAX_CONNECTION_NUM:
+ logging.info(f"connection reach max num of {MAX_CONNECTION_NUM}, closed current connection!")
+ connection.close()
+ continue
+ epoll.register(connection.fileno(), select.EPOLLOUT)
+ fd_to_socket[connection.fileno()] = connection
+ except socket.error as e:
+ logging.debug(f"socket error, reason is {e}")
+ break
+ except (KeyError, OSError, ValueError) as e:
+ logging.debug(f"wait for connection failed {e}")
+
+
+def transmit_alarm(server_sock, epoll, fd_to_socket, bin_data):
+ """
+ this function is to broadcast alarm data to client, if fail to send data, remove connections held by fd_to_socket
+ :param server_sock: server socket instance of alarm
+ :param epoll: epoll instance, used to unregister invalid client connections
+ :param fd_to_socket: dict instance, used to hold client connections and server connections
+ :param bin_data: binary instance, alarm info data in C-style struct format defined in xalarm_api.py
+ """
+ to_remove = []
+ for fileno, connection in fd_to_socket.items():
+ if connection is not server_sock:
+ try:
+ connection.sendall(bin_data)
+ except (BrokenPipeError, ConnectionResetError):
+ to_remove.append(fileno)
+ for fileno in to_remove:
+ epoll.unregister(fileno)
+ fd_to_socket[fileno].close()
+ del fd_to_socket[fileno]
+
--
2.27.0

File diff suppressed because it is too large Load Diff

View File

@ -1,438 +0,0 @@
From 8fa9389a85763831ea85d94f179a305d7f95d585 Mon Sep 17 00:00:00 2001
From: jinsaihang <jinsaihang@h-partners.com>
Date: Sun, 29 Sep 2024 02:04:52 +0000
Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=91=8A=E8=AD=A6=E4=BA=8B?=
=?UTF-8?q?=E4=BB=B6=E6=9F=A5=E8=AF=A2=E5=8A=9F=E8=83=BD=EF=BC=9Asentryctl?=
=?UTF-8?q?=20get=5Falarm=20<module=5Fname>=20-s=20<time=5Frange>=20-d?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
---
src/python/syssentry/alarm.py | 142 ++++++++++++++++++
.../src/python/syssentry/callbacks.py | 17 +++
.../src/python/syssentry/global_values.py | 4 +
.../src/python/syssentry/load_mods.py | 16 ++
.../src/python/syssentry/sentryctl | 20 ++-
.../src/python/syssentry/syssentry.py | 13 +-
.../src/python/syssentry/task_map.py | 5 +-
7 files changed, 212 insertions(+), 5 deletions(-)
create mode 100644 src/python/syssentry/alarm.py
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
new file mode 100644
index 0000000..74a2716
--- /dev/null
+++ b/src/python/syssentry/alarm.py
@@ -0,0 +1,142 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+
+"""
+use for report alarm
+"""
+import threading
+from typing import Dict, List
+from datetime import datetime
+import time
+import logging
+import json
+
+from xalarm.register_xalarm import xalarm_register,xalarm_getid,xalarm_getlevel,xalarm_gettype,xalarm_gettime,xalarm_getdesc
+from xalarm.xalarm_api import Xalarm
+
+from .global_values import InspectTask
+from .task_map import TasksMap
+
+# 告警ID映射字典key为插件名value为告警ID类型为数字
+task_alarm_id_dict: Dict[str, int] = {}
+
+# 告警老化时间字典key为告警IDvalue为老化时间类型为数字单位为秒
+alarm_id_clear_time_dict: Dict[int, int] = {}
+
+# 告警事件列表key为告警IDvalue为告警ID对应的告警事件列表类型为list
+alarm_list_dict: Dict[int, List[Xalarm]] = {}
+# 告警事件列表锁
+alarm_list_lock = threading.Lock()
+
+id_filter = []
+id_base = 1001
+clientId = -1
+
+MILLISECONDS_UNIT_SECONDS = 1000
+
+def update_alarm_list(alarm_info: Xalarm):
+ alarm_id = xalarm_getid(alarm_info)
+ timestamp = xalarm_gettime(alarm_info)
+ if not timestamp:
+ logging.error("Retrieve timestamp failed")
+ return
+ alarm_list_lock.acquire()
+ try:
+ # new alarm is inserted into list head
+ if alarm_id not in alarm_list_dict:
+ logging.warning(f"update_alarm_list: alarm_id {alarm_id} not found in alarm_list_dict")
+ return
+ alarm_list = alarm_list_dict[alarm_id]
+
+ alarm_list.insert(0, alarm_info)
+ # clear alarm_info older than clear time threshold
+ clear_index = -1
+ clear_time = alarm_id_clear_time_dict[alarm_id]
+ for i in range(len(alarm_list)):
+ if (timestamp - xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > clear_time:
+ clear_index = i
+ break
+ if clear_index >= 0:
+ alarm_list_dict[alarm_id] = alarm_list[:clear_index]
+ finally:
+ alarm_list_lock.release()
+
+def alarm_register():
+ logging.debug(f"alarm_register: enter")
+ # 初始化告警ID映射字典、告警老化时间字典
+ for task_type in TasksMap.tasks_dict:
+ for task_name in TasksMap.tasks_dict[task_type]:
+ logging.info(f"alarm_register: {task_name} is registered")
+ task = TasksMap.tasks_dict[task_type][task_name]
+ alarm_id = task.alarm_id
+ alarm_clear_time = task.alarm_clear_time
+ alarm_list_dict[alarm_id] = []
+ task_alarm_id_dict[task_name] = alarm_id
+ if alarm_id not in alarm_id_clear_time_dict:
+ alarm_id_clear_time_dict[alarm_id] = alarm_clear_time
+ else:
+ alarm_id_clear_time_dict[alarm_id] = max(alarm_clear_time, alarm_id_clear_time_dict[alarm_id])
+ # 注册告警回调
+ id_filter = [True] * 128
+ clientId = xalarm_register(update_alarm_list, id_filter)
+ if clientId < 0:
+ logging.info(f'register xalarm: failed')
+ return clientId
+ logging.info('register xalarm: success')
+ return clientId
+
+def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Dict]:
+ alarm_list_lock.acquire()
+ try:
+ if task_name not in task_alarm_id_dict:
+ logging.debug("task_name does not exist")
+ return []
+ alarm_id = task_alarm_id_dict[task_name]
+ if alarm_id not in alarm_list_dict:
+ logging.debug("alarm_id does not exist")
+ return []
+ alarm_list = alarm_list_dict[alarm_id]
+ logging.debug(f"get_alarm_result: alarm_list of {alarm_id} has {len(alarm_list)} elements")
+ # clear alarm_info older than clear time threshold
+ stop_index = -1
+ timestamp = int(datetime.now().timestamp())
+ for i in range(len(alarm_list)):
+ logging.debug(f"timestamp, alarm_list[{i}].timestamp: {timestamp}, {xalarm_gettime(alarm_list[i])}")
+ if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > int(time_range):
+ stop_index = i
+ break
+ if stop_index >= 0:
+ alarm_list = alarm_list[:stop_index]
+ logging.debug(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements")
+
+ def xalarm_to_dict(alarm_info: Xalarm) -> dict:
+ return {
+ 'alarm_id': xalarm_getid(alarm_info),
+ 'alarm_type': xalarm_gettype(alarm_info),
+ 'alarm_level': xalarm_getlevel(alarm_info),
+ 'timetamp': xalarm_gettime(alarm_info),
+ 'msg1': xalarm_getdesc(alarm_info)
+ }
+
+ alarm_list = [xalarm_to_dict(alarm) for alarm in alarm_list]
+
+ # keep detail
+ for alarm in alarm_list:
+ alarm_info = alarm['msg1']
+ alarm_info = json.loads(alarm_info)
+ if not detailed:
+ if 'details' in alarm_info:
+ alarm_info.pop('details', None)
+ alarm.pop('msg1', None)
+ alarm['alarm_info'] = alarm_info
+ return alarm_list
+ finally:
+ alarm_list_lock.release()
diff --git a/src/python/syssentry/callbacks.py b/src/python/syssentry/callbacks.py
index b38b381..6ec2c29 100644
--- a/src/python/syssentry/callbacks.py
+++ b/src/python/syssentry/callbacks.py
@@ -18,6 +18,7 @@ import logging
from .task_map import TasksMap, ONESHOT_TYPE, PERIOD_TYPE
from .mod_status import EXITED_STATUS, RUNNING_STATUS, WAITING_STATUS, set_runtime_status
+from .alarm import get_alarm_result
def task_get_status(mod_name):
@@ -41,6 +42,22 @@ def task_get_result(mod_name):
return "success", task.get_result()
+def task_get_alarm(data):
+ """get alarm by mod name"""
+ task_name = data['task_name']
+ time_range = data['time_range']
+ try:
+ detailed = data['detailed']
+ except KeyError:
+ logging.debug("Key 'detailed' does not exist in the dictionary")
+ detailed = None
+ task = TasksMap.get_task_by_name(task_name)
+ if not task:
+ return "failed", f"cannot find task by name {task_name}"
+ if not task.load_enabled:
+ return "failed", f"mod {task_name} is not enabled"
+
+ return "success", get_alarm_result(task_name, time_range, detailed)
def task_stop(mod_name):
"""stop by mod name"""
diff --git a/src/python/syssentry/global_values.py b/src/python/syssentry/global_values.py
index 483d544..b123b2d 100644
--- a/src/python/syssentry/global_values.py
+++ b/src/python/syssentry/global_values.py
@@ -27,6 +27,7 @@ CTL_SOCKET_PATH = "/var/run/sysSentry/control.sock"
SYSSENTRY_CONF_PATH = "/etc/sysSentry"
INSPECT_CONF_PATH = "/etc/sysSentry/inspect.conf"
TASK_LOG_DIR = "/var/log/sysSentry"
+DEFAULT_ALARM_CLEAR_TIME = 15
SENTRY_RUN_DIR_PERM = 0o750
@@ -76,6 +77,9 @@ class InspectTask:
self.env_file = ""
# start mode
self.conflict = "up"
+ # alarm id
+ self.alarm_id = -1
+ self.alarm_clear_time = DEFAULT_ALARM_CLEAR_TIME
def start(self):
"""
diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py
index 48d7e66..ae05e57 100644
--- a/src/python/syssentry/load_mods.py
+++ b/src/python/syssentry/load_mods.py
@@ -24,6 +24,7 @@ from .task_map import TasksMap, ONESHOT_TYPE, PERIOD_TYPE
from .cron_process import PeriodTask
from .mod_status import set_task_status
+from xalarm.register_xalarm import MIN_ALARM_ID, MAX_ALARM_ID
ONESHOT_CONF = 'oneshot'
PERIOD_CONF = 'period'
@@ -41,6 +42,8 @@ CONF_TASK_RESTART = 'task_restart'
CONF_ONSTART = 'onstart'
CONF_ENV_FILE = 'env_file'
CONF_CONFLICT = 'conflict'
+CONF_ALARM_ID = 'alarm_id'
+CONF_ALARM_CLEAR_TIME = 'alarm_clear_time'
MOD_FILE_SUFFIX = '.mod'
MOD_SUFFIX_LEN = 4
@@ -194,6 +197,18 @@ def parse_mod_conf(mod_name, mod_conf):
task.heartbeat_interval = heartbeat_interval
task.load_enabled = is_enabled
+ try:
+ task.alarm_id = int(mod_conf.get(CONF_TASK, CONF_ALARM_ID))
+ task.alarm_clear_time = int(mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME))
+ if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID):
+ raise ValueError("Invalid alarm_id")
+ except ValueError:
+ task.alarm_id = -1
+ logging.warning("Invalid alarm_id, set to -1")
+ except configparser.NoOptionError:
+ task.alarm_id = -1
+ logging.warning("Unset alarm_id and alarm_clear_time, use -1 and 15s as default")
+
if CONF_ONSTART in mod_conf.options(CONF_TASK):
is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes')
if task_type == PERIOD_CONF:
@@ -327,3 +342,4 @@ def reload_single_mod(mod_name):
res, ret = reload_mod_by_name(mod_name)
return res, ret
+
diff --git a/src/python/syssentry/sentryctl b/src/python/syssentry/sentryctl
index e94491f..675c17a 100644
--- a/src/python/syssentry/sentryctl
+++ b/src/python/syssentry/sentryctl
@@ -25,6 +25,7 @@ MAX_PARAM_LENGTH = 256
RESULT_MSG_DATA_LEN = 4
CTL_MSG_LEN_LEN = 3
+DEFAULT_ALARM_TIME_RANGE = 10
def status_output_format(res_data):
"""format output"""
@@ -57,6 +58,8 @@ def res_output_handle(res_struct, req_type):
status_output_format(res_struct['data'])
elif req_type == 'get_result':
result_output_format(res_struct['data'])
+ elif req_type == 'get_alarm':
+ result_output_format(res_struct['data'])
elif res_struct['ret'] == "failed":
print(res_struct['data'])
@@ -75,6 +78,7 @@ def client_send_and_recv(request_data, data_str_len):
print("sentryctl: client creat socket error")
return None
+ # connect to syssentry
try:
client_socket.connect(CTL_SOCKET_PATH)
except OSError:
@@ -82,6 +86,7 @@ def client_send_and_recv(request_data, data_str_len):
print("sentryctl: client connect error")
return None
+ # msg: CTL{len}{data}
req_data_len = len(request_data)
request_msg = "CTL" + str(req_data_len).zfill(3) + request_data
@@ -94,8 +99,8 @@ def client_send_and_recv(request_data, data_str_len):
print("sentryctl: client communicate error")
return None
+ # res: RES{len}{data}
res_magic = res_data[:3]
-
if res_magic != "RES":
print("res msg format error")
return None
@@ -128,6 +133,10 @@ if __name__ == '__main__':
parser_status.add_argument('task_name')
parser_get_result = subparsers.add_parser('get_result', help='get task result')
parser_get_result.add_argument('task_name')
+ parser_get_alarm = subparsers.add_parser('get_alarm', help='get task alarm')
+ parser_get_alarm.add_argument('task_name')
+ parser_get_alarm.add_argument('-s', '--time_range', type=str, default=DEFAULT_ALARM_TIME_RANGE, help='Specified time range')
+ parser_get_alarm.add_argument('-d', '--detailed', action='store_true', help='Print Detailed Information')
parser_list = subparsers.add_parser('list', help='show all loaded task mod')
client_args = parser.parse_args()
@@ -142,6 +151,15 @@ if __name__ == '__main__':
req_msg_struct = {"type": "get_status", "data": client_args.task_name}
elif client_args.cmd_type == 'get_result':
req_msg_struct = {"type": "get_result", "data": client_args.task_name}
+ elif client_args.cmd_type == 'get_alarm':
+ req_msg_struct = {
+ "type": "get_alarm",
+ "data": {
+ 'task_name': client_args.task_name,
+ 'time_range': client_args.time_range,
+ 'detailed': client_args.detailed,
+ }
+ }
elif client_args.cmd_type == 'reload':
req_msg_struct = {"type": "reload", "data": client_args.task_name}
else:
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
index 9ef0203..c2dee85 100644
--- a/src/python/syssentry/syssentry.py
+++ b/src/python/syssentry/syssentry.py
@@ -28,7 +28,7 @@ from .sentry_config import SentryConfig, get_log_level
from .task_map import TasksMap
from .global_values import SENTRY_RUN_DIR, CTL_SOCKET_PATH, SENTRY_RUN_DIR_PERM
from .cron_process import period_tasks_handle
-from .callbacks import mod_list_show, task_start, task_get_status, task_stop, task_get_result
+from .callbacks import mod_list_show, task_start, task_get_status, task_stop, task_get_result, task_get_alarm
from .mod_status import get_task_by_pid, set_runtime_status
from .load_mods import load_tasks, reload_single_mod
from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create,
@@ -36,7 +36,11 @@ from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create,
from .result import RESULT_MSG_HEAD_LEN, RESULT_MSG_MAGIC_LEN, RESULT_MAGIC
from .result import RESULT_LEVEL_ERR_MSG_DICT, ResultLevel
from .utils import get_current_time_string
+from .alarm import alarm_register
+from xalarm.register_xalarm import xalarm_unregister
+
+clientId = -1
CPU_EXIST = True
try:
@@ -62,6 +66,7 @@ type_func = {
'stop': task_stop,
'get_status': task_get_status,
'get_result': task_get_result,
+ 'get_alarm': task_get_alarm,
'reload': reload_single_mod
}
@@ -107,11 +112,12 @@ def msg_data_process(msg_data):
return "Invaild cmd type"
cmd_param = data_struct['data']
- logging.debug("msg_data_process cmd_type:%s cmd_param:%s", cmd_type, cmd_param)
+ logging.debug("msg_data_process cmd_type:%s cmd_param:%s", cmd_type, str(cmd_param))
if cmd_type in type_func:
ret, res_data = type_func[cmd_type](cmd_param)
else:
ret, res_data = type_func_void[cmd_type]()
+ logging.debug("msg_data_process res_data:%s",str(res_data))
res_msg_struct = {"ret": ret, "data": res_data}
res_msg = json.dumps(res_msg_struct)
@@ -584,10 +590,13 @@ def main():
_ = SentryConfig.init_param()
TasksMap.init_task_map()
load_tasks()
+ clientId = alarm_register()
main_loop()
except Exception:
logging.error('%s', traceback.format_exc())
finally:
+ if clientId != -1:
+ xalarm_unregister(clientId)
release_pidfile()
diff --git a/src/python/syssentry/task_map.py b/src/python/syssentry/task_map.py
index 70aa19d..27e97ff 100644
--- a/src/python/syssentry/task_map.py
+++ b/src/python/syssentry/task_map.py
@@ -13,16 +13,16 @@
tasks map class and initialize function.
"""
import logging
+from typing import Dict
ONESHOT_TYPE = "ONESHOT"
PERIOD_TYPE = "PERIOD"
TASKS_MAP = None
-
class TasksMap:
"""task map class"""
- tasks_dict = {}
+ tasks_dict: Dict[str, Dict] = {}
@classmethod
def init_task_map(cls):
@@ -65,3 +65,4 @@ class TasksMap:
logging.debug("getting task by name: %s", res)
break
return res
+
--
2.27.0

View File

@ -1,90 +0,0 @@
From 4fa9b250f56dc3f4f431fc091e25d8f2558a9bb2 Mon Sep 17 00:00:00 2001
From: caixiaomeng <caixiaomeng2@.com>
Date: Fri, 11 Oct 2024 18:12:21 +0800
Subject: [PATCH] add xalarm cleanup invalid server socket peroidly
---
src/python/xalarm/xalarm_server.py | 20 +++++++++++++++-----
src/python/xalarm/xalarm_transfer.py | 8 ++++++++
2 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/src/python/xalarm/xalarm_server.py b/src/python/xalarm/xalarm_server.py
index 2882609..f90a0e2 100644
--- a/src/python/xalarm/xalarm_server.py
+++ b/src/python/xalarm/xalarm_server.py
@@ -22,7 +22,12 @@ import threading
from struct import error as StructParseError
from .xalarm_api import alarm_bin2stu
-from .xalarm_transfer import check_filter, transmit_alarm, wait_for_connection
+from .xalarm_transfer import (
+ check_filter,
+ transmit_alarm,
+ wait_for_connection,
+ peroid_task_to_cleanup_connections
+)
ALARM_DIR = "/var/run/xalarm"
@@ -66,9 +71,13 @@ def server_loop(alarm_config):
fd_to_socket = {alarm_sock.fileno(): alarm_sock,}
thread_should_stop = False
- thread = threading.Thread(target=wait_for_connection, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop))
- thread.daemon = True
- thread.start()
+ conn_thread = threading.Thread(target=wait_for_connection, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop))
+ conn_thread.daemon = True
+ conn_thread.start()
+
+ cleanup_thread = threading.Thread(target=peroid_task_to_cleanup_connections, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop))
+ cleanup_thread.daemon = True
+ cleanup_thread.start()
while True:
try:
@@ -88,7 +97,8 @@ def server_loop(alarm_config):
logging.error(f"Error server:{e}")
thread_should_stop = True
- thread.join()
+ conn_thread.join()
+ cleanup_thread.join()
epoll.unregister(alarm_sock.fileno())
epoll.close()
diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py
index 90dccbc..75807e0 100644
--- a/src/python/xalarm/xalarm_transfer.py
+++ b/src/python/xalarm/xalarm_transfer.py
@@ -17,11 +17,13 @@ Create: 2023-11-02
import socket
import logging
import select
+from time import sleep
MIN_ID_NUMBER = 1001
MAX_ID_NUMBER = 1128
MAX_CONNECTION_NUM = 100
TEST_CONNECT_BUFFER_SIZE = 32
+PEROID_SCANN_TIME = 60
def check_filter(alarm_info, alarm_filter):
@@ -66,6 +68,12 @@ def cleanup_closed_connections(server_sock, epoll, fd_to_socket):
logging.info(f"cleaned up connection {fileno} for client lost connection.")
+def peroid_task_to_cleanup_connections(server_sock, epoll, fd_to_socket, thread_should_stop):
+ while not thread_should_stop:
+ sleep(PEROID_SCANN_TIME)
+ cleanup_closed_connections(server_sock, epoll, fd_to_socket)
+
+
def wait_for_connection(server_sock, epoll, fd_to_socket, thread_should_stop):
"""
thread function for catch and save client connection
--
2.27.0

View File

@ -1,221 +0,0 @@
From 367f8ab8a5ad26d80caf1bc4529c79d279ef0fb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Thu, 10 Oct 2024 17:21:48 +0800
Subject: [PATCH] ai_block_io adapt alarm module
---
config/tasks/ai_block_io.mod | 4 +-
.../sentryPlugins/ai_block_io/ai_block_io.py | 28 +++++---
.../sentryPlugins/ai_block_io/alarm_report.py | 65 ++++++++++++++-----
.../sentryPlugins/ai_block_io/data_access.py | 5 +-
.../sentryPlugins/ai_block_io/detector.py | 2 +-
5 files changed, 73 insertions(+), 31 deletions(-)
diff --git a/config/tasks/ai_block_io.mod b/config/tasks/ai_block_io.mod
index 1971d7d..82f4f0b 100644
--- a/config/tasks/ai_block_io.mod
+++ b/config/tasks/ai_block_io.mod
@@ -2,4 +2,6 @@
enabled=yes
task_start=/usr/bin/python3 /usr/bin/ai_block_io
task_stop=pkill -f /usr/bin/ai_block_io
-type=oneshot
\ No newline at end of file
+type=oneshot
+alarm_id=1002
+alarm_clear_time=5
\ No newline at end of file
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
index 3b00ef3..77104a9 100644
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
@@ -20,14 +20,14 @@ from .utils import get_data_queue_size_and_update_size
from .config_parser import ConfigParser
from .data_access import get_io_data_from_collect_plug, check_collect_valid
from .io_data import MetricName
-from .alarm_report import AlarmReport
+from .alarm_report import Xalarm, Report
CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini"
def sig_handler(signum, frame):
logging.info("receive signal: %d", signum)
- AlarmReport().report_fail(f"receive signal: {signum}")
+ Report.report_pass(f"receive signal: {signum}, exiting...")
exit(signum)
@@ -44,6 +44,10 @@ class SlowIODetection:
def __init_detector_name_list(self):
self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency())
+ if self._disk_list is None:
+ Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...")
+ exit(1)
+
logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
disks_to_detection: list = self._config_parser.get_disks_to_detection()
# 情况1None则启用所有磁盘检测
@@ -101,7 +105,8 @@ class SlowIODetection:
)
logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}')
if io_data_dict_with_disk_name is None:
- continue
+ Report.report_pass("get io data error, please check if the collector plug is enable. exitting...")
+ exit(1)
# Step2慢IO检测
logging.debug('step2. Start to detection slow io event.')
@@ -117,13 +122,16 @@ class SlowIODetection:
for slow_io_event in slow_io_event_list:
metric_name: MetricName = slow_io_event[0]
result = slow_io_event[1]
- alarm_content = (f"disk {metric_name.get_disk_name()} has slow io event. "
- f"stage is: {metric_name.get_stage_name()}, "
- f"io access type is: {metric_name.get_io_access_type_name()}, "
- f"metric is: {metric_name.get_metric_name()}, "
- f"current window is: {result[1]}, "
- f"threshold is: {result[2]}")
- AlarmReport.report_major_alm(alarm_content)
+ alarm_content = {
+ "driver_name": f"{metric_name.get_disk_name()}",
+ "reason": "disk_slow",
+ "block_stack": f"{metric_name.get_stage_name()}",
+ "io_type": f"{metric_name.get_io_access_type_name()}",
+ "alarm_source": "ai_block_io",
+ "alarm_type": "latency",
+ "details": f"current window is: {result[1]}, threshold is: {result[2]}.",
+ }
+ Xalarm.major(alarm_content)
logging.warning(alarm_content)
# Step4等待检测时间
diff --git a/src/python/sentryPlugins/ai_block_io/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py
index 230c8cd..92bd6e3 100644
--- a/src/python/sentryPlugins/ai_block_io/alarm_report.py
+++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py
@@ -9,41 +9,72 @@
# PURPOSE.
# See the Mulan PSL v2 for more details.
-from syssentry.result import ResultLevel, report_result
import logging
import json
+from xalarm.sentry_notify import (
+ xalarm_report,
+ MINOR_ALM,
+ MAJOR_ALM,
+ CRITICAL_ALM,
+ ALARM_TYPE_OCCUR,
+ ALARM_TYPE_RECOVER,
+)
+
+from syssentry.result import ResultLevel, report_result
+
-class AlarmReport:
+class Report:
TASK_NAME = "ai_block_io"
@staticmethod
def report_pass(info: str):
- report_result(AlarmReport.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info}))
- logging.info(f'Report {AlarmReport.TASK_NAME} PASS: {info}')
+ report_result(Report.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info}))
+ logging.info(f'Report {Report.TASK_NAME} PASS: {info}')
@staticmethod
def report_fail(info: str):
- report_result(AlarmReport.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info}))
- logging.info(f'Report {AlarmReport.TASK_NAME} FAIL: {info}')
+ report_result(Report.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info}))
+ logging.info(f'Report {Report.TASK_NAME} FAIL: {info}')
@staticmethod
def report_skip(info: str):
- report_result(AlarmReport.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info}))
- logging.info(f'Report {AlarmReport.TASK_NAME} SKIP: {info}')
+ report_result(Report.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info}))
+ logging.info(f'Report {Report.TASK_NAME} SKIP: {info}')
+
+
+class Xalarm:
+ ALARM_ID = 1002
@staticmethod
- def report_minor_alm(info: str):
- report_result(AlarmReport.TASK_NAME, ResultLevel.MINOR_ALM, json.dumps({"msg": info}))
- logging.info(f'Report {AlarmReport.TASK_NAME} MINOR_ALM: {info}')
+ def minor(info: dict):
+ info_str = json.dumps(info)
+ xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_OCCUR, info_str)
+ logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}")
@staticmethod
- def report_major_alm(info: str):
- report_result(AlarmReport.TASK_NAME, ResultLevel.MAJOR_ALM, json.dumps({"msg": info}))
- logging.info(f'Report {AlarmReport.TASK_NAME} MAJOR_ALM: {info}')
+ def major(info: dict):
+ info_str = json.dumps(info)
+ xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_OCCUR, info_str)
+ logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}")
@staticmethod
- def report_critical_alm(info: str):
- report_result(AlarmReport.TASK_NAME, ResultLevel.CRITICAL_ALM, json.dumps({"msg": info}))
- logging.info(f'Report {AlarmReport.TASK_NAME} CRITICAL_ALM: {info}')
+ def critical(info: dict):
+ info_str = json.dumps(info)
+ xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_OCCUR, info_str)
+ logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}")
+
+ def minor_recover(info: dict):
+ info_str = json.dumps(info)
+ xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_RECOVER, info_str)
+ logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}")
+
+ def major_recover(info: dict):
+ info_str = json.dumps(info)
+ xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_RECOVER, info_str)
+ logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}")
+ def critical_recover(info: dict):
+ info_str = json.dumps(info)
+ xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_RECOVER, info_str)
+ logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}")
diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
index 01c5315..c7679cd 100644
--- a/src/python/sentryPlugins/ai_block_io/data_access.py
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
@@ -42,10 +42,11 @@ def check_collect_valid(period):
data = json.loads(data_raw["message"])
except Exception as e:
logging.warning(f"get io data failed, {e}")
- return []
+ return None
return [k for k in data.keys()]
else:
- return []
+ logging.warning(f"get io data failed, return {data_raw}")
+ return None
def _get_raw_data(period, disk_list):
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
index a48144f..0ed282b 100644
--- a/src/python/sentryPlugins/ai_block_io/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -35,7 +35,7 @@ class Detector:
self._count += 1
if self._count % 15 == 0:
self._count = 0
- logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
+ logging.debug(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
logging.debug(f'enter Detector: {self}')
metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name)
if metric_value is None:
--
2.23.0

View File

@ -1,235 +0,0 @@
From 1e13bc31ae3aa94f36aa124eefdfc8773221eacd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Mon, 14 Oct 2024 23:16:46 +0800
Subject: [PATCH] ai_block_io fix some bugs
---
.../sentryPlugins/ai_block_io/ai_block_io.py | 1 +
.../ai_block_io/config_parser.py | 20 ++++++++++---------
.../sentryPlugins/ai_block_io/detector.py | 18 ++++++++++++-----
.../sentryPlugins/ai_block_io/io_data.py | 2 +-
.../sentryPlugins/ai_block_io/threshold.py | 17 +++++++++-------
5 files changed, 36 insertions(+), 22 deletions(-)
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
index dd661a1..4eecd43 100644
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
@@ -55,6 +55,7 @@ class SlowIODetection:
Report.report_pass(
"get available disk error, please check if the collector plug is enable. exiting..."
)
+ logging.critical("get available disk error, please check if the collector plug is enable. exiting...")
exit(1)
logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
index 3388cd4..7b0cd29 100644
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -190,7 +190,7 @@ class ConfigParser:
self._conf["common"]["disk"] = disk_list
def _read_train_data_duration(self, items_algorithm: dict):
- self._conf["common"]["train_data_duration"] = self._get_config_value(
+ self._conf["algorithm"]["train_data_duration"] = self._get_config_value(
items_algorithm,
"train_data_duration",
float,
@@ -203,17 +203,17 @@ class ConfigParser:
default_train_update_duration = self.DEFAULT_CONF["algorithm"][
"train_update_duration"
]
- if default_train_update_duration > self._conf["common"]["train_data_duration"]:
+ if default_train_update_duration > self._conf["algorithm"]["train_data_duration"]:
default_train_update_duration = (
- self._conf["common"]["train_data_duration"] / 2
+ self._conf["algorithm"]["train_data_duration"] / 2
)
- self._conf["common"]["train_update_duration"] = self._get_config_value(
+ self._conf["algorithm"]["train_update_duration"] = self._get_config_value(
items_algorithm,
"train_update_duration",
float,
default_train_update_duration,
gt=0,
- le=self._conf["common"]["train_data_duration"],
+ le=self._conf["algorithm"]["train_data_duration"],
)
def _read_algorithm_type_and_parameter(self, items_algorithm: dict):
@@ -401,6 +401,8 @@ class ConfigParser:
self._read_stage(items_common)
self._read_iotype(items_common)
else:
+ self._conf["common"]["stage"] = ALL_STAGE_LIST
+ self._conf["common"]["iotype"] = ALL_IOTPYE_LIST
logging.warning(
"common section parameter not found, it will be set to default value."
)
@@ -511,8 +513,8 @@ class ConfigParser:
def get_train_data_duration_and_train_update_duration(self):
return (
- self._conf["common"]["train_data_duration"],
- self._conf["common"]["train_update_duration"],
+ self._conf["algorithm"]["train_data_duration"],
+ self._conf["algorithm"]["train_update_duration"],
)
def get_window_size_and_window_minimum_threshold(self):
@@ -535,11 +537,11 @@ class ConfigParser:
@property
def train_data_duration(self):
- return self._conf["common"]["train_data_duration"]
+ return self._conf["algorithm"]["train_data_duration"]
@property
def train_update_duration(self):
- return self._conf["common"]["train_update_duration"]
+ return self._conf["algorithm"]["train_update_duration"]
@property
def window_size(self):
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
index 87bd1dd..5b21714 100644
--- a/src/python/sentryPlugins/ai_block_io/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -9,6 +9,7 @@
# PURPOSE.
# See the Mulan PSL v2 for more details.
import logging
+from datetime import datetime
from .io_data import MetricName
from .threshold import Threshold
@@ -21,18 +22,25 @@ class Detector:
def __init__(self, metric_name: MetricName, threshold: Threshold, sliding_window: SlidingWindow):
self._metric_name = metric_name
self._threshold = threshold
+ # for when threshold update, it can print latest threshold with metric name
+ self._threshold.set_metric_name(self._metric_name)
self._slidingWindow = sliding_window
self._threshold.attach_observer(self._slidingWindow)
- self._count = 0
+ self._count = None
def get_metric_name(self):
return self._metric_name
def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
- self._count += 1
- if self._count % 15 == 0:
- self._count = 0
- logging.debug(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
+ if self._count is None:
+ self._count = datetime.now()
+ else:
+ now_time = datetime.now()
+ time_diff = (now_time - self._count).total_seconds()
+ if time_diff >= 60:
+ logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
+ self._count = None
+
logging.debug(f'enter Detector: {self}')
metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name)
if metric_value is None:
diff --git a/src/python/sentryPlugins/ai_block_io/io_data.py b/src/python/sentryPlugins/ai_block_io/io_data.py
index d341b55..6042911 100644
--- a/src/python/sentryPlugins/ai_block_io/io_data.py
+++ b/src/python/sentryPlugins/ai_block_io/io_data.py
@@ -48,7 +48,7 @@ class IOData:
@dataclass(frozen=True)
class MetricName:
disk_name: str
- disk_type: str
+ disk_type: int
stage_name: str
io_access_type_name: str
metric_name: str
diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py
index 3b7a5a8..600d041 100644
--- a/src/python/sentryPlugins/ai_block_io/threshold.py
+++ b/src/python/sentryPlugins/ai_block_io/threshold.py
@@ -23,11 +23,6 @@ class ThresholdState(Enum):
class Threshold:
- threshold = None
- data_queue: queue.Queue = None
- data_queue_update_size: int = None
- new_data_size: int = None
- threshold_state: ThresholdState = None
def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
self._observer = None
@@ -36,12 +31,16 @@ class Threshold:
self.new_data_size = 0
self.threshold_state = ThresholdState.INIT
self.threshold = math.inf
+ self.metric_name = None
def set_threshold(self, threshold):
self.threshold = threshold
self.threshold_state = ThresholdState.START
self.notify_observer()
+ def set_metric_name(self, metric_name):
+ self.metric_name = metric_name
+
def get_threshold(self):
if self.threshold_state == ThresholdState.INIT:
return None
@@ -84,6 +83,7 @@ class BoxplotThreshold(Threshold):
self.parameter = boxplot_parameter
def _update_threshold(self):
+ old_threshold = self.threshold
data = list(self.data_queue.queue)
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
@@ -91,6 +91,7 @@ class BoxplotThreshold(Threshold):
self.threshold = q3 + self.parameter * iqr
if self.threshold_state == ThresholdState.INIT:
self.threshold_state = ThresholdState.START
+ logging.info(f"MetricName: [{self.metric_name}]'s threshold update, old is: {old_threshold} -> new is: {self.threshold}")
self.notify_observer()
def push_latest_data_to_queue(self, data):
@@ -109,7 +110,7 @@ class BoxplotThreshold(Threshold):
self.new_data_size = 0
def __repr__(self):
- return f"[BoxplotThreshold, param is: {self.parameter}]"
+ return f"[BoxplotThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]"
class NSigmaThreshold(Threshold):
@@ -118,12 +119,14 @@ class NSigmaThreshold(Threshold):
self.parameter = n_sigma_parameter
def _update_threshold(self):
+ old_threshold = self.threshold
data = list(self.data_queue.queue)
mean = np.mean(data)
std = np.std(data)
self.threshold = mean + self.parameter * std
if self.threshold_state == ThresholdState.INIT:
self.threshold_state = ThresholdState.START
+ logging.info(f"MetricName: [{self.metric_name}]'s threshold update, old is: {old_threshold} -> new is: {self.threshold}")
self.notify_observer()
def push_latest_data_to_queue(self, data):
@@ -142,7 +145,7 @@ class NSigmaThreshold(Threshold):
self.new_data_size = 0
def __repr__(self):
- return f"[NSigmaThreshold, param is: {self.parameter}]"
+ return f"[NSigmaThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]"
class ThresholdType(Enum):
--
2.23.0

View File

@ -1,626 +0,0 @@
From f3a0738061e852c8125513f6222b4a5d6ea73270 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Fri, 25 Oct 2024 15:34:25 +0800
Subject: [PATCH] ai_block_io fix some config parameters parse bug
---
.../sentryPlugins/ai_block_io/ai_block_io.py | 70 +++++----
.../ai_block_io/config_parser.py | 135 ++++++++++++++----
.../sentryPlugins/ai_block_io/data_access.py | 14 ++
.../sentryPlugins/ai_block_io/detector.py | 16 ++-
.../ai_block_io/sliding_window.py | 2 +-
.../sentryPlugins/ai_block_io/threshold.py | 14 +-
src/python/sentryPlugins/ai_block_io/utils.py | 2 -
7 files changed, 180 insertions(+), 73 deletions(-)
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
index 74f246a..14f740d 100644
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
@@ -23,6 +23,7 @@ from .data_access import (
get_io_data_from_collect_plug,
check_collect_valid,
get_disk_type,
+ check_disk_is_available
)
from .io_data import MetricName
from .alarm_report import Xalarm, Report
@@ -31,14 +32,14 @@ CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini"
def sig_handler(signum, frame):
- logging.info("receive signal: %d", signum)
Report.report_pass(f"receive signal: {signum}, exiting...")
+ logging.info("Finished ai_block_io plugin running.")
exit(signum)
class SlowIODetection:
_config_parser = None
- _disk_list = None
+ _disk_list = []
_detector_name_list = defaultdict(list)
_disk_detectors = {}
@@ -48,32 +49,30 @@ class SlowIODetection:
self.__init_detector()
def __init_detector_name_list(self):
- self._disk_list = check_collect_valid(
- self._config_parser.period_time
- )
- if self._disk_list is None:
- Report.report_pass(
- "get available disk error, please check if the collector plug is enable. exiting..."
- )
- logging.critical("get available disk error, please check if the collector plug is enable. exiting...")
- exit(1)
-
- logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
disks: list = self._config_parser.disks_to_detection
stages: list = self._config_parser.stage
iotypes: list = self._config_parser.iotype
- # 情况1None则启用所有磁盘检测
- # 情况2is not None and len = 0则不启动任何磁盘检测
- # 情况3len = 0则取交集
+
if disks is None:
- logging.warning(
- "you not specify any disk or use default, so ai_block_io will enable all available disk."
- )
- for disk in self._disk_list:
- if disks is not None:
- if disk not in disks:
- continue
- disks.remove(disk)
+ logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.")
+ all_available_disk_list = check_collect_valid(self._config_parser.period_time)
+ if all_available_disk_list is None:
+ Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...")
+ logging.critical("get available disk error, please check if the collector plug is enable. exiting...")
+ exit(1)
+ if len(all_available_disk_list) == 0:
+ Report.report_pass("not found available disk. exiting...")
+ logging.critical("not found available disk. exiting...")
+ exit(1)
+ disks = all_available_disk_list
+ logging.info(f"available disk list is follow: {disks}.")
+
+ for disk in disks:
+ tmp_disk = [disk]
+ ret = check_disk_is_available(self._config_parser.period_time, tmp_disk)
+ if not ret:
+ logging.warning(f"disk: {disk} is not available, it will be ignored.")
+ continue
disk_type_result = get_disk_type(disk)
if disk_type_result["ret"] == 0 and disk_type_result["message"] in (
@@ -89,20 +88,15 @@ class SlowIODetection:
disk_type_result,
)
continue
+ self._disk_list.append(disk)
for stage in stages:
for iotype in iotypes:
self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "latency"))
self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "io_dump"))
- if disks:
- logging.warning(
- "disks: %s not in available disk list, so they will be ignored.",
- disks,
- )
+
if not self._detector_name_list:
+ Report.report_pass("the disks to detection is empty, ai_block_io will exit.")
logging.critical("the disks to detection is empty, ai_block_io will exit.")
- Report.report_pass(
- "the disks to detection is empty, ai_block_io will exit."
- )
exit(1)
def __init_detector(self):
@@ -202,16 +196,20 @@ class SlowIODetection:
logging.debug("step3. Report slow io event to sysSentry.")
for slow_io_event in slow_io_event_list:
alarm_content = {
+ "alarm_source": "ai_block_io",
"driver_name": slow_io_event[1],
+ "io_type": slow_io_event[4],
"reason": slow_io_event[2],
"block_stack": slow_io_event[3],
- "io_type": slow_io_event[4],
- "alarm_source": "ai_block_io",
"alarm_type": slow_io_event[5],
- "details": slow_io_event[6],
+ "details": slow_io_event[6]
}
Xalarm.major(alarm_content)
- logging.warning("[SLOW IO] " + str(alarm_content))
+ tmp_alarm_content = alarm_content.copy()
+ del tmp_alarm_content["details"]
+ logging.warning("[SLOW IO] " + str(tmp_alarm_content))
+ logging.warning(f"latency: " + str(alarm_content.get("details").get("latency")))
+ logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump")))
# Step4等待检测时间
logging.debug("step4. Wait to start next slow io event detection loop.")
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
index 91ec5c6..3049db2 100644
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -105,21 +105,26 @@ class ConfigParser:
ge=None,
lt=None,
le=None,
+ section=None
):
+ if section is not None:
+ print_key = section + "." + key
+ else:
+ print_key = key
value = config_items.get(key)
if value is None:
logging.warning(
"config of %s not found, the default value %s will be used.",
- key,
+ print_key,
default_value,
)
value = default_value
if not value:
logging.critical(
- "the value of %s is empty, ai_block_io plug will exit.", key
+ "the value of %s is empty, ai_block_io plug will exit.", print_key
)
Report.report_pass(
- f"the value of {key} is empty, ai_block_io plug will exit."
+ f"the value of {print_key} is empty, ai_block_io plug will exit."
)
exit(1)
try:
@@ -127,51 +132,51 @@ class ConfigParser:
except ValueError:
logging.critical(
"the value of %s is not a valid %s, ai_block_io plug will exit.",
- key,
+ print_key,
value_type,
)
Report.report_pass(
- f"the value of {key} is not a valid {value_type}, ai_block_io plug will exit."
+ f"the value of {print_key} is not a valid {value_type}, ai_block_io plug will exit."
)
exit(1)
if gt is not None and value <= gt:
logging.critical(
"the value of %s is not greater than %s, ai_block_io plug will exit.",
- key,
+ print_key,
gt,
)
Report.report_pass(
- f"the value of {key} is not greater than {gt}, ai_block_io plug will exit."
+ f"the value of {print_key} is not greater than {gt}, ai_block_io plug will exit."
)
exit(1)
if ge is not None and value < ge:
logging.critical(
"the value of %s is not greater than or equal to %s, ai_block_io plug will exit.",
- key,
+ print_key,
ge,
)
Report.report_pass(
- f"the value of {key} is not greater than or equal to {ge}, ai_block_io plug will exit."
+ f"the value of {print_key} is not greater than or equal to {ge}, ai_block_io plug will exit."
)
exit(1)
if lt is not None and value >= lt:
logging.critical(
"the value of %s is not less than %s, ai_block_io plug will exit.",
- key,
+ print_key,
lt,
)
Report.report_pass(
- f"the value of {key} is not less than {lt}, ai_block_io plug will exit."
+ f"the value of {print_key} is not less than {lt}, ai_block_io plug will exit."
)
exit(1)
if le is not None and value > le:
logging.critical(
"the value of %s is not less than or equal to %s, ai_block_io plug will exit.",
- key,
+ print_key,
le,
)
Report.report_pass(
- f"the value of {key} is not less than or equal to {le}, ai_block_io plug will exit."
+ f"the value of {print_key} is not less than or equal to {le}, ai_block_io plug will exit."
)
exit(1)
@@ -188,7 +193,7 @@ class ConfigParser:
frequency = self._conf["common"]["period_time"]
ret = check_detect_frequency_is_valid(frequency)
if ret is None:
- log = f"period_time: {frequency} is valid, "\
+ log = f"period_time: {frequency} is invalid, "\
f"Check whether the value range is too large or is not an "\
f"integer multiple of period_time.. exiting..."
Report.report_pass(log)
@@ -202,6 +207,7 @@ class ConfigParser:
self._conf["common"]["disk"] = None
return
disks_to_detection = disks_to_detection.strip()
+ disks_to_detection = disks_to_detection.lower()
if not disks_to_detection:
logging.critical("the value of disk is empty, ai_block_io plug will exit.")
Report.report_pass(
@@ -213,7 +219,18 @@ class ConfigParser:
if len(disk_list) == 1 and disk_list[0] == "default":
self._conf["common"]["disk"] = None
return
- self._conf["common"]["disk"] = disk_list
+ if len(disk_list) > 10:
+ ten_disk_list = disk_list[0:10]
+ other_disk_list = disk_list[10:]
+ logging.warning(f"disk only support maximum is 10, disks: {ten_disk_list} will be retained, other: {other_disk_list} will be ignored.")
+ else:
+ ten_disk_list = disk_list
+ set_ten_disk_list = set(ten_disk_list)
+ if len(ten_disk_list) > len(set_ten_disk_list):
+ tmp = ten_disk_list
+ ten_disk_list = list(set_ten_disk_list)
+ logging.warning(f"disk exist duplicate, it will be deduplicate, before: {tmp}, after: {ten_disk_list}")
+ self._conf["common"]["disk"] = ten_disk_list
def _read_train_data_duration(self, items_algorithm: dict):
self._conf["algorithm"]["train_data_duration"] = self._get_config_value(
@@ -244,10 +261,12 @@ class ConfigParser:
def _read_algorithm_type_and_parameter(self, items_algorithm: dict):
algorithm_type = items_algorithm.get("algorithm_type")
- if algorithm_type is not None:
- self._conf["algorithm"]["algorithm_type"] = get_threshold_type_enum(
- algorithm_type
- )
+ if algorithm_type is None:
+ default_algorithm_type = self._conf["algorithm"]["algorithm_type"]
+ logging.warning(f"algorithm_type not found, it will be set default: {default_algorithm_type}")
+ else:
+ self._conf["algorithm"]["algorithm_type"] = get_threshold_type_enum(algorithm_type)
+
if self._conf["algorithm"]["algorithm_type"] is None:
logging.critical(
"the algorithm_type: %s you set is invalid. ai_block_io plug will exit.",
@@ -257,6 +276,7 @@ class ConfigParser:
f"the algorithm_type: {algorithm_type} you set is invalid. ai_block_io plug will exit."
)
exit(1)
+
elif self._conf["algorithm"]["algorithm_type"] == ThresholdType.NSigmaThreshold:
self._conf["algorithm"]["n_sigma_parameter"] = self._get_config_value(
items_algorithm,
@@ -279,9 +299,14 @@ class ConfigParser:
)
def _read_stage(self, items_algorithm: dict):
- stage_str = items_algorithm.get(
- "stage", self.DEFAULT_CONF["common"]["stage"]
- ).strip()
+ stage_str = items_algorithm.get("stage")
+ if stage_str is None:
+ stage_str = self.DEFAULT_CONF["common"]["stage"]
+ logging.warning(f"stage not found, it will be set default: {stage_str}")
+ else:
+ stage_str = stage_str.strip()
+
+ stage_str = stage_str.lower()
stage_list = stage_str.split(",")
stage_list = [stage.strip() for stage in stage_list]
if len(stage_list) == 1 and stage_list[0] == "":
@@ -307,9 +332,14 @@ class ConfigParser:
self._conf["common"]["stage"] = dup_stage_list
def _read_iotype(self, items_algorithm: dict):
- iotype_str = items_algorithm.get(
- "iotype", self.DEFAULT_CONF["common"]["iotype"]
- ).strip()
+ iotype_str = items_algorithm.get("iotype")
+ if iotype_str is None:
+ iotype_str = self.DEFAULT_CONF["common"]["iotype"]
+ logging.warning(f"iotype not found, it will be set default: {iotype_str}")
+ else:
+ iotype_str = iotype_str.strip()
+
+ iotype_str = iotype_str.lower()
iotype_list = iotype_str.split(",")
iotype_list = [iotype.strip() for iotype in iotype_list]
if len(iotype_list) == 1 and iotype_list[0] == "":
@@ -333,6 +363,13 @@ class ConfigParser:
def _read_sliding_window_type(self, items_sliding_window: dict):
sliding_window_type = items_sliding_window.get("win_type")
+
+ if sliding_window_type is None:
+ default_sliding_window_type = self._conf["algorithm"]["win_type"]
+ logging.warning(f"win_type not found, it will be set default: {default_sliding_window_type}")
+ return
+
+ sliding_window_type = sliding_window_type.strip()
if sliding_window_type is not None:
self._conf["algorithm"]["win_type"] = (
get_sliding_window_type_enum(sliding_window_type)
@@ -439,6 +476,7 @@ class ConfigParser:
int,
self.DEFAULT_CONF["latency_sata_ssd"]["read_tot_lim"],
gt=0,
+ section="latency_sata_ssd"
)
self._conf["latency_sata_ssd"]["write_tot_lim"] = self._get_config_value(
items_latency_sata_ssd,
@@ -446,21 +484,32 @@ class ConfigParser:
int,
self.DEFAULT_CONF["latency_sata_ssd"]["write_tot_lim"],
gt=0,
+ section="latency_sata_ssd"
)
self._conf["latency_sata_ssd"]["read_avg_lim"] = self._get_config_value(
items_latency_sata_ssd,
"read_avg_lim",
int,
self.DEFAULT_CONF["latency_sata_ssd"]["read_avg_lim"],
- gt=0
+ gt=0,
+ section="latency_sata_ssd"
)
self._conf["latency_sata_ssd"]["write_avg_lim"] = self._get_config_value(
items_latency_sata_ssd,
"write_avg_lim",
int,
self.DEFAULT_CONF["latency_sata_ssd"]["write_avg_lim"],
- gt=0
+ gt=0,
+ section="latency_sata_ssd"
)
+ if self._conf["latency_sata_ssd"]["read_avg_lim"] >= self._conf["latency_sata_ssd"]["read_tot_lim"]:
+ Report.report_pass("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...")
+ logging.critical("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...")
+ exit(1)
+ if self._conf["latency_sata_ssd"]["write_avg_lim"] >= self._conf["latency_sata_ssd"]["write_tot_lim"]:
+ Report.report_pass("latency_sata_ssd.write_avg_lim must < latency_sata_ssd.write_tot_lim . exiting...")
+ logging.critical("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...")
+ exit(1)
else:
Report.report_pass("not found latency_sata_ssd section. exiting...")
logging.critical("not found latency_sata_ssd section. exiting...")
@@ -474,6 +523,7 @@ class ConfigParser:
int,
self.DEFAULT_CONF["latency_nvme_ssd"]["read_tot_lim"],
gt=0,
+ section="latency_nvme_ssd"
)
self._conf["latency_nvme_ssd"]["write_tot_lim"] = self._get_config_value(
items_latency_nvme_ssd,
@@ -481,21 +531,32 @@ class ConfigParser:
int,
self.DEFAULT_CONF["latency_nvme_ssd"]["write_tot_lim"],
gt=0,
+ section="latency_nvme_ssd"
)
self._conf["latency_nvme_ssd"]["read_avg_lim"] = self._get_config_value(
items_latency_nvme_ssd,
"read_avg_lim",
int,
self.DEFAULT_CONF["latency_nvme_ssd"]["read_avg_lim"],
- gt=0
+ gt=0,
+ section="latency_nvme_ssd"
)
self._conf["latency_nvme_ssd"]["write_avg_lim"] = self._get_config_value(
items_latency_nvme_ssd,
"write_avg_lim",
int,
self.DEFAULT_CONF["latency_nvme_ssd"]["write_avg_lim"],
- gt=0
+ gt=0,
+ section="latency_nvme_ssd"
)
+ if self._conf["latency_nvme_ssd"]["read_avg_lim"] >= self._conf["latency_nvme_ssd"]["read_tot_lim"]:
+ Report.report_pass("latency_nvme_ssd.read_avg_lim must < latency_nvme_ssd.read_tot_lim . exiting...")
+ logging.critical("latency_nvme_ssd.read_avg_lim must < latency_nvme_ssd.read_tot_lim . exiting...")
+ exit(1)
+ if self._conf["latency_nvme_ssd"]["write_avg_lim"] >= self._conf["latency_nvme_ssd"]["write_tot_lim"]:
+ Report.report_pass("latency_nvme_ssd.write_avg_lim must < latency_nvme_ssd.write_tot_lim . exiting...")
+ logging.critical("latency_nvme_ssd.write_avg_lim must < latency_nvme_ssd.write_tot_lim . exiting...")
+ exit(1)
else:
Report.report_pass("not found latency_nvme_ssd section. exiting...")
logging.critical("not found latency_nvme_ssd section. exiting...")
@@ -509,6 +570,7 @@ class ConfigParser:
int,
self.DEFAULT_CONF["latency_sata_hdd"]["read_tot_lim"],
gt=0,
+ section="latency_sata_hdd"
)
self._conf["latency_sata_hdd"]["write_tot_lim"] = self._get_config_value(
items_latency_sata_hdd,
@@ -516,21 +578,32 @@ class ConfigParser:
int,
self.DEFAULT_CONF["latency_sata_hdd"]["write_tot_lim"],
gt=0,
+ section="latency_sata_hdd"
)
self._conf["latency_sata_hdd"]["read_avg_lim"] = self._get_config_value(
items_latency_sata_hdd,
"read_avg_lim",
int,
self.DEFAULT_CONF["latency_sata_hdd"]["read_avg_lim"],
- gt=0
+ gt=0,
+ section="latency_sata_hdd"
)
self._conf["latency_sata_hdd"]["write_avg_lim"] = self._get_config_value(
items_latency_sata_hdd,
"write_avg_lim",
int,
self.DEFAULT_CONF["latency_sata_hdd"]["write_avg_lim"],
- gt=0
+ gt=0,
+ section="latency_sata_hdd"
)
+ if self._conf["latency_sata_hdd"]["read_avg_lim"] >= self._conf["latency_sata_hdd"]["read_tot_lim"]:
+ Report.report_pass("latency_sata_hdd.read_avg_lim must < latency_sata_hdd.read_tot_lim . exiting...")
+ logging.critical("latency_sata_hdd.read_avg_lim must < latency_sata_hdd.read_tot_lim . exiting...")
+ exit(1)
+ if self._conf["latency_sata_hdd"]["write_avg_lim"] >= self._conf["latency_sata_hdd"]["write_tot_lim"]:
+ Report.report_pass("latency_sata_hdd.write_avg_lim must < latency_sata_hdd.write_tot_lim . exiting...")
+ logging.critical("latency_sata_hdd.write_avg_lim must < latency_sata_hdd.write_tot_lim . exiting...")
+ exit(1)
else:
Report.report_pass("not found latency_sata_hdd section. exiting...")
logging.critical("not found latency_sata_hdd section. exiting...")
diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
index e4869d5..2f2d607 100644
--- a/src/python/sentryPlugins/ai_block_io/data_access.py
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
@@ -67,6 +67,20 @@ def check_detect_frequency_is_valid(period):
return None
+def check_disk_is_available(period_time, disk):
+ data_raw = is_iocollect_valid(period_time, disk)
+ if data_raw["ret"] == 0:
+ try:
+ data = json.loads(data_raw["message"])
+ except Exception as e:
+ return False
+ if not data:
+ return False
+ return True
+ else:
+ return False
+
+
def _get_raw_data(period, disk_list):
return get_io_data(
period,
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
index e3a0952..496e032 100644
--- a/src/python/sentryPlugins/ai_block_io/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -75,6 +75,18 @@ class Detector:
f' sliding_window_type: {self._slidingWindow}')
+def set_to_str(parameter: set):
+ ret = ""
+ parameter = list(parameter)
+ length = len(parameter)
+ for i in range(length):
+ if i == 0:
+ ret += parameter[i]
+ else:
+ ret += "," + parameter[i]
+ return ret
+
+
class DiskDetector:
def __init__(self, disk_name: str):
@@ -124,7 +136,7 @@ class DiskDetector:
alarm_type.add(metric_name.metric_name)
latency_wins, iodump_wins = self.get_detector_list_window()
- details = f"latency: {latency_wins}, iodump: {iodump_wins}"
+ details = {"latency": latency_wins, "iodump": iodump_wins}
io_press = {"throtl", "wbt", "iocost", "bfq"}
driver_slow = {"rq_driver"}
@@ -137,7 +149,7 @@ class DiskDetector:
elif not kernel_slow.isdisjoint(block_stack):
reason = "kernel_slow"
- return True, driver_name, reason, str(block_stack), str(io_type), str(alarm_type), details
+ return True, driver_name, reason, set_to_str(block_stack), set_to_str(io_type), set_to_str(alarm_type), details
def __repr__(self):
msg = f'disk: {self._disk_name}, '
diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py
index 4083c43..ff3fa3b 100644
--- a/src/python/sentryPlugins/ai_block_io/sliding_window.py
+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py
@@ -107,7 +107,7 @@ class MedianSlidingWindow(SlidingWindow):
if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None):
is_slow_io_event = False
median = np.median(self._io_data_queue)
- if median >= self._ai_threshold:
+ if (self._ai_threshold is not None and median > self._ai_threshold) or (self._abs_threshold is not None and median > self._abs_threshold):
is_slow_io_event = True
return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim
diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py
index 600d041..e202bb8 100644
--- a/src/python/sentryPlugins/ai_block_io/threshold.py
+++ b/src/python/sentryPlugins/ai_block_io/threshold.py
@@ -65,9 +65,12 @@ class Threshold:
def __repr__(self):
return "Threshold"
+ def __str__(self):
+ return "Threshold"
+
class AbsoluteThreshold(Threshold):
- def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
+ def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
super().__init__(data_queue_size, data_queue_update_size)
def push_latest_data_to_queue(self, data):
@@ -76,6 +79,9 @@ class AbsoluteThreshold(Threshold):
def __repr__(self):
return "[AbsoluteThreshold]"
+ def __str__(self):
+ return "absolute"
+
class BoxplotThreshold(Threshold):
def __init__(self, boxplot_parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
@@ -112,6 +118,9 @@ class BoxplotThreshold(Threshold):
def __repr__(self):
return f"[BoxplotThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]"
+ def __str__(self):
+ return "boxplot"
+
class NSigmaThreshold(Threshold):
def __init__(self, n_sigma_parameter: float = 3.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
@@ -147,6 +156,9 @@ class NSigmaThreshold(Threshold):
def __repr__(self):
return f"[NSigmaThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]"
+ def __str__(self):
+ return "n_sigma"
+
class ThresholdType(Enum):
AbsoluteThreshold = 0
diff --git a/src/python/sentryPlugins/ai_block_io/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py
index d6f4067..7d2390b 100644
--- a/src/python/sentryPlugins/ai_block_io/utils.py
+++ b/src/python/sentryPlugins/ai_block_io/utils.py
@@ -19,8 +19,6 @@ from .io_data import MetricName, IOData
def get_threshold_type_enum(algorithm_type: str):
- if algorithm_type.lower() == "absolute":
- return ThresholdType.AbsoluteThreshold
if algorithm_type.lower() == "boxplot":
return ThresholdType.BoxplotThreshold
if algorithm_type.lower() == "n_sigma":
--
2.23.0

View File

@ -1,98 +0,0 @@
From 8e4f39897dc8dc51cfa0bbf24667be1688876c15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Mon, 21 Oct 2024 14:18:20 +0800
Subject: [PATCH] ai_block_io lack section exit
---
.../ai_block_io/config_parser.py | 40 +++++++++----------
1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
index 7b0cd29..447eccd 100644
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -401,11 +401,9 @@ class ConfigParser:
self._read_stage(items_common)
self._read_iotype(items_common)
else:
- self._conf["common"]["stage"] = ALL_STAGE_LIST
- self._conf["common"]["iotype"] = ALL_IOTPYE_LIST
- logging.warning(
- "common section parameter not found, it will be set to default value."
- )
+ Report.report_pass("not found common section. exiting...")
+ logging.critical("not found common section. exiting...")
+ exit(1)
if con.has_section("algorithm"):
items_algorithm = dict(con.items("algorithm"))
@@ -413,9 +411,9 @@ class ConfigParser:
self._read_train_update_duration(items_algorithm)
self._read_algorithm_type_and_parameter(items_algorithm)
else:
- logging.warning(
- "algorithm section parameter not found, it will be set to default value."
- )
+ Report.report_pass("not found algorithm section. exiting...")
+ logging.critical("not found algorithm section. exiting...")
+ exit(1)
if con.has_section("sliding_window"):
items_sliding_window = dict(con.items("sliding_window"))
@@ -423,9 +421,9 @@ class ConfigParser:
self._read_window_size(items_sliding_window)
self._read_window_minimum_threshold(items_sliding_window)
else:
- logging.warning(
- "sliding_window section parameter not found, it will be set to default value."
- )
+ Report.report_pass("not found sliding_window section. exiting...")
+ logging.critical("not found sliding_window section. exiting...")
+ exit(1)
if con.has_section("latency_sata_ssd"):
items_latency_sata_ssd = dict(con.items("latency_sata_ssd"))
@@ -444,9 +442,10 @@ class ConfigParser:
gt=0,
)
else:
- logging.warning(
- "latency_sata_ssd section parameter not found, it will be set to default value."
- )
+ Report.report_pass("not found latency_sata_ssd section. exiting...")
+ logging.critical("not found latency_sata_ssd section. exiting...")
+ exit(1)
+
if con.has_section("latency_nvme_ssd"):
items_latency_nvme_ssd = dict(con.items("latency_nvme_ssd"))
self._conf["latency_nvme_ssd"]["read_tot_lim"] = self._get_config_value(
@@ -464,9 +463,10 @@ class ConfigParser:
gt=0,
)
else:
- logging.warning(
- "latency_nvme_ssd section parameter not found, it will be set to default value."
- )
+ Report.report_pass("not found latency_nvme_ssd section. exiting...")
+ logging.critical("not found latency_nvme_ssd section. exiting...")
+ exit(1)
+
if con.has_section("latency_sata_hdd"):
items_latency_sata_hdd = dict(con.items("latency_sata_hdd"))
self._conf["latency_sata_hdd"]["read_tot_lim"] = self._get_config_value(
@@ -484,9 +484,9 @@ class ConfigParser:
gt=0,
)
else:
- logging.warning(
- "latency_sata_hdd section parameter not found, it will be set to default value."
- )
+ Report.report_pass("not found latency_sata_hdd section. exiting...")
+ logging.critical("not found latency_sata_hdd section. exiting...")
+ exit(1)
self.__print_all_config_value()
--
2.23.0

View File

@ -1,728 +0,0 @@
From cedd862d4e4a97a6c4fa13cbff2af452910ea5b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Thu, 24 Oct 2024 09:39:16 +0800
Subject: [PATCH] ai_block_io support absolute threshold lower limit
---
config/plugins/ai_block_io.ini | 19 +-
.../sentryPlugins/ai_block_io/ai_block_io.py | 36 ++--
.../sentryPlugins/ai_block_io/alarm_report.py | 18 +-
.../ai_block_io/config_parser.py | 168 ++++++++++++------
.../sentryPlugins/ai_block_io/detector.py | 92 ++++++----
.../ai_block_io/sliding_window.py | 21 ++-
6 files changed, 222 insertions(+), 132 deletions(-)
diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini
index 040237d..d0b1e74 100644
--- a/config/plugins/ai_block_io.ini
+++ b/config/plugins/ai_block_io.ini
@@ -2,9 +2,9 @@
level=info
[common]
-slow_io_detect_frequency=1
+period_time=1
disk=default
-stage=bio
+stage=default
iotype=read,write
[algorithm]
@@ -12,22 +12,25 @@ train_data_duration=24
train_update_duration=2
algorithm_type=boxplot
boxplot_parameter=1.5
-n_sigma_parameter=3
-
-[sliding_window]
-sliding_window_type=not_continuous
-window_size=30
-window_minimum_threshold=6
+win_type=not_continuous
+win_size=30
+win_threshold=6
[latency_sata_ssd]
+read_avg_lim=10000
+write_avg_lim=10000
read_tot_lim=50000
write_tot_lim=50000
[latency_nvme_ssd]
+read_avg_lim=300
+write_avg_lim=300
read_tot_lim=500
write_tot_lim=500
[latency_sata_hdd]
+read_avg_lim=15000
+write_avg_lim=15000
read_tot_lim=50000
write_tot_lim=50000
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
index f25e6d5..74f246a 100644
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
@@ -49,7 +49,7 @@ class SlowIODetection:
def __init_detector_name_list(self):
self._disk_list = check_collect_valid(
- self._config_parser.slow_io_detect_frequency
+ self._config_parser.period_time
)
if self._disk_list is None:
Report.report_pass(
@@ -109,7 +109,7 @@ class SlowIODetection:
train_data_duration, train_update_duration = (
self._config_parser.get_train_data_duration_and_train_update_duration()
)
- slow_io_detection_frequency = self._config_parser.slow_io_detect_frequency
+ slow_io_detection_frequency = self._config_parser.period_time
threshold_type = self._config_parser.algorithm_type
data_queue_size, update_size = get_data_queue_size_and_update_size(
train_data_duration, train_update_duration, slow_io_detection_frequency
@@ -131,10 +131,13 @@ class SlowIODetection:
data_queue_size=data_queue_size,
data_queue_update_size=update_size,
)
- abs_threshold = self._config_parser.get_tot_lim(
+ tot_lim = self._config_parser.get_tot_lim(
metric_name.disk_type, metric_name.io_access_type_name
)
- if abs_threshold is None:
+ avg_lim = self._config_parser.get_avg_lim(
+ metric_name.disk_type, metric_name.io_access_type_name
+ )
+ if tot_lim is None:
logging.warning(
"disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.",
disk,
@@ -145,7 +148,8 @@ class SlowIODetection:
sliding_window_type,
queue_length=window_size,
threshold=window_threshold,
- abs_threshold=abs_threshold,
+ abs_threshold=tot_lim,
+ avg_lim=avg_lim
)
detector = Detector(metric_name, threshold, sliding_window)
disk_detector.add_detector(detector)
@@ -176,7 +180,7 @@ class SlowIODetection:
# Step1获取IO数据
io_data_dict_with_disk_name = get_io_data_from_collect_plug(
- self._config_parser.slow_io_detect_frequency, self._disk_list
+ self._config_parser.period_time, self._disk_list
)
logging.debug(f"step1. Get io data: {str(io_data_dict_with_disk_name)}")
if io_data_dict_with_disk_name is None:
@@ -197,25 +201,21 @@ class SlowIODetection:
# Step3慢IO事件上报
logging.debug("step3. Report slow io event to sysSentry.")
for slow_io_event in slow_io_event_list:
- metric_name: MetricName = slow_io_event[1]
- window_info = slow_io_event[2]
- root_cause = slow_io_event[3]
alarm_content = {
- "driver_name": f"{metric_name.disk_name}",
- "reason": root_cause,
- "block_stack": f"{metric_name.stage_name}",
- "io_type": f"{metric_name.io_access_type_name}",
+ "driver_name": slow_io_event[1],
+ "reason": slow_io_event[2],
+ "block_stack": slow_io_event[3],
+ "io_type": slow_io_event[4],
"alarm_source": "ai_block_io",
- "alarm_type": "latency",
- "details": f"disk type: {metric_name.disk_type}, current window: {window_info[1]}, "
- f"ai threshold: {window_info[2]}, abs threshold: {window_info[3]}.",
+ "alarm_type": slow_io_event[5],
+ "details": slow_io_event[6],
}
Xalarm.major(alarm_content)
- logging.warning(alarm_content)
+ logging.warning("[SLOW IO] " + str(alarm_content))
# Step4等待检测时间
logging.debug("step4. Wait to start next slow io event detection loop.")
- time.sleep(self._config_parser.slow_io_detect_frequency)
+ time.sleep(self._config_parser.period_time)
def main():
diff --git a/src/python/sentryPlugins/ai_block_io/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py
index 92bd6e3..61bb145 100644
--- a/src/python/sentryPlugins/ai_block_io/alarm_report.py
+++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py
@@ -30,17 +30,17 @@ class Report:
@staticmethod
def report_pass(info: str):
report_result(Report.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info}))
- logging.info(f'Report {Report.TASK_NAME} PASS: {info}')
+ logging.debug(f'Report {Report.TASK_NAME} PASS: {info}')
@staticmethod
def report_fail(info: str):
report_result(Report.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info}))
- logging.info(f'Report {Report.TASK_NAME} FAIL: {info}')
+ logging.debug(f'Report {Report.TASK_NAME} FAIL: {info}')
@staticmethod
def report_skip(info: str):
report_result(Report.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info}))
- logging.info(f'Report {Report.TASK_NAME} SKIP: {info}')
+ logging.debug(f'Report {Report.TASK_NAME} SKIP: {info}')
class Xalarm:
@@ -50,31 +50,31 @@ class Xalarm:
def minor(info: dict):
info_str = json.dumps(info)
xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_OCCUR, info_str)
- logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}")
+ logging.debug(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}")
@staticmethod
def major(info: dict):
info_str = json.dumps(info)
xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_OCCUR, info_str)
- logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}")
+ logging.debug(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}")
@staticmethod
def critical(info: dict):
info_str = json.dumps(info)
xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_OCCUR, info_str)
- logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}")
+ logging.debug(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}")
def minor_recover(info: dict):
info_str = json.dumps(info)
xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_RECOVER, info_str)
- logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}")
+ logging.debug(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}")
def major_recover(info: dict):
info_str = json.dumps(info)
xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_RECOVER, info_str)
- logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}")
+ logging.debug(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}")
def critical_recover(info: dict):
info_str = json.dumps(info)
xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_RECOVER, info_str)
- logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}")
+ logging.debug(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}")
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
index 1117939..91ec5c6 100644
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -52,7 +52,7 @@ class ConfigParser:
DEFAULT_CONF = {
"log": {"level": "info"},
"common": {
- "slow_io_detect_frequency": 1,
+ "period_time": 1,
"disk": None,
"stage": "throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio",
"iotype": "read,write",
@@ -63,16 +63,32 @@ class ConfigParser:
"algorithm_type": get_threshold_type_enum("boxplot"),
"boxplot_parameter": 1.5,
"n_sigma_parameter": 3.0,
+ "win_type": get_sliding_window_type_enum("not_continuous"),
+ "win_size": 30,
+ "win_threshold": 6,
},
- "sliding_window": {
- "sliding_window_type": get_sliding_window_type_enum("not_continuous"),
- "window_size": 30,
- "window_minimum_threshold": 6,
+ "latency_sata_ssd": {
+ "read_avg_lim": 10000,
+ "write_avg_lim": 10000,
+ "read_tot_lim": 50000,
+ "write_tot_lim": 50000
},
- "latency_sata_ssd": {"read_tot_lim": 50000, "write_tot_lim": 50000},
- "latency_nvme_ssd": {"read_tot_lim": 500, "write_tot_lim": 500},
- "latency_sata_hdd": {"read_tot_lim": 50000, "write_tot_lim": 50000},
- "iodump": {"read_iodump_lim": 0, "write_iodump_lim": 0}
+ "latency_nvme_ssd": {
+ "read_avg_lim": 300,
+ "write_avg_lim": 300,
+ "read_tot_lim": 500,
+ "write_tot_lim": 500
+ },
+ "latency_sata_hdd": {
+ "read_avg_lim": 15000,
+ "write_avg_lim": 15000,
+ "read_tot_lim": 50000,
+ "write_tot_lim": 50000
+ },
+ "iodump": {
+ "read_iodump_lim": 0,
+ "write_iodump_lim": 0
+ }
}
def __init__(self, config_file_name):
@@ -161,18 +177,18 @@ class ConfigParser:
return value
- def _read_slow_io_detect_frequency(self, items_common: dict):
- self._conf["common"]["slow_io_detect_frequency"] = self._get_config_value(
+ def _read_period_time(self, items_common: dict):
+ self._conf["common"]["period_time"] = self._get_config_value(
items_common,
- "slow_io_detect_frequency",
+ "period_time",
int,
- self.DEFAULT_CONF["common"]["slow_io_detect_frequency"],
+ self.DEFAULT_CONF["common"]["period_time"],
gt=0
)
- frequency = self._conf["common"]["slow_io_detect_frequency"]
+ frequency = self._conf["common"]["period_time"]
ret = check_detect_frequency_is_valid(frequency)
if ret is None:
- log = f"slow io detect frequency: {frequency} is valid, "\
+ log = f"period_time: {frequency} is valid, "\
f"Check whether the value range is too large or is not an "\
f"integer multiple of period_time.. exiting..."
Report.report_pass(log)
@@ -316,50 +332,41 @@ class ConfigParser:
self._conf["common"]["iotype"] = dup_iotype_list
def _read_sliding_window_type(self, items_sliding_window: dict):
- sliding_window_type = items_sliding_window.get("sliding_window_type")
+ sliding_window_type = items_sliding_window.get("win_type")
if sliding_window_type is not None:
- self._conf["sliding_window"]["sliding_window_type"] = (
+ self._conf["algorithm"]["win_type"] = (
get_sliding_window_type_enum(sliding_window_type)
)
- if self._conf["sliding_window"]["sliding_window_type"] is None:
+ if self._conf["algorithm"]["win_type"] is None:
logging.critical(
- "the sliding_window_type: %s you set is invalid. ai_block_io plug will exit.",
+ "the win_type: %s you set is invalid. ai_block_io plug will exit.",
sliding_window_type,
)
Report.report_pass(
- f"the sliding_window_type: {sliding_window_type} you set is invalid. ai_block_io plug will exit."
+ f"the win_type: {sliding_window_type} you set is invalid. ai_block_io plug will exit."
)
exit(1)
def _read_window_size(self, items_sliding_window: dict):
- self._conf["sliding_window"]["window_size"] = self._get_config_value(
+ self._conf["algorithm"]["win_size"] = self._get_config_value(
items_sliding_window,
- "window_size",
+ "win_size",
int,
- self.DEFAULT_CONF["sliding_window"]["window_size"],
+ self.DEFAULT_CONF["algorithm"]["win_size"],
gt=0,
- le=3600,
+ le=300,
)
def _read_window_minimum_threshold(self, items_sliding_window: dict):
- default_window_minimum_threshold = self.DEFAULT_CONF["sliding_window"][
- "window_minimum_threshold"
- ]
- if (
- default_window_minimum_threshold
- > self._conf["sliding_window"]["window_size"]
- ):
- default_window_minimum_threshold = (
- self._conf["sliding_window"]["window_size"] / 2
- )
- self._conf["sliding_window"]["window_minimum_threshold"] = (
+ default_window_minimum_threshold = self.DEFAULT_CONF["algorithm"]["win_threshold"]
+ self._conf["algorithm"]["win_threshold"] = (
self._get_config_value(
items_sliding_window,
- "window_minimum_threshold",
+ "win_threshold",
int,
default_window_minimum_threshold,
gt=0,
- le=self._conf["sliding_window"]["window_size"],
+ le=self._conf["algorithm"]["win_size"],
)
)
@@ -406,7 +413,7 @@ class ConfigParser:
if con.has_section("common"):
items_common = dict(con.items("common"))
- self._read_slow_io_detect_frequency(items_common)
+ self._read_period_time(items_common)
self._read_disks_to_detect(items_common)
self._read_stage(items_common)
self._read_iotype(items_common)
@@ -420,20 +427,9 @@ class ConfigParser:
self._read_train_data_duration(items_algorithm)
self._read_train_update_duration(items_algorithm)
self._read_algorithm_type_and_parameter(items_algorithm)
- else:
- Report.report_pass("not found algorithm section. exiting...")
- logging.critical("not found algorithm section. exiting...")
- exit(1)
-
- if con.has_section("sliding_window"):
- items_sliding_window = dict(con.items("sliding_window"))
-
- self._read_window_size(items_sliding_window)
- self._read_window_minimum_threshold(items_sliding_window)
- else:
- Report.report_pass("not found sliding_window section. exiting...")
- logging.critical("not found sliding_window section. exiting...")
- exit(1)
+ self._read_sliding_window_type(items_algorithm)
+ self._read_window_size(items_algorithm)
+ self._read_window_minimum_threshold(items_algorithm)
if con.has_section("latency_sata_ssd"):
items_latency_sata_ssd = dict(con.items("latency_sata_ssd"))
@@ -451,6 +447,20 @@ class ConfigParser:
self.DEFAULT_CONF["latency_sata_ssd"]["write_tot_lim"],
gt=0,
)
+ self._conf["latency_sata_ssd"]["read_avg_lim"] = self._get_config_value(
+ items_latency_sata_ssd,
+ "read_avg_lim",
+ int,
+ self.DEFAULT_CONF["latency_sata_ssd"]["read_avg_lim"],
+ gt=0
+ )
+ self._conf["latency_sata_ssd"]["write_avg_lim"] = self._get_config_value(
+ items_latency_sata_ssd,
+ "write_avg_lim",
+ int,
+ self.DEFAULT_CONF["latency_sata_ssd"]["write_avg_lim"],
+ gt=0
+ )
else:
Report.report_pass("not found latency_sata_ssd section. exiting...")
logging.critical("not found latency_sata_ssd section. exiting...")
@@ -472,6 +482,20 @@ class ConfigParser:
self.DEFAULT_CONF["latency_nvme_ssd"]["write_tot_lim"],
gt=0,
)
+ self._conf["latency_nvme_ssd"]["read_avg_lim"] = self._get_config_value(
+ items_latency_nvme_ssd,
+ "read_avg_lim",
+ int,
+ self.DEFAULT_CONF["latency_nvme_ssd"]["read_avg_lim"],
+ gt=0
+ )
+ self._conf["latency_nvme_ssd"]["write_avg_lim"] = self._get_config_value(
+ items_latency_nvme_ssd,
+ "write_avg_lim",
+ int,
+ self.DEFAULT_CONF["latency_nvme_ssd"]["write_avg_lim"],
+ gt=0
+ )
else:
Report.report_pass("not found latency_nvme_ssd section. exiting...")
logging.critical("not found latency_nvme_ssd section. exiting...")
@@ -493,6 +517,20 @@ class ConfigParser:
self.DEFAULT_CONF["latency_sata_hdd"]["write_tot_lim"],
gt=0,
)
+ self._conf["latency_sata_hdd"]["read_avg_lim"] = self._get_config_value(
+ items_latency_sata_hdd,
+ "read_avg_lim",
+ int,
+ self.DEFAULT_CONF["latency_sata_hdd"]["read_avg_lim"],
+ gt=0
+ )
+ self._conf["latency_sata_hdd"]["write_avg_lim"] = self._get_config_value(
+ items_latency_sata_hdd,
+ "write_avg_lim",
+ int,
+ self.DEFAULT_CONF["latency_sata_hdd"]["write_avg_lim"],
+ gt=0
+ )
else:
Report.report_pass("not found latency_sata_hdd section. exiting...")
logging.critical("not found latency_sata_hdd section. exiting...")
@@ -542,6 +580,18 @@ class ConfigParser:
else:
return None
+ def get_avg_lim(self, disk_type, io_type):
+ if io_type == "read":
+ return self._conf.get(
+ f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {}
+ ).get("read_avg_lim", None)
+ elif io_type == "write":
+ return self._conf.get(
+ f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {}
+ ).get("write_avg_lim", None)
+ else:
+ return None
+
def get_train_data_duration_and_train_update_duration(self):
return (
self._conf["algorithm"]["train_data_duration"],
@@ -550,13 +600,13 @@ class ConfigParser:
def get_window_size_and_window_minimum_threshold(self):
return (
- self._conf["sliding_window"]["window_size"],
- self._conf["sliding_window"]["window_minimum_threshold"],
+ self._conf["algorithm"]["win_size"],
+ self._conf["algorithm"]["win_threshold"],
)
@property
- def slow_io_detect_frequency(self):
- return self._conf["common"]["slow_io_detect_frequency"]
+ def period_time(self):
+ return self._conf["common"]["period_time"]
@property
def algorithm_type(self):
@@ -564,7 +614,7 @@ class ConfigParser:
@property
def sliding_window_type(self):
- return self._conf["sliding_window"]["sliding_window_type"]
+ return self._conf["algorithm"]["win_type"]
@property
def train_data_duration(self):
@@ -576,11 +626,11 @@ class ConfigParser:
@property
def window_size(self):
- return self._conf["sliding_window"]["window_size"]
+ return self._conf["algorithm"]["win_size"]
@property
def window_minimum_threshold(self):
- return self._conf["sliding_window"]["window_minimum_threshold"]
+ return self._conf["algorithm"]["win_threshold"]
@property
def absolute_threshold(self):
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
index 8536f7a..e3a0952 100644
--- a/src/python/sentryPlugins/ai_block_io/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -28,9 +28,13 @@ class Detector:
self._threshold.attach_observer(self._slidingWindow)
self._count = None
- def get_metric_name(self):
+ @property
+ def metric_name(self):
return self._metric_name
+ def get_sliding_window_data(self):
+ return self._slidingWindow.get_data()
+
def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
if self._count is None:
self._count = datetime.now()
@@ -38,22 +42,27 @@ class Detector:
now_time = datetime.now()
time_diff = (now_time - self._count).total_seconds()
if time_diff >= 60:
- logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
+ logging.info(f"({self._metric_name}) 's latest ai threshold is: {self._threshold.get_threshold()}.")
self._count = None
logging.debug(f'enter Detector: {self}')
metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name)
if metric_value is None:
logging.debug('not found metric value, so return None.')
- return (False, False), None, None, None
+ return (False, False), None, None, None, None
logging.debug(f'input metric value: {str(metric_value)}')
self._threshold.push_latest_data_to_queue(metric_value)
detection_result = self._slidingWindow.is_slow_io_event(metric_value)
# 检测到慢周期由Detector负责打印info级别日志
if detection_result[0][1]:
- logging.info(f'[abnormal period happen]: disk info: {self._metric_name}, window: {detection_result[1]}, '
- f'current value: {metric_value}, ai threshold: {detection_result[2]}, '
- f'absolute threshold: {detection_result[3]}')
+ logging.info(f'[abnormal_period]: disk: {self._metric_name.disk_name}, '
+ f'stage: {self._metric_name.stage_name}, '
+ f'iotype: {self._metric_name.io_access_type_name}, '
+ f'metric: {self._metric_name.metric_name}, '
+ f'current value: {metric_value}, '
+ f'ai threshold: {detection_result[2]}, '
+ f'absolute threshold upper limit: {detection_result[3]}, '
+ f'lower limit: {detection_result[4]}')
else:
logging.debug(f'Detection result: {str(detection_result)}')
logging.debug(f'exit Detector: {self}')
@@ -75,41 +84,60 @@ class DiskDetector:
def add_detector(self, detector: Detector):
self._detector_list.append(detector)
+ def get_detector_list_window(self):
+ latency_wins = {"read": {}, "write": {}}
+ iodump_wins = {"read": {}, "write": {}}
+ for detector in self._detector_list:
+ if detector.metric_name.metric_name == 'latency':
+ latency_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data()
+ elif detector.metric_name.metric_name == 'io_dump':
+ iodump_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data()
+ return latency_wins, iodump_wins
+
def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
- """
- 根因诊断逻辑只有bio阶段发生异常才认为发生了慢IO事件即bio阶段异常是慢IO事件的必要条件
- 情况一bio异常rq_driver也异常则慢盘
- 情况二bio异常rq_driver无异常且有内核IO栈任意阶段异常则IO栈异常
- 情况三bio异常rq_driver无异常且无内核IO栈任意阶段异常则IO压力大
- 情况四bio异常则UNKNOWN
- """
- diagnosis_info = {"bio": [], "rq_driver": [], "io_stage": []}
+ diagnosis_info = {"bio": [], "rq_driver": [], "kernel_stack": []}
for detector in self._detector_list:
# result返回内容(是否检测到慢IO是否检测到慢周期)、窗口、ai阈值、绝对阈值
# 示例: (False, False), self._io_data_queue, self._ai_threshold, self._abs_threshold
result = detector.is_slow_io_event(io_data_dict_with_disk_name)
if result[0][0]:
- if detector.get_metric_name().stage_name == "bio":
- diagnosis_info["bio"].append((detector.get_metric_name(), result))
- elif detector.get_metric_name().stage_name == "rq_driver":
- diagnosis_info["rq_driver"].append((detector.get_metric_name(), result))
+ if detector.metric_name.stage_name == "bio":
+ diagnosis_info["bio"].append(detector.metric_name)
+ elif detector.metric_name.stage_name == "rq_driver":
+ diagnosis_info["rq_driver"].append(detector.metric_name)
else:
- diagnosis_info["io_stage"].append((detector.get_metric_name(), result))
+ diagnosis_info["kernel_stack"].append(detector.metric_name)
- # 返回内容1是否检测到慢IO事件、2MetricName、3滑动窗口及阈值、4慢IO事件根因
- root_cause = None
if len(diagnosis_info["bio"]) == 0:
- return False, None, None, None
- elif len(diagnosis_info["rq_driver"]) != 0:
- root_cause = "[Root Cause: disk slow]"
- elif len(diagnosis_info["io_stage"]) != 0:
- stage_list = []
- for io_stage in diagnosis_info["io_stage"]:
- stage_list.append(io_stage[0].stage_name)
- root_cause = f"[Root Cause: io stage slow, stage: {stage_list}]"
- if root_cause is None:
- root_cause = "[Root Cause: high io pressure]"
- return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause
+ return False, None, None, None, None, None, None
+
+ driver_name = self._disk_name
+ reason = "unknown"
+ block_stack = set()
+ io_type = set()
+ alarm_type = set()
+
+ for key, value in diagnosis_info.items():
+ for metric_name in value:
+ block_stack.add(metric_name.stage_name)
+ io_type.add(metric_name.io_access_type_name)
+ alarm_type.add(metric_name.metric_name)
+
+ latency_wins, iodump_wins = self.get_detector_list_window()
+ details = f"latency: {latency_wins}, iodump: {iodump_wins}"
+
+ io_press = {"throtl", "wbt", "iocost", "bfq"}
+ driver_slow = {"rq_driver"}
+ kernel_slow = {"gettag", "plug", "deadline", "hctx", "requeue"}
+
+ if not io_press.isdisjoint(block_stack):
+ reason = "io_press"
+ elif not driver_slow.isdisjoint(block_stack):
+ reason = "driver_slow"
+ elif not kernel_slow.isdisjoint(block_stack):
+ reason = "kernel_slow"
+
+ return True, driver_name, reason, str(block_stack), str(io_type), str(alarm_type), details
def __repr__(self):
msg = f'disk: {self._disk_name}, '
diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py
index cebe41f..4083c43 100644
--- a/src/python/sentryPlugins/ai_block_io/sliding_window.py
+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py
@@ -21,11 +21,12 @@ class SlidingWindowType(Enum):
class SlidingWindow:
- def __init__(self, queue_length: int, threshold: int, abs_threshold: int = None):
+ def __init__(self, queue_length: int, threshold: int, abs_threshold: int = None, avg_lim: int = None):
self._queue_length = queue_length
self._queue_threshold = threshold
self._ai_threshold = None
self._abs_threshold = abs_threshold
+ self._avg_lim = avg_lim
self._io_data_queue = []
self._io_data_queue_abnormal_tag = []
@@ -35,8 +36,13 @@ class SlidingWindow:
self._io_data_queue_abnormal_tag.pop(0)
self._io_data_queue.append(data)
tag = False
- if ((self._ai_threshold is not None and data > self._ai_threshold) or
- (self._abs_threshold is not None and data > self._abs_threshold)):
+ if self._avg_lim is not None and data < self._avg_lim:
+ tag = False
+ self._io_data_queue_abnormal_tag.append(tag)
+ return tag
+ if self._ai_threshold is not None and data > self._ai_threshold:
+ tag = True
+ if self._abs_threshold is not None and data > self._abs_threshold:
tag = True
self._io_data_queue_abnormal_tag.append(tag)
return tag
@@ -52,6 +58,9 @@ class SlidingWindow:
def is_slow_io_event(self, data):
return False, None, None, None
+ def get_data(self):
+ return self._io_data_queue
+
def __repr__(self):
return "[SlidingWindow]"
@@ -64,7 +73,7 @@ class NotContinuousSlidingWindow(SlidingWindow):
is_slow_io_event = False
if self._io_data_queue_abnormal_tag.count(True) >= self._queue_threshold:
is_slow_io_event = True
- return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold
+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim
def __repr__(self):
return f"[NotContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]"
@@ -85,7 +94,7 @@ class ContinuousSlidingWindow(SlidingWindow):
break
else:
consecutive_count = 0
- return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold
+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim
def __repr__(self):
return f"[ContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]"
@@ -100,7 +109,7 @@ class MedianSlidingWindow(SlidingWindow):
median = np.median(self._io_data_queue)
if median >= self._ai_threshold:
is_slow_io_event = True
- return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold
+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim
def __repr__(self):
return f"[MedianSlidingWindow, window size: {self._queue_length}]"
--
2.23.0

View File

@ -1,200 +0,0 @@
From db97139c411e86d6dc07fe0e91ae38c1bef17a8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Tue, 22 Oct 2024 16:37:52 +0800
Subject: [PATCH] ai_block_io support iodump
---
config/plugins/ai_block_io.ini | 6 +-
.../sentryPlugins/ai_block_io/ai_block_io.py | 75 ++++++++++++-------
.../ai_block_io/config_parser.py | 30 ++++++++
.../ai_block_io/sliding_window.py | 4 +-
4 files changed, 84 insertions(+), 31 deletions(-)
diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini
index 422cfa3..040237d 100644
--- a/config/plugins/ai_block_io.ini
+++ b/config/plugins/ai_block_io.ini
@@ -29,4 +29,8 @@ write_tot_lim=500
[latency_sata_hdd]
read_tot_lim=50000
-write_tot_lim=50000
\ No newline at end of file
+write_tot_lim=50000
+
+[iodump]
+read_iodump_lim=0
+write_iodump_lim=0
\ No newline at end of file
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
index 4eecd43..f25e6d5 100644
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
@@ -15,7 +15,7 @@ import logging
from collections import defaultdict
from .detector import Detector, DiskDetector
-from .threshold import ThresholdFactory
+from .threshold import ThresholdFactory, ThresholdType
from .sliding_window import SlidingWindowFactory
from .utils import get_data_queue_size_and_update_size
from .config_parser import ConfigParser
@@ -91,9 +91,8 @@ class SlowIODetection:
continue
for stage in stages:
for iotype in iotypes:
- self._detector_name_list[disk].append(
- MetricName(disk, disk_type, stage, iotype, "latency")
- )
+ self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "latency"))
+ self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "io_dump"))
if disks:
logging.warning(
"disks: %s not in available disk list, so they will be ignored.",
@@ -123,31 +122,51 @@ class SlowIODetection:
for disk, metric_name_list in self._detector_name_list.items():
disk_detector = DiskDetector(disk)
for metric_name in metric_name_list:
- threshold = ThresholdFactory().get_threshold(
- threshold_type,
- boxplot_parameter=self._config_parser.boxplot_parameter,
- n_sigma_paramter=self._config_parser.n_sigma_parameter,
- data_queue_size=data_queue_size,
- data_queue_update_size=update_size,
- )
- abs_threshold = self._config_parser.get_tot_lim(
- metric_name.disk_type, metric_name.io_access_type_name
- )
- if abs_threshold is None:
- logging.warning(
- "disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.",
- disk,
- metric_name.disk_type,
- metric_name.io_access_type_name,
+
+ if metric_name.metric_name == 'latency':
+ threshold = ThresholdFactory().get_threshold(
+ threshold_type,
+ boxplot_parameter=self._config_parser.boxplot_parameter,
+ n_sigma_paramter=self._config_parser.n_sigma_parameter,
+ data_queue_size=data_queue_size,
+ data_queue_update_size=update_size,
)
- sliding_window = SlidingWindowFactory().get_sliding_window(
- sliding_window_type,
- queue_length=window_size,
- threshold=window_threshold,
- abs_threshold=abs_threshold,
- )
- detector = Detector(metric_name, threshold, sliding_window)
- disk_detector.add_detector(detector)
+ abs_threshold = self._config_parser.get_tot_lim(
+ metric_name.disk_type, metric_name.io_access_type_name
+ )
+ if abs_threshold is None:
+ logging.warning(
+ "disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.",
+ disk,
+ metric_name.disk_type,
+ metric_name.io_access_type_name,
+ )
+ sliding_window = SlidingWindowFactory().get_sliding_window(
+ sliding_window_type,
+ queue_length=window_size,
+ threshold=window_threshold,
+ abs_threshold=abs_threshold,
+ )
+ detector = Detector(metric_name, threshold, sliding_window)
+ disk_detector.add_detector(detector)
+ continue
+
+ elif metric_name.metric_name == 'io_dump':
+ threshold = ThresholdFactory().get_threshold(ThresholdType.AbsoluteThreshold)
+ abs_threshold = None
+ if metric_name.io_access_type_name == 'read':
+ abs_threshold = self._config_parser.read_iodump_lim
+ elif metric_name.io_access_type_name == 'write':
+ abs_threshold = self._config_parser.write_iodump_lim
+ sliding_window = SlidingWindowFactory().get_sliding_window(
+ sliding_window_type,
+ queue_length=window_size,
+ threshold=window_threshold
+ )
+ detector = Detector(metric_name, threshold, sliding_window)
+ threshold.set_threshold(abs_threshold)
+ disk_detector.add_detector(detector)
+
logging.info(f"disk: [{disk}] add detector:\n [{disk_detector}]")
self._disk_detectors[disk] = disk_detector
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
index 274a31e..1117939 100644
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -72,6 +72,7 @@ class ConfigParser:
"latency_sata_ssd": {"read_tot_lim": 50000, "write_tot_lim": 50000},
"latency_nvme_ssd": {"read_tot_lim": 500, "write_tot_lim": 500},
"latency_sata_hdd": {"read_tot_lim": 50000, "write_tot_lim": 50000},
+ "iodump": {"read_iodump_lim": 0, "write_iodump_lim": 0}
}
def __init__(self, config_file_name):
@@ -497,6 +498,27 @@ class ConfigParser:
logging.critical("not found latency_sata_hdd section. exiting...")
exit(1)
+ if con.has_section("iodump"):
+ items_iodump = dict(con.items("iodump"))
+ self._conf["iodump"]["read_iodump_lim"] = self._get_config_value(
+ items_iodump,
+ "read_iodump_lim",
+ int,
+ self.DEFAULT_CONF["iodump"]["read_iodump_lim"],
+ ge=0
+ )
+ self._conf["iodump"]["write_iodump_lim"] = self._get_config_value(
+ items_iodump,
+ "write_iodump_lim",
+ int,
+ self.DEFAULT_CONF["iodump"]["write_iodump_lim"],
+ ge=0
+ )
+ else:
+ Report.report_pass("not found iodump section. exiting...")
+ logging.critical("not found iodump section. exiting...")
+ exit(1)
+
self.__print_all_config_value()
def __repr__(self) -> str:
@@ -587,3 +609,11 @@ class ConfigParser:
@property
def n_sigma_parameter(self):
return self._conf["algorithm"]["n_sigma_parameter"]
+
+ @property
+ def read_iodump_lim(self):
+ return self._conf["iodump"]["read_iodump_lim"]
+
+ @property
+ def write_iodump_lim(self):
+ return self._conf["iodump"]["write_iodump_lim"]
\ No newline at end of file
diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py
index d7c402a..cebe41f 100644
--- a/src/python/sentryPlugins/ai_block_io/sliding_window.py
+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py
@@ -35,8 +35,8 @@ class SlidingWindow:
self._io_data_queue_abnormal_tag.pop(0)
self._io_data_queue.append(data)
tag = False
- if ((self._ai_threshold is not None and data >= self._ai_threshold) or
- (self._abs_threshold is not None and data >= self._abs_threshold)):
+ if ((self._ai_threshold is not None and data > self._ai_threshold) or
+ (self._abs_threshold is not None and data > self._abs_threshold)):
tag = True
self._io_data_queue_abnormal_tag.append(tag)
return tag
--
2.23.0

View File

@ -1,906 +0,0 @@
From 13dc3712b4530a312aa43610f7696a4a62f30e96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Fri, 11 Oct 2024 21:50:32 +0800
Subject: [PATCH] ai_block_io support stage and iotype
---
config/plugins/ai_block_io.ini | 7 +-
.../sentryPlugins/ai_block_io/ai_block_io.py | 126 +++--
.../ai_block_io/config_parser.py | 471 +++++++++++++-----
.../sentryPlugins/ai_block_io/data_access.py | 11 +-
.../sentryPlugins/ai_block_io/detector.py | 25 +
src/python/sentryPlugins/ai_block_io/utils.py | 3 +-
6 files changed, 453 insertions(+), 190 deletions(-)
diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini
index 01ce266..a814d52 100644
--- a/config/plugins/ai_block_io.ini
+++ b/config/plugins/ai_block_io.ini
@@ -1,7 +1,12 @@
+[log]
+level=info
+
[common]
absolute_threshold=40
slow_io_detect_frequency=1
-log_level=info
+disk=default
+stage=bio
+iotype=read,write
[algorithm]
train_data_duration=24
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
index 77104a9..e1052ec 100644
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
@@ -13,7 +13,7 @@ import time
import signal
import logging
-from .detector import Detector
+from .detector import Detector, DiskDetector
from .threshold import ThresholdFactory, AbsoluteThreshold
from .sliding_window import SlidingWindowFactory
from .utils import get_data_queue_size_and_update_size
@@ -34,8 +34,8 @@ def sig_handler(signum, frame):
class SlowIODetection:
_config_parser = None
_disk_list = None
- _detector_name_list = []
- _detectors = {}
+ _detector_name_list = {}
+ _disk_detectors = {}
def __init__(self, config_parser: ConfigParser):
self._config_parser = config_parser
@@ -43,85 +43,101 @@ class SlowIODetection:
self.__init_detector()
def __init_detector_name_list(self):
- self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency())
+ self._disk_list = check_collect_valid(self._config_parser.slow_io_detect_frequency)
if self._disk_list is None:
Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...")
exit(1)
logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
- disks_to_detection: list = self._config_parser.get_disks_to_detection()
+ disks: list = self._config_parser.disks_to_detection
+ stages: list = self._config_parser.stage
+ iotypes: list = self._config_parser.iotype
# 情况1None则启用所有磁盘检测
# 情况2is not None and len = 0则不启动任何磁盘检测
# 情况3len = 0则取交集
- if disks_to_detection is None:
+ if disks is None:
logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.")
for disk in self._disk_list:
- self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
- self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
- elif len(disks_to_detection) == 0:
- logging.warning('please attention: conf file not specify any disk to detection, so it will not start ai block io.')
+ for stage in stages:
+ for iotype in iotypes:
+ if disk not in self._detector_name_list:
+ self._detector_name_list[disk] = []
+ self._detector_name_list[disk].append(MetricName(disk, stage, iotype, "latency"))
else:
- for disk_to_detection in disks_to_detection:
- if disk_to_detection in self._disk_list:
- self._detector_name_list.append(MetricName(disk_to_detection, "bio", "read", "latency"))
- self._detector_name_list.append(MetricName(disk_to_detection, "bio", "write", "latency"))
+ for disk in disks:
+ if disk in self._disk_list:
+ for stage in stages:
+ for iotype in iotypes:
+ if disk not in self._detector_name_list:
+ self._detector_name_list[disk] = []
+ self._detector_name_list[disk].append(MetricName(disk, stage, iotype, "latency"))
else:
- logging.warning(f"disk[{disk_to_detection}] not in available disk list, so it will be ignored.")
- logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}')
+ logging.warning("disk: [%s] not in available disk list, so it will be ignored.", disk)
+ if len(self._detector_name_list) == 0:
+ logging.critical("the disks to detection is empty, ai_block_io will exit.")
+ Report.report_pass("the disks to detection is empty, ai_block_io will exit.")
+ exit(1)
def __init_detector(self):
- train_data_duration, train_update_duration = (self._config_parser.
- get_train_data_duration_and_train_update_duration())
- slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency()
- threshold_type = self._config_parser.get_algorithm_type()
- data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration,
- train_update_duration,
- slow_io_detection_frequency)
- sliding_window_type = self._config_parser.get_sliding_window_type()
- window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold()
-
- for detector_name in self._detector_name_list:
- threshold = ThresholdFactory().get_threshold(threshold_type,
- boxplot_parameter=self._config_parser.get_boxplot_parameter(),
- n_sigma_paramter=self._config_parser.get_n_sigma_parameter(),
- data_queue_size=data_queue_size,
- data_queue_update_size=update_size)
- sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size,
- threshold=window_threshold)
- detector = Detector(detector_name, threshold, sliding_window)
- # 绝对阈值的阈值初始化
- if isinstance(threshold, AbsoluteThreshold):
- threshold.set_threshold(self._config_parser.get_absolute_threshold())
- self._detectors[detector_name] = detector
- logging.info(f"add detector: {detector}")
+ train_data_duration, train_update_duration = (
+ self._config_parser.get_train_data_duration_and_train_update_duration()
+ )
+ slow_io_detection_frequency = self._config_parser.slow_io_detect_frequency
+ threshold_type = self._config_parser.algorithm_type
+ data_queue_size, update_size = get_data_queue_size_and_update_size(
+ train_data_duration, train_update_duration, slow_io_detection_frequency
+ )
+ sliding_window_type = self._config_parser.sliding_window_type
+ window_size, window_threshold = (self._config_parser.get_window_size_and_window_minimum_threshold())
+
+ for disk, metric_name_list in self._detector_name_list.items():
+ threshold = ThresholdFactory().get_threshold(
+ threshold_type,
+ boxplot_parameter=self._config_parser.boxplot_parameter,
+ n_sigma_paramter=self._config_parser.n_sigma_parameter,
+ data_queue_size=data_queue_size,
+ data_queue_update_size=update_size,
+ )
+ sliding_window = SlidingWindowFactory().get_sliding_window(
+ sliding_window_type,
+ queue_length=window_size,
+ threshold=window_threshold,
+ )
+ disk_detector = DiskDetector(disk)
+ for metric_name in metric_name_list:
+ detector = Detector(metric_name, threshold, sliding_window)
+ disk_detector.add_detector(detector)
+ logging.info(f'disk: [{disk}] add detector:\n [{disk_detector}]')
+ self._disk_detectors[disk] = disk_detector
def launch(self):
while True:
- logging.debug('step0. AI threshold slow io event detection is looping.')
+ logging.debug("step0. AI threshold slow io event detection is looping.")
# Step1获取IO数据
io_data_dict_with_disk_name = get_io_data_from_collect_plug(
- self._config_parser.get_slow_io_detect_frequency(), self._disk_list
+ self._config_parser.slow_io_detect_frequency, self._disk_list
)
- logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}')
+ logging.debug(f"step1. Get io data: {str(io_data_dict_with_disk_name)}")
if io_data_dict_with_disk_name is None:
- Report.report_pass("get io data error, please check if the collector plug is enable. exitting...")
+ Report.report_pass(
+ "get io data error, please check if the collector plug is enable. exitting..."
+ )
exit(1)
# Step2慢IO检测
- logging.debug('step2. Start to detection slow io event.')
+ logging.debug("step2. Start to detection slow io event.")
slow_io_event_list = []
- for metric_name, detector in self._detectors.items():
- result = detector.is_slow_io_event(io_data_dict_with_disk_name)
+ for disk, disk_detector in self._disk_detectors.items():
+ result = disk_detector.is_slow_io_event(io_data_dict_with_disk_name)
if result[0]:
- slow_io_event_list.append((detector.get_metric_name(), result))
- logging.debug('step2. End to detection slow io event.')
+ slow_io_event_list.append(result)
+ logging.debug("step2. End to detection slow io event.")
# Step3慢IO事件上报
- logging.debug('step3. Report slow io event to sysSentry.')
+ logging.debug("step3. Report slow io event to sysSentry.")
for slow_io_event in slow_io_event_list:
- metric_name: MetricName = slow_io_event[0]
- result = slow_io_event[1]
+ metric_name: MetricName = slow_io_event[1]
alarm_content = {
"driver_name": f"{metric_name.get_disk_name()}",
"reason": "disk_slow",
@@ -129,14 +145,14 @@ class SlowIODetection:
"io_type": f"{metric_name.get_io_access_type_name()}",
"alarm_source": "ai_block_io",
"alarm_type": "latency",
- "details": f"current window is: {result[1]}, threshold is: {result[2]}.",
+ "details": f"current window is: {slow_io_event[2]}, threshold is: {slow_io_event[3]}.",
}
Xalarm.major(alarm_content)
logging.warning(alarm_content)
# Step4等待检测时间
- logging.debug('step4. Wait to start next slow io event detection loop.')
- time.sleep(self._config_parser.get_slow_io_detect_frequency())
+ logging.debug("step4. Wait to start next slow io event detection loop.")
+ time.sleep(self._config_parser.slow_io_detect_frequency)
def main():
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
index 354c122..a357766 100644
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -9,44 +9,60 @@
# PURPOSE.
# See the Mulan PSL v2 for more details.
+import os
import configparser
import logging
+from .alarm_report import Report
from .threshold import ThresholdType
from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level
LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
+ALL_STAGE_LIST = ['throtl', 'wbt', 'gettag', 'plug', 'deadline', 'hctx', 'requeue', 'rq_driver', 'bio']
+ALL_IOTPYE_LIST = ['read', 'write']
+
def init_log_format(log_level: str):
logging.basicConfig(level=get_log_level(log_level.lower()), format=LOG_FORMAT)
- if log_level.lower() not in ('info', 'warning', 'error', 'debug'):
- logging.warning(f'the log_level: {log_level} you set is invalid, use default value: info.')
+ if log_level.lower() not in ("info", "warning", "error", "debug"):
+ logging.warning(
+ f"the log_level: {log_level} you set is invalid, use default value: info."
+ )
class ConfigParser:
DEFAULT_ABSOLUTE_THRESHOLD = 40
DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1
- DEFAULT_LOG_LEVEL = 'info'
+ DEFAULT_LOG_LEVEL = "info"
+
+ DEFAULT_STAGE = 'throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio'
+ DEFAULT_IOTYPE = 'read,write'
- DEFAULT_ALGORITHM_TYPE = 'boxplot'
+ DEFAULT_ALGORITHM_TYPE = "boxplot"
DEFAULT_TRAIN_DATA_DURATION = 24
DEFAULT_TRAIN_UPDATE_DURATION = 2
DEFAULT_BOXPLOT_PARAMETER = 1.5
DEFAULT_N_SIGMA_PARAMETER = 3
- DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous'
+ DEFAULT_SLIDING_WINDOW_TYPE = "not_continuous"
DEFAULT_WINDOW_SIZE = 30
DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6
def __init__(self, config_file_name):
self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
+ self.__slow_io_detect_frequency = (
+ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
+ )
self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL
self.__disks_to_detection = None
+ self.__stage = ConfigParser.DEFAULT_STAGE
+ self.__iotype = ConfigParser.DEFAULT_IOTYPE
- self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE
+ self.__algorithm_type = get_threshold_type_enum(
+ ConfigParser.DEFAULT_ALGORITHM_TYPE
+ )
self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
@@ -58,199 +74,398 @@ class ConfigParser:
self.__config_file_name = config_file_name
- def __read_absolute_threshold(self, items_common: dict):
+ def _get_config_value(
+ self,
+ config_items: dict,
+ key: str,
+ value_type,
+ default_value=None,
+ gt=None,
+ ge=None,
+ lt=None,
+ le=None,
+ ):
+ value = config_items.get(key)
+ if value is None:
+ logging.warning(
+ "config of %s not found, the default value %s will be used.",
+ key,
+ default_value,
+ )
+ value = default_value
+ if not value:
+ logging.critical(
+ "the value of %s is empty, ai_block_io plug will exit.", key
+ )
+ Report.report_pass(
+ f"the value of {key} is empty, ai_block_io plug will exit."
+ )
+ exit(1)
try:
- self.__absolute_threshold = float(items_common.get('absolute_threshold',
- ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD))
- if self.__absolute_threshold <= 0:
- logging.warning(
- f'the_absolute_threshold: {self.__absolute_threshold} you set is invalid, use default value: {ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD}.')
- self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
+ value = value_type(value)
except ValueError:
- self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
- logging.warning(
- f'the_absolute_threshold type conversion has error, use default value: {self.__absolute_threshold}.')
+ logging.critical(
+ "the value of %s is not a valid %s, ai_block_io plug will exit.",
+ key,
+ value_type,
+ )
+ Report.report_pass(
+ f"the value of {key} is not a valid {value_type}, ai_block_io plug will exit."
+ )
+ exit(1)
+ if gt is not None and value <= gt:
+ logging.critical(
+ "the value of %s is not greater than %s, ai_block_io plug will exit.",
+ key,
+ gt,
+ )
+ Report.report_pass(
+ f"the value of {key} is not greater than {gt}, ai_block_io plug will exit."
+ )
+ exit(1)
+ if ge is not None and value < ge:
+ logging.critical(
+ "the value of %s is not greater than or equal to %s, ai_block_io plug will exit.",
+ key,
+ ge,
+ )
+ Report.report_pass(
+ f"the value of {key} is not greater than or equal to {ge}, ai_block_io plug will exit."
+ )
+ exit(1)
+ if lt is not None and value >= lt:
+ logging.critical(
+ "the value of %s is not less than %s, ai_block_io plug will exit.",
+ key,
+ lt,
+ )
+ Report.report_pass(
+ f"the value of {key} is not less than {lt}, ai_block_io plug will exit."
+ )
+ exit(1)
+ if le is not None and value > le:
+ logging.critical(
+ "the value of %s is not less than or equal to %s, ai_block_io plug will exit.",
+ key,
+ le,
+ )
+ Report.report_pass(
+ f"the value of {key} is not less than or equal to {le}, ai_block_io plug will exit."
+ )
+ exit(1)
+
+ return value
+
+ def __read_absolute_threshold(self, items_common: dict):
+ self.__absolute_threshold = self._get_config_value(
+ items_common,
+ "absolute_threshold",
+ float,
+ ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD,
+ gt=0,
+ )
def __read__slow_io_detect_frequency(self, items_common: dict):
- try:
- self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency',
- ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY))
- if self.__slow_io_detect_frequency < 1 or self.__slow_io_detect_frequency > 10:
- logging.warning(
- f'the slow_io_detect_frequency: {self.__slow_io_detect_frequency} you set is invalid, use default value: {ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY}.')
- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
- except ValueError:
- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
- logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.')
+ self.__slow_io_detect_frequency = self._get_config_value(
+ items_common,
+ "slow_io_detect_frequency",
+ int,
+ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY,
+ gt=0,
+ le=300,
+ )
def __read__disks_to_detect(self, items_common: dict):
- disks_to_detection = items_common.get('disk')
+ disks_to_detection = items_common.get("disk")
if disks_to_detection is None:
- logging.warning(f'config of disk not found, the default value will be used.')
+ logging.warning("config of disk not found, the default value will be used.")
self.__disks_to_detection = None
return
- disk_list = disks_to_detection.split(',')
- if len(disk_list) == 0 or (len(disk_list) == 1 and disk_list[0] == ''):
- logging.warning("you don't specify any disk.")
- self.__disks_to_detection = []
- return
- if len(disk_list) == 1 and disk_list[0] == 'default':
+ disks_to_detection = disks_to_detection.strip()
+ if not disks_to_detection:
+ logging.critical("the value of disk is empty, ai_block_io plug will exit.")
+ Report.report_pass(
+ "the value of disk is empty, ai_block_io plug will exit."
+ )
+ exit(1)
+ disk_list = disks_to_detection.split(",")
+ if len(disk_list) == 1 and disk_list[0] == "default":
self.__disks_to_detection = None
return
self.__disks_to_detection = disk_list
def __read__train_data_duration(self, items_algorithm: dict):
- try:
- self.__train_data_duration = float(items_algorithm.get('train_data_duration',
- ConfigParser.DEFAULT_TRAIN_DATA_DURATION))
- if self.__train_data_duration <= 0 or self.__train_data_duration > 720:
- logging.warning(
- f'the train_data_duration: {self.__train_data_duration} you set is invalid, use default value: {ConfigParser.DEFAULT_TRAIN_DATA_DURATION}.')
- self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION
- except ValueError:
- self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION
- logging.warning(f'the train_data_duration type conversion has error, use default value: {self.__train_data_duration}.')
+ self.__train_data_duration = self._get_config_value(
+ items_algorithm,
+ "train_data_duration",
+ float,
+ ConfigParser.DEFAULT_TRAIN_DATA_DURATION,
+ gt=0,
+ le=720,
+ )
def __read__train_update_duration(self, items_algorithm: dict):
default_train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
if default_train_update_duration > self.__train_data_duration:
default_train_update_duration = self.__train_data_duration / 2
-
- try:
- self.__train_update_duration = float(items_algorithm.get('train_update_duration',
- ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION))
- if self.__train_update_duration <= 0 or self.__train_update_duration > self.__train_data_duration:
- logging.warning(
- f'the train_update_duration: {self.__train_update_duration} you set is invalid, use default value: {default_train_update_duration}.')
- self.__train_update_duration = default_train_update_duration
- except ValueError:
- self.__train_update_duration = default_train_update_duration
- logging.warning(f'the train_update_duration type conversion has error, use default value: {self.__train_update_duration}.')
+ self.__train_update_duration = self._get_config_value(
+ items_algorithm,
+ "train_update_duration",
+ float,
+ default_train_update_duration,
+ gt=0,
+ le=self.__train_data_duration,
+ )
def __read__algorithm_type_and_parameter(self, items_algorithm: dict):
- algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE)
+ algorithm_type = items_algorithm.get(
+ "algorithm_type", ConfigParser.DEFAULT_ALGORITHM_TYPE
+ )
self.__algorithm_type = get_threshold_type_enum(algorithm_type)
+ if self.__algorithm_type is None:
+ logging.critical(
+ "the algorithm_type: %s you set is invalid. ai_block_io plug will exit.",
+ algorithm_type,
+ )
+ Report.report_pass(
+ f"the algorithm_type: {algorithm_type} you set is invalid. ai_block_io plug will exit."
+ )
+ exit(1)
if self.__algorithm_type == ThresholdType.NSigmaThreshold:
- try:
- self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter',
- ConfigParser.DEFAULT_N_SIGMA_PARAMETER))
- if self.__n_sigma_parameter <= 0 or self.__n_sigma_parameter > 10:
- logging.warning(
- f'the n_sigma_parameter: {self.__n_sigma_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_N_SIGMA_PARAMETER}.')
- self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
- except ValueError:
- self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
- logging.warning(f'the n_sigma_parameter type conversion has error, use default value: {self.__n_sigma_parameter}.')
+ self.__n_sigma_parameter = self._get_config_value(
+ items_algorithm,
+ "n_sigma_parameter",
+ float,
+ ConfigParser.DEFAULT_N_SIGMA_PARAMETER,
+ gt=0,
+ le=10,
+ )
elif self.__algorithm_type == ThresholdType.BoxplotThreshold:
- try:
- self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter',
- ConfigParser.DEFAULT_BOXPLOT_PARAMETER))
- if self.__boxplot_parameter <= 0 or self.__boxplot_parameter > 10:
- logging.warning(
- f'the boxplot_parameter: {self.__boxplot_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_BOXPLOT_PARAMETER}.')
- self.__n_sigma_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
- except ValueError:
- self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
- logging.warning(f'the boxplot_parameter type conversion has error, use default value: {self.__boxplot_parameter}.')
+ self.__boxplot_parameter = self._get_config_value(
+ items_algorithm,
+ "boxplot_parameter",
+ float,
+ ConfigParser.DEFAULT_BOXPLOT_PARAMETER,
+ gt=0,
+ le=10,
+ )
+
+ def __read__stage(self, items_algorithm: dict):
+ stage_str = items_algorithm.get('stage', ConfigParser.DEFAULT_STAGE)
+ stage_list = stage_str.split(',')
+ if len(stage_list) == 1 and stage_list[0] == '':
+ logging.critical('stage value not allow is empty, exiting...')
+ exit(1)
+ if len(stage_list) == 1 and stage_list[0] == 'default':
+ logging.warning(f'stage will enable default value: {ConfigParser.DEFAULT_STAGE}')
+ self.__stage = ALL_STAGE_LIST
+ return
+ for stage in stage_list:
+ if stage not in ALL_STAGE_LIST:
+ logging.critical(f'stage: {stage} is not valid stage, ai_block_io will exit...')
+ exit(1)
+ dup_stage_list = set(stage_list)
+ if 'bio' not in dup_stage_list:
+ logging.critical('stage must contains bio stage, exiting...')
+ exit(1)
+ self.__stage = dup_stage_list
+
+ def __read__iotype(self, items_algorithm: dict):
+ iotype_str = items_algorithm.get('iotype', ConfigParser.DEFAULT_IOTYPE)
+ iotype_list = iotype_str.split(',')
+ if len(iotype_list) == 1 and iotype_list[0] == '':
+ logging.critical('iotype value not allow is empty, exiting...')
+ exit(1)
+ if len(iotype_list) == 1 and iotype_list[0] == 'default':
+ logging.warning(f'iotype will enable default value: {ConfigParser.DEFAULT_IOTYPE}')
+ self.__iotype = ALL_IOTPYE_LIST
+ return
+ for iotype in iotype_list:
+ if iotype not in ALL_IOTPYE_LIST:
+ logging.critical(f'iotype: {iotype} is not valid iotype, ai_block_io will exit...')
+ exit(1)
+ dup_iotype_list = set(iotype_list)
+ self.__iotype = dup_iotype_list
def __read__window_size(self, items_sliding_window: dict):
- try:
- self.__window_size = int(items_sliding_window.get('window_size',
- ConfigParser.DEFAULT_WINDOW_SIZE))
- if self.__window_size < 1 or self.__window_size > 3600:
- logging.warning(
- f'the window_size: {self.__window_size} you set is invalid, use default value: {ConfigParser.DEFAULT_WINDOW_SIZE}.')
- self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
- except ValueError:
- self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
- logging.warning(f'window_size type conversion has error, use default value: {self.__window_size}.')
+ self.__window_size = self._get_config_value(
+ items_sliding_window,
+ "window_size",
+ int,
+ ConfigParser.DEFAULT_WINDOW_SIZE,
+ gt=0,
+ le=3600,
+ )
def __read__window_minimum_threshold(self, items_sliding_window: dict):
default_window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD
if default_window_minimum_threshold > self.__window_size:
default_window_minimum_threshold = self.__window_size / 2
- try:
- self.__window_minimum_threshold = (
- int(items_sliding_window.get('window_minimum_threshold',
- ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD)))
- if self.__window_minimum_threshold < 1 or self.__window_minimum_threshold > self.__window_size:
- logging.warning(
- f'the window_minimum_threshold: {self.__window_minimum_threshold} you set is invalid, use default value: {default_window_minimum_threshold}.')
- self.__window_minimum_threshold = default_window_minimum_threshold
- except ValueError:
- self.__window_minimum_threshold = default_window_minimum_threshold
- logging.warning(f'window_minimum_threshold type conversion has error, use default value: {self.__window_minimum_threshold}.')
+ self.__window_minimum_threshold = self._get_config_value(
+ items_sliding_window,
+ "window_minimum_threshold",
+ int,
+ default_window_minimum_threshold,
+ gt=0,
+ le=self.__window_size,
+ )
def read_config_from_file(self):
+ if not os.path.exists(self.__config_file_name):
+ init_log_format(self.__log_level)
+ logging.critical(
+ "config file %s not found, ai_block_io plug will exit.",
+ self.__config_file_name,
+ )
+ Report.report_pass(
+ f"config file {self.__config_file_name} not found, ai_block_io plug will exit."
+ )
+ exit(1)
+
con = configparser.ConfigParser()
try:
- con.read(self.__config_file_name, encoding='utf-8')
+ con.read(self.__config_file_name, encoding="utf-8")
except configparser.Error as e:
init_log_format(self.__log_level)
- logging.critical(f'config file read error: {e}, ai_block_io plug will exit.')
+ logging.critical(
+ f"config file read error: %s, ai_block_io plug will exit.", e
+ )
+ Report.report_pass(
+ f"config file read error: {e}, ai_block_io plug will exit."
+ )
exit(1)
- if con.has_section('common'):
- items_common = dict(con.items('common'))
- self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL)
+ if con.has_section('log'):
+ items_log = dict(con.items('log'))
+ # 情况一没有log则使用默认值
+ # 情况二有log值为空或异常使用默认值
+ # 情况三有log值正常则使用该值
+ self.__log_level = items_log.get('level', ConfigParser.DEFAULT_LOG_LEVEL)
init_log_format(self.__log_level)
+ else:
+ init_log_format(self.__log_level)
+ logging.warning(f"log section parameter not found, it will be set to default value.")
+
+ if con.has_section("common"):
+ items_common = dict(con.items("common"))
self.__read_absolute_threshold(items_common)
self.__read__slow_io_detect_frequency(items_common)
self.__read__disks_to_detect(items_common)
+ self.__read__stage(items_common)
+ self.__read__iotype(items_common)
else:
- init_log_format(self.__log_level)
- logging.warning("common section parameter not found, it will be set to default value.")
+ logging.warning(
+ "common section parameter not found, it will be set to default value."
+ )
- if con.has_section('algorithm'):
- items_algorithm = dict(con.items('algorithm'))
+ if con.has_section("algorithm"):
+ items_algorithm = dict(con.items("algorithm"))
self.__read__train_data_duration(items_algorithm)
self.__read__train_update_duration(items_algorithm)
self.__read__algorithm_type_and_parameter(items_algorithm)
else:
- logging.warning("algorithm section parameter not found, it will be set to default value.")
-
- if con.has_section('sliding_window'):
- items_sliding_window = dict(con.items('sliding_window'))
- sliding_window_type = items_sliding_window.get('sliding_window_type',
- ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE)
- self.__sliding_window_type = get_sliding_window_type_enum(sliding_window_type)
+ logging.warning(
+ "algorithm section parameter not found, it will be set to default value."
+ )
+
+ if con.has_section("sliding_window"):
+ items_sliding_window = dict(con.items("sliding_window"))
+ sliding_window_type = items_sliding_window.get(
+ "sliding_window_type", ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE
+ )
+ self.__sliding_window_type = get_sliding_window_type_enum(
+ sliding_window_type
+ )
self.__read__window_size(items_sliding_window)
self.__read__window_minimum_threshold(items_sliding_window)
else:
- logging.warning("sliding_window section parameter not found, it will be set to default value.")
+ logging.warning(
+ "sliding_window section parameter not found, it will be set to default value."
+ )
self.__print_all_config_value()
+ def __repr__(self):
+ config_str = {
+ 'log.level': self.__log_level,
+ 'common.absolute_threshold': self.__absolute_threshold,
+ 'common.slow_io_detect_frequency': self.__slow_io_detect_frequency,
+ 'common.disk': self.__disks_to_detection,
+ 'common.stage': self.__stage,
+ 'common.iotype': self.__iotype,
+ 'algorithm.train_data_duration': self.__train_data_duration,
+ 'algorithm.train_update_duration': self.__train_update_duration,
+ 'algorithm.algorithm_type': self.__algorithm_type,
+ 'algorithm.boxplot_parameter': self.__boxplot_parameter,
+ 'algorithm.n_sigma_parameter': self.__n_sigma_parameter,
+ 'sliding_window.sliding_window_type': self.__sliding_window_type,
+ 'sliding_window.window_size': self.__window_size,
+ 'sliding_window.window_minimum_threshold': self.__window_minimum_threshold
+ }
+ return str(config_str)
+
def __print_all_config_value(self):
- pass
+ logging.info(f"all config is follow:\n {self}")
+
+ def get_train_data_duration_and_train_update_duration(self):
+ return self.__train_data_duration, self.__train_update_duration
- def get_slow_io_detect_frequency(self):
+ def get_window_size_and_window_minimum_threshold(self):
+ return self.__window_size, self.__window_minimum_threshold
+
+ @property
+ def slow_io_detect_frequency(self):
return self.__slow_io_detect_frequency
- def get_algorithm_type(self):
+ @property
+ def algorithm_type(self):
return self.__algorithm_type
- def get_sliding_window_type(self):
+ @property
+ def sliding_window_type(self):
return self.__sliding_window_type
- def get_train_data_duration_and_train_update_duration(self):
- return self.__train_data_duration, self.__train_update_duration
+ @property
+ def train_data_duration(self):
+ return self.__train_data_duration
- def get_window_size_and_window_minimum_threshold(self):
- return self.__window_size, self.__window_minimum_threshold
+ @property
+ def train_update_duration(self):
+ return self.__train_update_duration
+
+ @property
+ def window_size(self):
+ return self.__window_size
- def get_absolute_threshold(self):
+ @property
+ def window_minimum_threshold(self):
+ return self.__window_minimum_threshold
+
+ @property
+ def absolute_threshold(self):
return self.__absolute_threshold
- def get_log_level(self):
+ @property
+ def log_level(self):
return self.__log_level
- def get_disks_to_detection(self):
+ @property
+ def disks_to_detection(self):
return self.__disks_to_detection
- def get_boxplot_parameter(self):
+ @property
+ def stage(self):
+ return self.__stage
+
+ @property
+ def iotype(self):
+ return self.__iotype
+
+ @property
+ def boxplot_parameter(self):
return self.__boxplot_parameter
- def get_n_sigma_parameter(self):
+ @property
+ def n_sigma_parameter(self):
return self.__n_sigma_parameter
diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
index c7679cd..ed997e6 100644
--- a/src/python/sentryPlugins/ai_block_io/data_access.py
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
@@ -41,11 +41,14 @@ def check_collect_valid(period):
try:
data = json.loads(data_raw["message"])
except Exception as e:
- logging.warning(f"get io data failed, {e}")
+ logging.warning(f"get valid devices failed, occur exception: {e}")
+ return None
+ if not data:
+ logging.warning(f"get valid devices failed, return {data_raw}")
return None
return [k for k in data.keys()]
else:
- logging.warning(f"get io data failed, return {data_raw}")
+ logging.warning(f"get valid devices failed, return {data_raw}")
return None
@@ -60,7 +63,7 @@ def _get_raw_data(period, disk_list):
def _get_io_stage_data(data):
io_stage_data = IOStageData()
- for data_type in ('read', 'write', 'flush', 'discard'):
+ for data_type in ("read", "write", "flush", "discard"):
if data_type in data:
getattr(io_stage_data, data_type).latency = data[data_type][0]
getattr(io_stage_data, data_type).io_dump = data[data_type][1]
@@ -87,7 +90,7 @@ def get_io_data_from_collect_plug(period, disk_list):
getattr(disk_ret, k)
setattr(disk_ret, k, _get_io_stage_data(v))
except AttributeError:
- logging.debug(f'no attr {k}')
+ logging.debug(f"no attr {k}")
continue
ret[disk] = disk_ret
return ret
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
index 0ed282b..e710ddd 100644
--- a/src/python/sentryPlugins/ai_block_io/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -53,3 +53,28 @@ class Detector:
f' io_type_name: {self._metric_name.get_io_access_type_name()},'
f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},'
f' sliding_window_type: {self._slidingWindow}')
+
+
+class DiskDetector:
+
+ def __init__(self, disk_name: str):
+ self._disk_name = disk_name
+ self._detector_list = []
+
+ def add_detector(self, detector: Detector):
+ self._detector_list.append(detector)
+
+ def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
+ # 只有bio阶段发生异常就认为发生了慢IO事件
+ # todo根因诊断
+ for detector in self._detector_list:
+ result = detector.is_slow_io_event(io_data_dict_with_disk_name)
+ if result[0] and detector.get_metric_name().get_stage_name() == 'bio':
+ return result[0], detector.get_metric_name(), result[1], result[2]
+ return False, None, None, None
+
+ def __repr__(self):
+ msg = f'disk: {self._disk_name}, '
+ for detector in self._detector_list:
+ msg += f'\n detector: [{detector}]'
+ return msg
diff --git a/src/python/sentryPlugins/ai_block_io/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py
index 8dbba06..0ed37b9 100644
--- a/src/python/sentryPlugins/ai_block_io/utils.py
+++ b/src/python/sentryPlugins/ai_block_io/utils.py
@@ -25,8 +25,7 @@ def get_threshold_type_enum(algorithm_type: str):
return ThresholdType.BoxplotThreshold
if algorithm_type.lower() == 'n_sigma':
return ThresholdType.NSigmaThreshold
- logging.warning(f"the algorithm type: {algorithm_type} you set is invalid, use default value: boxplot")
- return ThresholdType.BoxplotThreshold
+ return None
def get_sliding_window_type_enum(sliding_window_type: str):
--
2.23.0

View File

@ -1,73 +0,0 @@
From 7d5ad8f2dd87432b8f46ea5002400ee46cb6756a Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Wed, 9 Oct 2024 14:22:38 +0800
Subject: [PATCH] avg_block_io send alarm to xalarmd
---
config/tasks/avg_block_io.mod | 2 ++
.../sentryPlugins/avg_block_io/module_conn.py | 23 +++++++++++++++----
2 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod
index b9b6f34..bcd063b 100644
--- a/config/tasks/avg_block_io.mod
+++ b/config/tasks/avg_block_io.mod
@@ -3,3 +3,5 @@ enabled=yes
task_start=/usr/bin/python3 /usr/bin/avg_block_io
task_stop=pkill -f /usr/bin/avg_block_io
type=oneshot
+alarm_id=1002
+alarm_clear_time=5
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
index 0da4208..2fc5a83 100644
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
@@ -16,6 +16,7 @@ import time
from .utils import is_abnormal
from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
from syssentry.result import ResultLevel, report_result
+from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR
TASK_NAME = "avg_block_io"
@@ -68,19 +69,33 @@ def process_report_data(disk_name, rw, io_data):
if not is_abnormal((disk_name, 'bio', rw), io_data):
return
+ msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw}
+
ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
for stage_name in ctrl_stage:
if is_abnormal((disk_name, stage_name, rw), io_data):
- logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw))
+ msg["reason"] = "IO press slow"
+ msg["block_stack"] = f"bio,{stage_name}"
+ logging.warning("{} - {} report IO press slow".format(disk_name, rw))
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
return
if is_abnormal((disk_name, 'rq_driver', rw), io_data):
- logging.warning("{} - {} - {} report driver".format(time.ctime(), disk_name, rw))
+ msg["reason"] = "driver slow"
+ msg["block_stack"] = "bio,rq_driver"
+ logging.warning("{} - {} report driver slow".format(disk_name, rw))
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
return
kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue']
for stage_name in kernel_stage:
if is_abnormal((disk_name, stage_name, rw), io_data):
- logging.warning("{} - {} - {} report kernel".format(time.ctime(), disk_name, rw))
+ msg["reason"] = "kernel slow"
+ msg["block_stack"] = f"bio,{stage_name}"
+ logging.warning("{} - {} report kernel slow".format(disk_name, rw))
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
return
- logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw))
+ msg["reason"] = "unknown"
+ msg["block_stack"] = "bio"
+ logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw))
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
--
2.33.0

View File

@ -1,34 +0,0 @@
From 7d5ad8f2dd87432b8f46ea5002400ee46cb6756a Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Wed, 9 Oct 2024 14:22:38 +0800
Subject: [PATCH] bugfix typo
---
src/python/sentryPlugins/avg_block_io/avg_block_io.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
index b6b3b28..26a60c5 100644
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
@@ -114,7 +114,7 @@ def read_config_lat_iodump(io_dic, config):
common_param = {}
lat_sec = None
if not config.has_section("latency"):
- logging.warning("Cannot find algorithm section in config file")
+ logging.warning("Cannot find latency section in config file")
else:
lat_sec = config["latency"]
@@ -122,7 +122,7 @@ def read_config_lat_iodump(io_dic, config):
if not config.has_section("iodump"):
logging.warning("Cannot find iodump section in config file")
else:
- lat_sec = config["iodump"]
+ iodump_sec = config["iodump"]
if not lat_sec and not iodump_sec:
return common_param
--
2.27.0

View File

@ -1,56 +0,0 @@
From 67439c0040b1fb0614ac009bf53062e9ec2880aa Mon Sep 17 00:00:00 2001
From: jinsaihang <jinsaihang@h-partners.com>
Date: Wed, 9 Oct 2024 11:55:35 +0800
Subject: [PATCH 1/2] change alarm length
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
---
src/python/syssentry/sentryctl | 3 +++
src/python/syssentry/syssentry.py | 3 +++
2 files changed, 6 insertions(+)
diff --git a/src/python/syssentry/sentryctl b/src/python/syssentry/sentryctl
index 675c17a..3de93d0 100644
--- a/src/python/syssentry/sentryctl
+++ b/src/python/syssentry/sentryctl
@@ -25,6 +25,7 @@ MAX_PARAM_LENGTH = 256
RESULT_MSG_DATA_LEN = 4
CTL_MSG_LEN_LEN = 3
+ALARM_MSG_DATA_LEN = 6
DEFAULT_ALARM_TIME_RANGE = 10
def status_output_format(res_data):
@@ -173,6 +174,8 @@ if __name__ == '__main__':
request_message = json.dumps(req_msg_struct)
if client_args.cmd_type == 'get_result':
result_message = client_send_and_recv(request_message, RESULT_MSG_DATA_LEN)
+ elif client_args.cmd_type == 'get_alarm':
+ result_message = client_send_and_recv(request_message, ALARM_MSG_DATA_LEN)
else:
result_message = client_send_and_recv(request_message, CTL_MSG_LEN_LEN)
if not result_message:
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
index c2dee85..ea09095 100644
--- a/src/python/syssentry/syssentry.py
+++ b/src/python/syssentry/syssentry.py
@@ -56,6 +56,7 @@ CTL_MSG_MAGIC_LEN = 3
CTL_MSG_LEN_LEN = 3
CTL_MAGIC = "CTL"
RES_MAGIC = "RES"
+ALARM_MSG_DATA_LEN = 6
CTL_LISTEN_QUEUE_LEN = 5
SERVER_EPOLL_TIMEOUT = 0.3
@@ -256,6 +257,8 @@ def server_recv(server_socket: socket.socket):
res_head = RES_MAGIC
if cmd_type == "get_result":
res_data_len = str(len(res_data)).zfill(RESULT_MSG_HEAD_LEN - RESULT_MSG_MAGIC_LEN)
+ elif cmd_type == "get_alarm":
+ res_data_len = str(len(res_data)).zfill(ALARM_MSG_DATA_LEN)
else:
res_data_len = str(len(res_data)).zfill(CTL_MSG_MAGIC_LEN)
res_head += res_data_len
--
2.27.0

View File

@ -1,55 +0,0 @@
From aaff413d6954003a3c21af21003c3bc134f940e2 Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Tue, 5 Nov 2024 10:31:10 +0800
Subject: [PATCH] change avg_block_io config
---
config/plugins/avg_block_io.ini | 8 ++++----
.../src/python/sentryPlugins/avg_block_io/config.py | 8 ++++----
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini
index 5c4b9b0..3b4ee33 100644
--- a/config/plugins/avg_block_io.ini
+++ b/config/plugins/avg_block_io.ini
@@ -12,12 +12,12 @@ win_size=30
win_threshold=6
[latency_nvme_ssd]
-read_avg_lim=300
-write_avg_lim=300
+read_avg_lim=10000
+write_avg_lim=10000
read_avg_time=3
write_avg_time=3
-read_tot_lim=500
-write_tot_lim=500
+read_tot_lim=50000
+write_tot_lim=50000
[latency_sata_ssd]
read_avg_lim=10000
diff --git a/src/python/sentryPlugins/avg_block_io/config.py b/src/python/sentryPlugins/avg_block_io/config.py
index c8f45ce..c1e8ab1 100644
--- a/src/python/sentryPlugins/avg_block_io/config.py
+++ b/src/python/sentryPlugins/avg_block_io/config.py
@@ -42,12 +42,12 @@ DEFAULT_PARAM = {
CONF_ALGO_SIZE: 30,
CONF_ALGO_THRE: 6
}, 'latency_nvme_ssd': {
- 'read_avg_lim': 300,
- 'write_avg_lim': 300,
+ 'read_avg_lim': 10000,
+ 'write_avg_lim': 10000,
'read_avg_time': 3,
'write_avg_time': 3,
- 'read_tot_lim': 500,
- 'write_tot_lim': 500,
+ 'read_tot_lim': 50000,
+ 'write_tot_lim': 50000,
}, 'latency_sata_ssd' : {
'read_avg_lim': 10000,
'write_avg_lim': 10000,
--
2.39.5 (Apple Git-154)

View File

@ -1,36 +0,0 @@
From 8cc13a422ed29e48b0c5b86b2da2a5dc8ad4aa59 Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng6@huawei.com>
Date: Fri, 13 Dec 2024 11:20:55 +0800
Subject: [PATCH] change status of period task and sort mod file
---
src/python/syssentry/cron_process.py | 1 +
src/python/syssentry/load_mods.py | 1 +
2 files changed, 2 insertions(+)
diff --git a/src/python/syssentry/cron_process.py b/src/python/syssentry/cron_process.py
index 50780b3..5543d67 100644
--- a/src/python/syssentry/cron_process.py
+++ b/src/python/syssentry/cron_process.py
@@ -144,6 +144,7 @@ def period_tasks_handle():
if not task.onstart:
logging.debug("period onstart not enabled, task: %s", task.name)
+ task.runtime_status = EXITED_STATUS
continue
if task.runtime_status == WAITING_STATUS and \
diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py
index 48d7e66..5be5540 100644
--- a/src/python/syssentry/load_mods.py
+++ b/src/python/syssentry/load_mods.py
@@ -224,6 +224,7 @@ def load_tasks():
return "failed", ""
mod_files = os.listdir(TASKS_STORAGE_PATH)
+ mod_files.sort()
for mod_file in mod_files:
logging.debug("find mod, path is %s", mod_file)
if not mod_file.endswith(MOD_FILE_SUFFIX):
--
2.33.0

View File

@ -1,41 +0,0 @@
From 6e98b2e5008ffabfda8d1c10778717f972b54398 Mon Sep 17 00:00:00 2001
From: jwolf <523083921@qq.com>
Date: Mon, 22 Jul 2024 14:58:27 +0800
Subject: [PATCH] cpu_utility and cpu_patrol musht be an integer
---
src/c/catcli/catlib/cli_param_checker.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c
index a1aa636..e400428 100644
--- a/src/c/catcli/catlib/cli_param_checker.c
+++ b/src/c/catcli/catlib/cli_param_checker.c
@@ -2,6 +2,7 @@
#include <sys/un.h>
#include <regex.h>
#include <stdbool.h>
+#include <string.h>
#include <limits.h>
#include <unistd.h>
#include "cli_common.h"
@@ -13,7 +14,7 @@
void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs)
{
long cpu_utility = strtol(getopt_optarg, NULL, DECIMAL);
- if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX) {
+ if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) {
strncpy(errs->patrol_module_err,
"\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN);
}
@@ -68,7 +69,7 @@ void checkset_cpulist(char *getopt_optarg, catcli_request_body *p_request_body,
void checkset_patrol_time(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs)
{
long second = strtol(getopt_optarg, NULL, DECIMAL);
- if (second <= 0 || second > INT_MAX) {
+ if (second <= 0 || second > INT_MAX || strchr(getopt_optarg, '.') != NULL) {
strncpy(errs->patrol_time_err,
"\"patrol_second\" must be a number in the range of (0,INT_MAX] ,correct \"-t, --patrol_second\"\n",
MAX_ERR_LEN);
--
Gitee

View File

@ -1,430 +0,0 @@
From e7c1b0095e16369fb09ae62ffa3158be5e8893a1 Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Fri, 11 Oct 2024 10:48:35 +0800
Subject: [PATCH] diff disk type use diff config
---
config/plugins/avg_block_io.ini | 26 +++-
src/python/sentryCollector/collect_plugin.py | 6 +
.../avg_block_io/avg_block_io.py | 144 ++++++++----------
.../sentryPlugins/avg_block_io/module_conn.py | 19 ++-
.../sentryPlugins/avg_block_io/utils.py | 43 ++++++
5 files changed, 146 insertions(+), 92 deletions(-)
diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini
index 858db18..5c4b9b0 100644
--- a/config/plugins/avg_block_io.ini
+++ b/config/plugins/avg_block_io.ini
@@ -11,13 +11,29 @@ period_time=1
win_size=30
win_threshold=6
-[latency]
-read_avg_lim=10
-write_avg_lim=10
+[latency_nvme_ssd]
+read_avg_lim=300
+write_avg_lim=300
read_avg_time=3
write_avg_time=3
-read_tot_lim=50
-write_tot_lim=50
+read_tot_lim=500
+write_tot_lim=500
+
+[latency_sata_ssd]
+read_avg_lim=10000
+write_avg_lim=10000
+read_avg_time=3
+write_avg_time=3
+read_tot_lim=50000
+write_tot_lim=50000
+
+[latency_sata_hdd]
+read_avg_lim=15000
+write_avg_lim=15000
+read_avg_time=3
+write_avg_time=3
+read_tot_lim=50000
+write_tot_lim=50000
[iodump]
read_iodump_lim=0
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
index 31bf11b..bec405a 100644
--- a/src/python/sentryCollector/collect_plugin.py
+++ b/src/python/sentryCollector/collect_plugin.py
@@ -79,6 +79,12 @@ class DiskType():
TYPE_SATA_SSD = 1
TYPE_SATA_HDD = 2
+Disk_Type = {
+ DiskType.TYPE_NVME_SSD: "nvme_ssd",
+ DiskType.TYPE_SATA_SSD: "sata_ssd",
+ DiskType.TYPE_SATA_HDD: "sata_hdd"
+}
+
def client_send_and_recv(request_data, data_str_len, protocol):
"""client socket send and recv message"""
try:
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
index cf2ded3..fdad995 100644
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
@@ -14,8 +14,9 @@ import configparser
import time
from .stage_window import IoWindow, IoDumpWindow
-from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler
-from .utils import update_avg_and_check_abnormal, get_log_level
+from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name
+from .utils import update_avg_and_check_abnormal, get_log_level, get_section_value
+from sentryCollector.collect_plugin import Disk_Type
CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
@@ -37,44 +38,40 @@ def read_config_common(config):
disk = [] if disk_name == "default" else disk_name.split(",")
except configparser.NoOptionError:
disk = []
- logging.warning("Unset disk, set to default")
+ logging.warning("Unset common.disk, set to default")
try:
stage_name = config.get("common", "stage")
stage = [] if stage_name == "default" else stage_name.split(",")
except configparser.NoOptionError:
stage = []
- logging.warning("Unset stage, set to read,write")
+ logging.warning("Unset common.stage, set to default")
if len(disk) > 10:
- logging.warning("Too many disks, record only max 10 disks")
+ logging.warning("Too many common.disks, record only max 10 disks")
disk = disk[:10]
try:
iotype_name = config.get("common", "iotype").split(",")
- iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write', 'flush', 'discard']]
- err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write', 'flush', 'discard']]
+ iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']]
+ err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']]
- if iotype_list in [None, []]:
- iotype_list = ["read", "write"]
- except configparser.NoOptionError:
- iotype = ["read", "write"]
- logging.warning("Unset iotype, set to default")
+ if err_iotype:
+ report_alarm_fail("Invalid common.iotype config")
- if err_iotype:
- logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list))
-
+ except configparser.NoOptionError:
+ iotype_list = ["read", "write"]
+ logging.warning("Unset common.iotype, set to read,write")
try:
period_time = int(config.get("common", "period_time"))
if not (1 <= period_time <= 300):
raise ValueError("Invalid period_time")
except ValueError:
- period_time = 1
- logging.warning("Invalid period_time, set to 1s")
+ report_alarm_fail("Invalid common.period_time")
except configparser.NoOptionError:
period_time = 1
- logging.warning("Unset period_time, use 1s as default")
+ logging.warning("Unset common.period_time, use 1s as default")
return period_time, disk, stage, iotype_list
@@ -87,76 +84,56 @@ def read_config_algorithm(config):
try:
win_size = int(config.get("algorithm", "win_size"))
if not (1 <= win_size <= 300):
- raise ValueError("Invalid win_size")
+ raise ValueError("Invalid algorithm.win_size")
except ValueError:
- win_size = 30
- logging.warning("Invalid win_size, set to 30")
+ report_alarm_fail("Invalid algorithm.win_size config")
except configparser.NoOptionError:
win_size = 30
- logging.warning("Unset win_size, use 30 as default")
+ logging.warning("Unset algorithm.win_size, use 30 as default")
try:
win_threshold = int(config.get("algorithm", "win_threshold"))
if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
- raise ValueError("Invalid win_threshold")
+ raise ValueError("Invalid algorithm.win_threshold")
except ValueError:
- win_threshold = 6
- logging.warning("Invalid win_threshold, set to 6")
+ report_alarm_fail("Invalid algorithm.win_threshold config")
except configparser.NoOptionError:
win_threshold = 6
- logging.warning("Unset win_threshold, use 6 as default")
+ logging.warning("Unset algorithm.win_threshold, use 6 as default")
return win_size, win_threshold
-def read_config_lat_iodump(io_dic, config):
- """read config file, get [latency] [iodump] section value"""
+def read_config_latency(config):
+ """read config file, get [latency_xxx] section value"""
common_param = {}
- lat_sec = None
- if not config.has_section("latency"):
- logging.warning("Cannot find latency section in config file")
- else:
- lat_sec = config["latency"]
-
- iodump_sec = None
- if not config.has_section("iodump"):
- logging.warning("Cannot find iodump section in config file")
- else:
- iodump_sec = config["iodump"]
-
- if not lat_sec and not iodump_sec:
- return common_param
-
- for io_type in io_dic["iotype_list"]:
- common_param[io_type] = {}
-
- latency_keys = {
- "avg_lim": "{}_avg_lim".format(io_type),
- "avg_time": "{}_avg_time".format(io_type),
- "tot_lim": "{}_tot_lim".format(io_type),
- }
- iodump_key = "{}_iodump_lim".format(io_type)
+ for type_name in Disk_Type:
+ section_name = f"latency_{Disk_Type[type_name]}"
+ if not config.has_section(section_name):
+ report_alarm_fail(f"Cannot find {section_name} section in config file")
- if iodump_sec and iodump_key in iodump_sec and iodump_sec[iodump_key].isdecimal():
- common_param[io_type][iodump_key] = int(iodump_sec[iodump_key])
+ common_param[Disk_Type[type_name]] = get_section_value(section_name, config)
+ return common_param
- if not lat_sec:
- continue
- for key_suffix, key_template in latency_keys.items():
- if key_template in lat_sec and lat_sec[key_template].isdecimal():
- common_param[io_type][key_template] = int(lat_sec[key_template])
+def read_config_iodump(config):
+ """read config file, get [iodump] section value"""
+ common_param = {}
+ section_name = "iodump"
+ if not config.has_section(section_name):
+ report_alarm_fail(f"Cannot find {section_name} section in config file")
- return common_param
+ return get_section_value(section_name, config)
-def read_config_stage(config, stage, iotype_list):
- """read config file, get [STAGE_NAME] section value"""
+def read_config_stage(config, stage, iotype_list, curr_disk_type):
+ """read config file, get [STAGE_NAME_diskType] section value"""
res = {}
- if not stage in config:
+ section_name = f"{stage}_{curr_disk_type}"
+ if not config.has_section(section_name):
return res
- for key in config[stage]:
+ for key in config[section_name]:
if config[stage][key].isdecimal():
res[key] = int(config[stage][key])
@@ -171,11 +148,12 @@ def init_io_win(io_dic, config, common_param):
for disk_name in io_dic["disk_list"]:
io_data[disk_name] = {}
io_avg_value[disk_name] = {}
+ curr_disk_type = get_disk_type_by_name(disk_name)
for stage_name in io_dic["stage_list"]:
io_data[disk_name][stage_name] = {}
io_avg_value[disk_name][stage_name] = {}
- # step3. 解析stage配置
- curr_stage_param = read_config_stage(config, stage_name, iotype_list)
+ # 解析stage配置
+ curr_stage_param = read_config_stage(config, stage_name, iotype_list, curr_disk_type)
for rw in iotype_list:
io_data[disk_name][stage_name][rw] = {}
io_avg_value[disk_name][stage_name][rw] = [0, 0]
@@ -187,10 +165,10 @@ def init_io_win(io_dic, config, common_param):
iodump_lim_key = "{}_iodump_lim".format(rw)
# 获取值,优先从 curr_stage_param 获取,如果不存在,则从 common_param 获取
- avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key))
- avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key))
- tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key))
- iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key))
+ avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(curr_disk_type, {}).get(avg_lim_key))
+ avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(curr_disk_type, {}).get(avg_time_key))
+ tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(curr_disk_type, {}).get(tot_lim_key))
+ iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get("iodump", {}).get(iodump_lim_key))
if avg_lim_value and avg_time_value and tot_lim_value:
io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
@@ -217,28 +195,21 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
stage_list = [key for key in all_stage_set if key in config_stage]
not_in_stage_list = [key for key in config_stage if key not in all_stage_set]
- if not config_disk:
+ if not_in_stage_list:
+ report_alarm_fail(f"Invalid common.stage_list config, cannot set {not_in_stage_list}")
+
+ if not config_disk and not not_in_disk_list:
disk_list = [key for key in all_disk_set]
- if not config_stage:
+ if not config_stage and not not_in_stage_list:
stage_list = [key for key in all_stage_set]
disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list
- stage_list = stage_list[:15] if len(stage_list) > 15 else stage_list
-
- if config_disk and not disk_list:
- logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk))
- disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage)
-
- if config_stage and not stage_list:
- logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage))
- disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, [])
if not stage_list or not disk_list:
report_alarm_fail("Cannot get valid disk name or stage name.")
log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list)
- log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list)
return disk_list, stage_list
@@ -310,8 +281,13 @@ def main():
# step1. 解析公共配置 --- algorithm
io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config)
- # step2. 循环创建窗口
- common_param = read_config_lat_iodump(io_dic, config)
+ # step2. 解析公共配置 --- latency_xxx
+ common_param = read_config_latency(config)
+
+ # step3. 解析公共配置 --- iodump
+ common_param['iodump'] = read_config_iodump(config)
+
+ # step4. 循环创建窗口
io_data, io_avg_value = init_io_win(io_dic, config, common_param)
main_loop(io_dic, io_data, io_avg_value)
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
index 40b3fcc..8d6f429 100644
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
@@ -14,7 +14,7 @@ import sys
import time
from .utils import is_abnormal, get_win_data, log_slow_win
-from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
+from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages, get_disk_type, Disk_Type
from syssentry.result import ResultLevel, report_result
from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR
@@ -51,7 +51,7 @@ def check_result_validation(res, reason):
try:
json_data = json.loads(res['message'])
except json.JSONDecodeError:
- err_msg = "Failed to {}: invalid return message".format(reason)
+ err_msg = f"Failed to {reason}: invalid return message"
report_alarm_fail(err_msg)
return json_data
@@ -60,7 +60,7 @@ def check_result_validation(res, reason):
def report_alarm_fail(alarm_info):
"""report result to xalarmd"""
report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info}))
- logging.error(alarm_info)
+ logging.critical(alarm_info)
sys.exit(1)
@@ -114,3 +114,16 @@ def process_report_data(disk_name, rw, io_data):
log_slow_win(msg, "unknown")
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
+
+
+def get_disk_type_by_name(disk_name):
+ res = get_disk_type(disk_name)
+ disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}')
+ try:
+ curr_disk_type = int(disk_type_str)
+ if curr_disk_type not in Disk_Type:
+ raise ValueError
+ except ValueError:
+ report_alarm_fail(f"Failed to get disk type for {disk_name}")
+
+ return Disk_Type[curr_disk_type]
\ No newline at end of file
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
index 3b7f027..cef1edd 100644
--- a/src/python/sentryPlugins/avg_block_io/utils.py
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
@@ -26,6 +26,49 @@ LogLevel = {
}
+DEFAULT_PARAM = {
+ 'latency_nvme_ssd': {
+ 'read_avg_lim': 300,
+ 'write_avg_lim': 300,
+ 'read_avg_time': 3,
+ 'write_avg_time': 3,
+ 'read_tot_lim': 500,
+ 'write_tot_lim': 500,
+ }, 'latency_sata_ssd' : {
+ 'read_avg_lim': 10000,
+ 'write_avg_lim': 10000,
+ 'read_avg_time': 3,
+ 'write_avg_time': 3,
+ 'read_tot_lim': 50000,
+ 'write_tot_lim': 50000,
+ }, 'latency_sata_hdd' : {
+ 'read_avg_lim': 15000,
+ 'write_avg_lim': 15000,
+ 'read_avg_time': 3,
+ 'write_avg_time': 3,
+ 'read_tot_lim': 50000,
+ 'write_tot_lim': 50000
+ }, 'iodump': {
+ 'read_iodump_lim': 0,
+ 'write_iodump_lim': 0
+ }
+}
+
+
+def get_section_value(section_name, config):
+ common_param = {}
+ config_sec = config[section_name]
+ for config_key in DEFAULT_PARAM[section_name]:
+ if config_key in config_sec:
+ if not config_sec[config_key].isdecimal():
+ report_alarm_fail(f"Invalid {section_name}.{config_key} config.")
+ common_param[config_key] = int(config_sec[config_key])
+ else:
+ logging.warning(f"Unset {section_name}.{config_key} in config file, use {DEFAULT_PARAM[section_name][config_key]} as default")
+ common_param[config_key] = DEFAULT_PARAM[section_name][config_key]
+ return common_param
+
+
def get_log_level(filename):
if not os.path.exists(filename):
return logging.INFO
--
2.27.0

View File

@ -1,29 +0,0 @@
From 41bf507ca6cbbdf5e646a405de6b8d5b9be4bd28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Wed, 16 Oct 2024 17:20:01 +0800
Subject: [PATCH] enrich alert info about kernel stack
---
src/python/sentryPlugins/ai_block_io/detector.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
index ed8b64a..8536f7a 100644
--- a/src/python/sentryPlugins/ai_block_io/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -103,8 +103,10 @@ class DiskDetector:
elif len(diagnosis_info["rq_driver"]) != 0:
root_cause = "[Root Cause: disk slow]"
elif len(diagnosis_info["io_stage"]) != 0:
- stage = diagnosis_info["io_stage"][0][1].stage_name
- root_cause = f"[Root Cause: io stage slow, stage: {stage}]"
+ stage_list = []
+ for io_stage in diagnosis_info["io_stage"]:
+ stage_list.append(io_stage[0].stage_name)
+ root_cause = f"[Root Cause: io stage slow, stage: {stage_list}]"
if root_cause is None:
root_cause = "[Root Cause: high io pressure]"
return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause
--
2.23.0

View File

@ -1,572 +0,0 @@
From acb77d6a69aa9269b0f691613bef53efd0c01e53 Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Thu, 12 Sep 2024 11:31:34 +0800
Subject: [PATCH 2/2] add avg_block_io plugin
---
config/plugins/avg_block_io.ini | 21 ++
config/tasks/avg_block_io.mod | 5 +
src/python/sentryPlugins/__init__.py | 0
.../sentryPlugins/avg_block_io/__init__.py | 0
.../avg_block_io/avg_block_io.py | 257 ++++++++++++++++++
.../sentryPlugins/avg_block_io/module_conn.py | 86 ++++++
.../avg_block_io/stage_window.py | 47 ++++
.../sentryPlugins/avg_block_io/utils.py | 86 ++++++
8 files changed, 502 insertions(+)
create mode 100644 config/plugins/avg_block_io.ini
create mode 100644 config/tasks/avg_block_io.mod
create mode 100644 src/python/sentryPlugins/__init__.py
create mode 100644 src/python/sentryPlugins/avg_block_io/__init__.py
create mode 100644 src/python/sentryPlugins/avg_block_io/avg_block_io.py
create mode 100644 src/python/sentryPlugins/avg_block_io/module_conn.py
create mode 100644 src/python/sentryPlugins/avg_block_io/stage_window.py
create mode 100644 src/python/sentryPlugins/avg_block_io/utils.py
diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini
new file mode 100644
index 0000000..bc33dde
--- /dev/null
+++ b/config/plugins/avg_block_io.ini
@@ -0,0 +1,21 @@
+[common]
+disk=default
+stage=default
+iotype=read,write
+period_time=1
+
+[algorithm]
+win_size=30
+win_threshold=6
+
+[latency]
+read_avg_lim=10
+write_avg_lim=10
+read_avg_time=3
+write_avg_time=3
+read_tot_lim=50
+write_tot_lim=50
+
+[iodump]
+read_iodump_lim=0
+write_iodump_lim=0
diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod
new file mode 100644
index 0000000..814c483
--- /dev/null
+++ b/config/tasks/avg_block_io.mod
@@ -0,0 +1,5 @@
+[common]
+enabled=yes
+task_start=/usr/bin/python3 /usr/bin/avg_block_io
+task_stop=pkill avg_block_io
+type=oneshot
\ No newline at end of file
diff --git a/src/python/sentryPlugins/__init__.py b/src/python/sentryPlugins/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/python/sentryPlugins/avg_block_io/__init__.py b/src/python/sentryPlugins/avg_block_io/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
new file mode 100644
index 0000000..ff2071d
--- /dev/null
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
@@ -0,0 +1,257 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+import logging
+import signal
+import configparser
+import time
+
+from .stage_window import IoWindow, IoDumpWindow
+from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler
+from .utils import update_avg_and_check_abnormal
+
+CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
+
+def log_invalid_keys(not_in_list, keys_name, config_list, default_list):
+ """print invalid log"""
+ if config_list and default_list:
+ logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list))
+ elif config_list == ["default"]:
+ logging.warning("Default {} use {}".format(keys_name, default_list))
+
+
+def read_config_common(config):
+ """read config file, get [common] section value"""
+ try:
+ common_sec = config['common']
+ except configparser.NoSectionError:
+ report_alarm_fail("Cannot find common section in config file")
+
+ try:
+ period_time = int(common_sec.get("period_time", 1))
+ if not (1 <= period_time <= 300):
+ raise ValueError("Invalid period_time")
+ except ValueError:
+ period_time = 1
+ logging.warning("Invalid period_time, set to 1s")
+
+ disk = common_sec.get('disk').split(",") if common_sec.get('disk') not in [None, 'default'] else []
+ stage = common_sec.get('stage').split(",") if common_sec.get('stage') not in [None, 'default'] else []
+
+ if len(disk) > 10:
+ logging.warning("Too many disks, record only max 10 disks")
+ disk = disk[:10]
+
+ iotype = common_sec.get('iotype', 'read,write').split(",")
+ iotype_list = [rw.lower() for rw in iotype if rw.lower() in ['read', 'write', 'flush', 'discard']]
+ err_iotype = [rw for rw in iotype if rw.lower() not in ['read', 'write', 'flush', 'discard']]
+
+ if err_iotype:
+ logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list))
+
+ return period_time, disk, stage, iotype_list
+
+
+def read_config_algorithm(config):
+ """read config file, get [algorithm] section value"""
+ if not config.has_section("algorithm"):
+ report_alarm_fail("Cannot find algorithm section in config file")
+
+ try:
+ win_size = int(config.get("algorithm", "win_size"))
+ if not (1 <= win_size <= 300):
+ raise ValueError("Invalid win_size")
+ win_threshold = int(config.get("algorithm", "win_threshold"))
+ if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
+ raise ValueError("Invalid win_threshold")
+ except ValueError:
+ report_alarm_fail("Invalid win_threshold or win_size")
+
+ return win_size, win_threshold
+
+
+def read_config_lat_iodump(io_dic, config):
+ """read config file, get [latency] [iodump] section value"""
+ common_param = {}
+ for io_type in io_dic["iotype_list"]:
+ common_param[io_type] = {}
+
+ latency_keys = {
+ "avg_lim": "{}_avg_lim".format(io_type),
+ "avg_time": "{}_avg_time".format(io_type),
+ "tot_lim": "{}_tot_lim".format(io_type),
+ }
+ iodump_key = "{}_iodump_lim".format(io_type)
+
+ for key_suffix, key_template in latency_keys.items():
+ if key_template in config["latency"] and config["latency"][key_template].isdecimal():
+ common_param[io_type][key_template] = int(config["latency"][key_template])
+
+ if iodump_key in config["iodump"] and config["iodump"][iodump_key].isdecimal():
+ common_param[io_type][iodump_key] = int(config["iodump"][iodump_key])
+
+ return common_param
+
+
+def read_config_stage(config, stage, iotype_list):
+ """read config file, get [STAGE_NAME] section value"""
+ res = {}
+ if not stage in config:
+ return res
+
+ for key in config[stage]:
+ if config[stage][key].isdecimal():
+ res[key] = int(config[stage][key])
+
+ return res
+
+
+def init_io_win(io_dic, config, common_param):
+ """initialize windows of latency, iodump, and dict of avg_value"""
+ iotype_list = io_dic["iotype_list"]
+ io_data = {}
+ io_avg_value = {}
+ for disk_name in io_dic["disk_list"]:
+ io_data[disk_name] = {}
+ io_avg_value[disk_name] = {}
+ for stage_name in io_dic["stage_list"]:
+ io_data[disk_name][stage_name] = {}
+ io_avg_value[disk_name][stage_name] = {}
+ # step3. 解析stage配置
+ curr_stage_param = read_config_stage(config, stage_name, iotype_list)
+ for rw in iotype_list:
+ io_data[disk_name][stage_name][rw] = {}
+ io_avg_value[disk_name][stage_name][rw] = [0, 0]
+
+ # 对每个rw创建latency和iodump窗口
+ avg_lim_key = "{}_avg_lim".format(rw)
+ avg_time_key = "{}_avg_time".format(rw)
+ tot_lim_key = "{}_tot_lim".format(rw)
+ iodump_lim_key = "{}_iodump_lim".format(rw)
+
+ # 获取值,优先从 curr_stage_param 获取,如果不存在,则从 common_param 获取
+ avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key))
+ avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key))
+ tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key))
+ iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key))
+
+ if avg_lim_value and avg_time_value and tot_lim_value:
+ io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
+
+ if iodump_lim_value is not None:
+ io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value)
+ return io_data, io_avg_value
+
+
+def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
+ """get disk_list and stage_list by sentryCollector"""
+ json_data = avg_is_iocollect_valid(io_dic, config_disk, config_stage)
+
+ all_disk_set = json_data.keys()
+ all_stage_set = set()
+ for disk_stage_list in json_data.values():
+ all_stage_set.update(disk_stage_list)
+
+ disk_list = [key for key in config_disk if key in all_disk_set]
+ not_in_disk_list = [key for key in config_disk if key not in all_disk_set]
+
+ stage_list = [key for key in config_stage if key in all_stage_set]
+ not_in_stage_list = [key for key in config_stage if key not in all_stage_set]
+
+ if not config_disk:
+ disk_list = [key for key in all_disk_set]
+
+ if not config_stage:
+ stage_list = [key for key in all_stage_set]
+
+ if config_disk and not disk_list:
+ logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk))
+ disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage)
+
+ if config_stage and not stage_list:
+ logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage))
+ disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, [])
+
+ if not stage_list or not disk_list:
+ report_alarm_fail("Cannot get valid disk name or stage name.")
+
+ log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list)
+ log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list)
+
+ return disk_list, stage_list
+
+
+def main_loop(io_dic, io_data, io_avg_value):
+ """main loop of avg_block_io"""
+ period_time = io_dic["period_time"]
+ disk_list = io_dic["disk_list"]
+ stage_list = io_dic["stage_list"]
+ iotype_list = io_dic["iotype_list"]
+ win_size = io_dic["win_size"]
+ # 开始循环
+ while True:
+ # 等待x秒
+ time.sleep(period_time)
+
+ # 采集模块对接,获取周期数据
+ curr_period_data = avg_get_io_data(io_dic)
+
+ # 处理周期数据
+ reach_size = False
+ for disk_name in disk_list:
+ for stage_name in stage_list:
+ for rw in iotype_list:
+ if disk_name in curr_period_data and stage_name in curr_period_data[disk_name] and rw in curr_period_data[disk_name][stage_name]:
+ io_key = (disk_name, stage_name, rw)
+ reach_size = update_avg_and_check_abnormal(curr_period_data, io_key, win_size, io_avg_value, io_data)
+
+ # win_size不满时不进行告警判断
+ if not reach_size:
+ continue
+
+ # 判断异常窗口、异常场景
+ for disk_name in disk_list:
+ for rw in iotype_list:
+ process_report_data(disk_name, rw, io_data)
+
+
+def main():
+ """main func"""
+ # 注册停止信号-2/-15
+ signal.signal(signal.SIGINT, sig_handler)
+ signal.signal(signal.SIGTERM, sig_handler)
+
+ # 初始化配置读取
+ config = configparser.ConfigParser(comment_prefixes=('#', ';'))
+ try:
+ config.read(CONFIG_FILE)
+ except configparser.Error:
+ report_alarm_fail("Failed to read config file")
+
+ io_dic = {}
+
+ # 读取配置文件 -- common段
+ io_dic["period_time"], disk, stage, io_dic["iotype_list"] = read_config_common(config)
+
+ # 采集模块对接is_iocollect_valid()
+ io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage)
+
+ if "bio" not in io_dic["stage_list"]:
+ report_alarm_fail("Cannot run avg_block_io without bio stage")
+
+ # 初始化窗口 -- config读取对应is_iocollect_valid返回的结果
+ # step1. 解析公共配置 --- algorithm
+ io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config)
+
+ # step2. 循环创建窗口
+ common_param = read_config_lat_iodump(io_dic, config)
+ io_data, io_avg_value = init_io_win(io_dic, config, common_param)
+
+ main_loop(io_dic, io_data, io_avg_value)
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
new file mode 100644
index 0000000..caa0191
--- /dev/null
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
@@ -0,0 +1,86 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+import json
+import logging
+import sys
+import time
+
+from .utils import is_abnormal
+from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
+from syssentry.result import ResultLevel, report_result
+
+
+TASK_NAME = "avg_block_io"
+
+def sig_handler(signum, _f):
+ """stop avg_block_io"""
+ report_result(TASK_NAME, ResultLevel.PASS, json.dumps({}))
+ logging.info("Finished avg_block_io plugin running.")
+ sys.exit(0)
+
+def avg_get_io_data(io_dic):
+ """get_io_data from sentryCollector"""
+ res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"])
+ return check_result_validation(res, 'get io data')
+
+
+def avg_is_iocollect_valid(io_dic, config_disk, config_stage):
+ """is_iocollect_valid from sentryCollector"""
+ res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage)
+ return check_result_validation(res, 'check config validation')
+
+
+def check_result_validation(res, reason):
+ """check validation of result from sentryCollector"""
+ if not 'ret' in res or not 'message' in res:
+ err_msg = "Failed to {}: Cannot connect to sentryCollector.".format(reason)
+ report_alarm_fail(err_msg)
+ if res['ret'] != 0:
+ err_msg = "Failed to {}: {}".format(reason, Result_Messages[res['ret']])
+ report_alarm_fail(err_msg)
+
+ try:
+ json_data = json.loads(res['message'])
+ except json.JSONDecodeError:
+ err_msg = "Failed to {}: invalid return message".format(reason)
+ report_alarm_fail(err_msg)
+
+ return json_data
+
+
+def report_alarm_fail(alarm_info):
+ """report result to xalarmd"""
+ report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info}))
+ logging.error(alarm_info)
+ sys.exit(1)
+
+
+def process_report_data(disk_name, rw, io_data):
+ """check abnormal window and report to xalarm"""
+ if not is_abnormal((disk_name, 'bio', rw), io_data):
+ return
+
+ ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
+ for stage_name in ctrl_stage:
+ if is_abnormal((disk_name, stage_name, rw), io_data):
+ logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw))
+ return
+
+ if is_abnormal((disk_name, 'rq_driver', rw), io_data):
+ logging.warning("{} - {} - {} report driver".format(time.ctime(), disk_name, rw))
+ return
+
+ kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue']
+ for stage_name in kernel_stage:
+ if is_abnormal((disk_name, stage_name, rw), io_data):
+ logging.warning("{} - {} - {} report kernel".format(time.ctime(), disk_name, rw))
+ return
+ logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw))
diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py
new file mode 100644
index 0000000..9b0ce79
--- /dev/null
+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+
+class AbnormalWindowBase:
+ def __init__(self, window_size=10, window_threshold=7):
+ self.window_size = window_size
+ self.window_threshold = window_threshold
+ self.abnormal_window = [False] * window_size
+
+ def append_new_period(self, ab_res, avg_val=0):
+ self.abnormal_window.pop(0)
+ if self.is_abnormal_period(ab_res, avg_val):
+ self.abnormal_window.append(True)
+ else:
+ self.abnormal_window.append(False)
+
+ def is_abnormal_window(self):
+ return sum(self.abnormal_window) > self.window_threshold
+
+
+class IoWindow(AbnormalWindowBase):
+ def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40):
+ super().__init__(window_size, window_threshold)
+ self.abnormal_multiple = abnormal_multiple
+ self.abnormal_multiple_lim = abnormal_multiple_lim
+ self.abnormal_time = abnormal_time
+
+ def is_abnormal_period(self, value, avg_val):
+ return (value > avg_val * self.abnormal_multiple and value > self.abnormal_multiple_lim) or \
+ (value > self.abnormal_time)
+
+
+class IoDumpWindow(AbnormalWindowBase):
+ def __init__(self, window_size=10, window_threshold=7, abnormal_time=40):
+ super().__init__(window_size, window_threshold)
+ self.abnormal_time = abnormal_time
+
+ def is_abnormal_period(self, value, avg_val=0):
+ return value > self.abnormal_time
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
new file mode 100644
index 0000000..54ed080
--- /dev/null
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
@@ -0,0 +1,86 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+AVG_VALUE = 0
+AVG_COUNT = 1
+
+
+def get_nested_value(data, keys):
+ """get data from nested dict"""
+ for key in keys:
+ if key in data:
+ data = data[key]
+ else:
+ return None
+ return data
+
+
+def set_nested_value(data, keys, value):
+ """set data to nested dict"""
+ for key in keys[:-1]:
+ if key in data:
+ data = data[key]
+ else:
+ return False
+ data[keys[-1]] = value
+ return True
+
+
+def is_abnormal(io_key, io_data):
+ """check if latency and iodump win abnormal"""
+ for key in ['latency', 'iodump']:
+ all_keys = get_nested_value(io_data, io_key)
+ if all_keys and key in all_keys:
+ win = get_nested_value(io_data, io_key + (key,))
+ if win and win.is_abnormal_window():
+ return True
+ return False
+
+
+def update_io_avg(old_avg, period_value, win_size):
+ """update average of latency window"""
+ if old_avg[AVG_COUNT] < win_size:
+ new_avg_count = old_avg[AVG_COUNT] + 1
+ new_avg_value = (old_avg[AVG_VALUE] * old_avg[AVG_COUNT] + period_value[0]) / new_avg_count
+ else:
+ new_avg_count = old_avg[AVG_COUNT]
+ new_avg_value = (old_avg[AVG_VALUE] * (old_avg[AVG_COUNT] - 1) + period_value[0]) / new_avg_count
+ return [new_avg_value, new_avg_count]
+
+
+def update_io_data(old_avg, period_value, win_size, io_data, io_key):
+ """update data of latency and iodump window"""
+ all_wins = get_nested_value(io_data, io_key)
+ if all_wins and "latency" in all_wins:
+ io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE])
+ if all_wins and "iodump" in all_wins:
+ io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1])
+
+
+def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data):
+ """update avg and check abonrmal, return true if win_size full"""
+ period_value = get_nested_value(data, io_key)
+ old_avg = get_nested_value(io_avg_value, io_key)
+
+ # 更新avg数据
+ if old_avg[AVG_COUNT] < win_size:
+ set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
+ return False
+
+ # 更新win数据 -- 判断异常周期
+ update_io_data(old_avg, period_value, win_size, io_data, io_key)
+ all_wins = get_nested_value(io_data, io_key)
+ if all_wins and 'latency' not in all_wins:
+ return True
+ period = get_nested_value(io_data, io_key + ("latency",))
+ if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
+ return True
+ set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
+ return True
--
2.33.0

View File

@ -1,33 +0,0 @@
From ac9ce326dee20edde2451946e34ea9a13bd8c338 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Wed, 16 Oct 2024 11:50:46 +0800
Subject: [PATCH] fix ai_block_io root cause bug
---
src/python/sentryPlugins/ai_block_io/detector.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
index 5b21714..ed8b64a 100644
--- a/src/python/sentryPlugins/ai_block_io/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -101,12 +101,12 @@ class DiskDetector:
if len(diagnosis_info["bio"]) == 0:
return False, None, None, None
elif len(diagnosis_info["rq_driver"]) != 0:
- root_cause = "[Root Causedisk slow]"
+ root_cause = "[Root Cause: disk slow]"
elif len(diagnosis_info["io_stage"]) != 0:
- stage = diagnosis_info["io_stage"][0][1].get_stage_name()
- root_cause = f"[Root Causeio stage slow, stage: {stage}]"
+ stage = diagnosis_info["io_stage"][0][1].stage_name
+ root_cause = f"[Root Cause: io stage slow, stage: {stage}]"
if root_cause is None:
- root_cause = "[Root Causehigh io pressure]"
+ root_cause = "[Root Cause: high io pressure]"
return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause
def __repr__(self):
--
2.23.0

View File

@ -1,832 +0,0 @@
From 35ba8fe8e241c5e3508c5dadc82a777065a5cc4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Mon, 30 Sep 2024 00:15:29 +0800
Subject: [PATCH] fix ai_block_io some issues
---
..._slow_io_detection.ini => ai_block_io.ini} | 6 +-
config/tasks/ai_block_io.mod | 5 +
.../tasks/ai_threshold_slow_io_detection.mod | 5 -
...ow_io_detection.py => test_ai_block_io.py} | 0
.../README.md | 0
.../__init__.py | 0
.../ai_block_io.py} | 57 ++--
.../alarm_report.py | 2 +-
.../ai_block_io/config_parser.py | 256 ++++++++++++++++++
.../data_access.py | 3 +
.../detector.py | 17 +-
.../io_data.py | 0
.../sliding_window.py | 0
.../threshold.py | 13 +-
.../utils.py | 15 +-
.../config_parser.py | 141 ----------
src/python/setup.py | 2 +-
17 files changed, 336 insertions(+), 186 deletions(-)
rename config/plugins/{ai_threshold_slow_io_detection.ini => ai_block_io.ini} (66%)
create mode 100644 config/tasks/ai_block_io.mod
delete mode 100644 config/tasks/ai_threshold_slow_io_detection.mod
rename selftest/test/{test_ai_threshold_slow_io_detection.py => test_ai_block_io.py} (100%)
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/README.md (100%)
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/__init__.py (100%)
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection/slow_io_detection.py => ai_block_io/ai_block_io.py} (66%)
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/alarm_report.py (98%)
create mode 100644 src/python/sentryPlugins/ai_block_io/config_parser.py
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/data_access.py (99%)
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/detector.py (77%)
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/io_data.py (100%)
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/sliding_window.py (100%)
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/threshold.py (92%)
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/utils.py (86%)
delete mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py
diff --git a/config/plugins/ai_threshold_slow_io_detection.ini b/config/plugins/ai_block_io.ini
similarity index 66%
rename from config/plugins/ai_threshold_slow_io_detection.ini
rename to config/plugins/ai_block_io.ini
index 44eb928..01ce266 100644
--- a/config/plugins/ai_threshold_slow_io_detection.ini
+++ b/config/plugins/ai_block_io.ini
@@ -4,9 +4,9 @@ slow_io_detect_frequency=1
log_level=info
[algorithm]
-train_data_duration=0.1
-train_update_duration=0.02
-algorithm_type=n_sigma
+train_data_duration=24
+train_update_duration=2
+algorithm_type=boxplot
boxplot_parameter=1.5
n_sigma_parameter=3
diff --git a/config/tasks/ai_block_io.mod b/config/tasks/ai_block_io.mod
new file mode 100644
index 0000000..1971d7d
--- /dev/null
+++ b/config/tasks/ai_block_io.mod
@@ -0,0 +1,5 @@
+[common]
+enabled=yes
+task_start=/usr/bin/python3 /usr/bin/ai_block_io
+task_stop=pkill -f /usr/bin/ai_block_io
+type=oneshot
\ No newline at end of file
diff --git a/config/tasks/ai_threshold_slow_io_detection.mod b/config/tasks/ai_threshold_slow_io_detection.mod
deleted file mode 100644
index 2729f72..0000000
--- a/config/tasks/ai_threshold_slow_io_detection.mod
+++ /dev/null
@@ -1,5 +0,0 @@
-[common]
-enabled=yes
-task_start=/usr/bin/python3 /usr/bin/ai_threshold_slow_io_detection
-task_stop=pkill -f /usr/bin/ai_threshold_slow_io_detection
-type=oneshot
\ No newline at end of file
diff --git a/selftest/test/test_ai_threshold_slow_io_detection.py b/selftest/test/test_ai_block_io.py
similarity index 100%
rename from selftest/test/test_ai_threshold_slow_io_detection.py
rename to selftest/test/test_ai_block_io.py
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md b/src/python/sentryPlugins/ai_block_io/README.md
similarity index 100%
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md
rename to src/python/sentryPlugins/ai_block_io/README.md
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py b/src/python/sentryPlugins/ai_block_io/__init__.py
similarity index 100%
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py
rename to src/python/sentryPlugins/ai_block_io/__init__.py
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
similarity index 66%
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py
rename to src/python/sentryPlugins/ai_block_io/ai_block_io.py
index 43cf770..31b8a97 100644
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
@@ -23,7 +23,7 @@ from .data_access import get_io_data_from_collect_plug, check_collect_valid
from .io_data import MetricName
from .alarm_report import AlarmReport
-CONFIG_FILE = "/etc/sysSentry/plugins/ai_threshold_slow_io_detection.ini"
+CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini"
def sig_handler(signum, frame):
@@ -40,34 +40,48 @@ class SlowIODetection:
def __init__(self, config_parser: ConfigParser):
self._config_parser = config_parser
- self.__set_log_format()
self.__init_detector_name_list()
self.__init_detector()
- def __set_log_format(self):
- log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
- log_level = get_log_level(self._config_parser.get_log_level())
- logging.basicConfig(level=log_level, format=log_format)
-
def __init_detector_name_list(self):
self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency())
- for disk in self._disk_list:
- self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
- self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
+ disks_to_detection: list = self._config_parser.get_disks_to_detection()
+ # 情况1None则启用所有磁盘检测
+ # 情况2is not None and len = 0则不启动任何磁盘检测
+ # 情况3len = 0则取交集
+ if disks_to_detection is None:
+ for disk in self._disk_list:
+ self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
+ self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
+ elif len(disks_to_detection) == 0:
+ logging.warning('please attention: conf file not specify any disk to detection, '
+ 'so it will not start ai block io.')
+ else:
+ disks_name_to_detection = []
+ for disk_name_to_detection in disks_to_detection:
+ disks_name_to_detection.append(disk_name_to_detection.get_disk_name())
+ disk_intersection = [disk for disk in self._disk_list if disk in disks_name_to_detection]
+ for disk in disk_intersection:
+ self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
+ self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
+ logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}')
def __init_detector(self):
train_data_duration, train_update_duration = (self._config_parser.
get_train_data_duration_and_train_update_duration())
slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency()
- threshold_type = get_threshold_type_enum(self._config_parser.get_algorithm_type())
+ threshold_type = self._config_parser.get_algorithm_type()
data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration,
train_update_duration,
slow_io_detection_frequency)
- sliding_window_type = get_sliding_window_type_enum(self._config_parser.get_sliding_window_type())
+ sliding_window_type = self._config_parser.get_sliding_window_type()
window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold()
for detector_name in self._detector_name_list:
- threshold = ThresholdFactory().get_threshold(threshold_type, data_queue_size=data_queue_size,
+ threshold = ThresholdFactory().get_threshold(threshold_type,
+ boxplot_parameter=self._config_parser.get_boxplot_parameter(),
+ n_sigma_paramter=self._config_parser.get_n_sigma_parameter(),
+ data_queue_size=data_queue_size,
data_queue_update_size=update_size)
sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size,
threshold=window_threshold)
@@ -89,6 +103,7 @@ class SlowIODetection:
logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}')
if io_data_dict_with_disk_name is None:
continue
+
# Step2慢IO检测
logging.debug('step2. Start to detection slow io event.')
slow_io_event_list = []
@@ -103,13 +118,14 @@ class SlowIODetection:
for slow_io_event in slow_io_event_list:
metric_name: MetricName = slow_io_event[0]
result = slow_io_event[1]
- AlarmReport.report_major_alm(f"disk {metric_name.get_disk_name()} has slow io event."
- f"stage: {metric_name.get_metric_name()},"
- f"type: {metric_name.get_io_access_type_name()},"
- f"metric: {metric_name.get_metric_name()},"
- f"current window: {result[1]},"
- f"threshold: {result[2]}")
- logging.error(f"slow io event happen: {str(slow_io_event)}")
+ alarm_content = (f"disk {metric_name.get_disk_name()} has slow io event. "
+ f"stage is: {metric_name.get_stage_name()}, "
+ f"io access type is: {metric_name.get_io_access_type_name()}, "
+ f"metric is: {metric_name.get_metric_name()}, "
+ f"current window is: {result[1]}, "
+ f"threshold is: {result[2]}")
+ AlarmReport.report_major_alm(alarm_content)
+ logging.warning(alarm_content)
# Step4等待检测时间
logging.debug('step4. Wait to start next slow io event detection loop.')
@@ -120,6 +136,7 @@ def main():
# Step1注册消息处理函数
signal.signal(signal.SIGINT, sig_handler)
signal.signal(signal.SIGTERM, sig_handler)
+
# Step2断点恢复
# todo:
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py
similarity index 98%
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py
rename to src/python/sentryPlugins/ai_block_io/alarm_report.py
index 3f4f34e..230c8cd 100644
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py
+++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py
@@ -15,7 +15,7 @@ import json
class AlarmReport:
- TASK_NAME = "SLOW_IO_DETECTION"
+ TASK_NAME = "ai_block_io"
@staticmethod
def report_pass(info: str):
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
new file mode 100644
index 0000000..632391d
--- /dev/null
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -0,0 +1,256 @@
+# coding: utf-8
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# sysSentry is licensed under the Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
+# PURPOSE.
+# See the Mulan PSL v2 for more details.
+
+import configparser
+import json
+import logging
+
+from .io_data import MetricName
+from .threshold import ThresholdType
+from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level
+
+LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
+
+
+def init_log_format(log_level: str):
+ logging.basicConfig(level=get_log_level(log_level), format=LOG_FORMAT)
+
+
+class ConfigParser:
+ DEFAULT_ABSOLUTE_THRESHOLD = 40
+ DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1
+ DEFAULT_LOG_LEVEL = 'info'
+
+ DEFAULT_ALGORITHM_TYPE = 'boxplot'
+ DEFAULT_TRAIN_DATA_DURATION = 24
+ DEFAULT_TRAIN_UPDATE_DURATION = 2
+ DEFAULT_BOXPLOT_PARAMETER = 1.5
+ DEFAULT_N_SIGMA_PARAMETER = 3
+
+ DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous'
+ DEFAULT_WINDOW_SIZE = 30
+ DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6
+
+ def __init__(self, config_file_name):
+ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
+ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
+ self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL
+ self.__disks_to_detection: list = []
+
+ self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE
+ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
+ self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
+ self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
+ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
+
+ self.__sliding_window_type = ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE
+ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
+ self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD
+
+ self.__config_file_name = config_file_name
+
+ def __read_absolute_threshold(self, items_common: dict):
+ try:
+ self.__absolute_threshold = float(items_common.get('absolute_threshold',
+ ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD))
+ if self.__absolute_threshold <= 0:
+ logging.warning(
+ f'the_absolute_threshold: {self.__absolute_threshold} you set is invalid, use default value: {ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD}.')
+ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
+ except ValueError:
+ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
+ logging.warning(
+ f'the_absolute_threshold type conversion has error, use default value: {self.__absolute_threshold}.')
+
+ def __read__slow_io_detect_frequency(self, items_common: dict):
+ try:
+ self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency',
+ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY))
+ if self.__slow_io_detect_frequency < 1 or self.__slow_io_detect_frequency > 10:
+ logging.warning(
+ f'the slow_io_detect_frequency: {self.__slow_io_detect_frequency} you set is invalid, use default value: {ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY}.')
+ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
+ except ValueError:
+ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
+ logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.')
+
+ def __read__disks_to_detect(self, items_common: dict):
+ disks_to_detection = items_common.get('disks_to_detect')
+ if disks_to_detection is None:
+ logging.warning(f'config of disks_to_detect not found, the default value be used.')
+ self.__disks_to_detection = None
+ return
+ try:
+ disks_to_detection_list = json.loads(disks_to_detection)
+ for disk_to_detection in disks_to_detection_list:
+ disk_name = disk_to_detection.get('disk_name', None)
+ stage_name = disk_to_detection.get('stage_name', None)
+ io_access_type_name = disk_to_detection.get('io_access_type_name', None)
+ metric_name = disk_to_detection.get('metric_name', None)
+ if not (disk_name is None or stage_name is None or io_access_type_name is None or metric_name is None):
+ metric_name_object = MetricName(disk_name, stage_name, io_access_type_name, metric_name)
+ self.__disks_to_detection.append(metric_name_object)
+ else:
+ logging.warning(f'config of disks_to_detect\'s some part has some error: {disk_to_detection}, it will be ignored.')
+ except json.decoder.JSONDecodeError as e:
+ logging.warning(f'config of disks_to_detect is error: {e}, it will be ignored and default value be used.')
+ self.__disks_to_detection = None
+
+ def __read__train_data_duration(self, items_algorithm: dict):
+ try:
+ self.__train_data_duration = float(items_algorithm.get('train_data_duration',
+ ConfigParser.DEFAULT_TRAIN_DATA_DURATION))
+ if self.__train_data_duration <= 0 or self.__train_data_duration > 720:
+ logging.warning(
+ f'the train_data_duration: {self.__train_data_duration} you set is invalid, use default value: {ConfigParser.DEFAULT_TRAIN_DATA_DURATION}.')
+ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION
+ except ValueError:
+ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION
+ logging.warning(f'the train_data_duration type conversion has error, use default value: {self.__train_data_duration}.')
+
+ def __read__train_update_duration(self, items_algorithm: dict):
+ default_train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
+ if default_train_update_duration > self.__train_data_duration:
+ default_train_update_duration = self.__train_data_duration / 2
+
+ try:
+ self.__train_update_duration = float(items_algorithm.get('train_update_duration',
+ ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION))
+ if self.__train_update_duration <= 0 or self.__train_update_duration > self.__train_data_duration:
+ logging.warning(
+ f'the train_update_duration: {self.__train_update_duration} you set is invalid, use default value: {default_train_update_duration}.')
+ self.__train_update_duration = default_train_update_duration
+ except ValueError:
+ self.__train_update_duration = default_train_update_duration
+ logging.warning(f'the train_update_duration type conversion has error, use default value: {self.__train_update_duration}.')
+
+ def __read__algorithm_type_and_parameter(self, items_algorithm: dict):
+ algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE)
+ self.__algorithm_type = get_threshold_type_enum(algorithm_type)
+
+ if self.__algorithm_type == ThresholdType.NSigmaThreshold:
+ try:
+ self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter',
+ ConfigParser.DEFAULT_N_SIGMA_PARAMETER))
+ if self.__n_sigma_parameter <= 0 or self.__n_sigma_parameter > 10:
+ logging.warning(
+ f'the n_sigma_parameter: {self.__n_sigma_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_N_SIGMA_PARAMETER}.')
+ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
+ except ValueError:
+ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
+ logging.warning(f'the n_sigma_parameter type conversion has error, use default value: {self.__n_sigma_parameter}.')
+ elif self.__algorithm_type == ThresholdType.BoxplotThreshold:
+ try:
+ self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter',
+ ConfigParser.DEFAULT_BOXPLOT_PARAMETER))
+ if self.__boxplot_parameter <= 0 or self.__boxplot_parameter > 10:
+ logging.warning(
+ f'the boxplot_parameter: {self.__boxplot_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_BOXPLOT_PARAMETER}.')
+ self.__n_sigma_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
+ except ValueError:
+ self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
+ logging.warning(f'the boxplot_parameter type conversion has error, use default value: {self.__boxplot_parameter}.')
+
+ def __read__window_size(self, items_sliding_window: dict):
+ try:
+ self.__window_size = int(items_sliding_window.get('window_size',
+ ConfigParser.DEFAULT_WINDOW_SIZE))
+ if self.__window_size < 1 or self.__window_size > 3600:
+ logging.warning(
+ f'the window_size: {self.__window_size} you set is invalid, use default value: {ConfigParser.DEFAULT_WINDOW_SIZE}.')
+ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
+ except ValueError:
+ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
+ logging.warning(f'window_size type conversion has error, use default value: {self.__window_size}.')
+
+ def __read__window_minimum_threshold(self, items_sliding_window: dict):
+ default_window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD
+ if default_window_minimum_threshold > self.__window_size:
+ default_window_minimum_threshold = self.__window_size / 2
+ try:
+ self.__window_minimum_threshold = (
+ int(items_sliding_window.get('window_minimum_threshold',
+ ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD)))
+ if self.__window_minimum_threshold < 1 or self.__window_minimum_threshold > self.__window_size:
+ logging.warning(
+ f'the window_minimum_threshold: {self.__window_minimum_threshold} you set is invalid, use default value: {default_window_minimum_threshold}.')
+ self.__window_minimum_threshold = default_window_minimum_threshold
+ except ValueError:
+ self.__window_minimum_threshold = default_window_minimum_threshold
+ logging.warning(f'window_minimum_threshold type conversion has error, use default value: {self.__window_minimum_threshold}.')
+
+ def read_config_from_file(self):
+ con = configparser.ConfigParser()
+ con.read(self.__config_file_name, encoding='utf-8')
+
+ if con.has_section('common'):
+ items_common = dict(con.items('common'))
+ self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL)
+ init_log_format(self.__log_level)
+ self.__read_absolute_threshold(items_common)
+ self.__read__slow_io_detect_frequency(items_common)
+ self.__read__disks_to_detect(items_common)
+ else:
+ init_log_format(self.__log_level)
+ logging.warning("common section parameter not found, it will be set to default value.")
+
+ if con.has_section('algorithm'):
+ items_algorithm = dict(con.items('algorithm'))
+ self.__read__train_data_duration(items_algorithm)
+ self.__read__train_update_duration(items_algorithm)
+ self.__read__algorithm_type_and_parameter(items_algorithm)
+ else:
+ logging.warning("algorithm section parameter not found, it will be set to default value.")
+
+ if con.has_section('sliding_window'):
+ items_sliding_window = dict(con.items('sliding_window'))
+ sliding_window_type = items_sliding_window.get('sliding_window_type',
+ ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE)
+ self.__sliding_window_type = get_sliding_window_type_enum(sliding_window_type)
+ self.__read__window_size(items_sliding_window)
+ self.__read__window_minimum_threshold(items_sliding_window)
+ else:
+ logging.warning("sliding_window section parameter not found, it will be set to default value.")
+
+ self.__print_all_config_value()
+
+ def __print_all_config_value(self):
+ pass
+
+ def get_slow_io_detect_frequency(self):
+ return self.__slow_io_detect_frequency
+
+ def get_algorithm_type(self):
+ return self.__algorithm_type
+
+ def get_sliding_window_type(self):
+ return self.__sliding_window_type
+
+ def get_train_data_duration_and_train_update_duration(self):
+ return self.__train_data_duration, self.__train_update_duration
+
+ def get_window_size_and_window_minimum_threshold(self):
+ return self.__window_size, self.__window_minimum_threshold
+
+ def get_absolute_threshold(self):
+ return self.__absolute_threshold
+
+ def get_log_level(self):
+ return self.__log_level
+
+ def get_disks_to_detection(self):
+ return self.__disks_to_detection
+
+ def get_boxplot_parameter(self):
+ return self.__boxplot_parameter
+
+ def get_n_sigma_parameter(self):
+ return self.__n_sigma_parameter
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
similarity index 99%
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py
rename to src/python/sentryPlugins/ai_block_io/data_access.py
index d9f3460..01c5315 100644
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
@@ -17,6 +17,8 @@ from sentryCollector.collect_plugin import (
get_io_data,
is_iocollect_valid,
)
+
+
from .io_data import IOStageData, IOData
COLLECT_STAGES = [
@@ -32,6 +34,7 @@ COLLECT_STAGES = [
"iocost",
]
+
def check_collect_valid(period):
data_raw = is_iocollect_valid(period)
if data_raw["ret"] == 0:
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
similarity index 77%
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py
rename to src/python/sentryPlugins/ai_block_io/detector.py
index eda9825..bcf62cb 100644
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -26,19 +26,26 @@ class Detector:
self._threshold = threshold
self._slidingWindow = sliding_window
self._threshold.attach_observer(self._slidingWindow)
+ self._count = 0
def get_metric_name(self):
return self._metric_name
def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
- logging.debug(f'Enter Detector: {self}')
+ self._count += 1
+ if self._count % 15 == 0:
+ self._count = 0
+ logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
+ logging.debug(f'enter Detector: {self}')
metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name)
- if metric_value > 1e-6:
- logging.debug(f'Input metric value: {str(metric_value)}')
- self._threshold.push_latest_data_to_queue(metric_value)
+ if metric_value is None:
+ logging.debug('not found metric value, so return None.')
+ return False, None, None
+ logging.debug(f'input metric value: {str(metric_value)}')
+ self._threshold.push_latest_data_to_queue(metric_value)
detection_result = self._slidingWindow.is_slow_io_event(metric_value)
logging.debug(f'Detection result: {str(detection_result)}')
- logging.debug(f'Exit Detector: {self}')
+ logging.debug(f'exit Detector: {self}')
return detection_result
def __repr__(self):
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py b/src/python/sentryPlugins/ai_block_io/io_data.py
similarity index 100%
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py
rename to src/python/sentryPlugins/ai_block_io/io_data.py
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py
similarity index 100%
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py
rename to src/python/sentryPlugins/ai_block_io/sliding_window.py
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py
similarity index 92%
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py
rename to src/python/sentryPlugins/ai_block_io/threshold.py
index 9e1ca7b..ff85d85 100644
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py
+++ b/src/python/sentryPlugins/ai_block_io/threshold.py
@@ -79,9 +79,9 @@ class AbsoluteThreshold(Threshold):
class BoxplotThreshold(Threshold):
- def __init__(self, parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
+ def __init__(self, boxplot_parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
super().__init__(data_queue_size, data_queue_update_size)
- self.parameter = parameter
+ self.parameter = boxplot_parameter
def _update_threshold(self):
data = list(self.data_queue.queue)
@@ -94,6 +94,8 @@ class BoxplotThreshold(Threshold):
self.notify_observer()
def push_latest_data_to_queue(self, data):
+ if data < 1e-6:
+ return
try:
self.data_queue.put(data, block=False)
except queue.Full:
@@ -111,9 +113,9 @@ class BoxplotThreshold(Threshold):
class NSigmaThreshold(Threshold):
- def __init__(self, parameter: float = 2.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
+ def __init__(self, n_sigma_parameter: float = 3.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
super().__init__(data_queue_size, data_queue_update_size)
- self.parameter = parameter
+ self.parameter = n_sigma_parameter
def _update_threshold(self):
data = list(self.data_queue.queue)
@@ -125,6 +127,8 @@ class NSigmaThreshold(Threshold):
self.notify_observer()
def push_latest_data_to_queue(self, data):
+ if data < 1e-6:
+ return
try:
self.data_queue.put(data, block=False)
except queue.Full:
@@ -157,4 +161,3 @@ class ThresholdFactory:
return NSigmaThreshold(*args, **kwargs)
else:
raise ValueError(f"Invalid threshold type: {threshold_type}")
-
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py
similarity index 86%
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py
rename to src/python/sentryPlugins/ai_block_io/utils.py
index f66e5ed..8dbba06 100644
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py
+++ b/src/python/sentryPlugins/ai_block_io/utils.py
@@ -8,13 +8,16 @@
# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
# PURPOSE.
# See the Mulan PSL v2 for more details.
+
import logging
from dataclasses import asdict
+
from .threshold import ThresholdType
from .sliding_window import SlidingWindowType
from .io_data import MetricName, IOData
+
def get_threshold_type_enum(algorithm_type: str):
if algorithm_type.lower() == 'absolute':
return ThresholdType.AbsoluteThreshold
@@ -22,7 +25,7 @@ def get_threshold_type_enum(algorithm_type: str):
return ThresholdType.BoxplotThreshold
if algorithm_type.lower() == 'n_sigma':
return ThresholdType.NSigmaThreshold
- logging.info('not found correct algorithm type, use default: boxplot.')
+ logging.warning(f"the algorithm type: {algorithm_type} you set is invalid, use default value: boxplot")
return ThresholdType.BoxplotThreshold
@@ -33,7 +36,7 @@ def get_sliding_window_type_enum(sliding_window_type: str):
return SlidingWindowType.ContinuousSlidingWindow
if sliding_window_type.lower() == 'median':
return SlidingWindowType.MedianSlidingWindow
- logging.info('not found correct sliding window type, use default: not_continuous.')
+ logging.warning(f"the sliding window type: {sliding_window_type} you set is invalid, use default value: not_continuous")
return SlidingWindowType.NotContinuousSlidingWindow
@@ -62,6 +65,8 @@ def get_log_level(log_level: str):
return logging.INFO
elif log_level.lower() == 'warning':
return logging.WARNING
- elif log_level.lower() == 'fatal':
- return logging.FATAL
- return None
+ elif log_level.lower() == 'error':
+ return logging.ERROR
+ elif log_level.lower() == 'critical':
+ return logging.CRITICAL
+ return logging.INFO
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py
deleted file mode 100644
index cd4e6f1..0000000
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# coding: utf-8
-# Copyright (c) 2024 Huawei Technologies Co., Ltd.
-# sysSentry is licensed under the Mulan PSL v2.
-# You can use this software according to the terms and conditions of the Mulan PSL v2.
-# You may obtain a copy of Mulan PSL v2 at:
-# http://license.coscl.org.cn/MulanPSL2
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
-# PURPOSE.
-# See the Mulan PSL v2 for more details.
-
-import configparser
-import logging
-
-
-class ConfigParser:
-
- DEFAULT_ABSOLUTE_THRESHOLD = 40
- DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1
- DEFAULT_LOG_LEVEL = 'info'
- DEFAULT_TRAIN_DATA_DURATION = 24
- DEFAULT_TRAIN_UPDATE_DURATION = 2
- DEFAULT_ALGORITHM_TYPE = 'boxplot'
- DEFAULT_N_SIGMA_PARAMETER = 3
- DEFAULT_BOXPLOT_PARAMETER = 1.5
- DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous'
- DEFAULT_WINDOW_SIZE = 30
- DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6
-
- def __init__(self, config_file_name):
- self.__boxplot_parameter = None
- self.__window_minimum_threshold = None
- self.__window_size = None
- self.__sliding_window_type = None
- self.__n_sigma_parameter = None
- self.__algorithm_type = None
- self.__train_update_duration = None
- self.__log_level = None
- self.__slow_io_detect_frequency = None
- self.__absolute_threshold = None
- self.__train_data_duration = None
- self.__config_file_name = config_file_name
-
- def read_config_from_file(self):
-
- con = configparser.ConfigParser()
- con.read(self.__config_file_name, encoding='utf-8')
-
- items_common = dict(con.items('common'))
- items_algorithm = dict(con.items('algorithm'))
- items_sliding_window = dict(con.items('sliding_window'))
-
- try:
- self.__absolute_threshold = int(items_common.get('absolute_threshold',
- ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD))
- except ValueError:
- self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
- logging.warning('absolute threshold type conversion has error, use default value.')
-
- try:
- self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency',
- ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY))
- except ValueError:
- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
- logging.warning('slow_io_detect_frequency type conversion has error, use default value.')
-
- self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL)
-
- try:
- self.__train_data_duration = float(items_algorithm.get('train_data_duration',
- ConfigParser.DEFAULT_TRAIN_DATA_DURATION))
- except ValueError:
- self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION
- logging.warning('train_data_duration type conversion has error, use default value.')
-
- try:
- self.__train_update_duration = float(items_algorithm.get('train_update_duration',
- ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION))
- except ValueError:
- self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
- logging.warning('train_update_duration type conversion has error, use default value.')
-
- try:
- self.__algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE)
- except ValueError:
- self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE
- logging.warning('algorithmType type conversion has error, use default value.')
-
- if self.__algorithm_type == 'n_sigma':
- try:
- self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter',
- ConfigParser.DEFAULT_N_SIGMA_PARAMETER))
- except ValueError:
- self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
- logging.warning('n_sigma_parameter type conversion has error, use default value.')
- elif self.__algorithm_type == 'boxplot':
- try:
- self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter',
- ConfigParser.DEFAULT_BOXPLOT_PARAMETER))
- except ValueError:
- self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
- logging.warning('boxplot_parameter type conversion has error, use default value.')
-
- self.__sliding_window_type = items_sliding_window.get('sliding_window_type',
- ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE)
-
- try:
- self.__window_size = int(items_sliding_window.get('window_size',
- ConfigParser.DEFAULT_WINDOW_SIZE))
- except ValueError:
- self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
- logging.warning('window_size type conversion has error, use default value.')
-
- try:
- self.__window_minimum_threshold = (
- int(items_sliding_window.get('window_minimum_threshold',
- ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD)))
- except ValueError:
- self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD
- logging.warning('window_minimum_threshold type conversion has error, use default value.')
-
- def get_slow_io_detect_frequency(self):
- return self.__slow_io_detect_frequency
-
- def get_algorithm_type(self):
- return self.__algorithm_type
-
- def get_sliding_window_type(self):
- return self.__sliding_window_type
-
- def get_train_data_duration_and_train_update_duration(self):
- return self.__train_data_duration, self.__train_update_duration
-
- def get_window_size_and_window_minimum_threshold(self):
- return self.__window_size, self.__window_minimum_threshold
-
- def get_absolute_threshold(self):
- return self.__absolute_threshold
-
- def get_log_level(self):
- return self.__log_level
diff --git a/src/python/setup.py b/src/python/setup.py
index dac6481..9e26a10 100644
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -34,7 +34,7 @@ setup(
'xalarmd=xalarm.xalarm_daemon:alarm_process_create',
'sentryCollector=sentryCollector.collectd:main',
'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main',
- 'ai_threshold_slow_io_detection=sentryPlugins.ai_threshold_slow_io_detection.slow_io_detection:main'
+ 'ai_block_io=sentryPlugins.ai_block_io.ai_block_io:main'
]
},
)
--
2.23.0

View File

@ -1,48 +0,0 @@
From fe1bb401c1f77860616e74c1dbf5fe6aa862b17d Mon Sep 17 00:00:00 2001
From: jinsaihang <jinsaihang@h-partners.com>
Date: Sat, 26 Oct 2024 07:18:16 +0000
Subject: [PATCH] fix alarm_info newline break error
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
---
sysSentry-1.0.2/src/python/syssentry/alarm.py | 23 +++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
index 2575307..b35a126 100644
--- a/src/python/syssentry/alarm.py
+++ b/src/python/syssentry/alarm.py
@@ -180,7 +180,30 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
if 'details' in alarm_info:
alarm_info.pop('details', None)
alarm.pop('msg1', None)
+
+ # dump each {key,value} of details in one line
+ if 'details' in alarm_info and isinstance(alarm_info['details'], dict):
+ for key in alarm_info['details']:
+ alarm_info['details'][key] = json.dumps(alarm_info['details'][key], indent=None)
+
alarm['alarm_info'] = alarm_info
+ alarm_list = [alarm for alarm in alarm_list if 'alarm_source' in alarm['alarm_info'] and alarm['alarm_info']['alarm_source'] == task_name]
+
+ alarm_level_mapping = {
+ 1: 'MINOR_ALM',
+ 2: 'MAJOR_ALM',
+ 3: 'CRITICAL_ALM'
+ }
+
+ alarm_type_mapping = {
+ 1: 'ALARM_TYPE_OCCUR',
+ 2: 'ALARM_TYPE_RECOVER'
+ }
+
+ for alarm in alarm_list:
+ alarm['alarm_level'] = alarm_level_mapping.get(alarm['alarm_level'], 'UNKNOWN_LEVEL')
+ alarm['alarm_type'] = alarm_type_mapping.get(alarm['alarm_type'], 'UNKNOWN_TYPE')
return alarm_list
+
finally:
alarm_list_lock.release()
--
2.27.0

View File

@ -1,323 +0,0 @@
From e6eb39799b3ca15fb385c572863417ea26bdfa66 Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Wed, 25 Sep 2024 11:03:29 +0800
Subject: [PATCH] fix-bug-step-2-about-collect-module-and-avg-block-io
---
src/python/sentryCollector/collect_config.py | 11 ++-
src/python/sentryCollector/collect_io.py | 25 ++---
src/python/sentryCollector/collect_plugin.py | 6 +-
src/python/sentryCollector/collect_server.py | 1 -
src/python/sentryCollector/collectd.py | 4 +-
.../avg_block_io/avg_block_io.py | 92 ++++++++++++++-----
6 files changed, 96 insertions(+), 43 deletions(-)
diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py
index b6cc75c..0fdd9f0 100644
--- a/src/python/sentryCollector/collect_config.py
+++ b/src/python/sentryCollector/collect_config.py
@@ -49,14 +49,14 @@ class CollectConfig:
self.config = configparser.ConfigParser()
self.config.read(self.filename)
except configparser.Error:
- logging.error("collectd configure file read failed")
+ logging.error("collect configure file read failed")
return
try:
common_config = self.config[CONF_COMMON]
- modules_str = common_config[CONF_MODULES]
+ modules_str = common_config[CONF_MODULES].lower()
# remove space
- modules_list = modules_str.replace(" ", "").split(',')
+ modules_list = set(modules_str.replace(" ", "").split(','))
except KeyError as e:
logging.error("read config data failed, %s", e)
return
@@ -98,7 +98,7 @@ class CollectConfig:
CONF_IO, CONF_IO_MAX_SAVE, CONF_IO_MAX_SAVE_DEFAULT)
result_io_config[CONF_IO_MAX_SAVE] = CONF_IO_MAX_SAVE_DEFAULT
# disk
- disk = io_map_value.get(CONF_IO_DISK)
+ disk = io_map_value.get(CONF_IO_DISK).lower()
if disk:
disk_str = disk.replace(" ", "")
pattern = r'^[a-zA-Z0-9-_,]+$'
@@ -106,12 +106,13 @@ class CollectConfig:
logging.warning("module_name = %s section, field = %s is incorrect, use default %s",
CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT)
disk_str = CONF_IO_DISK_DEFAULT
+ disk_str = ",".join(set(disk_str.split(',')))
result_io_config[CONF_IO_DISK] = disk_str
else:
logging.warning("module_name = %s section, field = %s is incorrect, use default %s",
CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT)
result_io_config[CONF_IO_DISK] = CONF_IO_DISK_DEFAULT
- logging.info("config get_io_config: %s", result_io_config)
+ logging.debug("config get_io_config: %s", result_io_config)
return result_io_config
def get_common_config(self):
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
index 104b734..9c8dae7 100644
--- a/src/python/sentryCollector/collect_io.py
+++ b/src/python/sentryCollector/collect_io.py
@@ -177,10 +177,8 @@ class CollectIo():
def is_kernel_avaliable(self):
base_path = '/sys/kernel/debug/block'
+ all_disk = []
for disk_name in os.listdir(base_path):
- if not self.loop_all and disk_name not in self.disk_list:
- continue
-
disk_path = os.path.join(base_path, disk_name)
blk_io_hierarchy_path = os.path.join(disk_path, 'blk_io_hierarchy')
@@ -190,12 +188,18 @@ class CollectIo():
for file_name in os.listdir(blk_io_hierarchy_path):
file_path = os.path.join(blk_io_hierarchy_path, file_name)
-
if file_name == 'stats':
- stage_list = self.extract_first_column(file_path)
- self.disk_map_stage[disk_name] = stage_list
- self.window_value[disk_name] = {}
- IO_GLOBAL_DATA[disk_name] = {}
+ all_disk.append(disk_name)
+
+ for disk_name in self.disk_list:
+ if not self.loop_all and disk_name not in all_disk:
+ logging.warning("the %s disk not exist!", disk_name)
+ continue
+ stats_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/stats'.format(disk_name)
+ stage_list = self.extract_first_column(stats_file)
+ self.disk_map_stage[disk_name] = stage_list
+ self.window_value[disk_name] = {}
+ IO_GLOBAL_DATA[disk_name] = {}
return len(IO_GLOBAL_DATA) != 0
@@ -203,7 +207,7 @@ class CollectIo():
logging.info("collect io thread start")
if not self.is_kernel_avaliable() or len(self.disk_map_stage) == 0:
- logging.warning("no disks meet the requirements. collect io thread exits")
+ logging.warning("no disks meet the requirements. collect io thread exit")
return
for disk_name, stage_list in self.disk_map_stage.items():
@@ -239,5 +243,4 @@ class CollectIo():
# set stop event, notify thread exit
def stop_thread(self):
- logging.debug("collect io thread is preparing to exit")
- self.stop_event.set()
+ self.stop_event.set()
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
index 9132473..1faa5e3 100644
--- a/src/python/sentryCollector/collect_plugin.py
+++ b/src/python/sentryCollector/collect_plugin.py
@@ -10,7 +10,7 @@
# See the Mulan PSL v2 for more details.
"""
-collcet plugin
+collect plugin
"""
import json
import socket
@@ -75,7 +75,7 @@ def client_send_and_recv(request_data, data_str_len, protocol):
try:
client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
except socket.error:
- print("collect_plugin: client creat socket error")
+ print("collect_plugin: client create socket error")
return None
try:
@@ -128,7 +128,7 @@ def client_send_and_recv(request_data, data_str_len, protocol):
def validate_parameters(param, len_limit, char_limit):
ret = ResultMessage.RESULT_SUCCEED
if not param:
- print("parm is invalid")
+ print("param is invalid")
ret = ResultMessage.RESULT_NOT_PARAM
return [False, ret]
diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py
index bab4e56..11d1af0 100644
--- a/src/python/sentryCollector/collect_server.py
+++ b/src/python/sentryCollector/collect_server.py
@@ -281,5 +281,4 @@ class CollectServer():
pass
def stop_thread(self):
- logging.debug("collect listen thread is preparing to exit")
self.stop_event.set()
diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py
index 3a836df..d9d8862 100644
--- a/src/python/sentryCollector/collectd.py
+++ b/src/python/sentryCollector/collectd.py
@@ -79,7 +79,7 @@ def main():
for info in module_list:
class_name = Module_Map_Class.get(info)
if not class_name:
- logging.info("%s correspond to class is not exists", info)
+ logging.info("%s correspond to class is not exist", info)
continue
cn = class_name(module_config)
collect_thread = threading.Thread(target=cn.main_loop)
@@ -94,4 +94,4 @@ def main():
finally:
pass
- logging.info("All threads have finished. Main thread is exiting.")
\ No newline at end of file
+ logging.info("all threads have finished. main thread exit.")
\ No newline at end of file
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
index 73f0b22..ac35be2 100644
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
@@ -28,33 +28,53 @@ def log_invalid_keys(not_in_list, keys_name, config_list, default_list):
def read_config_common(config):
- """read config file, get [common] section value"""
- try:
- common_sec = config['common']
- except configparser.NoSectionError:
+ """read config file, get [common] section value"""
+ if not config.has_section("common"):
report_alarm_fail("Cannot find common section in config file")
try:
- period_time = int(common_sec.get("period_time", 1))
- if not (1 <= period_time <= 300):
- raise ValueError("Invalid period_time")
- except ValueError:
- period_time = 1
- logging.warning("Invalid period_time, set to 1s")
+ disk_name = config.get("common", "disk")
+ disk = [] if disk_name == "default" else disk_name.split(",")
+ except configparser.NoOptionError:
+ disk = []
+ logging.warning("Unset disk, set to default")
- disk = common_sec.get('disk').split(",") if common_sec.get('disk') not in [None, 'default'] else []
- stage = common_sec.get('stage').split(",") if common_sec.get('stage') not in [None, 'default'] else []
+ try:
+ stage_name = config.get("common", "stage")
+ stage = [] if stage_name == "default" else stage_name.split(",")
+ except configparser.NoOptionError:
+ stage = []
+ logging.warning("Unset stage, set to read,write")
if len(disk) > 10:
logging.warning("Too many disks, record only max 10 disks")
disk = disk[:10]
- iotype = common_sec.get('iotype', 'read,write').split(",")
- iotype_list = [rw.lower() for rw in iotype if rw.lower() in ['read', 'write', 'flush', 'discard']]
- err_iotype = [rw for rw in iotype if rw.lower() not in ['read', 'write', 'flush', 'discard']]
+ try:
+ iotype_name = config.get("common", "iotype").split(",")
+ iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write', 'flush', 'discard']]
+ err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write', 'flush', 'discard']]
+
+ if iotype_list in [None, []]:
+ iotype_list = ["read", "write"]
+ except configparser.NoOptionError:
+ iotype = ["read", "write"]
+ logging.warning("Unset iotype, set to default")
if err_iotype:
logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list))
+
+
+ try:
+ period_time = int(config.get("common", "period_time"))
+ if not (1 <= period_time <= 300):
+ raise ValueError("Invalid period_time")
+ except ValueError:
+ period_time = 1
+ logging.warning("Invalid period_time, set to 1s")
+ except configparser.NoOptionError:
+ period_time = 1
+ logging.warning("Unset period_time, use 1s as default")
return period_time, disk, stage, iotype_list
@@ -68,11 +88,23 @@ def read_config_algorithm(config):
win_size = int(config.get("algorithm", "win_size"))
if not (1 <= win_size <= 300):
raise ValueError("Invalid win_size")
+ except ValueError:
+ win_size = 30
+ logging.warning("Invalid win_size, set to 30")
+ except configparser.NoOptionError:
+ win_size = 30
+ logging.warning("Unset win_size, use 30 as default")
+
+ try:
win_threshold = int(config.get("algorithm", "win_threshold"))
if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
raise ValueError("Invalid win_threshold")
except ValueError:
- report_alarm_fail("Invalid win_threshold or win_size")
+ win_threshold = 6
+ logging.warning("Invalid win_threshold, set to 6")
+ except configparser.NoOptionError:
+ win_threshold = 6
+ logging.warning("Unset win_threshold, use 6 as default")
return win_size, win_threshold
@@ -80,6 +112,21 @@ def read_config_algorithm(config):
def read_config_lat_iodump(io_dic, config):
"""read config file, get [latency] [iodump] section value"""
common_param = {}
+ lat_sec = None
+ if not config.has_section("latency"):
+ logging.warning("Cannot find algorithm section in config file")
+ else:
+ lat_sec = config["latency"]
+
+ iodump_sec = None
+ if not config.has_section("iodump"):
+ logging.warning("Cannot find iodump section in config file")
+ else:
+ lat_sec = config["iodump"]
+
+ if not lat_sec and not iodump_sec:
+ return common_param
+
for io_type in io_dic["iotype_list"]:
common_param[io_type] = {}
@@ -90,13 +137,16 @@ def read_config_lat_iodump(io_dic, config):
}
iodump_key = "{}_iodump_lim".format(io_type)
+ if iodump_sec and iodump_key in iodump_sec and iodump_sec[iodump_key].isdecimal():
+ common_param[io_type][iodump_key] = int(iodump_sec[iodump_key])
+
+ if not lat_sec:
+ continue
+
for key_suffix, key_template in latency_keys.items():
- if key_template in config["latency"] and config["latency"][key_template].isdecimal():
- common_param[io_type][key_template] = int(config["latency"][key_template])
+ if key_template in lat_sec and lat_sec[key_template].isdecimal():
+ common_param[io_type][key_template] = int(lat_sec[key_template])
- if iodump_key in config["iodump"] and config["iodump"][iodump_key].isdecimal():
- common_param[io_type][iodump_key] = int(config["iodump"][iodump_key])
-
return common_param
--
2.33.0

View File

@ -1,243 +0,0 @@
From c9f62e01f09a56743ccc3e470f273875ab22ac5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Wed, 9 Oct 2024 16:19:52 +0800
Subject: [PATCH] fix config relative some issues
---
.../sentryPlugins/ai_block_io/README.md | 1 -
.../sentryPlugins/ai_block_io/ai_block_io.py | 21 +++++-----
.../ai_block_io/config_parser.py | 42 +++++++++----------
.../sentryPlugins/ai_block_io/detector.py | 2 +-
.../ai_block_io/sliding_window.py | 8 ++--
.../sentryPlugins/ai_block_io/threshold.py | 6 +--
6 files changed, 39 insertions(+), 41 deletions(-)
diff --git a/src/python/sentryPlugins/ai_block_io/README.md b/src/python/sentryPlugins/ai_block_io/README.md
index f9b8388..95c1111 100644
--- a/src/python/sentryPlugins/ai_block_io/README.md
+++ b/src/python/sentryPlugins/ai_block_io/README.md
@@ -1,2 +1 @@
# slow_io_detection
-
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
index 31b8a97..3b00ef3 100644
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
@@ -16,8 +16,7 @@ import logging
from .detector import Detector
from .threshold import ThresholdFactory, AbsoluteThreshold
from .sliding_window import SlidingWindowFactory
-from .utils import (get_threshold_type_enum, get_sliding_window_type_enum, get_data_queue_size_and_update_size,
- get_log_level)
+from .utils import get_data_queue_size_and_update_size
from .config_parser import ConfigParser
from .data_access import get_io_data_from_collect_plug, check_collect_valid
from .io_data import MetricName
@@ -45,25 +44,25 @@ class SlowIODetection:
def __init_detector_name_list(self):
self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency())
+ logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
disks_to_detection: list = self._config_parser.get_disks_to_detection()
# 情况1None则启用所有磁盘检测
# 情况2is not None and len = 0则不启动任何磁盘检测
# 情况3len = 0则取交集
if disks_to_detection is None:
+ logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.")
for disk in self._disk_list:
self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
elif len(disks_to_detection) == 0:
- logging.warning('please attention: conf file not specify any disk to detection, '
- 'so it will not start ai block io.')
+ logging.warning('please attention: conf file not specify any disk to detection, so it will not start ai block io.')
else:
- disks_name_to_detection = []
- for disk_name_to_detection in disks_to_detection:
- disks_name_to_detection.append(disk_name_to_detection.get_disk_name())
- disk_intersection = [disk for disk in self._disk_list if disk in disks_name_to_detection]
- for disk in disk_intersection:
- self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
- self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
+ for disk_to_detection in disks_to_detection:
+ if disk_to_detection in self._disk_list:
+ self._detector_name_list.append(MetricName(disk_to_detection, "bio", "read", "latency"))
+ self._detector_name_list.append(MetricName(disk_to_detection, "bio", "write", "latency"))
+ else:
+ logging.warning(f"disk[{disk_to_detection}] not in available disk list, so it will be ignored.")
logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}')
def __init_detector(self):
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
index 632391d..354c122 100644
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -10,18 +10,19 @@
# See the Mulan PSL v2 for more details.
import configparser
-import json
import logging
-from .io_data import MetricName
from .threshold import ThresholdType
from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level
+
LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
def init_log_format(log_level: str):
- logging.basicConfig(level=get_log_level(log_level), format=LOG_FORMAT)
+ logging.basicConfig(level=get_log_level(log_level.lower()), format=LOG_FORMAT)
+ if log_level.lower() not in ('info', 'warning', 'error', 'debug'):
+ logging.warning(f'the log_level: {log_level} you set is invalid, use default value: info.')
class ConfigParser:
@@ -43,7 +44,7 @@ class ConfigParser:
self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL
- self.__disks_to_detection: list = []
+ self.__disks_to_detection = None
self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE
self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
@@ -83,26 +84,20 @@ class ConfigParser:
logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.')
def __read__disks_to_detect(self, items_common: dict):
- disks_to_detection = items_common.get('disks_to_detect')
+ disks_to_detection = items_common.get('disk')
if disks_to_detection is None:
- logging.warning(f'config of disks_to_detect not found, the default value be used.')
+ logging.warning(f'config of disk not found, the default value will be used.')
self.__disks_to_detection = None
return
- try:
- disks_to_detection_list = json.loads(disks_to_detection)
- for disk_to_detection in disks_to_detection_list:
- disk_name = disk_to_detection.get('disk_name', None)
- stage_name = disk_to_detection.get('stage_name', None)
- io_access_type_name = disk_to_detection.get('io_access_type_name', None)
- metric_name = disk_to_detection.get('metric_name', None)
- if not (disk_name is None or stage_name is None or io_access_type_name is None or metric_name is None):
- metric_name_object = MetricName(disk_name, stage_name, io_access_type_name, metric_name)
- self.__disks_to_detection.append(metric_name_object)
- else:
- logging.warning(f'config of disks_to_detect\'s some part has some error: {disk_to_detection}, it will be ignored.')
- except json.decoder.JSONDecodeError as e:
- logging.warning(f'config of disks_to_detect is error: {e}, it will be ignored and default value be used.')
+ disk_list = disks_to_detection.split(',')
+ if len(disk_list) == 0 or (len(disk_list) == 1 and disk_list[0] == ''):
+ logging.warning("you don't specify any disk.")
+ self.__disks_to_detection = []
+ return
+ if len(disk_list) == 1 and disk_list[0] == 'default':
self.__disks_to_detection = None
+ return
+ self.__disks_to_detection = disk_list
def __read__train_data_duration(self, items_algorithm: dict):
try:
@@ -189,7 +184,12 @@ class ConfigParser:
def read_config_from_file(self):
con = configparser.ConfigParser()
- con.read(self.__config_file_name, encoding='utf-8')
+ try:
+ con.read(self.__config_file_name, encoding='utf-8')
+ except configparser.Error as e:
+ init_log_format(self.__log_level)
+ logging.critical(f'config file read error: {e}, ai_block_io plug will exit.')
+ exit(1)
if con.has_section('common'):
items_common = dict(con.items('common'))
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
index bcf62cb..a48144f 100644
--- a/src/python/sentryPlugins/ai_block_io/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -50,6 +50,6 @@ class Detector:
def __repr__(self):
return (f'disk_name: {self._metric_name.get_disk_name()}, stage_name: {self._metric_name.get_stage_name()},'
- f' access_type_name: {self._metric_name.get_io_access_type_name()},'
+ f' io_type_name: {self._metric_name.get_io_access_type_name()},'
f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},'
f' sliding_window_type: {self._slidingWindow}')
diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py
index d395d48..89191e5 100644
--- a/src/python/sentryPlugins/ai_block_io/sliding_window.py
+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py
@@ -52,7 +52,7 @@ class SlidingWindow:
return False, None, None
def __repr__(self):
- return "SlidingWindow"
+ return "[SlidingWindow]"
class NotContinuousSlidingWindow(SlidingWindow):
@@ -65,7 +65,7 @@ class NotContinuousSlidingWindow(SlidingWindow):
return False, self._io_data_queue, self._ai_threshold
def __repr__(self):
- return "NotContinuousSlidingWindow"
+ return f"[NotContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]"
class ContinuousSlidingWindow(SlidingWindow):
@@ -84,7 +84,7 @@ class ContinuousSlidingWindow(SlidingWindow):
return False, self._io_data_queue, self._ai_threshold
def __repr__(self):
- return "ContinuousSlidingWindow"
+ return f"[ContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]"
class MedianSlidingWindow(SlidingWindow):
@@ -98,7 +98,7 @@ class MedianSlidingWindow(SlidingWindow):
return False, self._io_data_queue, self._ai_threshold
def __repr__(self):
- return "MedianSlidingWindow"
+ return f"[MedianSlidingWindow, window size: {self._queue_length}]"
class SlidingWindowFactory:
diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py
index ff85d85..3b7a5a8 100644
--- a/src/python/sentryPlugins/ai_block_io/threshold.py
+++ b/src/python/sentryPlugins/ai_block_io/threshold.py
@@ -75,7 +75,7 @@ class AbsoluteThreshold(Threshold):
pass
def __repr__(self):
- return "AbsoluteThreshold"
+ return "[AbsoluteThreshold]"
class BoxplotThreshold(Threshold):
@@ -109,7 +109,7 @@ class BoxplotThreshold(Threshold):
self.new_data_size = 0
def __repr__(self):
- return "BoxplotThreshold"
+ return f"[BoxplotThreshold, param is: {self.parameter}]"
class NSigmaThreshold(Threshold):
@@ -142,7 +142,7 @@ class NSigmaThreshold(Threshold):
self.new_data_size = 0
def __repr__(self):
- return "NSigmaThreshold"
+ return f"[NSigmaThreshold, param is: {self.parameter}]"
class ThresholdType(Enum):
--
2.23.0

View File

@ -1,37 +0,0 @@
From 65ceade489c4018c3f315104d70be0550a28d9d9 Mon Sep 17 00:00:00 2001
From: shixuantong <shixuantong1@huawei.com>
Date: Wed, 11 Sep 2024 10:23:41 +0800
Subject: [PATCH] fix configparser.InterpolationSyntaxError
---
src/python/syssentry/sentry_config.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/python/syssentry/sentry_config.py b/src/python/syssentry/sentry_config.py
index 01f3df8..a0e7b79 100644
--- a/src/python/syssentry/sentry_config.py
+++ b/src/python/syssentry/sentry_config.py
@@ -103,14 +103,18 @@ class CpuPluginsParamsConfig:
"""read config file"""
config_param_section_args = {}
if os.path.exists(self.config_file):
- self.config.read(self.config_file)
try:
+ self.config.read(self.config_file)
config_param_section_args = dict(self.config[self.param_section_name])
- except (ValueError, KeyError):
+ except (ValueError, KeyError, configparser.InterpolationSyntaxError):
config_param_section_args = {}
+ logging.error("Failed to parse cpu_sentry.ini!")
return config_param_section_args
def join_cpu_start_cmd(self, cpu_param_dict: dict) -> str:
+ if not cpu_param_dict:
+ return ""
+
cpu_list = cpu_param_dict.get("cpu_list", "default")
if cpu_list == "default":
cpu_list = CpuPluginsParamsConfig.get_cpu_info()
--
2.27.0

View File

@ -1,25 +0,0 @@
From 370b22b032dce9290eebca1cf8d48bd155164b6a Mon Sep 17 00:00:00 2001
From: shixuantong <shixuantong1@huawei.com>
Date: Wed, 24 Jul 2024 17:53:58 +0800
Subject: [PATCH] fix error handling
---
src/python/syssentry/cpu_sentry.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py
index 3c4d58d..d0bafa8 100644
--- a/src/python/syssentry/cpu_sentry.py
+++ b/src/python/syssentry/cpu_sentry.py
@@ -87,7 +87,7 @@ class CpuSentry:
}
def handle_cpu_output(self, stdout: str):
- if "<ERROR>" in stdout:
+ if "ERROR" in stdout:
self.send_result["result"] = ResultLevel.FAIL
self.send_result["details"]["code"] = 1004
self.send_result["details"]["msg"] = stdout.split("\n")[0]
--
2.27.0

View File

@ -1,41 +0,0 @@
From 815537382fc0d5164fe57b0d984ca4a1ed8254ea Mon Sep 17 00:00:00 2001
From: jinsaihang <jinsaihang@h-partners.com>
Date: Thu, 31 Oct 2024 16:00:50 +0800
Subject: [PATCH] excessive CPU usage
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
---
sysSentry-1.0.2/src/python/xalarm/xalarm_transfer.py | 3 ---
1 file changed, 3 deletions(-)
diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py
index b072007..4bebe5d 100644
--- a/src/python/xalarm/xalarm_transfer.py
+++ b/src/python/xalarm/xalarm_transfer.py
@@ -62,7 +62,6 @@ def cleanup_closed_connections(server_sock, epoll, fd_to_socket):
to_remove.append(fileno)
for fileno in to_remove:
- epoll.unregister(fileno)
fd_to_socket[fileno].close()
del fd_to_socket[fileno]
logging.info(f"cleaned up connection {fileno} for client lost connection.")
@@ -97,7 +96,6 @@ def wait_for_connection(server_sock, epoll, fd_to_socket, thread_should_stop):
logging.info(f"connection reach max num of {MAX_CONNECTION_NUM}, closed current connection!")
connection.close()
continue
- epoll.register(connection.fileno(), select.EPOLLOUT)
fd_to_socket[connection.fileno()] = connection
except socket.error as e:
logging.debug(f"socket error, reason is {e}")
@@ -122,7 +120,6 @@ def transmit_alarm(server_sock, epoll, fd_to_socket, bin_data):
except (BrokenPipeError, ConnectionResetError):
to_remove.append(fileno)
for fileno in to_remove:
- epoll.unregister(fileno)
fd_to_socket[fileno].close()
del fd_to_socket[fileno]
logging.info(f"cleaned up connection {fileno} for client lost connection.")
--
2.27.0

View File

@ -1,70 +0,0 @@
From a06ad0c944b093a71f49cc9fccd5097c1493ca5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Mon, 21 Oct 2024 17:31:32 +0800
Subject: [PATCH] fix frequency param check bug
---
.../sentryPlugins/ai_block_io/config_parser.py | 13 +++++++++++--
.../sentryPlugins/ai_block_io/data_access.py | 14 ++++++++++++++
2 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
index 447eccd..274a31e 100644
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -16,6 +16,7 @@ import logging
from .alarm_report import Report
from .threshold import ThresholdType
from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level
+from .data_access import check_detect_frequency_is_valid
LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
@@ -165,9 +166,17 @@ class ConfigParser:
"slow_io_detect_frequency",
int,
self.DEFAULT_CONF["common"]["slow_io_detect_frequency"],
- gt=0,
- le=300,
+ gt=0
)
+ frequency = self._conf["common"]["slow_io_detect_frequency"]
+ ret = check_detect_frequency_is_valid(frequency)
+ if ret is None:
+ log = f"slow io detect frequency: {frequency} is valid, "\
+ f"Check whether the value range is too large or is not an "\
+ f"integer multiple of period_time.. exiting..."
+ Report.report_pass(log)
+ logging.critical(log)
+ exit(1)
def _read_disks_to_detect(self, items_common: dict):
disks_to_detection = items_common.get("disk")
diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
index 1bc5ed8..e4869d5 100644
--- a/src/python/sentryPlugins/ai_block_io/data_access.py
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
@@ -53,6 +53,20 @@ def check_collect_valid(period):
return None
+def check_detect_frequency_is_valid(period):
+ data_raw = is_iocollect_valid(period)
+ if data_raw["ret"] == 0:
+ try:
+ data = json.loads(data_raw["message"])
+ except Exception as e:
+ return None
+ if not data:
+ return None
+ return [k for k in data.keys()]
+ else:
+ return None
+
+
def _get_raw_data(period, disk_list):
return get_io_data(
period,
--
2.23.0

View File

@ -1,36 +0,0 @@
From 8f28a40ffd7dc7aa969a7bfc0a170ed0c8f03bce Mon Sep 17 00:00:00 2001
From: jinsaihang <jinsaihang@h-partners.com>
Date: Tue, 22 Oct 2024 20:28:59 +0800
Subject: [PATCH] fix get_alarm error
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
---
sysSentry-1.0.2/src/python/syssentry/alarm.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
index c3f2ee1..2575307 100644
--- a/src/python/syssentry/alarm.py
+++ b/src/python/syssentry/alarm.py
@@ -139,8 +139,6 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
return []
alarm_id = task_alarm_id_dict[task_name]
clear_time = alarm_id_clear_time_dict[alarm_id]
- if clear_time < int(time_range):
- return []
if alarm_id not in alarm_list_dict:
logging.debug("alarm_id does not exist")
return []
@@ -154,6 +152,9 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > time_range:
stop_index = i
break
+ if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > clear_time:
+ stop_index = i
+ break
if stop_index >= 0:
alarm_list = alarm_list[:stop_index]
logging.debug(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements")
--
2.27.0

View File

@ -1,508 +0,0 @@
From 85d6dae9d7c6148f2699ef7da7d2d784043a2ee1 Mon Sep 17 00:00:00 2001
From: luckky <guodashun1@huawei.com>
Date: Wed, 30 Oct 2024 10:41:11 +0800
Subject: [PATCH] fix hbm online repair notice and efi create
---
src/c/hbm_online_repair/hbm_online_repair.c | 5 +-
.../non-standard-hbm-repair.c | 194 +++++++++---------
.../non-standard-hbm-repair.h | 2 +-
src/c/hbm_online_repair/ras-events.c | 1 -
.../ras-non-standard-handler.c | 33 +--
.../ras-non-standard-handler.h | 1 +
6 files changed, 116 insertions(+), 120 deletions(-)
diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c
index 3ace206..b3b2742 100644
--- a/src/c/hbm_online_repair/hbm_online_repair.c
+++ b/src/c/hbm_online_repair/hbm_online_repair.c
@@ -127,10 +127,7 @@ int main(int argc, char *argv[])
return -1;
}
- ret = init_all_flash();
- if (ret < 0) {
- log(LOG_ERROR, "flash writer init failed\n");
- }
+ get_flash_total_size();
handle_ras_events(ras);
diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c
index b175e14..f26d8ae 100644
--- a/src/c/hbm_online_repair/non-standard-hbm-repair.c
+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c
@@ -15,7 +15,7 @@
#include "non-standard-hbm-repair.h"
extern int page_isolation_threshold;
-size_t total_size = 0;
+size_t flash_total_size = 0;
struct hisi_common_error_section {
uint32_t val_bits;
uint8_t version;
@@ -122,28 +122,58 @@ static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned
info_struct->crc8 = (uint32_t)fault_addr;
}
-static bool variable_existed(char *name, char *guid)
+static bool is_variable_existing(char *name, char *guid)
{
+ char filename[PATH_MAX];
+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
+
+ return access(filename, F_OK | R_OK) == 0;
+}
+
+static size_t get_var_size(char *name, char *guid) {
char filename[PATH_MAX];
int fd;
+ struct stat stat;
snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
// open var file
fd = open(filename, O_RDONLY);
if (fd < 0) {
- log(LOG_WARNING, "open file %s failed\n", filename);
- return false;
+ log(LOG_WARNING, "open %s failed\n", filename);
+ goto err;
+ }
+ // read stat
+ if (fstat(fd, &stat) != 0) {
+ log(LOG_WARNING, "fstat %s failed\n", filename);
+ goto err;
}
close(fd);
- return true;
+ return stat.st_size;
+err:
+ if (fd >= 0)
+ close(fd);
+ return (size_t)-1;
}
-static uint32_t read_variable_attribute(char *name, char *guid) {
+void get_flash_total_size() {
+ for (int i = 0; i < FLASH_ENTRY_NUM; i++) {
+ if (is_variable_existing(flash_names[i], flash_guids[i])) {
+ flash_total_size += get_var_size(flash_names[i], flash_guids[i]);
+ }
+ }
+ // check total entry size
+ log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n",
+ flash_total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE);
+ if (flash_total_size > MAX_VAR_SIZE) {
+ log(LOG_WARNING, "fault info storage %zu reach threshold, cannot save new record\n", flash_total_size);
+ }
+}
+
+static int read_variable_attribute(char *name, char *guid, uint32_t *attribute) {
char filename[PATH_MAX];
int fd;
size_t readsize;
- uint32_t attribute = (uint32_t)-1;
snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
@@ -151,17 +181,18 @@ static uint32_t read_variable_attribute(char *name, char *guid) {
fd = open(filename, O_RDONLY);
if (fd < 0) {
log(LOG_ERROR, "open %s failed\n", filename);
- return attribute;
+ return -1;
}
// read attributes from first 4 bytes
- readsize = read(fd, &attribute, sizeof(uint32_t));
+ readsize = read(fd, attribute, sizeof(uint32_t));
if (readsize != sizeof(uint32_t)) {
log(LOG_ERROR, "read attribute of %s failed\n", filename);
+ return -1;
}
close(fd);
- return attribute;
+ return 0;
}
static int efivarfs_set_mutable(char *name, char *guid, bool mutable)
@@ -205,8 +236,8 @@ err:
return -1;
}
-static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute) {
- int fd, mode;
+static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute, bool is_existing) {
+ int fd = -1, mode;
size_t writesize;
void *buffer;
unsigned long total;
@@ -225,16 +256,13 @@ static int write_variable(char *name, char *guid, void *value, unsigned long siz
memcpy(buffer + sizeof(uint32_t), value, size);
// change attr
- if (efivarfs_set_mutable(name, guid, 1) != 0) {
+ if (is_existing && efivarfs_set_mutable(name, guid, 1) != 0) {
log(LOG_ERROR, "set mutable for %s failed\n", filename);
goto err;
}
mode = O_WRONLY;
- if (attribute & EFI_VARIABLE_APPEND_WRITE)
- mode |= O_APPEND;
- else
- mode |= O_CREAT;
+ mode |= is_existing ? O_APPEND : O_CREAT;
// open var file
fd = open(filename, mode, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
@@ -252,7 +280,7 @@ static int write_variable(char *name, char *guid, void *value, unsigned long siz
close(fd);
free(buffer);
- if (efivarfs_set_mutable(name, guid, 0) != 0) {
+ if (is_existing && efivarfs_set_mutable(name, guid, 0) != 0) {
log(LOG_ERROR, "set immutable for %s failed\n", filename);
}
return 0;
@@ -261,86 +289,21 @@ err:
close(fd);
if (buffer)
free(buffer);
- if (efivarfs_set_mutable(name, guid, 0) != 0) {
+ if (is_existing && efivarfs_set_mutable(name, guid, 0) != 0) {
log(LOG_ERROR, "set immutable for %s failed\n", filename);
}
return -1;
}
-static int append_variable(char *name, char *guid, void *data, unsigned long size) {
- // prepare append attribute
- uint32_t attribute = read_variable_attribute(name, guid);
- if (attribute == (uint32_t)-1) {
- log(LOG_ERROR, "read %s-%s attribute failed\n", name, guid);
- return -1;
- }
- attribute |= EFI_VARIABLE_APPEND_WRITE;
-
- return write_variable(name, guid, data, size, attribute);
-}
-
-static size_t get_var_size(char *name, char *guid) {
- char filename[PATH_MAX];
- int fd;
- struct stat stat;
-
- snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
-
- // open var file
- fd = open(filename, O_RDONLY);
- if (fd < 0) {
- log(LOG_WARNING, "open %s failed\n", filename);
- goto err;
- }
- // read stat
- if (fstat(fd, &stat) != 0) {
- log(LOG_WARNING, "fstat %s failed\n", filename);
- goto err;
- }
- close(fd);
- return stat.st_size;
-err:
- if (fd >= 0)
- close(fd);
- return (size_t)-1;
-}
-
-int init_all_flash() {
- for (int i = 0; i < FLASH_ENTRY_NUM; i++) {
- // check existed entry
- if (variable_existed(flash_names[i], flash_guids[i])) {
- total_size += get_var_size(flash_names[i], flash_guids[i]);
- continue;
- }
- // create new entry
- uint32_t attribute = EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS;
- char *data = "";
- unsigned long size = 1;
- int ret = write_variable(flash_names[i], flash_guids[i], data, size, attribute);
- if (ret) {
- log(LOG_ERROR, "init %s-%s failed, fault info storage funtion not enabled\n", flash_names[i], flash_guids[i]);
- return -1;
- }
- total_size += sizeof(uint32_t) + 1;
- }
- // check total entry size
- log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n",
- total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE);
- if (total_size > MAX_VAR_SIZE) {
- log(LOG_ERROR, "fault info storage reach threshold, cannot save new record\n");
- }
- return 0;
-}
-
static int write_fault_info_to_flash(const struct hisi_common_error_section *err) {
int ret, guid_index;
uint32_t reg_size;
uint64_t fault_addr;
+ bool is_existing;
+ uint32_t attribute = -1;
// check flash usage threshold
- if (total_size + sizeof(uint64_t) > MAX_VAR_SIZE) {
+ if (flash_total_size + sizeof(uint64_t) > MAX_VAR_SIZE) {
log(LOG_WARNING, "fault info storage reach threshold, cannot save new record into flash\n");
return -1;
}
@@ -359,14 +322,29 @@ static int write_fault_info_to_flash(const struct hisi_common_error_section *err
log(LOG_ERROR, "invalid fault info\n");
return -1;
}
+
+ // judge if the efivar is existing to set the attribute
+ is_existing = is_variable_existing(flash_names[guid_index], flash_guids[guid_index]);
+ attribute = EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS;
+ if (is_existing) {
+ ret = read_variable_attribute(flash_names[guid_index], flash_guids[guid_index], &attribute);
+ if (ret < 0) {
+ log(LOG_ERROR, "read variable %s-%s attribute failed, stop writing\n", flash_names[guid_index], flash_guids[guid_index]);
+ return -1;
+ }
+ attribute |= EFI_VARIABLE_APPEND_WRITE;
+ }
+
// record physical addr in flash
- ret = append_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t));
+ ret = write_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t), attribute, is_existing);
if (ret < 0) {
- log(LOG_ERROR, "append to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]);
+ log(LOG_ERROR, "write to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]);
return -1;
}
- total_size += sizeof(uint64_t);
- log(LOG_INFO, "write hbm fault info to flash success\n");
+ flash_total_size += sizeof(uint64_t);
+ log(LOG_INFO, "write hbm fault info to flash %s-%s success\n", flash_names[guid_index], flash_guids[guid_index]);
return 0;
}
@@ -421,7 +399,7 @@ static int get_hardware_corrupted_size()
return hardware_corrupted_size;
}
-static uint8_t get_repair_result_code(int ret)
+static uint8_t get_repair_failed_result_code(int ret)
{
if (ret == -ENOSPC) {
return REPAIR_FAILED_NO_RESOURCE;
@@ -582,11 +560,11 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err)
static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr)
{
int ret;
- if (repair_ret < 0) {
+ if (repair_ret <= 0) {
log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr);
/* not much we can do about errors here */
(void)write_file("/sys/kernel/page_eject", "remove_page", paddr);
- return get_repair_result_code(repair_ret);
+ return get_repair_failed_result_code(repair_ret);
}
ret = write_file("/sys/kernel/page_eject", "online_page", paddr);
@@ -615,9 +593,13 @@ static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char
err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_PSUE_ACLS;
ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr);
- if (ret < 0) {
- notice_BMC(err, get_repair_result_code(ret));
- log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR");
+
+ /* Only positive num means the error is supported to repair */
+ if (ret <= 0) {
+ if (ret != -ENXIO) {
+ notice_BMC(err, get_repair_failed_result_code(ret));
+ log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR");
+ }
return ret;
}
@@ -642,8 +624,9 @@ static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char
all_online_success = false;
}
}
- if (ret < 0) {
- notice_BMC(err, get_repair_result_code(ret));
+ /* The ret is from the acls/sppr repair, and only positive num means the error is repaired successfully */
+ if (ret <= 0) {
+ notice_BMC(err, get_repair_failed_result_code(ret));
return ret;
} else if (all_online_success) {
notice_BMC(err, ISOLATE_REPAIR_ONLINE_SUCCESS);
@@ -698,7 +681,7 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err)
struct dirent *dent;
DIR *dir;
int ret;
- bool find_device = false, find_hbm_mem = false;
+ bool find_device = false, find_hbm_mem = false, addr_in_hbm_device = false;
ret = hbmc_hbm_page_isolate(err);
if (ret < 0) {
@@ -723,10 +706,13 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err)
if (hbmc_get_memory_type(path) == HBM_HBM_MEMORY) {
find_hbm_mem = true;
ret = hbmc_hbm_repair(err, path);
- if (ret != -ENXIO)
+ if (ret != -ENXIO) {
+ addr_in_hbm_device = true;
break;
+ }
}
}
+
if (!find_device) {
log(LOG_ERROR, "Repair driver is not loaded, skip error, error_type is %u\n",
err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK);
@@ -735,6 +721,10 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err)
log(LOG_ERROR, "No HBM device memory type found, skip error, error_type is %u\n",
err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK);
notice_BMC(err, REPAIR_FAILED_OTHER_REASON);
+ } else if (!addr_in_hbm_device) {
+ log(LOG_ERROR, "Err addr is not in device, skip error, error_type is %u\n",
+ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK);
+ notice_BMC(err, REPAIR_FAILED_INVALID_PARAM);
}
closedir(dir);
@@ -769,7 +759,7 @@ static bool hbm_repair_validate(const struct hisi_common_error_section *err)
(err->reg_array_size == HBM_CACHE_ARRAY_SIZE);
if (!(is_acls_valid || is_sppr_valid || is_cache_mode)) {
- log(LOG_DEBUG, "err type (%u) is unknown or address array length (%u) is invalid\n",
+ log(LOG_WARNING, "err type (%u) is unknown or address array length (%u) is invalid\n",
hbm_repair_reg_type, err->reg_array_size);
return false;
}
diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.h b/src/c/hbm_online_repair/non-standard-hbm-repair.h
index 7e8e448..ecb04fe 100644
--- a/src/c/hbm_online_repair/non-standard-hbm-repair.h
+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.h
@@ -84,6 +84,6 @@
#define FLASH_ENTRY_NUM 8
#define KB_SIZE 1024
-extern int init_all_flash();
+extern void get_flash_total_size();
#endif
diff --git a/src/c/hbm_online_repair/ras-events.c b/src/c/hbm_online_repair/ras-events.c
index 0b12329..4d281ad 100644
--- a/src/c/hbm_online_repair/ras-events.c
+++ b/src/c/hbm_online_repair/ras-events.c
@@ -348,7 +348,6 @@ static int read_ras_event_all_cpus(struct pcpu_data *pdata,
"Error on CPU %i\n", i);
warnonce[i]++;
}
- continue;
}
if (!(fds[i].revents & POLLIN)) {
count_nready++;
diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.c b/src/c/hbm_online_repair/ras-non-standard-handler.c
index 1d1fd04..48ffa70 100644
--- a/src/c/hbm_online_repair/ras-non-standard-handler.c
+++ b/src/c/hbm_online_repair/ras-non-standard-handler.c
@@ -7,17 +7,21 @@
#include "ras-non-standard-handler.h"
#include "logger.h"
-static char *uuid_le(const char *uu)
+static int uuid_le(const char *uu, char* uuid)
{
- static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
if (!uu) {
log(LOG_ERROR, "uuid_le failed: uu is empty");
- return uuid;
+ return -1;
}
size_t uu_len = strlen(uu);
- if (uu_len < SECTION_TYPE_UUID_LEN) {
- log(LOG_ERROR, "uuid_le failed: uu is too short");
- return uuid;
+ if (uu_len != SECTION_TYPE_UUID_LEN) {
+ log(LOG_ERROR, "uuid_le failed: uu len is incorrect");
+ return -1;
+ }
+ size_t uuid_len = strlen(uuid);
+ if (uuid_len != strlen(UUID_STR_TYPE)) {
+ log(LOG_ERROR, "uuid_le failed: uuid len is incorrect");
+ return -1;
}
char *p = uuid;
@@ -38,7 +42,7 @@ static char *uuid_le(const char *uu)
*p = 0;
- return uuid;
+ return 0;
}
int ras_non_standard_event_handler(struct trace_seq *s,
@@ -52,15 +56,20 @@ int ras_non_standard_event_handler(struct trace_seq *s,
ev.sec_type = tep_get_field_raw(s, event, "sec_type",
record, &len, 1);
if(!ev.sec_type) {
- log(LOG_WARNING, "get event section type failed");
+ log(LOG_WARNING, "get event section type failed\n");
return -1;
}
trace_seq_printf(s, "\n");
- trace_seq_printf(s, "sec_type: %s\n", uuid_le(ev.sec_type));
+ char uuid[sizeof(UUID_STR_TYPE)] = UUID_STR_TYPE;
+ if (uuid_le(ev.sec_type, uuid) < 0) {
+ log(LOG_WARNING, "get uuid failed\n");
+ return -1;
+ }
+ trace_seq_printf(s, "sec_type: %s\n", uuid);
if (tep_get_field_val(s, event, "len", record, &val, 1) < 0) {
- log(LOG_WARNING, "tep get field val failed");
+ log(LOG_WARNING, "tep get field val failed\n");
return -1;
}
@@ -69,11 +78,11 @@ int ras_non_standard_event_handler(struct trace_seq *s,
ev.error = tep_get_field_raw(s, event, "buf", record, &len, 1);
if(!ev.error || ev.length != len) {
- log(LOG_WARNING, "get event error failed");
+ log(LOG_WARNING, "get event error failed\n");
return -1;
}
- if (strcmp(uuid_le(ev.sec_type), HISI_COMMON_SECTION_TYPE_UUID) == 0) {
+ if (strcmp(uuid, HISI_COMMON_SECTION_TYPE_UUID) == 0) {
decode_hisi_common_section(&ev);
}
diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.h b/src/c/hbm_online_repair/ras-non-standard-handler.h
index 0272dc1..15a37ee 100644
--- a/src/c/hbm_online_repair/ras-non-standard-handler.h
+++ b/src/c/hbm_online_repair/ras-non-standard-handler.h
@@ -7,6 +7,7 @@
#define BIT(nr) (1UL << (nr))
#define SECTION_TYPE_UUID_LEN 16
+#define UUID_STR_TYPE "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
#define HISI_COMMON_SECTION_TYPE_UUID "c8b328a8-9917-4af6-9a13-2e08ab2e7586"
struct ras_non_standard_event {
--
2.43.0

View File

@ -1,25 +0,0 @@
From 6307a1ff4068a541658e3312ca938c6fdd9a5c1a Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Sat, 12 Oct 2024 14:51:51 +0800
Subject: [PATCH] fix io_dump for collect module
---
src/python/sentryCollector/collect_io.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
index d734734..11c9d9a 100644
--- a/src/python/sentryCollector/collect_io.py
+++ b/src/python/sentryCollector/collect_io.py
@@ -154,7 +154,7 @@ class CollectIo():
try:
with open(io_dump_file, 'r') as file:
for line in file:
- count += line.count('.op=' + Io_Category[category])
+ count += line.count('.op=' + Io_Category[category].upper())
if count > 0:
logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}")
except FileNotFoundError:
--
2.33.0

View File

@ -1,53 +0,0 @@
From 878bcf61467bfd9d015a8089a8367f4333ba76f6 Mon Sep 17 00:00:00 2001
From: PshySimon <caixiaomeng2@huawei.com>
Date: Wed, 9 Oct 2024 10:20:34 +0800
Subject: [PATCH] fix python 3.7 not support list[bool] type
---
src/python/xalarm/register_xalarm.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py
index e58343d..6756b1b 100644
--- a/src/python/xalarm/register_xalarm.py
+++ b/src/python/xalarm/register_xalarm.py
@@ -26,7 +26,7 @@ ALARM_REGISTER_INFO = None
class AlarmRegister:
- def __init__(self, id_filter: list[bool], callback: callable):
+ def __init__(self, id_filter: list, callback: callable):
self.id_filter = id_filter
self.callback = callback
self.socket = self.create_unix_socket()
@@ -49,7 +49,7 @@ class AlarmRegister:
return False
return True
- def set_id_filter(self, id_filter: list[bool]) -> bool:
+ def set_id_filter(self, id_filter: list) -> bool:
if (len(id_filter) > MAX_NUM_OF_ALARM_ID):
sys.stderr.write("set_id_filter: invalid param id_filter\n")
return False
@@ -118,7 +118,7 @@ class AlarmRegister:
self.socket.close()
-def xalarm_register(callback: callable, id_filter: list[bool]) -> int:
+def xalarm_register(callback: callable, id_filter: list) -> int:
global ALARM_REGISTER_INFO
if ALARM_REGISTER_INFO is not None:
@@ -148,7 +148,7 @@ def xalarm_unregister(clientId: int) -> None:
ALARM_REGISTER_INFO = None
-def xalarm_upgrade(clientId: int, id_filter: list[bool]) -> None:
+def xalarm_upgrade(clientId: int, id_filter: list) -> None:
global ALARM_REGISTER_INFO
if clientId < 0:
sys.stderr.write("xalarm_unregister: invalid client\n")
--
2.27.0

View File

@ -1,36 +0,0 @@
From e8e4fa5fd9e78508567782e17b7b1cb6ace3ef0d Mon Sep 17 00:00:00 2001
From: shixuantong <shixuantong1@huawei.com>
Date: Fri, 26 Jul 2024 15:59:42 +0800
Subject: [PATCH] fix result when process output is None
---
src/python/syssentry/cpu_sentry.py | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py
index d0bafa8..9287e2f 100644
--- a/src/python/syssentry/cpu_sentry.py
+++ b/src/python/syssentry/cpu_sentry.py
@@ -87,11 +87,19 @@ class CpuSentry:
}
def handle_cpu_output(self, stdout: str):
+ if not stdout:
+ logging.error("%s process output is None, it may be killed!", LOW_LEVEL_INSPECT_CMD)
+ self.send_result["result"] = ResultLevel.FAIL
+ self.send_result["details"]["code"] = 1005
+ self.send_result["details"]["msg"] = "cpu_sentry task is killed!"
+ return
+
if "ERROR" in stdout:
self.send_result["result"] = ResultLevel.FAIL
self.send_result["details"]["code"] = 1004
self.send_result["details"]["msg"] = stdout.split("\n")[0]
return
+
out_split = stdout.split("\n")
isolated_cores_number = 0
found_fault_cores_list = []
--
2.27.0

View File

@ -1,226 +0,0 @@
From dea58a559f3dbad3dbce3b681639ee89c20b1cee Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Fri, 20 Sep 2024 14:35:39 +0800
Subject: [PATCH] fix some about collect module and avg block io
---
config/tasks/avg_block_io.mod | 4 ++--
src/python/sentryCollector/collect_io.py | 18 +++++++++++-------
src/python/sentryCollector/collect_plugin.py | 17 ++++++++---------
src/python/sentryCollector/collect_server.py | 6 +++---
src/python/sentryCollector/collectd.py | 2 --
.../sentryPlugins/avg_block_io/avg_block_io.py | 13 ++++++++++---
6 files changed, 34 insertions(+), 26 deletions(-)
diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod
index 814c483..b9b6f34 100644
--- a/config/tasks/avg_block_io.mod
+++ b/config/tasks/avg_block_io.mod
@@ -1,5 +1,5 @@
[common]
enabled=yes
task_start=/usr/bin/python3 /usr/bin/avg_block_io
-task_stop=pkill avg_block_io
-type=oneshot
\ No newline at end of file
+task_stop=pkill -f /usr/bin/avg_block_io
+type=oneshot
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
index b826dc4..104b734 100644
--- a/src/python/sentryCollector/collect_io.py
+++ b/src/python/sentryCollector/collect_io.py
@@ -175,8 +175,7 @@ class CollectIo():
threading.Timer(self.period_time, self.task_loop).start()
- def main_loop(self):
- logging.info("collect io thread start")
+ def is_kernel_avaliable(self):
base_path = '/sys/kernel/debug/block'
for disk_name in os.listdir(base_path):
if not self.loop_all and disk_name not in self.disk_list:
@@ -198,8 +197,13 @@ class CollectIo():
self.window_value[disk_name] = {}
IO_GLOBAL_DATA[disk_name] = {}
- if len(self.disk_map_stage) == 0:
- logging.warning("no disks meet the requirements. the thread exits")
+ return len(IO_GLOBAL_DATA) != 0
+
+ def main_loop(self):
+ logging.info("collect io thread start")
+
+ if not self.is_kernel_avaliable() or len(self.disk_map_stage) == 0:
+ logging.warning("no disks meet the requirements. collect io thread exits")
return
for disk_name, stage_list in self.disk_map_stage.items():
@@ -213,7 +217,7 @@ class CollectIo():
start_time = time.time()
if self.stop_event.is_set():
- logging.info("collect io thread exit")
+ logging.debug("collect io thread exit")
return
for disk_name, stage_list in self.disk_map_stage.items():
@@ -227,7 +231,7 @@ class CollectIo():
continue
while sleep_time > 1:
if self.stop_event.is_set():
- logging.info("collect io thread exit")
+ logging.debug("collect io thread exit")
return
time.sleep(1)
sleep_time -= 1
@@ -235,5 +239,5 @@ class CollectIo():
# set stop event, notify thread exit
def stop_thread(self):
- logging.info("collect io thread is preparing to exit")
+ logging.debug("collect io thread is preparing to exit")
self.stop_event.set()
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
index 49ce0a8..9132473 100644
--- a/src/python/sentryCollector/collect_plugin.py
+++ b/src/python/sentryCollector/collect_plugin.py
@@ -142,22 +142,21 @@ def validate_parameters(param, len_limit, char_limit):
ret = ResultMessage.RESULT_INVALID_LENGTH
return [False, ret]
- if len(param) > len_limit:
- print(f"{param} length more than {len_limit}")
- ret = ResultMessage.RESULT_EXCEED_LIMIT
- return [False, ret]
-
pattern = r'^[a-zA-Z0-9_-]+$'
for info in param:
- if len(info) > char_limit:
- print(f"{info} length more than {char_limit}")
- ret = ResultMessage.RESULT_EXCEED_LIMIT
- return [False, ret]
if not re.match(pattern, info):
print(f"{info} is invalid char")
ret = ResultMessage.RESULT_INVALID_CHAR
return [False, ret]
+ # length of len_limit is exceeded, keep len_limit
+ if len(param) > len_limit:
+ print(f"{param} length more than {len_limit}, keep the first {len_limit}")
+ param[:] = param[0:len_limit]
+
+ # only keep elements under the char_limit length
+ param[:] = [elem for elem in param if len(elem) <= char_limit]
+
return [True, ret]
def is_iocollect_valid(period, disk_list=None, stage=None):
diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py
index fa49781..bab4e56 100644
--- a/src/python/sentryCollector/collect_server.py
+++ b/src/python/sentryCollector/collect_server.py
@@ -256,7 +256,7 @@ class CollectServer():
def server_loop(self):
"""main loop"""
- logging.info("collect server thread start")
+ logging.info("collect listen thread start")
server_fd = self.server_fd_create()
if not server_fd:
return
@@ -267,7 +267,7 @@ class CollectServer():
logging.debug("start server_loop loop")
while True:
if self.stop_event.is_set():
- logging.info("collect server thread exit")
+ logging.debug("collect listen thread exit")
server_fd = None
return
try:
@@ -281,5 +281,5 @@ class CollectServer():
pass
def stop_thread(self):
- logging.info("collect server thread is preparing to exit")
+ logging.debug("collect listen thread is preparing to exit")
self.stop_event.set()
diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py
index b77c642..3a836df 100644
--- a/src/python/sentryCollector/collectd.py
+++ b/src/python/sentryCollector/collectd.py
@@ -49,7 +49,6 @@ def sig_handler(signum, _f):
Thread_List[i][0].stop_thread()
remove_sock_file()
- sys.exit(0)
def main():
"""main
@@ -64,7 +63,6 @@ def main():
try:
signal.signal(signal.SIGINT, sig_handler)
signal.signal(signal.SIGTERM, sig_handler)
- signal.signal(signal.SIGHUP, sig_handler)
logging.info("finish main parse_args")
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
index ff2071d..73f0b22 100644
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
@@ -21,7 +21,7 @@ CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
def log_invalid_keys(not_in_list, keys_name, config_list, default_list):
"""print invalid log"""
- if config_list and default_list:
+ if config_list and not_in_list:
logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list))
elif config_list == ["default"]:
logging.warning("Default {} use {}".format(keys_name, default_list))
@@ -144,9 +144,11 @@ def init_io_win(io_dic, config, common_param):
if avg_lim_value and avg_time_value and tot_lim_value:
io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
+ logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw))
if iodump_lim_value is not None:
io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value)
+ logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw))
return io_data, io_avg_value
@@ -159,10 +161,10 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
for disk_stage_list in json_data.values():
all_stage_set.update(disk_stage_list)
- disk_list = [key for key in config_disk if key in all_disk_set]
+ disk_list = [key for key in all_disk_set if key in config_disk]
not_in_disk_list = [key for key in config_disk if key not in all_disk_set]
- stage_list = [key for key in config_stage if key in all_stage_set]
+ stage_list = [key for key in all_stage_set if key in config_stage]
not_in_stage_list = [key for key in config_stage if key not in all_stage_set]
if not config_disk:
@@ -171,6 +173,9 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
if not config_stage:
stage_list = [key for key in all_stage_set]
+ disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list
+ stage_list = stage_list[:15] if len(stage_list) > 15 else stage_list
+
if config_disk and not disk_list:
logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk))
disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage)
@@ -228,6 +233,8 @@ def main():
signal.signal(signal.SIGINT, sig_handler)
signal.signal(signal.SIGTERM, sig_handler)
+ logging.basicConfig(level=logging.INFO)
+
# 初始化配置读取
config = configparser.ConfigParser(comment_prefixes=('#', ';'))
try:
--
2.33.0

View File

@ -1,56 +0,0 @@
From 497b3124f017ce4ae99b34261c4fd5dd2a358f5b Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Sat, 14 Sep 2024 09:28:00 +0800
Subject: [PATCH] fix syssentry fails to be started when cpu_sentry is not
installed
---
src/python/syssentry/syssentry.py | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
index f93956e..776971f 100644
--- a/src/python/syssentry/syssentry.py
+++ b/src/python/syssentry/syssentry.py
@@ -43,7 +43,6 @@ try:
from .cpu_alarm import cpu_alarm_recv
except ImportError:
CPU_EXIST = False
- logging.debug("Cannot find cpu sentry mod")
INSPECTOR = None
@@ -563,20 +562,21 @@ def main():
if not os.path.exists(SENTRY_RUN_DIR):
os.mkdir(SENTRY_RUN_DIR)
os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM)
- if not chk_and_set_pidfile():
- logging.error("get pid file lock failed, exist")
- sys.exit(17)
logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=logging.INFO)
os.chmod(SYSSENTRY_LOG_FILE, 0o600)
+ if not chk_and_set_pidfile():
+ logging.error("get pid file lock failed, exist")
+ sys.exit(17)
+
try:
signal.signal(signal.SIGINT, sig_handler)
signal.signal(signal.SIGTERM, sig_handler)
signal.signal(signal.SIGHUP, sig_handler)
signal.signal(signal.SIGCHLD, sigchld_handler)
- logging.debug("finish main parse_args")
+ logging.info("finish main parse_args")
_ = SentryConfig.init_param()
TasksMap.init_task_map()
@@ -587,3 +587,4 @@ def main():
logging.error('%s', traceback.format_exc())
finally:
release_pidfile()
+
--
2.33.0

View File

@ -1,91 +0,0 @@
From 874daf9627c74aa31f1063c250b5477b2eb322e8 Mon Sep 17 00:00:00 2001
From: shixuantong <shixuantong1@huawei.com>
Date: Sat, 28 Dec 2024 11:31:23 +0800
Subject: [PATCH] fix test_ai_block_io fail
---
selftest/test/test_ai_block_io.py | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/selftest/test/test_ai_block_io.py b/selftest/test/test_ai_block_io.py
index c36fef5..58ab096 100644
--- a/selftest/test/test_ai_block_io.py
+++ b/selftest/test/test_ai_block_io.py
@@ -12,9 +12,9 @@
import unittest
import numpy as np
-from sentryPlugins.ai_threshold_slow_io_detection.threshold import AbsoluteThreshold, BoxplotThreshold, NSigmaThreshold
-from sentryPlugins.ai_threshold_slow_io_detection.sliding_window import (NotContinuousSlidingWindow,
- ContinuousSlidingWindow, MedianSlidingWindow)
+from sentryPlugins.ai_block_io.threshold import AbsoluteThreshold, BoxplotThreshold, NSigmaThreshold
+from sentryPlugins.ai_block_io.sliding_window import (NotContinuousSlidingWindow,
+ ContinuousSlidingWindow, MedianSlidingWindow)
def _get_boxplot_threshold(data_list: list, parameter):
@@ -98,11 +98,11 @@ class Test(unittest.TestCase):
for data in data_list1:
boxplot_threshold.push_latest_data_to_queue(data)
result = not_continuous.is_slow_io_event(data)
- self.assertFalse(result[0])
+ self.assertFalse(result[0][0])
self.assertEqual(23.75, boxplot_threshold.get_threshold())
boxplot_threshold.push_latest_data_to_queue(24)
result = not_continuous.is_slow_io_event(24)
- self.assertFalse(result[0])
+ self.assertFalse(result[0][0])
boxplot_threshold.push_latest_data_to_queue(25)
result = not_continuous.is_slow_io_event(25)
self.assertTrue(result[0])
@@ -110,7 +110,7 @@ class Test(unittest.TestCase):
for data in data_list2:
boxplot_threshold.push_latest_data_to_queue(data)
result = not_continuous.is_slow_io_event(data)
- self.assertFalse(result[0])
+ self.assertFalse(result[0][0])
self.assertEqual(25.625, boxplot_threshold.get_threshold())
def test_continuous_sliding_window(self):
@@ -121,14 +121,14 @@ class Test(unittest.TestCase):
for data in data_list:
boxplot_threshold.push_latest_data_to_queue(data)
result = continuous.is_slow_io_event(data)
- self.assertFalse(result[0])
+ self.assertFalse(result[0][0])
self.assertEqual(23.75, boxplot_threshold.get_threshold())
# 没有三个异常点
- self.assertFalse(continuous.is_slow_io_event(25)[0])
+ self.assertFalse(continuous.is_slow_io_event(25)[0][0])
# 不连续的三个异常点
- self.assertFalse(continuous.is_slow_io_event(25)[0])
+ self.assertFalse(continuous.is_slow_io_event(25)[0][0])
# 连续的三个异常点
- self.assertTrue(continuous.is_slow_io_event(25)[0])
+ self.assertTrue(continuous.is_slow_io_event(25)[0][0])
def test_median_sliding_window(self):
median = MedianSlidingWindow(5, 3)
@@ -137,7 +137,7 @@ class Test(unittest.TestCase):
absolute_threshold.set_threshold(24.5)
data_list = [24, 24, 24, 25, 25]
for data in data_list:
- self.assertFalse(median.is_slow_io_event(data)[0])
+ self.assertFalse(median.is_slow_io_event(data)[0][0])
self.assertTrue(median.is_slow_io_event(25)[0])
def test_parse_collect_data(self):
@@ -147,8 +147,8 @@ class Test(unittest.TestCase):
"flush": [9.0, 10.0, 11.0, 12.0],
"discard": [13.0, 14.0, 15.0, 16.0],
}
- from io_data import BaseData
- from data_access import _get_io_stage_data
+ from sentryPlugins.ai_block_io.io_data import BaseData
+ from sentryPlugins.ai_block_io.data_access import _get_io_stage_data
io_data = _get_io_stage_data(collect)
self.assertEqual(
--
2.27.0

View File

@ -1,61 +0,0 @@
From 00ea35472d50faea89c881eb45b6d9d11f6b6632 Mon Sep 17 00:00:00 2001
From: luckky <guodashun1@huawei.com>
Date: Fri, 1 Nov 2024 15:09:57 +0800
Subject: [PATCH] fix uint8 bug and change isolation default value
---
src/c/hbm_online_repair/hbm_online_repair.env | 2 +-
src/c/hbm_online_repair/non-standard-hbm-repair.c | 8 ++++----
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/c/hbm_online_repair/hbm_online_repair.env b/src/c/hbm_online_repair/hbm_online_repair.env
index de56079..7166c8d 100644
--- a/src/c/hbm_online_repair/hbm_online_repair.env
+++ b/src/c/hbm_online_repair/hbm_online_repair.env
@@ -1,2 +1,2 @@
HBM_ONLINE_REPAIR_LOG_LEVEL=1
-PAGE_ISOLATION_THRESHOLD=128
+PAGE_ISOLATION_THRESHOLD=3355443
diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c
index f26d8ae..b8dde7a 100644
--- a/src/c/hbm_online_repair/non-standard-hbm-repair.c
+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c
@@ -359,7 +359,7 @@ static int write_file(char *path, const char *name, unsigned long long value)
fd = open(fname, O_WRONLY);
if (fd < 0) {
- log(LOG_WARNING, "HBM ACLS: Cannot to open '%s': %s\n",
+ log(LOG_WARNING, "HBM: Cannot to open '%s': %s\n",
fname, strerror(errno));
return -errno;
}
@@ -367,7 +367,7 @@ static int write_file(char *path, const char *name, unsigned long long value)
snprintf(buf, sizeof(buf), "0x%llx\n", value);
ret = write(fd, buf, strlen(buf));
if (ret <= 0)
- log(LOG_WARNING, "HBM ACLS: Failed to set %s (0x%llx): %s\n",
+ log(LOG_WARNING, "HBM: Failed to set %s (0x%llx): %s\n",
fname, value, strerror(errno));
close(fd);
@@ -557,7 +557,7 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err)
return ret < 0 ? ret : 0;
}
-static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr)
+static uint8_t hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr)
{
int ret;
if (repair_ret <= 0) {
@@ -577,7 +577,7 @@ static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsig
}
}
-static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path)
+static int hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path)
{
unsigned long long paddr;
int ret;
--
2.43.0

View File

@ -1,25 +0,0 @@
From 7baf2815f515c54bc33f41f495ec7c26988b5c44 Mon Sep 17 00:00:00 2001
From: shixuantong <shixuantong1@huawei.com>
Date: Tue, 11 Jun 2024 16:47:46 +0800
Subject: [PATCH] fix version in setup.py
---
src/python/setup.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/python/setup.py b/src/python/setup.py
index 21dbe9f..f96a96e 100644
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -17,7 +17,7 @@ from setuptools import setup, find_packages
setup(
name="syssentry",
- version="1.0.1",
+ version="1.0.2",
description="System inspection framework tool set",
packages=find_packages(),
include_package_data=True,
--
2.27.0

View File

@ -1,53 +0,0 @@
From 5be0d121c6fde185d323dc4bcf3026e2c3ee8757 Mon Sep 17 00:00:00 2001
From: jinsaihang <jinsaihang@h-partners.com>
Date: Mon, 14 Oct 2024 11:30:58 +0800
Subject: [PATCH] fix word error
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
---
sysSentry-1.0.2/src/python/syssentry/alarm.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
index d012901..bff527c 100644
--- a/src/python/syssentry/alarm.py
+++ b/src/python/syssentry/alarm.py
@@ -49,7 +49,7 @@ MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
def update_alarm_list(alarm_info: Xalarm):
alarm_id = xalarm_getid(alarm_info)
if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
- logging.warnning(f"Invalid alarm_id {alarm_id}")
+ logging.warning(f"Invalid alarm_id {alarm_id}")
return
timestamp = xalarm_gettime(alarm_info)
if not timestamp:
@@ -97,14 +97,14 @@ def alarm_register():
task = TasksMap.tasks_dict[task_type][task_name]
alarm_id = task.alarm_id
if not check_alarm_id_if_number(alarm_id):
- logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
+ logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
continue
if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
- logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
+ logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
continue
alarm_clear_time = task.alarm_clear_time
if not check_alarm_clear_time_if_positive_integer(alarm_clear_time):
- logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
+ logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
continue
try:
alarm_clear_time = int(alarm_clear_time)
@@ -113,7 +113,7 @@ def alarm_register():
if alarm_clear_time > sys.maxsize:
raise ValueError("Exceeds maximum value for int")
except (ValueError, OverflowError, TypeError) as e:
- logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
+ logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
continue
alarm_list_dict[alarm_id] = []
task_alarm_id_dict[task_name] = alarm_id
--
2.27.0

View File

@ -1,69 +0,0 @@
From cea094acea79b88e6458cfa264a03c51f08c72fc Mon Sep 17 00:00:00 2001
From: luckky <guodashun1@huawei.com>
Date: Mon, 4 Nov 2024 20:18:05 +0800
Subject: [PATCH] fix write file return code bug
Set the return code 0 to -EINVAL to unify the processing of return code.
---
.../hbm_online_repair/non-standard-hbm-repair.c | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c
index b8dde7a..97cb9a7 100644
--- a/src/c/hbm_online_repair/non-standard-hbm-repair.c
+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c
@@ -112,7 +112,7 @@ static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned
info_struct->row_id = fault_addr & FAULT_ADDR_ROW_ID_MASK;
fault_addr >>= FAULT_ADDR_ROW_ID_LEN;
info_struct->column_id = fault_addr & FAULT_ADDR_COLUMN_ID_MASK;
- fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN;
+ fault_addr >>= FAULT_ADDR_COLUMN_ID_LEN;
info_struct->error_type = fault_addr & FAULT_ADDR_ERROR_TYPE_MASK;
fault_addr >>= FAULT_ADDR_ERROR_TYPE_LEN;
info_struct->repair_type = fault_addr & FAULT_ADDR_REPAIR_TYPE_MASK;
@@ -371,7 +371,12 @@ static int write_file(char *path, const char *name, unsigned long long value)
fname, value, strerror(errno));
close(fd);
- return ret > 0 ? 0 : -errno;
+ if (ret == 0) {
+ ret = -EINVAL;
+ } else if (ret < 0) {
+ ret = -errno;
+ }
+ return ret;
}
static int get_hardware_corrupted_size()
@@ -560,7 +565,7 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err)
static uint8_t hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr)
{
int ret;
- if (repair_ret <= 0) {
+ if (repair_ret < 0) {
log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr);
/* not much we can do about errors here */
(void)write_file("/sys/kernel/page_eject", "remove_page", paddr);
@@ -594,8 +599,7 @@ static int hbmc_hbm_repair(const struct hisi_common_error_section *err, char *pa
ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr);
- /* Only positive num means the error is supported to repair */
- if (ret <= 0) {
+ if (ret < 0) {
if (ret != -ENXIO) {
notice_BMC(err, get_repair_failed_result_code(ret));
log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR");
@@ -624,8 +628,7 @@ static int hbmc_hbm_repair(const struct hisi_common_error_section *err, char *pa
all_online_success = false;
}
}
- /* The ret is from the acls/sppr repair, and only positive num means the error is repaired successfully */
- if (ret <= 0) {
+ if (ret < 0) {
notice_BMC(err, get_repair_failed_result_code(ret));
return ret;
} else if (all_online_success) {
--
2.43.0

View File

@ -1,60 +0,0 @@
From 3eba5dcde10e05e7badc99852f76488e667d56e6 Mon Sep 17 00:00:00 2001
From: caixiaomeng <caixiaomeng2@.com>
Date: Mon, 21 Oct 2024 11:57:37 +0800
Subject: [PATCH] fix xalarm non-uniform log formatting
---
src/python/xalarm/sentry_notify.py | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py
index 5838473..ffe4147 100644
--- a/src/python/xalarm/sentry_notify.py
+++ b/src/python/xalarm/sentry_notify.py
@@ -2,6 +2,7 @@ import os
import sys
import time
import socket
+import logging
from struct import error as StructParseError
from .xalarm_api import alarm_stu2bin, Xalarm
@@ -27,21 +28,21 @@ ALARM_SOCKET_PERMISSION = 0o700
def check_params(alarm_id, alarm_level, alarm_type, puc_paras) -> bool:
if not os.path.exists(DIR_XALARM):
- sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed\n")
+ logging.error(f"check_params: {DIR_XALARM} not exist, failed")
return False
if not os.path.exists(PATH_REPORT_ALARM):
- sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed\n")
+ logging.error(f"check_params: {PATH_REPORT_ALARM} not exist, failed")
return False
if (alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID or
alarm_level < MINOR_ALM or alarm_level > CRITICAL_ALM or
alarm_type < ALARM_TYPE_OCCUR or alarm_type > ALARM_TYPE_RECOVER):
- sys.stderr.write("check_params: alarm info invalid\n")
+ logging.error("check_params: alarm info invalid")
return False
if len(puc_paras) >= MAX_PUC_PARAS_LEN:
- sys.stderr.write(f"check_params: alarm msg should be less than {MAX_PUC_PARAS_LEN}\n")
+ logging.error(f"check_params: alarm msg should be less than {MAX_PUC_PARAS_LEN}")
return False
return True
@@ -61,7 +62,7 @@ def xalarm_report(alarm_id, alarm_level, alarm_type, puc_paras) -> bool:
sock.sendto(alarm_stu2bin(alarm_info), PATH_REPORT_ALARM)
except (FileNotFoundError, StructParseError, socket.error, OSError, UnicodeError) as e:
- sys.stderr.write(f"check_params: error occurs when sending msg.{e}\n")
+ logging.error(f"error occurs when sending msg.")
return False
finally:
sock.close()
--
2.27.0

View File

@ -1,76 +0,0 @@
From f6a26ea0759f36ebcaebe05d4d24c7234a110c63 Mon Sep 17 00:00:00 2001
From: caixiaomeng <caixiaomeng2@.com>
Date: Fri, 11 Oct 2024 12:12:53 +0800
Subject: [PATCH] fix xalarm_Report function not refuse alarm msg exceeds
maximum
---
src/libso/xalarm/register_xalarm.c | 5 +++++
src/python/xalarm/register_xalarm.py | 6 +++---
src/python/xalarm/sentry_notify.py | 4 ++--
3 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c
index 5aff2bc..952a28b 100644
--- a/src/libso/xalarm/register_xalarm.c
+++ b/src/libso/xalarm/register_xalarm.c
@@ -339,6 +339,11 @@ int xalarm_Report(unsigned short usAlarmId, unsigned char ucAlarmLevel,
return -1;
}
+ if (pucParas == NULL || (int)strlen(pucParas) > MAX_PARAS_LEN) {
+ fprintf(stderr, "%s: alarm info invalid\n", __func__);
+ return -1;
+ }
+
if (memset(&info, 0, sizeof(struct alarm_info)) == NULL) {
fprintf(stderr, "%s: memset info failed, ret: %d\n", __func__, ret);
return -1;
diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py
index edd9994..39623bd 100644
--- a/src/python/xalarm/register_xalarm.py
+++ b/src/python/xalarm/register_xalarm.py
@@ -45,7 +45,7 @@ class AlarmRegister:
return False
if self.socket is None:
- sys.stderr.write("check_params: scoket create failed\n")
+ sys.stderr.write("check_params: socket create failed\n")
return False
return True
@@ -151,10 +151,10 @@ def xalarm_unregister(clientId: int) -> None:
def xalarm_upgrade(clientId: int, id_filter: list) -> None:
global ALARM_REGISTER_INFO
if clientId < 0:
- sys.stderr.write("xalarm_unregister: invalid client\n")
+ sys.stderr.write("xalarm_upgrade: invalid client\n")
return
if ALARM_REGISTER_INFO is None:
- sys.stderr.write("xalarm_unregister: alarm has not registered\n")
+ sys.stderr.write("xalarm_upgrade: alarm has not registered\n")
return
ALARM_REGISTER_INFO.id_filter = id_filter
diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py
index c763a24..5838473 100644
--- a/src/python/xalarm/sentry_notify.py
+++ b/src/python/xalarm/sentry_notify.py
@@ -27,11 +27,11 @@ ALARM_SOCKET_PERMISSION = 0o700
def check_params(alarm_id, alarm_level, alarm_type, puc_paras) -> bool:
if not os.path.exists(DIR_XALARM):
- sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed")
+ sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed\n")
return False
if not os.path.exists(PATH_REPORT_ALARM):
- sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed")
+ sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed\n")
return False
if (alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID or
--
2.27.0

View File

@ -1,71 +0,0 @@
From 624efd60495403743fc251b7d689d920841e44c8 Mon Sep 17 00:00:00 2001
From: caixiaomeng <caixiaomeng2@.com>
Date: Fri, 11 Oct 2024 17:54:04 +0800
Subject: [PATCH] fix xalarm_upgrade not return val and fail when thread
stopped
---
src/libso/xalarm/register_xalarm.c | 11 ++++++++++-
src/python/xalarm/register_xalarm.py | 10 +++++++---
2 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c
index 952a28b..6768242 100644
--- a/src/libso/xalarm/register_xalarm.c
+++ b/src/libso/xalarm/register_xalarm.c
@@ -156,7 +156,11 @@ static void *alarm_recv(void *arg)
continue;
}
printf("recv error len:%d errno:%d\n", recvlen, errno);
- }
+ } else if (recvlen == 0) {
+ printf("connection closed by xalarmd, maybe connections reach max num or service stopped.\n");
+ g_register_info.thread_should_stop = 1;
+ break;
+ }
}
return NULL;
}
@@ -211,6 +215,11 @@ bool xalarm_Upgrade(struct alarm_subscription_info id_filter, int client_id)
printf("%s: invalid args\n", __func__);
return false;
}
+
+ if (g_register_info.thread_should_stop) {
+ printf("%s: upgrade failed, alarm thread has stopped\n", __func__);
+ return false;
+ }
set_alarm_id(id_filter);
return true;
diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py
index 39623bd..2a6dabf 100644
--- a/src/python/xalarm/register_xalarm.py
+++ b/src/python/xalarm/register_xalarm.py
@@ -148,15 +148,19 @@ def xalarm_unregister(clientId: int) -> None:
ALARM_REGISTER_INFO = None
-def xalarm_upgrade(clientId: int, id_filter: list) -> None:
+def xalarm_upgrade(id_filter: list, clientId: int) -> bool:
global ALARM_REGISTER_INFO
if clientId < 0:
sys.stderr.write("xalarm_upgrade: invalid client\n")
- return
+ return False
if ALARM_REGISTER_INFO is None:
sys.stderr.write("xalarm_upgrade: alarm has not registered\n")
- return
+ return False
+ if ALARM_REGISTER_INFO.thread_should_stop:
+ sys.stderr.write("xalarm_upgrade: upgrade failed, alarm thread has stopped\n")
+ return False
ALARM_REGISTER_INFO.id_filter = id_filter
+ return True
def xalarm_getid(alarm_info: Xalarm) -> int:
--
2.27.0

View File

@ -1,26 +0,0 @@
From 132334913c4afebefd6afa835f790fa8a5fbf123 Mon Sep 17 00:00:00 2001
From: jinsaihang <jinsaihang@h-partners.com>
Date: Mon, 28 Oct 2024 09:22:53 +0800
Subject: [PATCH] get_alarm -d abnomal display
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
---
sysSentry-1.0.2/src/python/syssentry/alarm.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
index b35a126..e5cc313 100644
--- a/src/python/syssentry/alarm.py
+++ b/src/python/syssentry/alarm.py
@@ -184,7 +184,7 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
# dump each {key,value} of details in one line
if 'details' in alarm_info and isinstance(alarm_info['details'], dict):
for key in alarm_info['details']:
- alarm_info['details'][key] = json.dumps(alarm_info['details'][key], indent=None)
+ alarm_info['details'][key] = str(alarm_info['details'][key])
alarm['alarm_info'] = alarm_info
alarm_list = [alarm for alarm in alarm_list if 'alarm_source' in alarm['alarm_info'] and alarm['alarm_info']['alarm_source'] == task_name]
--
2.27.0

View File

@ -1,168 +0,0 @@
From b21607fcec4b290bc78c9f6c4a26db1a2df32a66 Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Tue, 15 Oct 2024 21:21:10 +0800
Subject: [PATCH] get_io_data failed wont stop avg_block_io and del disk not
support
---
src/python/sentryCollector/collect_plugin.py | 14 ++++-----
.../avg_block_io/avg_block_io.py | 9 ++++--
.../sentryPlugins/avg_block_io/module_conn.py | 31 +++++++++++++------
3 files changed, 35 insertions(+), 19 deletions(-)
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
index bec405a..53dddec 100644
--- a/src/python/sentryCollector/collect_plugin.py
+++ b/src/python/sentryCollector/collect_plugin.py
@@ -90,14 +90,14 @@ def client_send_and_recv(request_data, data_str_len, protocol):
try:
client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
except socket.error:
- logging.error("collect_plugin: client create socket error")
+ logging.debug("collect_plugin: client create socket error")
return None
try:
client_socket.connect(COLLECT_SOCKET_PATH)
except OSError:
client_socket.close()
- logging.error("collect_plugin: client connect error")
+ logging.debug("collect_plugin: client connect error")
return None
req_data_len = len(request_data)
@@ -109,23 +109,23 @@ def client_send_and_recv(request_data, data_str_len, protocol):
res_data = res_data.decode()
except (OSError, UnicodeError):
client_socket.close()
- logging.error("collect_plugin: client communicate error")
+ logging.debug("collect_plugin: client communicate error")
return None
res_magic = res_data[:CLT_MSG_MAGIC_LEN]
if res_magic != "RES":
- logging.error("res msg format error")
+ logging.debug("res msg format error")
return None
protocol_str = res_data[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN]
try:
protocol_id = int(protocol_str)
except ValueError:
- logging.error("recv msg protocol id is invalid %s", protocol_str)
+ logging.debug("recv msg protocol id is invalid %s", protocol_str)
return None
if protocol_id >= ClientProtocol.PRO_END:
- logging.error("protocol id is invalid")
+ logging.debug("protocol id is invalid")
return None
try:
@@ -134,7 +134,7 @@ def client_send_and_recv(request_data, data_str_len, protocol):
res_msg_data = res_msg_data.decode()
return res_msg_data
except (OSError, ValueError, UnicodeError):
- logging.error("collect_plugin: client recv res msg error")
+ logging.debug("collect_plugin: client recv res msg error")
finally:
client_socket.close()
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
index cd47919..899d517 100644
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
@@ -15,7 +15,7 @@ import time
from .config import read_config_log, read_config_common, read_config_algorithm, read_config_latency, read_config_iodump, read_config_stage
from .stage_window import IoWindow, IoDumpWindow
-from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name
+from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name, check_disk_list_validation
from .utils import update_avg_and_check_abnormal
CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
@@ -79,6 +79,8 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
if not disk_list:
report_alarm_fail("Cannot get valid disk name")
+ disk_list = check_disk_list_validation(disk_list)
+
disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list
if not config_disk:
@@ -117,7 +119,10 @@ def main_loop(io_dic, io_data, io_avg_value):
time.sleep(period_time)
# 采集模块对接,获取周期数据
- curr_period_data = avg_get_io_data(io_dic)
+ is_success, curr_period_data = avg_get_io_data(io_dic)
+ if not is_success:
+ logging.error(f"{curr_period_data['msg']}")
+ continue
# 处理周期数据
reach_size = False
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
index cbdaad4..a67ef45 100644
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
@@ -40,25 +40,25 @@ def avg_is_iocollect_valid(io_dic, config_disk, config_stage):
logging.debug(f"send to sentryCollector is_iocollect_valid: period={io_dic['period_time']}, "
f"disk={config_disk}, stage={config_stage}")
res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage)
- return check_result_validation(res, 'check config validation')
+ is_success, data = check_result_validation(res, 'check config validation')
+ if not is_success:
+ report_alarm_fail(f"{data['msg']}")
+ return data
def check_result_validation(res, reason):
"""check validation of result from sentryCollector"""
if not 'ret' in res or not 'message' in res:
- err_msg = "Failed to {}: Cannot connect to sentryCollector.".format(reason)
- report_alarm_fail(err_msg)
+ return False, {'msg': f"Failed to {reason}: Cannot connect to sentryCollector"}
if res['ret'] != 0:
- err_msg = "Failed to {}: {}".format(reason, Result_Messages[res['ret']])
- report_alarm_fail(err_msg)
+ return False, {'msg': f"Failed to {reason}: {Result_Messages[res['ret']]}"}
try:
json_data = json.loads(res['message'])
except json.JSONDecodeError:
- err_msg = f"Failed to {reason}: invalid return message"
- report_alarm_fail(err_msg)
+ return False, {'msg': f"Failed to {reason}: invalid return message"}
- return json_data
+ return True, json_data
def report_alarm_fail(alarm_info):
@@ -120,10 +120,21 @@ def process_report_data(disk_name, rw, io_data):
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
+def check_disk_list_validation(disk_list):
+ valid_disk_list = []
+ for disk_name in disk_list:
+ is_success, _ = check_result_validation(get_disk_type(disk_name), "")
+ if not is_success:
+ continue
+ valid_disk_list.append(disk_name)
+ return valid_disk_list
+
+
def get_disk_type_by_name(disk_name):
logging.debug(f"send to sentryCollector get_disk_type: disk_name={disk_name}")
- res = get_disk_type(disk_name)
- disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}')
+ is_success, disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}')
+ if not is_success:
+ report_alarm_fail(f"{disk_type_str['msg']}")
try:
curr_disk_type = int(disk_type_str)
if curr_disk_type not in Disk_Type:
--
2.27.0

View File

@ -1,107 +0,0 @@
From 74f18b0e1fd4f99fa7d1d95e08894b408dcafe51 Mon Sep 17 00:00:00 2001
From: luckky <guodashun1@huawei.com>
Date: Wed, 18 Dec 2024 14:31:04 +0800
Subject: [PATCH] hbm_online_repair add unload driver
---
src/c/hbm_online_repair/hbm_online_repair.c | 47 +++++++++++++--------
1 file changed, 29 insertions(+), 18 deletions(-)
diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c
index 00c9c0b..6783485 100644
--- a/src/c/hbm_online_repair/hbm_online_repair.c
+++ b/src/c/hbm_online_repair/hbm_online_repair.c
@@ -11,6 +11,8 @@
#define DEFAULT_LOG_LEVEL LOG_INFO
#define DEFAULT_PAGE_ISOLATION_THRESHOLD 3355443
+#define DRIVER_COMMAND_LEN 32
+
int global_level_setting;
int page_isolation_threshold;
@@ -57,25 +59,31 @@ int execute_command(const char *command)
return -1;
}
- ret = WEXITSTATUS(ret);
+ ret = -WEXITSTATUS(ret);
log(LOG_DEBUG, "command %s exited with status: %d\n", command, ret);
return ret;
}
-int load_required_driver(void)
+int handle_driver(char* driver_name, bool load)
{
int ret;
- ret = execute_command("modprobe hisi_mem_ras 2>&1");
- if (ret < 0) {
- log(LOG_ERROR, "load repair driver failed\n");
- return ret;
- }
- ret = execute_command("modprobe page_eject 2>&1");
- if (ret < 0) {
- log(LOG_ERROR, "load page driver failed\n");
+ char command[DRIVER_COMMAND_LEN];
+
+ snprintf(command, DRIVER_COMMAND_LEN, "%s %s 2>&1", load ? "modprobe" : "rmmod", driver_name);
+ ret = execute_command(command);
+ log(ret < 0 ? LOG_ERROR : LOG_DEBUG, "%s %s %s\n", load ? "load" : "unload", driver_name, ret < 0 ? "failed" : "success");
+ return ret;
+}
+
+int handle_all_drivers(bool load)
+{
+ int ret;
+
+ ret = handle_driver("hisi_mem_ras", load);
+ if (ret < 0)
return ret;
- }
- log(LOG_INFO, "load required driver success\n");
+
+ ret = handle_driver("page_eject", load);
return ret;
}
@@ -116,21 +124,21 @@ int main(int argc, char *argv[])
hbm_param_init();
- ret = load_required_driver();
+ ret = handle_all_drivers(true);
if (ret < 0) {
- log(LOG_DEBUG, "load required driver failed\n");
return ret;
}
struct ras_events *ras = init_trace_instance();
- if (!ras)
- return -1;
+ if (!ras) {
+ ret = -1;
+ goto err_unload;
+ }
ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 1);
if (ret < 0) {
log(LOG_WARNING, "unable to enable ras non_standard_event.\n");
- free(ras);
- return -1;
+ goto err_free;
}
get_flash_total_size();
@@ -142,6 +150,9 @@ int main(int argc, char *argv[])
log(LOG_WARNING, "unable to disable ras non_standard_event.\n");
}
+err_free:
free(ras);
+err_unload:
+ handle_all_drivers(false);
return ret;
}
--
2.43.0

View File

@ -1,104 +0,0 @@
From 2135b4e41666d99922eda79e9ee04bbc2b557fea Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Wed, 16 Oct 2024 12:13:21 +0800
Subject: [PATCH] listen thread of collect module exits occasionally
---
src/python/sentryCollector/collect_io.py | 4 +---
src/python/sentryCollector/collect_server.py | 18 ++++++++----------
2 files changed, 9 insertions(+), 13 deletions(-)
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
index 5fe1efc..de308b3 100644
--- a/src/python/sentryCollector/collect_io.py
+++ b/src/python/sentryCollector/collect_io.py
@@ -231,9 +231,7 @@ class CollectIo():
if self.get_blk_io_hierarchy(disk_name, stage_list) < 0:
continue
self.append_period_lat(disk_name, stage_list)
-
- logging.debug(f"no-lock collect data : {IO_GLOBAL_DATA}")
-
+
elapsed_time = time.time() - start_time
sleep_time = self.period_time - elapsed_time
if sleep_time < 0:
diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py
index 11d1af0..ad3ac0e 100644
--- a/src/python/sentryCollector/collect_server.py
+++ b/src/python/sentryCollector/collect_server.py
@@ -64,7 +64,7 @@ class CollectServer():
self.io_global_data = IO_GLOBAL_DATA
if len(IO_CONFIG_DATA) == 0:
- logging.error("the collect thread is not started, the data is invalid. ")
+ logging.error("the collect thread is not started, the data is invalid.")
return json.dumps(result_rev)
period_time = IO_CONFIG_DATA[0]
@@ -75,7 +75,7 @@ class CollectServer():
stage_list = json.loads(data_struct['stage'])
if (period < period_time) or (period > period_time * max_save) or (period % period_time):
- logging.error("is_iocollect_valid: period time: %d is invalid", period)
+ logging.error("is_iocollect_valid: period time is invalid, user period: %d, config period_time: %d", period, period_time)
return json.dumps(result_rev)
for disk_name, stage_info in self.io_global_data.items():
@@ -96,7 +96,7 @@ class CollectServer():
self.io_global_data = IO_GLOBAL_DATA
if len(IO_CONFIG_DATA) == 0:
- logging.error("the collect thread is not started, the data is invalid. ")
+ logging.error("the collect thread is not started, the data is invalid.")
return json.dumps(result_rev)
period_time = IO_CONFIG_DATA[0]
max_save = IO_CONFIG_DATA[1]
@@ -107,11 +107,11 @@ class CollectServer():
iotype_list = json.loads(data_struct['iotype'])
if (period < period_time) or (period > period_time * max_save) or (period % period_time):
- logging.error("get_io_data: period time: %d is invalid", period)
+ logging.error("get_io_data: period time is invalid, user period: %d, config period_time: %d", period, period_time)
return json.dumps(result_rev)
collect_index = period // period_time - 1
- logging.debug("period: %d, collect_index: %d", period, collect_index)
+ logging.debug("user period: %d, config period_time: %d, collect_index: %d", period, period_time, collect_index)
for disk_name, stage_info in self.io_global_data.items():
if disk_name not in disk_list:
@@ -124,7 +124,7 @@ class CollectServer():
for iotype_name, iotype_info in iotype_info.items():
if iotype_name not in iotype_list:
continue
- if len(iotype_info) < collect_index:
+ if len(iotype_info) - 1 < collect_index:
continue
result_rev[disk_name][stage_name][iotype_name] = iotype_info[collect_index]
@@ -250,10 +250,8 @@ class CollectServer():
except socket.error:
logging.error("server fd create failed")
server_fd = None
-
return server_fd
-
def server_loop(self):
"""main loop"""
logging.info("collect listen thread start")
@@ -277,8 +275,8 @@ class CollectServer():
self.server_recv(server_fd)
else:
continue
- except socket.error:
- pass
+ except Exception:
+ logging.error('collect listen exception : %s', traceback.format_exc())
def stop_thread(self):
self.stop_event.set()
--
2.33.0

View File

@ -1,69 +0,0 @@
From edbe32637a939d0788bcbde9211a61cfded436bf Mon Sep 17 00:00:00 2001
From: luckky <guodashun1@huawei.com>
Date: Tue, 5 Nov 2024 17:22:27 +0800
Subject: [PATCH] make debug msg clear
1. Change the page_isolation_threshold default value for 128(kb) to 3355443(kb)
to synchronize the modification of the .mod file.
2. Add specific command info in debug message to make debug message clear.
3. Update the commit of the log level and format of syssentry.
4. Change the interval 180 to 10 to short the restart time.
---
config/tasks/hbm_online_repair.mod | 2 +-
.../src/c/hbm_online_repair/hbm_online_repair.c | 8 ++++----
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/config/tasks/hbm_online_repair.mod b/config/tasks/hbm_online_repair.mod
index 77dd73e..4dcef43 100644
--- a/config/tasks/hbm_online_repair.mod
+++ b/config/tasks/hbm_online_repair.mod
@@ -3,7 +3,7 @@ enabled=yes
task_start=/usr/bin/hbm_online_repair
task_stop=kill $pid
type=period
-interval=180
+interval=10
onstart=yes
env_file=/etc/sysconfig/hbm_online_repair.env
conflict=up
\ No newline at end of file
diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c
index b3b2742..943f201 100644
--- a/src/c/hbm_online_repair/hbm_online_repair.c
+++ b/src/c/hbm_online_repair/hbm_online_repair.c
@@ -9,7 +9,7 @@
#include "non-standard-hbm-repair.h"
#define DEFAULT_LOG_LEVEL LOG_INFO
-#define DEFAULT_PAGE_ISOLATION_THRESHOLD 128
+#define DEFAULT_PAGE_ISOLATION_THRESHOLD 3355443
int global_level_setting;
int page_isolation_threshold;
@@ -44,7 +44,7 @@ int execute_command(const char *command)
}
fgets(buffer, sizeof(buffer), fp);
- log(LOG_DEBUG, "output of command is: %s\n", buffer);
+ log(LOG_DEBUG, "output of command %s is: %s\n", command, buffer);
ret = pclose(fp);
if (ret < 0) {
@@ -53,12 +53,12 @@ int execute_command(const char *command)
}
if (!WIFEXITED(ret)) {
- log(LOG_ERROR, "command did not terminate normally\n");
+ log(LOG_ERROR, "command %s did not terminate normally\n", command);
return -1;
}
ret = WEXITSTATUS(ret);
- log(LOG_DEBUG, "command exited with status: %d\n", ret);
+ log(LOG_DEBUG, "command %s exited with status: %d\n", command, ret);
return ret;
}
--
2.43.0

View File

@ -1,28 +0,0 @@
From b5794ef43f768d7ea9bbbac450deaabbdcff4997 Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Sat, 12 Oct 2024 17:57:01 +0800
Subject: [PATCH] modify abnormal stack when the disk field is not configured
---
src/python/sentryCollector/collect_config.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py
index 5aa38ec..7ca9898 100644
--- a/src/python/sentryCollector/collect_config.py
+++ b/src/python/sentryCollector/collect_config.py
@@ -127,9 +127,9 @@ class CollectConfig:
CONF_IO, CONF_IO_MAX_SAVE, CONF_IO_MAX_SAVE_DEFAULT)
result_io_config[CONF_IO_MAX_SAVE] = CONF_IO_MAX_SAVE_DEFAULT
# disk
- disk = io_map_value.get(CONF_IO_DISK).lower()
+ disk = io_map_value.get(CONF_IO_DISK)
if disk:
- disk_str = disk.replace(" ", "")
+ disk_str = disk.lower().replace(" ", "")
pattern = r'^[a-zA-Z0-9-_,]+$'
if not re.match(pattern, disk_str):
logging.warning("module_name = %s section, field = %s is incorrect, use default %s",
--
2.33.0

View File

@ -1,27 +0,0 @@
From 0d3323d13797f3f9d3124e3938787d2573bf249d Mon Sep 17 00:00:00 2001
From: zhangnan <zhangnan134@huawei.com>
Date: Mon, 28 Oct 2024 17:32:49 +0800
Subject: [PATCH] modify logrotate rule
---
config/logrotate | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/config/logrotate b/config/logrotate
index f54e7b3..e855118 100644
--- a/config/logrotate
+++ b/config/logrotate
@@ -1,8 +1,9 @@
/var/log/sysSentry/*.log {
- nocompress
+ compress
missingok
notifempty
copytruncate
rotate 2
size +4096k
+ hourly
}
--
2.33.0

View File

@ -1,125 +0,0 @@
From 91c37cec1639c79b2b5ddcd6b173b4d7aa0ce9db Mon Sep 17 00:00:00 2001
From: jinsaihang <jinsaihang@h-partners.com>
Date: Wed, 16 Oct 2024 14:51:24 +0800
Subject: [PATCH] optimize log printing
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
---
src/python/syssentry/alarm.py | 53 ++++++++++++++++---------------
src/python/syssentry/load_mods.py | 15 +++++----
2 files changed, 35 insertions(+), 33 deletions(-)
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
index bff527c..c3f2ee1 100644
--- a/src/python/syssentry/alarm.py
+++ b/src/python/syssentry/alarm.py
@@ -76,16 +76,26 @@ def update_alarm_list(alarm_info: Xalarm):
finally:
alarm_list_lock.release()
-def check_alarm_id_if_number(alarm_id):
- if isinstance(alarm_id, int):
- return True
- else:
+def validate_alarm_id(alarm_id):
+ if alarm_id is None:
+ return False
+ try:
+ alarm_id = int(alarm_id)
+ if MIN_ALARM_ID <= alarm_id <= MAX_ALARM_ID:
+ return True
+ else:
+ return False
+ except ValueError:
return False
-def check_alarm_clear_time_if_positive_integer(alarm_clear_time):
- if isinstance(alarm_clear_time, int) and alarm_clear_time > 0:
- return True
- else:
+def validate_alarm_clear_time(alarm_clear_time):
+ try:
+ alarm_clear_time = int(alarm_clear_time)
+ if alarm_clear_time > 0 and alarm_clear_time <= sys.maxsize:
+ return True
+ else:
+ return False
+ except ValueError:
return False
def alarm_register():
@@ -93,34 +103,25 @@ def alarm_register():
# 初始化告警ID映射字典、告警老化时间字典
for task_type in TasksMap.tasks_dict:
for task_name in TasksMap.tasks_dict[task_type]:
- logging.info(f"alarm_register: {task_name} is registered")
task = TasksMap.tasks_dict[task_type][task_name]
- alarm_id = task.alarm_id
- if not check_alarm_id_if_number(alarm_id):
- logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
+ if not validate_alarm_id(task.alarm_id):
+ logging.warning(f"Invalid alarm_id {task.alarm_id}: ignore {task_name} alarm")
continue
- if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
- logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
+ if not validate_alarm_clear_time(task.alarm_clear_time):
+ logging.warning(f"Invalid alarm_clear_time {task.alarm_clear_time}: ignore {task_name} alarm")
continue
+ task.alarm_id = int(task.alarm_id)
+ task.alarm_clear_time = int(task.alarm_clear_time)
+ alarm_id = task.alarm_id
alarm_clear_time = task.alarm_clear_time
- if not check_alarm_clear_time_if_positive_integer(alarm_clear_time):
- logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
- continue
- try:
- alarm_clear_time = int(alarm_clear_time)
- if alarm_clear_time <= 0:
- raise ValueError("Not a positive integer")
- if alarm_clear_time > sys.maxsize:
- raise ValueError("Exceeds maximum value for int")
- except (ValueError, OverflowError, TypeError) as e:
- logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
- continue
+
alarm_list_dict[alarm_id] = []
task_alarm_id_dict[task_name] = alarm_id
if alarm_id not in alarm_id_clear_time_dict:
alarm_id_clear_time_dict[alarm_id] = alarm_clear_time
else:
alarm_id_clear_time_dict[alarm_id] = max(alarm_clear_time, alarm_id_clear_time_dict[alarm_id])
+ logging.info(f"alarm_register: {task_name} is registered")
# 注册告警回调
id_filter = [True] * 128
clientId = xalarm_register(update_alarm_list, id_filter)
diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py
index f74f165..78db446 100644
--- a/src/python/syssentry/load_mods.py
+++ b/src/python/syssentry/load_mods.py
@@ -198,15 +198,16 @@ def parse_mod_conf(mod_name, mod_conf):
task.load_enabled = is_enabled
try:
- task.alarm_id = int(mod_conf.get(CONF_TASK, CONF_ALARM_ID))
- task.alarm_clear_time = int(mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME))
- if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID):
- raise ValueError("Invalid alarm_id")
- except ValueError:
task.alarm_id = mod_conf.get(CONF_TASK, CONF_ALARM_ID)
- task.alarm_clear_time = mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME)
except configparser.NoOptionError:
- logging.warning("Unset alarm_clear_time, use 15s as default")
+ task.alarm_id = None
+ logging.warning(f"{mod_name} alarm_id not set, alarm_id is None")
+
+ if task.alarm_id is not None:
+ try:
+ task.alarm_clear_time = mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME)
+ except configparser.NoOptionError:
+ logging.warning(f"{mod_name} not set alarm_clear_time, use 15s as default")
if CONF_ONSTART in mod_conf.options(CONF_TASK):
is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes')
--
2.27.0

View File

@ -1,77 +0,0 @@
From cb3d0ea18eed3d48f2753f878d9726f58fe616b1 Mon Sep 17 00:00:00 2001
From: shixuantong <shixuantong1@huawei.com>
Date: Sat, 21 Sep 2024 09:53:42 +0800
Subject: [PATCH] optimize the handing of cat-cli error msg in cpu_sentry
---
src/python/syssentry/cpu_sentry.py | 36 +++++++++++++++++-------------
1 file changed, 21 insertions(+), 15 deletions(-)
diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py
index 99af127..582d4b3 100644
--- a/src/python/syssentry/cpu_sentry.py
+++ b/src/python/syssentry/cpu_sentry.py
@@ -26,6 +26,8 @@ CPU_SENTRY_PARAM_CONFIG = "/etc/sysSentry/plugins/cpu_sentry.ini"
# Inspection commands running at the bottom layer
LOW_LEVEL_INSPECT_CMD = "cat-cli"
+# max length of msg in details
+DETAILS_LOG_MSG_MAX_LEN = 255
class CpuSentry:
"""
@@ -94,22 +96,10 @@ class CpuSentry:
self.send_result["details"]["msg"] = "cpu_sentry task is killed!"
return
- if "ERROR" in stdout:
- self.send_result["result"] = ResultLevel.FAIL
- self.send_result["details"]["code"] = 1004
-
- # Remove ANSI escape sequences
- error_info = stdout.split("\n")[0]
- if error_info.startswith("\u001b"):
- ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])'
- error_info = re.sub(ansi_escape, '', error_info)
-
- self.send_result["details"]["msg"] = error_info
- return
-
out_split = stdout.split("\n")
- isolated_cores_number = 0
+ isolated_cores_number = -1
found_fault_cores_list = []
+ error_msg_list = []
for out_line_i in out_split:
if "handle_patrol_result: Found fault cores" in out_line_i:
cores_number_tmp = out_line_i.split("Found fault cores:")[1]
@@ -121,9 +111,25 @@ class CpuSentry:
elif out_line_i.startswith('<ISOLATED-CORE-LIST>'):
self.send_result["details"]["isolated_cpu_list"] = out_line_i.split(':')[1]
break
+ elif "ERROR" in out_line_i:
+ logging.error("[cat-cli error] - %s\n", out_line_i)
+ error_msg_list.append(out_line_i)
found_fault_cores_number = len(set(found_fault_cores_list))
- if found_fault_cores_number == 0:
+ if isolated_cores_number == -1:
+ self.send_result["result"] = ResultLevel.FAIL
+ self.send_result["details"]["code"] = 1004
+
+ send_error_msg = ""
+ # Remove ANSI escape sequences
+ for error_info in error_msg_list:
+ if error_info.startswith("\u001b"):
+ ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])'
+ error_info = re.sub(ansi_escape, '', error_info)
+ if len(send_error_msg) + len(error_info) < DETAILS_LOG_MSG_MAX_LEN:
+ send_error_msg += error_info
+ self.send_result["details"]["msg"] = send_error_msg
+ elif found_fault_cores_number == 0:
self.send_result["details"]["code"] = 0
self.send_result["result"] = ResultLevel.PASS
elif 0 in found_fault_cores_list:
--
2.27.0

View File

@ -1,25 +0,0 @@
From 3dda5f68db38b63b1e45a28558a9fcd341c1f945 Mon Sep 17 00:00:00 2001
From: jwolf <523083921@qq.com>
Date: Fri, 20 Sep 2024 15:59:40 +0800
Subject: [PATCH] should be warn-level log
---
src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c
index 9f8d80c..f4f3172 100644
--- a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c
+++ b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c
@@ -23,7 +23,7 @@ static cat_return_t insert_core_to_list(core_list_st *core_list, int coreid)
return CAT_OK;
}
if ((core_list->current_nums == MAX_ISOLATE_CORES_PER_PATROL) || (coreid < 0)) {
- CAT_LOG_E("Insert error, core id(%d)", coreid);
+ CAT_LOG_W("Too many cores need to isolate,do not isolate core(%d)", coreid);
return CAT_ERR;
}
--
2.27.0

View File

@ -1,23 +0,0 @@
From 34febf57060060d1f8262941af49e3beeb1f7f5d Mon Sep 17 00:00:00 2001
From: jwolf <523083921@qq.com>
Date: Fri, 30 Aug 2024 16:59:56 +0800
Subject: [PATCH] param must be integer
---
src/c/catcli/catlib/cli_param_checker.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c
index 5b38402..71edf17 100644
--- a/src/c/catcli/catlib/cli_param_checker.c
+++ b/src/c/catcli/catlib/cli_param_checker.c
@@ -17,6 +17,7 @@ void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_r
if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) {
strncpy(errs->patrol_module_err,
"\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN);
+ p_request_body->cpu_utility = 0;
} else {
p_request_body->cpu_utility = (int)cpu_utility;
}
--
Gitee

View File

@ -1,91 +0,0 @@
From 7fa9e80531bb3d4fa587e5fb7a99e3af59feda7e Mon Sep 17 00:00:00 2001
From: jinsaihang <jinsaihang@h-partners.com>
Date: Sat, 12 Oct 2024 16:51:37 +0800
Subject: [PATCH] precise alarm query time
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
---
sysSentry-1.0.2/src/python/syssentry/alarm.py | 25 +++++++++++++++++--
.../src/python/syssentry/load_mods.py | 3 ++-
2 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
index 43c1065..d012901 100644
--- a/src/python/syssentry/alarm.py
+++ b/src/python/syssentry/alarm.py
@@ -76,6 +76,18 @@ def update_alarm_list(alarm_info: Xalarm):
finally:
alarm_list_lock.release()
+def check_alarm_id_if_number(alarm_id):
+ if isinstance(alarm_id, int):
+ return True
+ else:
+ return False
+
+def check_alarm_clear_time_if_positive_integer(alarm_clear_time):
+ if isinstance(alarm_clear_time, int) and alarm_clear_time > 0:
+ return True
+ else:
+ return False
+
def alarm_register():
logging.debug(f"alarm_register: enter")
# 初始化告警ID映射字典、告警老化时间字典
@@ -84,10 +96,16 @@ def alarm_register():
logging.info(f"alarm_register: {task_name} is registered")
task = TasksMap.tasks_dict[task_type][task_name]
alarm_id = task.alarm_id
+ if not check_alarm_id_if_number(alarm_id):
+ logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
+ continue
if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
continue
alarm_clear_time = task.alarm_clear_time
+ if not check_alarm_clear_time_if_positive_integer(alarm_clear_time):
+ logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
+ continue
try:
alarm_clear_time = int(alarm_clear_time)
if alarm_clear_time <= 0:
@@ -119,6 +137,9 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
logging.debug("task_name does not exist")
return []
alarm_id = task_alarm_id_dict[task_name]
+ clear_time = alarm_id_clear_time_dict[alarm_id]
+ if clear_time < int(time_range):
+ return []
if alarm_id not in alarm_list_dict:
logging.debug("alarm_id does not exist")
return []
@@ -126,10 +147,10 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
logging.debug(f"get_alarm_result: alarm_list of {alarm_id} has {len(alarm_list)} elements")
# clear alarm_info older than clear time threshold
stop_index = -1
- timestamp = int(datetime.now().timestamp())
+ timestamp = datetime.now().timestamp()
for i in range(len(alarm_list)):
logging.debug(f"timestamp, alarm_list[{i}].timestamp: {timestamp}, {xalarm_gettime(alarm_list[i])}")
- if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > int(time_range):
+ if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > time_range:
stop_index = i
break
if stop_index >= 0:
diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py
index 7daf17d..f74f165 100644
--- a/src/python/syssentry/load_mods.py
+++ b/src/python/syssentry/load_mods.py
@@ -203,7 +203,8 @@ def parse_mod_conf(mod_name, mod_conf):
if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID):
raise ValueError("Invalid alarm_id")
except ValueError:
- logging.warning("Invalid alarm_id")
+ task.alarm_id = mod_conf.get(CONF_TASK, CONF_ALARM_ID)
+ task.alarm_clear_time = mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME)
except configparser.NoOptionError:
logging.warning("Unset alarm_clear_time, use 15s as default")
--
2.27.0

View File

@ -1,566 +0,0 @@
From d5cb115a97e27c8270e8fb385fb3914af9ba3c34 Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Tue, 15 Oct 2024 10:00:07 +0000
Subject: [PATCH] refactor config.py and bugfix uncorrect slow io report
Signed-off-by: gaoruoshu <gaoruoshu@huawei.com>
---
.../avg_block_io/avg_block_io.py | 155 ++-----------
.../sentryPlugins/avg_block_io/config.py | 208 ++++++++++++++++++
.../sentryPlugins/avg_block_io/module_conn.py | 9 +-
.../sentryPlugins/avg_block_io/utils.py | 72 ------
4 files changed, 238 insertions(+), 206 deletions(-)
create mode 100644 src/python/sentryPlugins/avg_block_io/config.py
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
index f3ade09..cd47919 100644
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
@@ -13,132 +13,13 @@ import signal
import configparser
import time
+from .config import read_config_log, read_config_common, read_config_algorithm, read_config_latency, read_config_iodump, read_config_stage
from .stage_window import IoWindow, IoDumpWindow
from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name
-from .utils import update_avg_and_check_abnormal, get_log_level, get_section_value
-from sentryCollector.collect_plugin import Disk_Type
+from .utils import update_avg_and_check_abnormal
CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
-def log_invalid_keys(not_in_list, keys_name, config_list, default_list):
- """print invalid log"""
- if config_list and not_in_list:
- logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list))
- elif config_list == ["default"]:
- logging.warning("Default {} use {}".format(keys_name, default_list))
-
-
-def read_config_common(config):
- """read config file, get [common] section value"""
- if not config.has_section("common"):
- report_alarm_fail("Cannot find common section in config file")
-
- try:
- disk_name = config.get("common", "disk")
- disk = [] if disk_name == "default" else disk_name.split(",")
- except configparser.NoOptionError:
- disk = []
- logging.warning("Unset common.disk, set to default")
-
- try:
- stage_name = config.get("common", "stage")
- stage = [] if stage_name == "default" else stage_name.split(",")
- except configparser.NoOptionError:
- stage = []
- logging.warning("Unset common.stage, set to default")
-
- if len(disk) > 10:
- logging.warning("Too many common.disks, record only max 10 disks")
- disk = disk[:10]
-
- try:
- iotype_name = config.get("common", "iotype").split(",")
- iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']]
- err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']]
-
- if err_iotype:
- report_alarm_fail("Invalid common.iotype config")
-
- except configparser.NoOptionError:
- iotype_list = ["read", "write"]
- logging.warning("Unset common.iotype, set to read,write")
-
- try:
- period_time = int(config.get("common", "period_time"))
- if not (1 <= period_time <= 300):
- raise ValueError("Invalid period_time")
- except ValueError:
- report_alarm_fail("Invalid common.period_time")
- except configparser.NoOptionError:
- period_time = 1
- logging.warning("Unset common.period_time, use 1s as default")
-
- return period_time, disk, stage, iotype_list
-
-
-def read_config_algorithm(config):
- """read config file, get [algorithm] section value"""
- if not config.has_section("algorithm"):
- report_alarm_fail("Cannot find algorithm section in config file")
-
- try:
- win_size = int(config.get("algorithm", "win_size"))
- if not (1 <= win_size <= 300):
- raise ValueError("Invalid algorithm.win_size")
- except ValueError:
- report_alarm_fail("Invalid algorithm.win_size config")
- except configparser.NoOptionError:
- win_size = 30
- logging.warning("Unset algorithm.win_size, use 30 as default")
-
- try:
- win_threshold = int(config.get("algorithm", "win_threshold"))
- if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
- raise ValueError("Invalid algorithm.win_threshold")
- except ValueError:
- report_alarm_fail("Invalid algorithm.win_threshold config")
- except configparser.NoOptionError:
- win_threshold = 6
- logging.warning("Unset algorithm.win_threshold, use 6 as default")
-
- return win_size, win_threshold
-
-
-def read_config_latency(config):
- """read config file, get [latency_xxx] section value"""
- common_param = {}
- for type_name in Disk_Type:
- section_name = f"latency_{Disk_Type[type_name]}"
- if not config.has_section(section_name):
- report_alarm_fail(f"Cannot find {section_name} section in config file")
-
- common_param[Disk_Type[type_name]] = get_section_value(section_name, config)
- return common_param
-
-
-def read_config_iodump(config):
- """read config file, get [iodump] section value"""
- common_param = {}
- section_name = "iodump"
- if not config.has_section(section_name):
- report_alarm_fail(f"Cannot find {section_name} section in config file")
-
- return get_section_value(section_name, config)
-
-
-def read_config_stage(config, stage, iotype_list, curr_disk_type):
- """read config file, get [STAGE_NAME_diskType] section value"""
- res = {}
- section_name = f"{stage}_{curr_disk_type}"
- if not config.has_section(section_name):
- return res
-
- for key in config[section_name]:
- if config[stage][key].isdecimal():
- res[key] = int(config[stage][key])
-
- return res
-
def init_io_win(io_dic, config, common_param):
"""initialize windows of latency, iodump, and dict of avg_value"""
@@ -192,24 +73,33 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
disk_list = [key for key in all_disk_set if key in config_disk]
not_in_disk_list = [key for key in config_disk if key not in all_disk_set]
+ if not config_disk and not not_in_disk_list:
+ disk_list = [key for key in all_disk_set]
+
+ if not disk_list:
+ report_alarm_fail("Cannot get valid disk name")
+
+ disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list
+
+ if not config_disk:
+ logging.info(f"Default common.disk using disk={disk_list}")
+ elif sorted(disk_list) != sorted(config_disk):
+ logging.warning(f"Set common.disk to {disk_list}")
+
stage_list = [key for key in all_stage_set if key in config_stage]
not_in_stage_list = [key for key in config_stage if key not in all_stage_set]
if not_in_stage_list:
report_alarm_fail(f"Invalid common.stage_list config, cannot set {not_in_stage_list}")
- if not config_disk and not not_in_disk_list:
- disk_list = [key for key in all_disk_set]
-
- if not config_stage and not not_in_stage_list:
+ if not config_stage:
stage_list = [key for key in all_stage_set]
- disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list
-
- if not stage_list or not disk_list:
- report_alarm_fail("Cannot get valid disk name or stage name.")
+ if not stage_list:
+ report_alarm_fail("Cannot get valid stage name.")
- log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list)
+ if not config_stage:
+ logging.info(f"Default common.stage using stage={stage_list}")
return disk_list, stage_list
@@ -254,9 +144,8 @@ def main():
signal.signal(signal.SIGINT, sig_handler)
signal.signal(signal.SIGTERM, sig_handler)
- log_level = get_log_level(CONFIG_FILE)
+ log_level = read_config_log(CONFIG_FILE)
log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
-
logging.basicConfig(level=log_level, format=log_format)
# 初始化配置读取
@@ -274,6 +163,8 @@ def main():
# 采集模块对接is_iocollect_valid()
io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage)
+ logging.debug(f"disk={io_dic['disk_list']}, stage={io_dic['stage_list']}")
+
if "bio" not in io_dic["stage_list"]:
report_alarm_fail("Cannot run avg_block_io without bio stage")
diff --git a/src/python/sentryPlugins/avg_block_io/config.py b/src/python/sentryPlugins/avg_block_io/config.py
new file mode 100644
index 0000000..c8f45ce
--- /dev/null
+++ b/src/python/sentryPlugins/avg_block_io/config.py
@@ -0,0 +1,208 @@
+import configparser
+import logging
+import os
+
+from .module_conn import report_alarm_fail
+from sentryCollector.collect_plugin import Disk_Type
+
+
+CONF_LOG = 'log'
+CONF_LOG_LEVEL = 'level'
+LogLevel = {
+ "debug": logging.DEBUG,
+ "info": logging.INFO,
+ "warning": logging.WARNING,
+ "error": logging.ERROR,
+ "critical": logging.CRITICAL
+}
+
+CONF_COMMON = 'common'
+CONF_COMMON_DISK = 'disk'
+CONF_COMMON_STAGE = 'stage'
+CONF_COMMON_IOTYPE = 'iotype'
+CONF_COMMON_PER_TIME = 'period_time'
+
+CONF_ALGO = 'algorithm'
+CONF_ALGO_SIZE = 'win_size'
+CONF_ALGO_THRE = 'win_threshold'
+
+CONF_LATENCY = 'latency_{}'
+CONF_IODUMP = 'iodump'
+
+
+DEFAULT_PARAM = {
+ CONF_LOG: {
+ CONF_LOG_LEVEL: 'info'
+ }, CONF_COMMON: {
+ CONF_COMMON_DISK: 'default',
+ CONF_COMMON_STAGE: 'default',
+ CONF_COMMON_IOTYPE: 'read,write',
+ CONF_COMMON_PER_TIME: 1
+ }, CONF_ALGO: {
+ CONF_ALGO_SIZE: 30,
+ CONF_ALGO_THRE: 6
+ }, 'latency_nvme_ssd': {
+ 'read_avg_lim': 300,
+ 'write_avg_lim': 300,
+ 'read_avg_time': 3,
+ 'write_avg_time': 3,
+ 'read_tot_lim': 500,
+ 'write_tot_lim': 500,
+ }, 'latency_sata_ssd' : {
+ 'read_avg_lim': 10000,
+ 'write_avg_lim': 10000,
+ 'read_avg_time': 3,
+ 'write_avg_time': 3,
+ 'read_tot_lim': 50000,
+ 'write_tot_lim': 50000,
+ }, 'latency_sata_hdd' : {
+ 'read_avg_lim': 15000,
+ 'write_avg_lim': 15000,
+ 'read_avg_time': 3,
+ 'write_avg_time': 3,
+ 'read_tot_lim': 50000,
+ 'write_tot_lim': 50000
+ }, CONF_IODUMP: {
+ 'read_iodump_lim': 0,
+ 'write_iodump_lim': 0
+ }
+}
+
+
+def get_section_value(section_name, config):
+ common_param = {}
+ config_sec = config[section_name]
+ for config_key in DEFAULT_PARAM[section_name]:
+ if config_key in config_sec:
+ if not config_sec[config_key].isdecimal():
+ report_alarm_fail(f"Invalid {section_name}.{config_key} config.")
+ common_param[config_key] = int(config_sec[config_key])
+ else:
+ common_param[config_key] = DEFAULT_PARAM[section_name][config_key]
+ logging.warning(f"Unset {section_name}.{config_key} in config file, use {common_param[config_key]} as default")
+ return common_param
+
+
+def read_config_log(filename):
+ """read config file, get [log] section value"""
+ default_log_level = DEFAULT_PARAM[CONF_LOG][CONF_LOG_LEVEL]
+ if not os.path.exists(filename):
+ return LogLevel.get(default_log_level)
+
+ config = configparser.ConfigParser()
+ config.read(filename)
+
+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL, fallback=default_log_level)
+ if log_level.lower() in LogLevel:
+ return LogLevel.get(log_level.lower())
+ return LogLevel.get(default_log_level)
+
+
+def read_config_common(config):
+ """read config file, get [common] section value"""
+ if not config.has_section(CONF_COMMON):
+ report_alarm_fail(f"Cannot find {CONF_COMMON} section in config file")
+
+ try:
+ disk_name = config.get(CONF_COMMON, CONF_COMMON_DISK).lower()
+ disk = [] if disk_name == "default" else disk_name.split(",")
+ except configparser.NoOptionError:
+ disk = []
+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_DISK}, set to default")
+
+ try:
+ stage_name = config.get(CONF_COMMON, CONF_COMMON_STAGE).lower()
+ stage = [] if stage_name == "default" else stage_name.split(",")
+ except configparser.NoOptionError:
+ stage = []
+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_STAGE}, set to default")
+
+ if len(disk) > 10:
+ logging.warning(f"Too many {CONF_COMMON}.disks, record only max 10 disks")
+ disk = disk[:10]
+
+ try:
+ iotype_name = config.get(CONF_COMMON, CONF_COMMON_IOTYPE).lower().split(",")
+ iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']]
+ err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']]
+
+ if err_iotype:
+ report_alarm_fail(f"Invalid {CONF_COMMON}.{CONF_COMMON_IOTYPE} config")
+
+ except configparser.NoOptionError:
+ iotype_list = DEFAULT_PARAM[CONF_COMMON][CONF_COMMON_IOTYPE]
+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_IOTYPE}, use {iotupe_list} as default")
+
+ try:
+ period_time = int(config.get(CONF_COMMON, CONF_COMMON_PER_TIME))
+ if not (1 <= period_time <= 300):
+ raise ValueError("Invalid period_time")
+ except ValueError:
+ report_alarm_fail(f"Invalid {CONF_COMMON}.{CONF_COMMON_PER_TIME}")
+ except configparser.NoOptionError:
+ period_time = DEFAULT_PARAM[CONF_COMMON][CONF_COMMON_PER_TIME]
+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_PER_TIME}, use {period_time} as default")
+
+ return period_time, disk, stage, iotype_list
+
+
+def read_config_algorithm(config):
+ """read config file, get [algorithm] section value"""
+ if not config.has_section(CONF_ALGO):
+ report_alarm_fail(f"Cannot find {CONF_ALGO} section in config file")
+
+ try:
+ win_size = int(config.get(CONF_ALGO, CONF_ALGO_SIZE))
+ if not (1 <= win_size <= 300):
+ raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE}")
+ except ValueError:
+ report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE} config")
+ except configparser.NoOptionError:
+ win_size = DEFAULT_PARAM[CONF_ALGO][CONF_ALGO_SIZE]
+ logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_SIZE}, use {win_size} as default")
+
+ try:
+ win_threshold = int(config.get(CONF_ALGO, CONF_ALGO_THRE))
+ if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
+ raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE}")
+ except ValueError:
+ report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE} config")
+ except configparser.NoOptionError:
+ win_threshold = DEFAULT_PARAM[CONF_ALGO]['win_threshold']
+ logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE}, use {win_threshold} as default")
+
+ return win_size, win_threshold
+
+
+def read_config_latency(config):
+ """read config file, get [latency_xxx] section value"""
+ common_param = {}
+ for type_name in Disk_Type:
+ section_name = CONF_LATENCY.format(Disk_Type[type_name])
+ if not config.has_section(section_name):
+ report_alarm_fail(f"Cannot find {section_name} section in config file")
+
+ common_param[Disk_Type[type_name]] = get_section_value(section_name, config)
+ return common_param
+
+
+def read_config_iodump(config):
+ """read config file, get [iodump] section value"""
+ if not config.has_section(CONF_IODUMP):
+ report_alarm_fail(f"Cannot find {CONF_IODUMP} section in config file")
+
+ return get_section_value(CONF_IODUMP, config)
+
+
+def read_config_stage(config, stage, iotype_list, curr_disk_type):
+ """read config file, get [STAGE_NAME_diskType] section value"""
+ res = {}
+ section_name = f"{stage}_{curr_disk_type}"
+ if not config.has_section(section_name):
+ return res
+
+ for key in config[section_name]:
+ if config[stage][key].isdecimal():
+ res[key] = int(config[stage][key])
+
+ return res
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
index 8d6f429..cbdaad4 100644
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
@@ -29,12 +29,16 @@ def sig_handler(signum, _f):
def avg_get_io_data(io_dic):
"""get_io_data from sentryCollector"""
+ logging.debug(f"send to sentryCollector get_io_data: period={io_dic['period_time']}, "
+ f"disk={io_dic['disk_list']}, stage={io_dic['stage_list']}, iotype={io_dic['iotype_list']}")
res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"])
return check_result_validation(res, 'get io data')
def avg_is_iocollect_valid(io_dic, config_disk, config_stage):
"""is_iocollect_valid from sentryCollector"""
+ logging.debug(f"send to sentryCollector is_iocollect_valid: period={io_dic['period_time']}, "
+ f"disk={config_disk}, stage={config_stage}")
res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage)
return check_result_validation(res, 'check config validation')
@@ -79,7 +83,7 @@ def process_report_data(disk_name, rw, io_data):
# io press
ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
for stage_name in ctrl_stage:
- abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
+ abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data)
if not abnormal:
continue
msg["reason"] = "IO press"
@@ -117,6 +121,7 @@ def process_report_data(disk_name, rw, io_data):
def get_disk_type_by_name(disk_name):
+ logging.debug(f"send to sentryCollector get_disk_type: disk_name={disk_name}")
res = get_disk_type(disk_name)
disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}')
try:
@@ -126,4 +131,4 @@ def get_disk_type_by_name(disk_name):
except ValueError:
report_alarm_fail(f"Failed to get disk type for {disk_name}")
- return Disk_Type[curr_disk_type]
\ No newline at end of file
+ return Disk_Type[curr_disk_type]
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
index c381c07..1bfd4e8 100644
--- a/src/python/sentryPlugins/avg_block_io/utils.py
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
@@ -8,84 +8,12 @@
# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
# PURPOSE.
# See the Mulan PSL v2 for more details.
-import configparser
import logging
import os
AVG_VALUE = 0
AVG_COUNT = 1
-CONF_LOG = 'log'
-CONF_LOG_LEVEL = 'level'
-LogLevel = {
- "debug": logging.DEBUG,
- "info": logging.INFO,
- "warning": logging.WARNING,
- "error": logging.ERROR,
- "critical": logging.CRITICAL
-}
-
-
-DEFAULT_PARAM = {
- 'latency_nvme_ssd': {
- 'read_avg_lim': 300,
- 'write_avg_lim': 300,
- 'read_avg_time': 3,
- 'write_avg_time': 3,
- 'read_tot_lim': 500,
- 'write_tot_lim': 500,
- }, 'latency_sata_ssd' : {
- 'read_avg_lim': 10000,
- 'write_avg_lim': 10000,
- 'read_avg_time': 3,
- 'write_avg_time': 3,
- 'read_tot_lim': 50000,
- 'write_tot_lim': 50000,
- }, 'latency_sata_hdd' : {
- 'read_avg_lim': 15000,
- 'write_avg_lim': 15000,
- 'read_avg_time': 3,
- 'write_avg_time': 3,
- 'read_tot_lim': 50000,
- 'write_tot_lim': 50000
- }, 'iodump': {
- 'read_iodump_lim': 0,
- 'write_iodump_lim': 0
- }
-}
-
-
-def get_section_value(section_name, config):
- common_param = {}
- config_sec = config[section_name]
- for config_key in DEFAULT_PARAM[section_name]:
- if config_key in config_sec:
- if not config_sec[config_key].isdecimal():
- report_alarm_fail(f"Invalid {section_name}.{config_key} config.")
- common_param[config_key] = int(config_sec[config_key])
- else:
- logging.warning(f"Unset {section_name}.{config_key} in config file, use {DEFAULT_PARAM[section_name][config_key]} as default")
- common_param[config_key] = DEFAULT_PARAM[section_name][config_key]
- return common_param
-
-
-def get_log_level(filename):
- if not os.path.exists(filename):
- return logging.INFO
-
- try:
- config = configparser.ConfigParser()
- config.read(filename)
- if not config.has_option(CONF_LOG, CONF_LOG_LEVEL):
- return logging.INFO
- log_level = config.get(CONF_LOG, CONF_LOG_LEVEL)
-
- if log_level.lower() in LogLevel:
- return LogLevel.get(log_level.lower())
- return logging.INFO
- except configparser.Error:
- return logging.INFO
-
def get_nested_value(data, keys):
"""get data from nested dict"""
--
2.27.0

View File

@ -1,92 +0,0 @@
From d74076f4b772822de4f5bee1c8a778dd6b1771d2 Mon Sep 17 00:00:00 2001
From: shixuantong <shixuantong1@huawei.com>
Date: Wed, 11 Dec 2024 15:25:33 +0800
Subject: [PATCH] set logrotate
---
config/logrotate | 9 ---------
config/logrotate-sysSentry.conf | 35 +++++++++++++++++++++++++++++++++
src/sh/logrotate-sysSentry.cron | 13 ++++++++++++
3 files changed, 48 insertions(+), 9 deletions(-)
delete mode 100644 config/logrotate
create mode 100644 config/logrotate-sysSentry.conf
create mode 100644 src/sh/logrotate-sysSentry.cron
diff --git a/config/logrotate b/config/logrotate
deleted file mode 100644
index 3dc77f5..0000000
--- a/config/logrotate
+++ /dev/null
@@ -1,9 +0,0 @@
-/var/log/sysSentry/*.log {
- compress
- missingok
- notifempty
- copytruncate
- rotate 2
- size +4096k
- hourly
-}
diff --git a/config/logrotate-sysSentry.conf b/config/logrotate-sysSentry.conf
new file mode 100644
index 0000000..cf5f994
--- /dev/null
+++ b/config/logrotate-sysSentry.conf
@@ -0,0 +1,35 @@
+# keep 4 hours worth of backlogs
+rotate 4
+
+# create new (empty) log files after rotating old ones
+create
+
+# compress log files
+compress
+
+# if a log file does not exist, go no to the next one without an error msg
+missingok
+
+# do not rotate the log if it is empty
+notifempty
+
+copytruncate
+
+# ignore any following matches of a log file.
+# Note that order is significant, it will not overwrite and take the first match.
+# require logrotate >= 3.21.0
+ignoreduplicates
+
+/var/log/sysSentry/sysSentry.log {
+ rotate 8
+ size +4096k
+}
+
+/var/log/sysSentry/cpu_sentry.log {
+ rotate 2
+ size +2048k
+}
+
+/var/log/sysSentry/*.log {
+ size +4096k
+}
diff --git a/src/sh/logrotate-sysSentry.cron b/src/sh/logrotate-sysSentry.cron
new file mode 100644
index 0000000..64d02f9
--- /dev/null
+++ b/src/sh/logrotate-sysSentry.cron
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+TMPF=`mktemp /tmp/logrotate-sysSentry.XXXXXXXXX`
+
+/usr/sbin/logrotate /etc/logrotate-sysSentry.conf -v --log=$TMPF -s /var/lib/logrotate-syssentry/logrotate.status
+EXITVALUE=$?
+if [ $EXITVALUE != 0 ]; then
+ /bin/logger -t logrotate "ALERT exited abnormally with [$EXITVALUE], for details, see /var/log/sysSentry/logrotate.log"
+ /bin/logger -t logrotate -f $TMPF
+fi
+rm -rf $TMPF
+rm -rf /var/lib/logrotate-syssentry/logrotate.status
+exit $EXITVALUE
--
2.27.0

View File

@ -1,38 +0,0 @@
From 4abad77067557234d938de3914094c80181030c1 Mon Sep 17 00:00:00 2001
From: jwolf <523083921@qq.com>
Date: Fri, 30 Aug 2024 14:30:46 +0800
Subject: [PATCH] must be integer
---
c/catcli/catlib/cli_param_checker.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c
index e400428..5b38402 100644
--- a/src/c/catcli/catlib/cli_param_checker.c
+++ b/src/c/catcli/catlib/cli_param_checker.c
@@ -17,8 +17,9 @@ void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_r
if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) {
strncpy(errs->patrol_module_err,
"\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN);
+ } else {
+ p_request_body->cpu_utility = (int)cpu_utility;
}
- p_request_body->cpu_utility = (int)cpu_utility;
}
void checkset_cpulist(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs)
@@ -73,8 +74,9 @@ void checkset_patrol_time(char *getopt_optarg, catcli_request_body *p_request_bo
strncpy(errs->patrol_time_err,
"\"patrol_second\" must be a number in the range of (0,INT_MAX] ,correct \"-t, --patrol_second\"\n",
MAX_ERR_LEN);
+ } else {
+ p_request_body->patrol_second = (int)second;
}
- p_request_body->patrol_second = (int)second;
}
void checkset_patrol_type(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs)
--
2.27.0

View File

@ -1,155 +0,0 @@
From 3f6e4d12618597b5aab6b0633f1bda800526ea54 Mon Sep 17 00:00:00 2001
From: gaoruoshu <gaoruoshu@huawei.com>
Date: Wed, 14 Aug 2024 21:10:20 +0800
Subject: [PATCH] split cpu_sentry and syssentry
---
src/python/syssentry/cpu_alarm.py | 42 +++++++++++++++++++++++++
src/python/syssentry/syssentry.py | 52 ++++++-------------------------
2 files changed, 52 insertions(+), 42 deletions(-)
diff --git a/src/python/syssentry/cpu_alarm.py b/src/python/syssentry/cpu_alarm.py
index d972c42..0b1642b 100644
--- a/src/python/syssentry/cpu_alarm.py
+++ b/src/python/syssentry/cpu_alarm.py
@@ -1,6 +1,7 @@
import re
import math
import logging
+import socket
from enum import Enum
from .utils import execute_command
@@ -15,6 +16,12 @@ BINARY = 2
MIN_DATA_LEN = 0
MAX_DATA_LEN = 999
+PARAM_REP_LEN = 3
+PARAM_TYPE_LEN = 1
+PARAM_MODULE_LEN = 1
+PARAM_TRANS_TO_LEN = 2
+PARAM_DATA_LEN = 3
+
class Type(Enum):
CE = 0x00
@@ -207,3 +214,38 @@ def check_fixed_param(data, expect):
raise ValueError("expected str param is not valid")
return data
raise NotImplementedError("unexpected param type")
+
+
+def cpu_alarm_recv(server_socket: socket.socket):
+ try:
+ client_socket, _ = server_socket.accept()
+ logging.debug("cpu alarm fd listen ok")
+
+ data = client_socket.recv(PARAM_REP_LEN)
+ check_fixed_param(data, "REP")
+
+ data = client_socket.recv(PARAM_TYPE_LEN)
+ _type = check_fixed_param(data, Type)
+
+ data = client_socket.recv(PARAM_MODULE_LEN)
+ module = check_fixed_param(data, Module)
+
+ data = client_socket.recv(PARAM_TRANS_TO_LEN)
+ trans_to = check_fixed_param(data, TransTo)
+
+ data = client_socket.recv(PARAM_DATA_LEN)
+ data_len = check_fixed_param(data, (MIN_DATA_LEN, MAX_DATA_LEN))
+
+ data = client_socket.recv(data_len)
+
+ command, event_type, socket_id, core_id = parser_cpu_alarm_info(data)
+ except socket.error:
+ logging.error("socket error")
+ return
+ except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError):
+ logging.error("server recv cpu alarm msg failed!")
+ client_socket.close()
+ return
+
+ upload_bmc(_type, module, command, event_type, socket_id, core_id)
+
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
index 3d5cb8d..f93956e 100644
--- a/src/python/syssentry/syssentry.py
+++ b/src/python/syssentry/syssentry.py
@@ -36,8 +36,15 @@ from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create,
from .result import RESULT_MSG_HEAD_LEN, RESULT_MSG_MAGIC_LEN, RESULT_MAGIC
from .result import RESULT_LEVEL_ERR_MSG_DICT, ResultLevel
from .utils import get_current_time_string
-from .cpu_alarm import (upload_bmc, check_fixed_param, parser_cpu_alarm_info,
- Type, Module, TransTo, MIN_DATA_LEN, MAX_DATA_LEN)
+
+
+CPU_EXIST = True
+try:
+ from .cpu_alarm import cpu_alarm_recv
+except ImportError:
+ CPU_EXIST = False
+ logging.debug("Cannot find cpu sentry mod")
+
INSPECTOR = None
@@ -76,45 +83,6 @@ PID_FILE_FLOCK = None
RESULT_SOCKET_PATH = "/var/run/sysSentry/result.sock"
CPU_ALARM_SOCKET_PATH = "/var/run/sysSentry/report.sock"
-PARAM_REP_LEN = 3
-PARAM_TYPE_LEN = 1
-PARAM_MODULE_LEN = 1
-PARAM_TRANS_TO_LEN = 2
-PARAM_DATA_LEN = 3
-
-
-def cpu_alarm_recv(server_socket: socket.socket):
- try:
- client_socket, _ = server_socket.accept()
- logging.debug("cpu alarm fd listen ok")
-
- data = client_socket.recv(PARAM_REP_LEN)
- check_fixed_param(data, "REP")
-
- data = client_socket.recv(PARAM_TYPE_LEN)
- _type = check_fixed_param(data, Type)
-
- data = client_socket.recv(PARAM_MODULE_LEN)
- module = check_fixed_param(data, Module)
-
- data = client_socket.recv(PARAM_TRANS_TO_LEN)
- trans_to = check_fixed_param(data, TransTo)
-
- data = client_socket.recv(PARAM_DATA_LEN)
- data_len = check_fixed_param(data, (MIN_DATA_LEN, MAX_DATA_LEN))
-
- data = client_socket.recv(data_len)
-
- command, event_type, socket_id, core_id = parser_cpu_alarm_info(data)
- except socket.error:
- logging.error("socket error")
- return
- except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError):
- logging.error("server recv cpu alarm msg failed!")
- client_socket.close()
- return
-
- upload_bmc(_type, module, command, event_type, socket_id, core_id)
def msg_data_process(msg_data):
@@ -480,7 +448,7 @@ def main_loop():
server_result_recv(server_result_fd)
elif event_fd == heartbeat_fd.fileno():
heartbeat_recv(heartbeat_fd)
- elif event_fd == cpu_alarm_fd.fileno():
+ elif CPU_EXIST and event_fd == cpu_alarm_fd.fileno():
cpu_alarm_recv(cpu_alarm_fd)
else:
continue
--
2.33.0

Binary file not shown.

BIN
sysSentry-1.0.3.tar.gz Normal file

Binary file not shown.

View File

@ -3,101 +3,24 @@
Summary: System Inspection Framework
Name: sysSentry
Version: 1.0.2
Release: 67
Version: 1.0.3
Release: 1
License: Mulan PSL v2
Group: System Environment/Daemons
Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz
BuildRoot: %{_builddir}/%{name}-root
Patch1: fix-version-in-setup.py.patch
Patch2: Fix-the-problem-that-function-cpu_report_result-is-c.patch
Patch3: fix-error-handling.patch
Patch4: fix-result-when-process-output-is-None.patch
Patch5: cpu_utility-and-cpu_patrol-must-be-an-integer.patch
Patch6: setting-parameters-must-be-integer.patch
Patch7: param-must-be-integer.patch
Patch8: add-deleted-code-to-plugin-rasdaemon.patch
Patch9: Remove-ANSI-escape-sequences.patch
Patch10: split-cpu_sentry-and-syssentry.patch
Patch11: fix-configparser.InterpolationSyntaxError.patch
Patch12: fix-syssentry-fails-to-be-started-when-cpu_sentry-is.patch
Patch13: add-collect-module-to-sysSentry.patch
Patch14: feature-add-avg_block_io-plugin.patch
Patch15: fix-some-about-collect-module-and-avg-block-io.patch
Patch16: add-ai-threshold-slow-io-detection-plugin.patch
Patch17: optimize-the-handing-of-cat-cli-error-msg-in-cpu_sentry.patch
Patch18: over-threshold-should-be-warn-level-log-in-cat-cli.patch
Patch19: fix-bug-step-2-about-collect-module-and-avg-block-io.patch
Patch20: add-log-level-and-change-log-format.patch
Patch21: fix-ai_block_io-some-issues.patch
Patch22: add-pyxalarm-and-pySentryNotify-add-multi-users-supp.patch
Patch23: add-sentryctl-get_alarm-module_name-s-time_range-d.patch
Patch24: fix-python-3.7-not-support-list-bool-type.patch
Patch25: avg_block_io-send-alarm-to-xalarmd.patch
Patch26: bugfix-typo.patch
Patch27: fix-config-relative-some-issues.patch
Patch28: update-log-when-it-is-not-lock-collect.patch
Patch29: change-alarm-length.patch
Patch30: add-detail-time.patch
Patch31: xalarm-add-alarm-msg-length-to-8192.patch
Patch32: ai_block_io-adapt-alarm-module.patch
Patch33: add-log-for-improving-maintainability.patch
Patch34: add-get_disk_type-and-fix-some-bugs.patch
Patch35: diff-disk-type-use-diff-config.patch
Patch36: add-parameter-time_range-alarm_id-and-alarm_clear_ti.patch
Patch37: fix-xalarm_Report-function-not-refuse-alarm-msg-exce.patch
Patch38: fix-xalarm_upgrade-not-return-val-and-fail-when-thre.patch
Patch39: add-log-for-xalarm-when-sending-msg-and-clean-invali.patch
Patch40: add-xalarm-cleanup-invalid-server-socket-peroidly.patch
Patch41: ai_block_io-support-stage-and-iotype.patch
Patch42: fix-io_dump-for-collect-module.patch
Patch43: add-root-cause-analysis.patch
Patch44: update-collect-log.patch
Patch45: modify-abnormal-stack-when-the-disk-field-is-not-con.patch
Patch46: ai_block_io-fix-some-bugs.patch
Patch47: refactor-config.py-and-bugfix-uncorrect-slow-io-repo.patch
Patch48: get_io_data-failed-wont-stop-avg_block_io-and-del-di.patch
Patch49: fix-ai_block_io-root-cause-bug.patch
Patch50: listen-thread-of-collect-module-exits-occasionally.patch
Patch51: precise-alarm-query-time.patch
Patch52: fix-word-error.patch
Patch53: optimize-log-printing.patch
Patch54: enrich-alert-info-about-kernel-stack.patch
Patch55: ai_block_io-lack-section-exit.patch
Patch56: fix-xalarm-non-uniform-log-formatting.patch
Patch57: update-collect-plugin-period-max.patch
Patch58: fix-frequency-param-check-bug.patch
Patch59: ai_block_io-support-iodump.patch
Patch60: fix-get_alarm-error.patch
Patch61: fix-alarm_info-newline-break-error.patch
Patch62: add-hbm-online-repair.patch
Patch63: fix-hbm-online-repair-notice-and-efi-create.patch
Patch64: get_alarm-d-abnomal-display.patch
Patch65: modify-logrotate-rule.patch
Patch66: fix-excessive-CPU-usage.patch
Patch67: fix-uint8-bug-and-change-isolation-default-value.patch
Patch68: fix-write-file-return-code-bug.patch
Patch69: change-avg_block_io-config.patch
Patch70: ai_block_io-support-absolute-threshold-lower-limit.patch
Patch71: ai_block_io-fix-some-config-parameters-parse-bug.patch
Patch72: update-nvme-config.patch
Patch73: make-debug-msg-clear.patch
Patch74: add-boundary-check-for-settings.patch
Patch75: change-status-of-period-task-and-sort-mod-file.patch
Patch76: uniform-avg_block_io-log-and-ai_block_io-log.patch
Patch77: set-logrotate.patch
Patch78: hbm_online_repair-add-unload-driver.patch
Patch79: fix-test_ai_block_io-fail.patch
BuildRequires: cmake gcc-c++
BuildRequires: python3 python3-setuptools
BuildRequires: json-c-devel
BuildRequires: chrpath
# for test
BuildRequires: elfutils-devel clang libbpf-devel bpftool
BuildRequires: python3-numpy python3-pytest
Requires: pyxalarm = %{version}
Requires: libbpf
%define PYTHON_VERSION %{python3_version}
%define PKGVER syssentry-%{version}-py%{PYTHON_VERSION}.egg-info
%description
sysSentry provides framework tools for system inspection.
@ -119,15 +42,6 @@ Provides: libxalarm-devel = %{version}
%description -n libxalarm-devel
This package provides developer tools for the libxalarm.
%package -n cpu_sentry
Summary: CPU fault inspection program
Requires: procps-ng
Recommends: sysSentry = %{version}-%{release}
Recommends: ipmitool
%description -n cpu_sentry
This package provides CPU fault detection
%package -n avg_block_io
Summary: Supports slow I/O detection
Requires: sysSentry = %{version}-%{release}
@ -182,92 +96,10 @@ This package provides hbm_online_repair for the sysSentry.
%autosetup -n %{name}-%{version} -p1
%build
# xalarm
sh build/build.sh -b %{buildroot}%{_libdir}
# sysSentry
pushd src/python
python3 setup.py build
popd
pushd src/c/catcli/catlib
cmake -B ./build/ -S . -D CMAKE_INSTALL_PREFIX=/usr/local -D CMAKE_BUILD_TYPE=Release
pushd build
make
popd
popd
# hbm_online_repair
pushd src/c/hbm_online_repair
make
popd
%make_build
%install
# sysSentry
mkdir -p %{buildroot}%{_bindir}
mkdir -p %{buildroot}%{_unitdir}
mkdir -p %{buildroot}%{_var}/log/sysSentry
install src/python/syssentry/sentryctl %{buildroot}%{_bindir}
install -d -m 700 %{buildroot}/etc/sysSentry/
install -d -m 700 %{buildroot}/etc/sysSentry/tasks/
install -d -m 700 %{buildroot}/etc/sysSentry/plugins/
install -m 600 config/inspect.conf %{buildroot}%{_sysconfdir}/sysSentry
install -m 600 service/sysSentry.service %{buildroot}%{_unitdir}
# rasdaemon
install config/tasks/rasdaemon.mod %{buildroot}/etc/sysSentry/tasks/
# xalarm
sh build/build.sh -i %{buildroot}%{_libdir}
install -m 600 config/xalarm.conf %{buildroot}%{_sysconfdir}/sysSentry
install -d %{buildroot}%{_libdir}
install -d %{buildroot}%{_includedir}/xalarm
install -m 600 service/xalarmd.service %{buildroot}%{_unitdir}
install -m 644 src/libso/xalarm/register_xalarm.h %{buildroot}%{_includedir}/xalarm/register_xalarm.h
# sentryCollector
install -m 600 config/collector.conf %{buildroot}%{_sysconfdir}/sysSentry
install -m 600 service/sentryCollector.service %{buildroot}%{_unitdir}
# cpu sentry
install config/tasks/cpu_sentry.mod %{buildroot}/etc/sysSentry/tasks/
install config/plugins/cpu_sentry.ini %{buildroot}/etc/sysSentry/plugins/cpu_sentry.ini
install src/c/catcli/catlib/build/cat-cli %{buildroot}%{_bindir}/cat-cli
install src/c/catcli/catlib/build/plugin/cpu_patrol/libcpu_patrol.so %{buildroot}%{_libdir}
chrpath -d %{buildroot}%{_bindir}/cat-cli
chrpath -d %{buildroot}%{_libdir}/libcpu_patrol.so
# avg_block_io
install config/tasks/avg_block_io.mod %{buildroot}/etc/sysSentry/tasks/
install config/plugins/avg_block_io.ini %{buildroot}/etc/sysSentry/plugins/avg_block_io.ini
# ai_block_io
install config/tasks/ai_block_io.mod %{buildroot}/etc/sysSentry/tasks/
install config/plugins/ai_block_io.ini %{buildroot}/etc/sysSentry/plugins/ai_block_io.ini
# hbm_online_repair
mkdir -p %{buildroot}/etc/sysconfig/
install config/tasks/hbm_online_repair.mod %{buildroot}/etc/sysSentry/tasks/
install src/c/hbm_online_repair/hbm_online_repair %{buildroot}%{_bindir}
install src/c/hbm_online_repair/hbm_online_repair.env %{buildroot}/etc/sysconfig/hbm_online_repair.env
# logrotate
mkdir -p %{buildroot}%{_localstatedir}/lib/logrotate-syssentry
mkdir -p %{buildroot}%{_sysconfdir}/cron.hourly
install -m 0600 config/logrotate-sysSentry.conf %{buildroot}%{_sysconfdir}/logrotate-sysSentry.conf
install -m 0500 src/sh/logrotate-sysSentry.cron %{buildroot}%{_sysconfdir}/cron.hourly/logrotate-sysSentry
pushd src/python
python3 setup.py install -O1 --root=$RPM_BUILD_ROOT --record=SENTRY_FILES
cat SENTRY_FILES | grep -v register_xalarm.* | grep -v sentry_notify.* > SENTRY_FILES.tmp
mv SENTRY_FILES.tmp SENTRY_FILES
popd
%check
PYTHONPATH=%{buildroot}%{python3_sitelib} %{__python3} -m pytest selftest/test/
%pre
%make_install
%post
/sbin/ldconfig
@ -287,28 +119,36 @@ rm -rf /var/run/sysSentry | :
%postun
/sbin/ldconfig
%clean
rm -rf %{buildroot}
%files -f src/python/SENTRY_FILES
%files
%defattr(0550,root,root)
%dir %attr(0550,root,root) %{python3_sitelib}/xalarm
%attr(0550,root,root) %{python3_sitelib}/xalarm
%attr(0550,root,root) %{python3_sitelib}/syssentry
%attr(0550,root,root) %{python3_sitelib}/%{PKGVER}
%attr(0550,root,root) %{python3_sitelib}/sentryCollector
%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/avg_block_io
%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_block_io
# sysSentry
%attr(0500,root,root) %{_bindir}/sentryctl
%attr(0550,root,root) %{_bindir}/syssentry
%attr(0550,root,root) %{_bindir}/ebpf_collector
%attr(0750,root,root) %config(noreplace) %{_var}/log/sysSentry
%attr(0750,root,root) %config(noreplace) %{_sysconfdir}/sysSentry
%attr(0750,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks
%attr(0750,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/plugins
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/inspect.conf
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/rasdaemon.mod
%attr(0600,root,root) %{_unitdir}/sysSentry.service
%exclude %{python3_sitelib}/sentryCollector/collect_plugin.py
%exclude %{python3_sitelib}/xalarm/register_xalarm.py
%exclude %{python3_sitelib}/xalarm/sentry_notify.py
%exclude %{python3_sitelib}/syssentry/__pycache__
%exclude %{python3_sitelib}/sentryCollector/__pycache__
%exclude %{python3_sitelib}/xalarm/__pycache__
%exclude %{_sysconfdir}/sysSentry/tasks/ai_block_io.mod
%exclude %{_sysconfdir}/sysSentry/plugins/ai_block_io.ini
%exclude %{_sysconfdir}/sysSentry/tasks/avg_block_io.mod
%exclude %{_sysconfdir}/sysSentry/plugins/avg_block_io.ini
# xalarm
%attr(0550,root,root) %{_bindir}/xalarmd
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/xalarm.conf
@ -316,39 +156,14 @@ rm -rf %{buildroot}
# logrotate
%dir %{_localstatedir}/lib/logrotate-syssentry
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/logrotate-sysSentry.conf
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/logrotate-sysSentry.conf
%attr(0500,root,root) %{_sysconfdir}/cron.hourly/logrotate-sysSentry
# cpu inspection module
%exclude %{_sysconfdir}/sysSentry/tasks/cpu_sentry.mod
%exclude %{_sysconfdir}/sysSentry/plugins/cpu_sentry.ini
%exclude %{_bindir}/cpu_sentry
%exclude %{_bindir}/cat-cli
%exclude %{python3_sitelib}/syssentry/cpu_*
%exclude %{python3_sitelib}/syssentry/*/cpu_*
# avg block io
%exclude %{_sysconfdir}/sysSentry/tasks/avg_block_io.mod
%exclude %{_sysconfdir}/sysSentry/plugins/avg_block_io.ini
%exclude %{_bindir}/avg_block_io
%exclude %{python3_sitelib}/sentryPlugins/*
# ai_block_io
%exclude %{_sysconfdir}/sysSentry/tasks/ai_block_io.mod
%exclude %{_sysconfdir}/sysSentry/plugins/ai_block_io.ini
%exclude %{_bindir}/ai_block_io
%exclude %{python3_sitelib}/sentryPlugins/*
# sentryCollector
%attr(0550,root,root) %{_bindir}/sentryCollector
%attr(0600,root,root) %{_sysconfdir}/sysSentry/collector.conf
%attr(0600,root,root) %{_unitdir}/sentryCollector.service
# pysentry_collect
%exclude %{python3_sitelib}/sentryCollector/collect_plugin.py
%exclude %{python3_sitelib}/sentryCollector/__pycache__/collect_plugin*
# hbm repair module
%exclude %{_sysconfdir}/sysSentry/tasks/hbm_online_repair.mod
%exclude %{python3_sitelib}/syssentry/bmc_*
%exclude %{python3_sitelib}/syssentry/*/bmc_*
@ -357,41 +172,30 @@ rm -rf %{buildroot}
%attr(0550,root,root) %{_libdir}/libxalarm.so
%files -n libxalarm-devel
%dir %{_includedir}/xalarm
%attr(0550,root,root) %{_includedir}/xalarm
%attr(0550,root,root) %{_includedir}/xalarm/register_xalarm.h
%files -n pyxalarm
%attr(0550,root,root) %{python3_sitelib}/xalarm/register_xalarm.py
%attr(0550,root,root) %{python3_sitelib}/xalarm/__pycache__/register_xalarm*
%files -n pysentry_notify
%attr(0550,root,root) %{python3_sitelib}/xalarm/sentry_notify.py
%attr(0550,root,root) %{python3_sitelib}/xalarm/__pycache__/sentry_notify*
%files -n cpu_sentry
%attr(0500,root,root) %{_bindir}/cat-cli
%attr(0500,root,root) %{_bindir}/cpu_sentry
%attr(0550,root,root) %{_libdir}/libcpu_patrol.so
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/cpu_sentry.mod
%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/cpu_sentry.ini
%attr(0550,root,root) %{python3_sitelib}/syssentry/cpu_*
%files -n avg_block_io
%attr(0500,root,root) %{_bindir}/avg_block_io
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/avg_block_io.mod
%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/avg_block_io.ini
%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/avg_block_io
%exclude %{python3_sitelib}/sentryPlugins/avg_block_io/__pycache__
%files -n ai_block_io
%attr(0500,root,root) %{_bindir}/ai_block_io
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/ai_block_io.mod
%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/ai_block_io.ini
%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_block_io
%exclude %{python3_sitelib}/sentryPlugins/ai_block_io/__pycache__
%files -n pysentry_collect
%attr(0550,root,root) %{python3_sitelib}/sentryCollector/collect_plugin.py
%attr(0550,root,root) %{python3_sitelib}/sentryCollector/__pycache__/collect_plugin*
%files -n hbm_online_repair
%attr(0550,root,root) %{_bindir}/hbm_online_repair
@ -400,409 +204,8 @@ rm -rf %{buildroot}
%attr(0550,root,root) %{python3_sitelib}/syssentry/bmc_alarm.py
%changelog
* Sat Dec 28 2024 shixuantong <shixuantong@huawei.com> - 1.0.2-67
* Mon Jan 20 2025 zhuofeng <zhuofeng2@huawei.com> - 1.0.3-1
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix test_ai_block_io fail
* Wed Dec 18 2024 luckky <guodashun1@huawei.com> - 1.0.2-66
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC: add boundary check for settings
* Wed Dec 18 2024 shixuantong <shixuantong@huawei.com> - 1.0.2-65
- Type:enhancement
- CVE:NA
- SUG:NA
- DESC:set logrotate
* Wed Dec 18 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-64
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:uniform plugins log
* Fri Dec 13 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-63
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC: change status of period task and sort mod file
* Wed Nov 6 2024 luckky <guodashun1@huawei.com> - 1.0.2-62
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC: add boundary check for settings
* Tue Nov 5 2024 luckky <guodashun1@huawei.com> - 1.0.2-61
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:make debug msg clear
* Tue Nov 5 2024 zhangnan <zhangnan134@huawei.com> - 1.0.2-60
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:update nvme config
* Tue Nov 5 2024 gaoruoshu <gaoruoshu@huawei.com> - 1.0.2-59
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:change avg_block_io config
* Mon Nov 4 2024 luckky <guodashun1@huawei.com> - 1.0.2-58
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix write file return code bug
* Fri Nov 1 2024 luckky <guodashun1@huawei.com> - 1.0.2-57
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix uint8 bug and change page isolation threshold default value
* Fri Nov 1 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-56
- Type:bugfix
- CVE:NA
- SUG:NA
- DES:excessive CPU usage
* Thu Oct 31 2024 zhangnan <zhangnan134@huawei.com> - 1.0.2-55
- Type:bugfix
- CVE:NA
- SUG:NA
- DES:modify logrotate rule
* Wed Oct 30 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-54
- Type:bugfix
- CVE:NA
- SUG:NA
- DES:get_alarm -d abnormal display
* Wed Oct 30 2024 luckky <guodashun1@huawei.com> - 1.0.2-53
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix hbm online repair notice and efi create
* Sat Oct 26 2024 luckky <guodashun1@huawei.com> - 1.0.2-52
- Type:requirement
- CVE:NA
- SUG:NA
- DESC:add hbm_online_repair
* Sat Oct 26 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-51
- Type:bugfix
- CVE:NA
- SUG:NA
- DES:fix newline break error
* Sat Oct 26 2024 zhangnan <zhangnan134@huawei.com> - 1.0.2-50
- Type:bugfix
- CVE:NA
- SUG:NA
- DES:remove extra dependency
* Wed Oct 23 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-49
- Type:bugfix
- CVE:NA
- SUG:NA
- DES:fix get_alarm error
* Tue Oct 22 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-48
- Type:bugfix
- CVE:NA
- SUG:NA
- DES:ai_block_io support iodump
* Tue Oct 22 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-47
- Type:bugfix
- CVE:NA
- SUG:NA
- DES:fix frequency param check bug
* Mon Oct 21 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-46
- Type:bugfix
- CVE:NA
- SUG:NA
- DES:update collect plugin period max
* Mon Oct 21 2024 caixiaomeng <caixiaomeng2@huawei.com> - 1.0.2-45
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:ai_block_io lack section exit
* Mon Oct 21 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-44
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:ai_block_io lack section exit
* Wed Oct 16 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-43
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:enrich alert info about kernel stack
* Wed Oct 16 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-42
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:optimize log printing
* Wed Oct 16 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-41
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:listen thread of collect module exits occasionally
* Wed Oct 16 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-40
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix ai_block_io root cause bug
* Tue Oct 15 2024 gaoruoshu <gaoruoshu@huawei.com> - 1.0.2-39
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:refactor config.py and bugfix uncorrect slow io report
* Mon Oct 14 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-38
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:ai_block_io fix some bugs
* Sat Oct 12 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-37
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:add pysentry_collect package and update collect log
modify abnormal stack when the disk field is not configured
* Sat Oct 12 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-36
- Type:requirement
- CVE:NA
- SUG:NA
- DESC:add root cause analysis
* Sat Oct 12 2024 zhuofeng <zhangnan134@huawei.com> - 1.0.2-35
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix io_dump for collect module
* Fri Oct 11 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-34
- Type:requirement
- CVE:NA
- SUG:NA
- DESC:ai_block_io support stage and iotype
* Fri Oct 11 2024 caixiaomeng <caixiaomeng2@huawei.com> - 1.0.2-33
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix xalarm upgrade not return val, not refuse to send msg when length exceeds 8192,cleanup invalid socket peroidlly
* Fri Oct 11 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-32
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:add parameter validation
* Fri Oct 11 2024 gaoruoshu <gaoruoshu@huawei.com> - 1.0.2-31
- Type:requirement
- CVE:NA
- SUG:NA
- DESC:avg_block_io adapt different type of disk, use different config
* Thu Oct 10 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-30
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:add get_disk_type and fix some bugs
add log for improving maintainability
* Thu Oct 10 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-29
- Type:requirement
- CVE:NA
- SUG:NA
- DESC:ai_block_io adapt alarm module
* Thu Oct 10 2024 caixiaomeng <caixiaomeng2@huawei.com> - 1.0.2-28
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:xalarm add alarm msg length to 8192
* Thu Oct 10 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-27
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:add dependency for sysSentry and avg_block_io
* Thu Oct 10 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-26
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix get_alarm length and timestamp
* Wed Oct 9 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-25
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:update log when it is not lock collect
* Wed Oct 9 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-24
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix ai_block_io config relative some issues
* Wed Oct 9 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-23
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:avg_block_io send alarm to xalarmd
* Wed Oct 9 2024 caixiaomeng <caixiaomeng2@huawei.com> - 1.0.2-22
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix python 3.7 not support list bool type
* Tue Oct 8 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-21
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:add alarm event query function
* Tue Oct 8 2024 caixiaomeng <caixiaomeng2@huawei.com> - 1.0.2-20
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:add pyxalarm and pySentryNotify, add multi users support for xalarmd
* Mon Sep 30 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-19
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix ai_block_io some issues
* Fri Sep 27 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-18
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:add log level and change log format
* Wed Sep 25 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-17
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix bug step 2 about collect module and avg block io
* Mon Sep 23 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-16
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:optimize the handing of cat-cli error msg in cpu_sentry
over threshold should be warn level log in cat-cli
* Mon Sep 23 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-15
- Type:requirement
- CVE:NA
- SUG:NA
- DESC:add ai threshold slow io detection plugin
* Fri Sep 20 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-14
- Type:requirement
- CVE:NA
- SUG:NA
- DESC:fix some about collect module and avg block io
* Sat Sep 14 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-13
- Type:requirement
- CVE:NA
- SUG:NA
- DESC:add collect module and avg_block_io plugin to sysSentry
* Sat Sep 14 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-12
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix syssentry fails to be started when cpu_sentry is not installed
* Wed Sep 11 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-11
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix configparser.InterpolationSyntaxError
* Mon Sep 09 2024 caixiaomeng <caixiaomeng2@huawei.com> - 1.0.2-10
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:split cpu_sentry and syssentry
* Mon Sep 02 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-9
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:Remove ANSI escape sequences
* Sat Aug 31 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-8
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:add ipmitool to Recommends for cpu_sentry
* Sat Aug 31 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-7
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:add deleted code to plugin rasdaemon
* Fri Aug 30 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-6
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:setting parameters must be integer
* Wed Aug 28 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-5
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:cpu_utility and cpu_patrol must be an integer
* Fri Jul 26 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-4
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:fix result when process output is None
* Thu Jul 25 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-3
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:Fix the problem that function cpu_report_result() is called more than once
fix error handling
* Tue Jun 18 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-2
- Type:bugfix
- CVE:NA
- SUG:NA
- DESC:delete rpath setting
* Tue Jun 11 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-1
- Type:enhancement
- CVE:NA
- SUG:NA
- DESC:Package init
- DESC:1.0.3 init

View File

@ -1,63 +0,0 @@
From c8f21d1621e96e2c8a239f8028cc9331aa0f8997 Mon Sep 17 00:00:00 2001
From: jinsaihang <jinsaihang@h-partners.com>
Date: Tue, 17 Dec 2024 11:36:11 +0800
Subject: [PATCH] uniform avg_block_io log and ai_block_io log
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
---
src/python/sentryPlugins/ai_block_io/ai_block_io.py | 5 +++++
src/python/sentryPlugins/ai_block_io/detector.py | 8 +++-----
src/python/sentryPlugins/avg_block_io/stage_window.py | 2 +-
3 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
index 14f740d..8075f5f 100644
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
@@ -208,6 +208,11 @@ class SlowIODetection:
tmp_alarm_content = alarm_content.copy()
del tmp_alarm_content["details"]
logging.warning("[SLOW IO] " + str(tmp_alarm_content))
+ logging.warning(f'[SLOW IO] disk: {str(tmp_alarm_content.get("driver_name"))}, '
+ f'stage: {str(tmp_alarm_content.get("driver_name"))}, '
+ f'iotype: {str(tmp_alarm_content.get("io_type"))}, '
+ f'type: {str(tmp_alarm_content.get("alarm_type"))}, '
+ f'reason: {str(tmp_alarm_content.get("reason"))}')
logging.warning(f"latency: " + str(alarm_content.get("details").get("latency")))
logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump")))
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
index 496e032..27fb7f7 100644
--- a/src/python/sentryPlugins/ai_block_io/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -58,11 +58,9 @@ class Detector:
logging.info(f'[abnormal_period]: disk: {self._metric_name.disk_name}, '
f'stage: {self._metric_name.stage_name}, '
f'iotype: {self._metric_name.io_access_type_name}, '
- f'metric: {self._metric_name.metric_name}, '
- f'current value: {metric_value}, '
- f'ai threshold: {detection_result[2]}, '
- f'absolute threshold upper limit: {detection_result[3]}, '
- f'lower limit: {detection_result[4]}')
+ f'type: {self._metric_name.metric_name}, '
+ f'ai_threshold: {round(detection_result[2], 3)}, '
+ f'curr_val: {metric_value}')
else:
logging.debug(f'Detection result: {str(detection_result)}')
logging.debug(f'exit Detector: {self}')
diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py
index 5113782..587bd49 100644
--- a/src/python/sentryPlugins/avg_block_io/stage_window.py
+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py
@@ -28,7 +28,7 @@ class AbnormalWindowBase:
self.abnormal_window.append(False)
def is_abnormal_window(self):
- return sum(self.abnormal_window) > self.window_threshold
+ return sum(self.abnormal_window) >= self.window_threshold
def window_data_to_string(self):
return ",".join(str(x) for x in self.window_data)
--
2.27.0

View File

@ -1,25 +0,0 @@
From 73f5028fcab08613833c9f2b432f660c70ac264e Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Sat, 12 Oct 2024 16:06:32 +0800
Subject: [PATCH] update collect log
---
src/python/sentryCollector/collect_io.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
index 2b10cde..f699c3c 100644
--- a/src/python/sentryCollector/collect_io.py
+++ b/src/python/sentryCollector/collect_io.py
@@ -156,7 +156,7 @@ class CollectIo():
for line in file:
count += line.count('.op=' + Io_Category[category].upper())
if count > 0:
- logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}")
+ logging.info(f"io_dump info : {disk_name}, {stage}, {Io_Category[category]}, {count}")
except FileNotFoundError:
logging.error("The file %s does not exist.", io_dump_file)
return count
--
2.33.0

View File

@ -1,44 +0,0 @@
From 4550d9cbbb7e921db168f748e8b1d5d7cc0f8b15 Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Mon, 21 Oct 2024 17:30:39 +0800
Subject: [PATCH] update collect plugin period max
---
src/python/sentryCollector/collect_plugin.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
index 53dddec..9495d8b 100644
--- a/src/python/sentryCollector/collect_plugin.py
+++ b/src/python/sentryCollector/collect_plugin.py
@@ -45,6 +45,9 @@ LIMIT_IOTYPE_LIST_LEN = 4
LIMIT_PERIOD_MIN_LEN = 1
LIMIT_PERIOD_MAX_LEN = 300
+# max_save
+LIMIT_MAX_SAVE_LEN = 300
+
# interface protocol
class ClientProtocol():
IS_IOCOLLECT_VALID = 0
@@ -189,7 +192,7 @@ def inter_is_iocollect_valid(period, disk_list=None, stage=None):
if not period or not isinstance(period, int):
result['ret'] = ResultMessage.RESULT_NOT_PARAM
return result
- if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN:
+ if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN * LIMIT_MAX_SAVE_LEN:
result['ret'] = ResultMessage.RESULT_INVALID_LENGTH
return result
@@ -246,7 +249,7 @@ def inter_get_io_data(period, disk_list, stage, iotype):
if not isinstance(period, int):
result['ret'] = ResultMessage.RESULT_NOT_PARAM
return result
- if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN:
+ if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN * LIMIT_MAX_SAVE_LEN:
result['ret'] = ResultMessage.RESULT_INVALID_LENGTH
return result
--
2.33.0

View File

@ -1,35 +0,0 @@
From ac73565fdb0e4bc544e5308ea0251dd6be410ed9 Mon Sep 17 00:00:00 2001
From: zhuofeng <zhuofeng2@huawei.com>
Date: Wed, 9 Oct 2024 16:37:24 +0800
Subject: [PATCH] update log when it is not lock collect
---
src/python/sentryCollector/collect_io.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
index e45947a..2e75187 100644
--- a/src/python/sentryCollector/collect_io.py
+++ b/src/python/sentryCollector/collect_io.py
@@ -179,13 +179,17 @@ class CollectIo():
blk_io_hierarchy_path = os.path.join(disk_path, 'blk_io_hierarchy')
if not os.path.exists(blk_io_hierarchy_path):
- logging.error("no blk_io_hierarchy directory found in %s, skipping.", disk_name)
+ logging.warning("no blk_io_hierarchy directory found in %s, skipping.", disk_name)
continue
for file_name in os.listdir(blk_io_hierarchy_path):
file_path = os.path.join(blk_io_hierarchy_path, file_name)
if file_name == 'stats':
all_disk.append(disk_name)
+
+ if len(all_disk) == 0:
+ logging.debug("no blk_io_hierarchy disk, it is not lock-free collection")
+ return False
if self.loop_all:
self.disk_list = all_disk
--
2.33.0

View File

@ -1,51 +0,0 @@
From f50b4e1b7f5fa38b1930349b1a9a905eb5307ab7 Mon Sep 17 00:00:00 2001
From: znzjugod <zhangnan134@huawei.com>
Date: Tue, 5 Nov 2024 11:47:56 +0800
Subject: [PATCH] update nvme config
---
config/plugins/ai_block_io.ini | 8 ++++----
src/python/sentryPlugins/ai_block_io/config_parser.py | 8 ++++----
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini
index d0b1e74..69f44ba 100644
--- a/config/plugins/ai_block_io.ini
+++ b/config/plugins/ai_block_io.ini
@@ -23,10 +23,10 @@ read_tot_lim=50000
write_tot_lim=50000
[latency_nvme_ssd]
-read_avg_lim=300
-write_avg_lim=300
-read_tot_lim=500
-write_tot_lim=500
+read_avg_lim=10000
+write_avg_lim=10000
+read_tot_lim=50000
+write_tot_lim=50000
[latency_sata_hdd]
read_avg_lim=15000
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
index 3049db2..1bbb609 100644
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
@@ -74,10 +74,10 @@ class ConfigParser:
"write_tot_lim": 50000
},
"latency_nvme_ssd": {
- "read_avg_lim": 300,
- "write_avg_lim": 300,
- "read_tot_lim": 500,
- "write_tot_lim": 500
+ "read_avg_lim": 10000,
+ "write_avg_lim": 10000,
+ "read_tot_lim": 50000,
+ "write_tot_lim": 50000
},
"latency_sata_hdd": {
"read_avg_lim": 15000,
--
2.45.2

View File

@ -1,112 +0,0 @@
From c95be14eee48e5afb255700c9d67c1d8ef2532dc Mon Sep 17 00:00:00 2001
From: PshySimon <caixiaomeng2@huawei.com>
Date: Thu, 10 Oct 2024 16:15:52 +0800
Subject: [PATCH] xalarm add alarm msg length to 8192
---
src/libso/xalarm/register_xalarm.c | 2 +-
src/libso/xalarm/register_xalarm.h | 2 +-
src/python/xalarm/register_xalarm.py | 2 +-
src/python/xalarm/sentry_notify.py | 2 +-
src/python/xalarm/xalarm_api.py | 8 ++++++--
src/python/xalarm/xalarm_server.py | 2 +-
6 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c
index 21a419f..5aff2bc 100644
--- a/src/libso/xalarm/register_xalarm.c
+++ b/src/libso/xalarm/register_xalarm.c
@@ -35,7 +35,7 @@
#define ALARM_SOCKET_PERMISSION 0700
#define TIME_UNIT_MILLISECONDS 1000
-#define MAX_PARAS_LEN 1023
+#define MAX_PARAS_LEN 8191
#define MIN_ALARM_ID 1001
#define MAX_ALARM_ID (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
diff --git a/src/libso/xalarm/register_xalarm.h b/src/libso/xalarm/register_xalarm.h
index fef9482..dcf4f03 100644
--- a/src/libso/xalarm/register_xalarm.h
+++ b/src/libso/xalarm/register_xalarm.h
@@ -11,7 +11,7 @@
#include <sys/time.h>
#include <stdbool.h>
-#define ALARM_INFO_MAX_PARAS_LEN 1024
+#define ALARM_INFO_MAX_PARAS_LEN 8192
#define MAX_STRERROR_SIZE 1024
#define MAX_ALARM_TYEPS 1024
#define MIN_ALARM_ID 1001
diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py
index 6756b1b..edd9994 100644
--- a/src/python/xalarm/register_xalarm.py
+++ b/src/python/xalarm/register_xalarm.py
@@ -11,7 +11,7 @@ from struct import error as StructParseError
from .xalarm_api import Xalarm, alarm_bin2stu
-ALARM_REPORT_LEN = 1048
+ALARM_REPORT_LEN = 8216
MAX_NUM_OF_ALARM_ID=128
MIN_ALARM_ID = 1001
MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py
index a19e5b3..c763a24 100644
--- a/src/python/xalarm/sentry_notify.py
+++ b/src/python/xalarm/sentry_notify.py
@@ -17,7 +17,7 @@ CRITICAL_ALM = 3
ALARM_TYPE_OCCUR = 1
ALARM_TYPE_RECOVER = 2
-MAX_PUC_PARAS_LEN = 1024
+MAX_PUC_PARAS_LEN = 8192
DIR_XALARM = "/var/run/xalarm"
PATH_REPORT_ALARM = "/var/run/xalarm/report"
diff --git a/src/python/xalarm/xalarm_api.py b/src/python/xalarm/xalarm_api.py
index 99eabf5..863bd02 100644
--- a/src/python/xalarm/xalarm_api.py
+++ b/src/python/xalarm/xalarm_api.py
@@ -23,7 +23,7 @@ ALARM_LEVELS = (1, 2, 3, 4, 5)
ALARM_SOCK_PATH = "/var/run/xalarm/report"
MIN_ALARM_ID = 1001
MAX_ALARM_ID = 1128
-MAX_MSG_LEN = 1024
+MAX_MSG_LEN = 8192
@dataclasses.dataclass
@@ -120,6 +120,10 @@ def alarm_bin2stu(bin_data):
def alarm_stu2bin(alarm_info: Xalarm):
+ alarm_msg = alarm_info.msg1
+ padding_length = MAX_MSG_LEN - len(alarm_msg)
+ if padding_length > 0:
+ alarm_msg = alarm_msg + ('\x00' * padding_length)
return struct.pack(
f'@HBBll{MAX_MSG_LEN}s',
alarm_info.alarm_id,
@@ -127,4 +131,4 @@ def alarm_stu2bin(alarm_info: Xalarm):
alarm_info.alarm_type,
alarm_info.timetamp.tv_sec,
alarm_info.timetamp.tv_usec,
- alarm_info.msg1.encode('utf-8'))
+ alarm_msg.encode('utf-8'))
diff --git a/src/python/xalarm/xalarm_server.py b/src/python/xalarm/xalarm_server.py
index fcaf393..2882609 100644
--- a/src/python/xalarm/xalarm_server.py
+++ b/src/python/xalarm/xalarm_server.py
@@ -28,7 +28,7 @@ from .xalarm_transfer import check_filter, transmit_alarm, wait_for_connection
ALARM_DIR = "/var/run/xalarm"
USER_RECV_SOCK = "/var/run/xalarm/alarm"
SOCK_FILE = "/var/run/xalarm/report"
-ALARM_REPORT_LEN = 1048
+ALARM_REPORT_LEN = 8216
ALARM_DIR_PERMISSION = 0o750
ALARM_LISTEN_QUEUE_LEN = 5
--
2.27.0