init the new version-1.0.3
Signed-off-by: zhuofeng <1107893276@qq.com>
This commit is contained in:
parent
d65453dccd
commit
9de4c5a236
@ -1,35 +0,0 @@
|
||||
From 3e2721852ad1f8047ad219a5ab6c68fd4c9d6f5c Mon Sep 17 00:00:00 2001
|
||||
From: shixuantong <shixuantong1@huawei.com>
|
||||
Date: Wed, 24 Jul 2024 16:17:54 +0800
|
||||
Subject: [PATCH] Fix the problem that function cpu_report_result() is called
|
||||
more than once
|
||||
|
||||
when task is running, user to exec "sentryctl stop cpu_sentry", cpu_report_result() will be called twice. This will cause the log to be printed twice
|
||||
---
|
||||
src/python/syssentry/cpu_sentry.py | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py
|
||||
index 7e77654..3c4d58d 100644
|
||||
--- a/src/python/syssentry/cpu_sentry.py
|
||||
+++ b/src/python/syssentry/cpu_sentry.py
|
||||
@@ -133,6 +133,7 @@ class CpuSentry:
|
||||
|
||||
result_level = self.send_result.get("result", ResultLevel.FAIL)
|
||||
report_result(task_name, result_level, details)
|
||||
+ self.init_send_result()
|
||||
|
||||
def kill_process(signum, _f, cpu_sentry_obj):
|
||||
"""kill process by 'pkill -9'"""
|
||||
@@ -179,6 +180,6 @@ def main():
|
||||
cpu_sentry_task.send_result["result"] = ResultLevel.FAIL
|
||||
cpu_sentry_task.send_result["details"]["code"] = 1004
|
||||
cpu_sentry_task.send_result["details"]["msg"] = "run cmd [%s] raise Error" % cpu_sentry_task_cmd
|
||||
- finally:
|
||||
cpu_sentry_task.cpu_report_result()
|
||||
- cpu_sentry_task.init_send_result()
|
||||
+ else:
|
||||
+ cpu_sentry_task.cpu_report_result()
|
||||
--
|
||||
2.27.0
|
||||
|
||||
36
README.en.md
36
README.en.md
@ -1,36 +0,0 @@
|
||||
# sysSentry
|
||||
|
||||
#### Description
|
||||
sysSentry is a system inspection framework used to manage system inspection tasks.
|
||||
|
||||
#### Software Architecture
|
||||
Software architecture description
|
||||
|
||||
#### Installation
|
||||
|
||||
1. xxxx
|
||||
2. xxxx
|
||||
3. xxxx
|
||||
|
||||
#### Instructions
|
||||
|
||||
1. xxxx
|
||||
2. xxxx
|
||||
3. xxxx
|
||||
|
||||
#### Contribution
|
||||
|
||||
1. Fork the repository
|
||||
2. Create Feat_xxx branch
|
||||
3. Commit your code
|
||||
4. Create Pull Request
|
||||
|
||||
|
||||
#### Gitee Feature
|
||||
|
||||
1. You can use Readme\_XXX.md to support different languages, such as Readme\_en.md, Readme\_zh.md
|
||||
2. Gitee blog [blog.gitee.com](https://blog.gitee.com)
|
||||
3. Explore open source project [https://gitee.com/explore](https://gitee.com/explore)
|
||||
4. The most valuable open source project [GVP](https://gitee.com/gvp)
|
||||
5. The manual of Gitee [https://gitee.com/help](https://gitee.com/help)
|
||||
6. The most popular members [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)
|
||||
110
README.md
110
README.md
@ -1,110 +0,0 @@
|
||||
# sysSentry
|
||||
|
||||
#### 介绍
|
||||
sysSentry is a system inspection framework used to manage system inspection tasks.
|
||||
|
||||
#### 软件架构
|
||||
1. 框架:支持x86和aarch64架构
|
||||
2. 插件:不同的差距支持架构不同,请参考docs.openeuler.org中的内容
|
||||
|
||||
|
||||
#### 安装教程
|
||||
|
||||
1. 安装巡检框架
|
||||
```shell
|
||||
[root@openEuler ~]# yum install -y sysSentry
|
||||
```
|
||||
2. 启动巡检框架
|
||||
```shell
|
||||
[root@openEuler ~]# systemctl start sentryCollector
|
||||
[root@openEuler ~]# systemctl start xalarmd
|
||||
[root@openEuler ~]# systemctl start sysSentry
|
||||
```
|
||||
3. 安装&重载巡检插件
|
||||
step1. 安装用户需要的巡检插件
|
||||
```shell
|
||||
yum install <插件名>
|
||||
```
|
||||
当前支持插件有:
|
||||
- cpu_sentry -- cpu巡检,支持22.03-LTS-SP4版本,aarch64架构,920F芯片使用
|
||||
- avg_block_io -- 平均阈值慢io检测,支持20.03-LTS-SP4版本,x86及aarch64架构
|
||||
- ai_block_io -- AI阈值慢io检测,支持20.03-LTS-SP4版本,x86及aarch64架构
|
||||
|
||||
step2. 重载巡检插件
|
||||
```shell
|
||||
[root@openEuler ~]# sentryctl reload <插件名>
|
||||
```
|
||||
|
||||
#### 使用说明
|
||||
|
||||
sysSentry提供了用于管理巡检插件的命令 -- sentryctl,可以用于启动/停止巡检插件任务、查看巡检插件运行状态、查看巡检插件上报信息等功能。
|
||||
|
||||
1. 启动指定巡检任务
|
||||
|
||||
```shell
|
||||
[root@openEuler ~]# sentryctl start <module_name>
|
||||
```
|
||||
|
||||
2. 终止指定巡检任务
|
||||
|
||||
```shell
|
||||
[root@openEuler ~]# sentryctl stop <module_name>
|
||||
```
|
||||
|
||||
3. 列出所有已加载的巡检任务及状态
|
||||
|
||||
```shell
|
||||
[root@openEuler ~]# sentryctl list
|
||||
```
|
||||
|
||||
4. 查询指定巡检任务的状态
|
||||
|
||||
```shell
|
||||
[root@openEuler ~]# sentryctl status <module_name>
|
||||
```
|
||||
|
||||
巡检任务共存在四种状态,每种状态的回显信息及对应介绍如下:
|
||||
|
||||
| 状态 | 描述 |
|
||||
| ------- | ------------------------------------------------------------ |
|
||||
| RUNNING | 巡检任务正在运行 |
|
||||
| WAITING | 仅period类型巡检任务可设置此状态,表示period巡检任务等待下一次被调度执行 |
|
||||
| EXITED | 巡检任务尚未执行,或者oneshot类型的巡检任务执行结束处于此状态 |
|
||||
| FAILED | 巡检任务未拉起成功,或者巡检任务未正常退出 |
|
||||
|
||||
5. 重载指定巡检任务的配置
|
||||
|
||||
当用户修改了巡检任务的配置文件/etc/sysSentry/tasks/<module_name>.mod时,可通过以下命令重载配置文件:
|
||||
|
||||
```shell
|
||||
[root@openEuler ~]# sentryctl reload <module_name>
|
||||
```
|
||||
|
||||
6. 查询指定任务的告警信息
|
||||
|
||||
```shell
|
||||
[root@openEuler ~]# sentryctl get_alarm <module_name> [options]
|
||||
```
|
||||
|
||||
options可选参数及释义如下:
|
||||
|
||||
| 参数 | 描述 |
|
||||
| -------------------------------------- | ------------------------------------------------------------ |
|
||||
| -s TIME_RANGE, --time_range TIME_RANGE | 展示用户指定时间长度内的告警信息,TIME_RANGE为整形,单位秒,范围为1~15 |
|
||||
| -d, --detailed | 打印详细告警信息 |
|
||||
|
||||
7. 查询指定巡检任务的巡检结果
|
||||
|
||||
```shell
|
||||
sentryctl get_result <module_name>
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
#### 参与贡献
|
||||
|
||||
1. Fork 本仓库
|
||||
2. 新建 Feat_xxx 分支
|
||||
3. 提交代码
|
||||
4. 新建 Pull Request
|
||||
@ -1,32 +0,0 @@
|
||||
From 91aa47999030503fda4935d4cc238b82d6842238 Mon Sep 17 00:00:00 2001
|
||||
From: shixuantong <shixuantong1@huawei.com>
|
||||
Date: Sun, 11 Aug 2024 18:36:23 +0800
|
||||
Subject: [PATCH] Remove ANSI escape sequences
|
||||
|
||||
---
|
||||
src/python/syssentry/cpu_sentry.py | 9 ++++++++-
|
||||
1 file changed, 8 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py
|
||||
index 9287e2f..99af127 100644
|
||||
--- a/src/python/syssentry/cpu_sentry.py
|
||||
+++ b/src/python/syssentry/cpu_sentry.py
|
||||
@@ -97,7 +97,14 @@ class CpuSentry:
|
||||
if "ERROR" in stdout:
|
||||
self.send_result["result"] = ResultLevel.FAIL
|
||||
self.send_result["details"]["code"] = 1004
|
||||
- self.send_result["details"]["msg"] = stdout.split("\n")[0]
|
||||
+
|
||||
+ # Remove ANSI escape sequences
|
||||
+ error_info = stdout.split("\n")[0]
|
||||
+ if error_info.startswith("\u001b"):
|
||||
+ ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])'
|
||||
+ error_info = re.sub(ansi_escape, '', error_info)
|
||||
+
|
||||
+ self.send_result["details"]["msg"] = error_info
|
||||
return
|
||||
|
||||
out_split = stdout.split("\n")
|
||||
--
|
||||
2.33.0
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,39 +0,0 @@
|
||||
From abf36bf0351efde388c089245aed9f6d8d2e6d3b Mon Sep 17 00:00:00 2001
|
||||
From: luckky <guodashun1@huawei.com>
|
||||
Date: Wed, 6 Nov 2024 11:42:53 +0800
|
||||
Subject: [PATCH] add boundary check for settings
|
||||
1. add two boundary checks for page_isolation_threshold and hbm_online_repair_log_level
|
||||
(0 <= page_isolation_threshold)
|
||||
(0(LOG_DEBUG) <= hbm_online_repair_log_level <= 3(LOG_ERROR))
|
||||
|
||||
---
|
||||
src/c/hbm_online_repair/hbm_online_repair.c | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c
|
||||
index 943f201..00c9c0b 100644
|
||||
--- a/src/c/hbm_online_repair/hbm_online_repair.c
|
||||
+++ b/src/c/hbm_online_repair/hbm_online_repair.c
|
||||
@@ -89,6 +89,9 @@ void hbm_param_init(void)
|
||||
if (ret < 0) {
|
||||
global_level_setting = DEFAULT_LOG_LEVEL;
|
||||
log(LOG_WARNING, "Get log level from config failed, set the default value %d\n", DEFAULT_LOG_LEVEL);
|
||||
+ } else if (global_level_setting < LOG_DEBUG || global_level_setting > LOG_ERROR) {
|
||||
+ log(LOG_WARNING, "The log level value %d in config is out of range, set the default value %d\n", global_level_setting, DEFAULT_LOG_LEVEL);
|
||||
+ global_level_setting = DEFAULT_LOG_LEVEL;
|
||||
} else {
|
||||
log(LOG_INFO, "log level: %d\n", global_level_setting);
|
||||
}
|
||||
@@ -98,6 +101,9 @@ void hbm_param_init(void)
|
||||
if (ret < 0) {
|
||||
page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD;
|
||||
log(LOG_WARNING, "Get page_isolation_threshold from config failed, set the default value %d\n", DEFAULT_PAGE_ISOLATION_THRESHOLD);
|
||||
+ } else if (page_isolation_threshold < 0) {
|
||||
+ log(LOG_WARNING, "The page_isolation_threshold %d in config is out of range, set the default value %d\n", page_isolation_threshold, DEFAULT_PAGE_ISOLATION_THRESHOLD);
|
||||
+ page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD;
|
||||
} else {
|
||||
log(LOG_INFO, "page_isolation_threshold: %d\n", page_isolation_threshold);
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,31 +0,0 @@
|
||||
From eca8c542875aef5cfbf947d697c4b644490d1c05 Mon Sep 17 00:00:00 2001
|
||||
From: zhuofeng <zhuofeng2@huawei.com>
|
||||
Date: Fri, 30 Aug 2024 19:58:41 +0800
|
||||
Subject: [PATCH] add deleted code to plugin rasdaemon
|
||||
|
||||
---
|
||||
src/python/syssentry/syssentry.py | 8 ++++++++
|
||||
1 file changed, 8 insertions(+)
|
||||
|
||||
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
|
||||
index 32b81e3..3d5cb8d 100644
|
||||
--- a/src/python/syssentry/syssentry.py
|
||||
+++ b/src/python/syssentry/syssentry.py
|
||||
@@ -462,6 +462,14 @@ def main_loop():
|
||||
epoll_fd.register(cpu_alarm_fd.fileno(), select.EPOLLIN)
|
||||
|
||||
logging.debug("start main loop")
|
||||
+ # onstart_tasks_handle()
|
||||
+ for task_type in TasksMap.tasks_dict:
|
||||
+ for task_name in TasksMap.tasks_dict.get(task_type):
|
||||
+ task = TasksMap.tasks_dict.get(task_type).get(task_name)
|
||||
+ if not task:
|
||||
+ continue
|
||||
+ task.onstart_handle()
|
||||
+
|
||||
while True:
|
||||
try:
|
||||
events_list = epoll_fd.poll(SERVER_EPOLL_TIMEOUT)
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,32 +0,0 @@
|
||||
From 9ecd4c2c9c9f9578f5ec4780360dc67b182b384a Mon Sep 17 00:00:00 2001
|
||||
From: jinsaihang <jinsaihang@h-partners.com>
|
||||
Date: Wed, 9 Oct 2024 08:09:04 +0000
|
||||
Subject: [PATCH 2/2] add detail time
|
||||
|
||||
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
|
||||
---
|
||||
src/python/syssentry/alarm.py | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
|
||||
index 74a2716..d5337d3 100644
|
||||
--- a/src/python/syssentry/alarm.py
|
||||
+++ b/src/python/syssentry/alarm.py
|
||||
@@ -118,11 +118,13 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
|
||||
logging.debug(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements")
|
||||
|
||||
def xalarm_to_dict(alarm_info: Xalarm) -> dict:
|
||||
+ timestamp = alarm_info.timetamp.tv_sec + alarm_info.timetamp.tv_usec / 1000000
|
||||
+ dt_object = datetime.fromtimestamp(int(timestamp))
|
||||
return {
|
||||
'alarm_id': xalarm_getid(alarm_info),
|
||||
'alarm_type': xalarm_gettype(alarm_info),
|
||||
'alarm_level': xalarm_getlevel(alarm_info),
|
||||
- 'timetamp': xalarm_gettime(alarm_info),
|
||||
+ 'timestamp': dt_object.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
'msg1': xalarm_getdesc(alarm_info)
|
||||
}
|
||||
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,176 +0,0 @@
|
||||
From c2ffc679eddda5d78362612d89a9319d268da7e3 Mon Sep 17 00:00:00 2001
|
||||
From: zhuofeng <zhuofeng2@huawei.com>
|
||||
Date: Thu, 10 Oct 2024 20:17:34 +0800
|
||||
Subject: [PATCH] add get_disk_type and fix some bugs
|
||||
|
||||
---
|
||||
service/sentryCollector.service | 2 +-
|
||||
src/python/sentryCollector/collect_io.py | 16 ++++-
|
||||
src/python/sentryCollector/collect_plugin.py | 68 +++++++++++++++++++-
|
||||
3 files changed, 81 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/service/sentryCollector.service b/service/sentryCollector.service
|
||||
index 4ee07d5..e09ddb3 100644
|
||||
--- a/service/sentryCollector.service
|
||||
+++ b/service/sentryCollector.service
|
||||
@@ -1,5 +1,5 @@
|
||||
[Unit]
|
||||
-Description = Collection module added for sysSentry and kernel lock-free collection
|
||||
+Description = Collection module added for sysSentry
|
||||
|
||||
[Service]
|
||||
ExecStart=/usr/bin/python3 /usr/bin/sentryCollector
|
||||
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
|
||||
index 8780648..6699a90 100644
|
||||
--- a/src/python/sentryCollector/collect_io.py
|
||||
+++ b/src/python/sentryCollector/collect_io.py
|
||||
@@ -116,7 +116,7 @@ class CollectIo():
|
||||
return 0
|
||||
if finish <= 0 or lat_time <= 0:
|
||||
return 0
|
||||
- value = lat_time / finish / 1000 / 1000
|
||||
+ value = lat_time / finish / 1000
|
||||
if value.is_integer():
|
||||
return int(value)
|
||||
else:
|
||||
@@ -124,11 +124,17 @@ class CollectIo():
|
||||
|
||||
def get_io_length(self, curr_stage_value, last_stage_value, category):
|
||||
try:
|
||||
- finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH])
|
||||
+ lat_time = (int(curr_stage_value[category * 3 + IoStatus.LATENCY]) - int(last_stage_value[category * 3 + IoStatus.LATENCY]))
|
||||
except ValueError as e:
|
||||
logging.error("get_io_length convert to int failed, %s", e)
|
||||
return 0
|
||||
- value = finish / self.period_time / 1000 / 1000
|
||||
+ if lat_time <= 0:
|
||||
+ return 0
|
||||
+ # ns convert us
|
||||
+ lat_time = lat_time / 1000
|
||||
+ # s convert us
|
||||
+ period_time = self.period_time * 1000 * 1000
|
||||
+ value = lat_time / period_time
|
||||
if value.is_integer():
|
||||
return int(value)
|
||||
else:
|
||||
@@ -141,6 +147,8 @@ class CollectIo():
|
||||
with open(io_dump_file, 'r') as file:
|
||||
for line in file:
|
||||
count += line.count('.op=' + Io_Category[category])
|
||||
+ if count > 0:
|
||||
+ logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}")
|
||||
except FileNotFoundError:
|
||||
logging.error("The file %s does not exist.", io_dump_file)
|
||||
return count
|
||||
@@ -223,6 +231,8 @@ class CollectIo():
|
||||
if self.get_blk_io_hierarchy(disk_name, stage_list) < 0:
|
||||
continue
|
||||
self.append_period_lat(disk_name, stage_list)
|
||||
+
|
||||
+ logging.debug(f"no-lock collect data : {IO_GLOBAL_DATA}")
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
sleep_time = self.period_time - elapsed_time
|
||||
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
|
||||
index 3e2cf4c..31bf11b 100644
|
||||
--- a/src/python/sentryCollector/collect_plugin.py
|
||||
+++ b/src/python/sentryCollector/collect_plugin.py
|
||||
@@ -16,6 +16,7 @@ import json
|
||||
import socket
|
||||
import logging
|
||||
import re
|
||||
+import os
|
||||
|
||||
COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock"
|
||||
|
||||
@@ -58,6 +59,8 @@ class ResultMessage():
|
||||
RESULT_EXCEED_LIMIT = 4 # the parameter length exceeds the limit.
|
||||
RESULT_PARSE_FAILED = 5 # parse failed
|
||||
RESULT_INVALID_CHAR = 6 # invalid char
|
||||
+ RESULT_DISK_NOEXIST = 7 # disk is not exist
|
||||
+ RESULT_DISK_TYPE_MISMATCH= 8 # disk type mismatch
|
||||
|
||||
Result_Messages = {
|
||||
ResultMessage.RESULT_SUCCEED: "Succeed",
|
||||
@@ -66,9 +69,15 @@ Result_Messages = {
|
||||
ResultMessage.RESULT_INVALID_LENGTH: "Invalid parameter length",
|
||||
ResultMessage.RESULT_EXCEED_LIMIT: "The parameter length exceeds the limit",
|
||||
ResultMessage.RESULT_PARSE_FAILED: "Parse failed",
|
||||
- ResultMessage.RESULT_INVALID_CHAR: "Invalid char"
|
||||
+ ResultMessage.RESULT_INVALID_CHAR: "Invalid char",
|
||||
+ ResultMessage.RESULT_DISK_NOEXIST: "Disk is not exist",
|
||||
+ ResultMessage.RESULT_DISK_TYPE_MISMATCH: "Disk type mismatch"
|
||||
}
|
||||
|
||||
+class DiskType():
|
||||
+ TYPE_NVME_SSD = 0
|
||||
+ TYPE_SATA_SSD = 1
|
||||
+ TYPE_SATA_HDD = 2
|
||||
|
||||
def client_send_and_recv(request_data, data_str_len, protocol):
|
||||
"""client socket send and recv message"""
|
||||
@@ -273,3 +282,60 @@ def inter_get_io_data(period, disk_list, stage, iotype):
|
||||
result['message'] = result_message
|
||||
return result
|
||||
|
||||
+def get_disk_type(disk):
|
||||
+ result = {}
|
||||
+ result['ret'] = ResultMessage.RESULT_UNKNOWN
|
||||
+ result['message'] = ""
|
||||
+ if not disk:
|
||||
+ logging.error("param is invalid")
|
||||
+ result['ret'] = ResultMessage.RESULT_NOT_PARAM
|
||||
+ return result
|
||||
+ if len(disk) <= 0 or len(disk) > LIMIT_DISK_CHAR_LEN:
|
||||
+ logging.error("invalid disk length")
|
||||
+ result['ret'] = ResultMessage.RESULT_INVALID_LENGTH
|
||||
+ return result
|
||||
+ pattern = r'^[a-zA-Z0-9_-]+$'
|
||||
+ if not re.match(pattern, disk):
|
||||
+ logging.error("%s is invalid char", disk)
|
||||
+ result['ret'] = ResultMessage.RESULT_INVALID_CHAR
|
||||
+ return result
|
||||
+
|
||||
+ base_path = '/sys/block'
|
||||
+ all_disk = []
|
||||
+ for disk_name in os.listdir(base_path):
|
||||
+ all_disk.append(disk_name)
|
||||
+
|
||||
+ if disk not in all_disk:
|
||||
+ logging.error("disk %s is not exist", disk)
|
||||
+ result['ret'] = ResultMessage.RESULT_DISK_NOEXIST
|
||||
+ return result
|
||||
+
|
||||
+ if disk[0:4] == "nvme":
|
||||
+ result['message'] = str(DiskType.TYPE_NVME_SSD)
|
||||
+ elif disk[0:2] == "sd":
|
||||
+ disk_file = '/sys/block/{}/queue/rotational'.format(disk)
|
||||
+ try:
|
||||
+ with open(disk_file, 'r') as file:
|
||||
+ num = int(file.read())
|
||||
+ if num == 1:
|
||||
+ result['message'] = str(DiskType.TYPE_SATA_SSD)
|
||||
+ elif num == 0:
|
||||
+ result['message'] = str(DiskType.TYPE_SATA_HDD)
|
||||
+ else:
|
||||
+ logging.error("disk %s is not support, num = %d", disk, num)
|
||||
+ result['ret'] = ResultMessage.RESULT_DISK_TYPE_MISMATCH
|
||||
+ return result
|
||||
+ except FileNotFoundError:
|
||||
+ logging.error("The disk_file [%s] does not exist", disk_file)
|
||||
+ result['ret'] = ResultMessage.RESULT_DISK_NOEXIST
|
||||
+ return result
|
||||
+ except Exception as e:
|
||||
+ logging.error("open disk_file %s happen an error: %s", disk_file, e)
|
||||
+ return result
|
||||
+ else:
|
||||
+ logging.error("disk %s is not support", disk)
|
||||
+ result['ret'] = ResultMessage.RESULT_DISK_TYPE_MISMATCH
|
||||
+ return result
|
||||
+
|
||||
+ result['ret'] = ResultMessage.RESULT_SUCCEED
|
||||
+ return result
|
||||
\ No newline at end of file
|
||||
--
|
||||
2.33.0
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,251 +0,0 @@
|
||||
From a8418093bb37482da7ccaac0c950f2ed8d0ba2fa Mon Sep 17 00:00:00 2001
|
||||
From: gaoruoshu <gaoruoshu@huawei.com>
|
||||
Date: Thu, 10 Oct 2024 15:07:29 +0800
|
||||
Subject: [PATCH] add log for improving maintainability
|
||||
|
||||
---
|
||||
.../avg_block_io/avg_block_io.py | 4 +-
|
||||
.../sentryPlugins/avg_block_io/module_conn.py | 57 ++++++++++-------
|
||||
.../avg_block_io/stage_window.py | 8 +++
|
||||
.../sentryPlugins/avg_block_io/utils.py | 63 +++++++++++++++++--
|
||||
4 files changed, 103 insertions(+), 29 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
index 26a60c5..cf2ded3 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
@@ -194,11 +194,11 @@ def init_io_win(io_dic, config, common_param):
|
||||
|
||||
if avg_lim_value and avg_time_value and tot_lim_value:
|
||||
io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
|
||||
- logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw))
|
||||
+ logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw))
|
||||
|
||||
if iodump_lim_value is not None:
|
||||
io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value)
|
||||
- logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw))
|
||||
+ logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw))
|
||||
return io_data, io_avg_value
|
||||
|
||||
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
index 2fc5a83..40b3fcc 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
@@ -13,7 +13,7 @@ import logging
|
||||
import sys
|
||||
import time
|
||||
|
||||
-from .utils import is_abnormal
|
||||
+from .utils import is_abnormal, get_win_data, log_slow_win
|
||||
from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
|
||||
from syssentry.result import ResultLevel, report_result
|
||||
from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR
|
||||
@@ -66,36 +66,51 @@ def report_alarm_fail(alarm_info):
|
||||
|
||||
def process_report_data(disk_name, rw, io_data):
|
||||
"""check abnormal window and report to xalarm"""
|
||||
- if not is_abnormal((disk_name, 'bio', rw), io_data):
|
||||
+ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
|
||||
+ if not abnormal:
|
||||
return
|
||||
|
||||
- msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw}
|
||||
+ msg = {
|
||||
+ "alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw,
|
||||
+ "reason": "unknown", "block_stack": "bio", "alarm_type": abnormal_list,
|
||||
+ "details": get_win_data(disk_name, rw, io_data)
|
||||
+ }
|
||||
|
||||
+ # io press
|
||||
ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
|
||||
for stage_name in ctrl_stage:
|
||||
- if is_abnormal((disk_name, stage_name, rw), io_data):
|
||||
- msg["reason"] = "IO press slow"
|
||||
- msg["block_stack"] = f"bio,{stage_name}"
|
||||
- logging.warning("{} - {} report IO press slow".format(disk_name, rw))
|
||||
- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
||||
- return
|
||||
-
|
||||
- if is_abnormal((disk_name, 'rq_driver', rw), io_data):
|
||||
+ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
|
||||
+ if not abnormal:
|
||||
+ continue
|
||||
+ msg["reason"] = "IO press"
|
||||
+ msg["block_stack"] = f"bio,{stage_name}"
|
||||
+ msg["alarm_type"] = abnormal_list
|
||||
+ log_slow_win(msg, "IO press")
|
||||
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
||||
+ return
|
||||
+
|
||||
+ # driver slow
|
||||
+ abnormal, abnormal_list = is_abnormal((disk_name, 'rq_driver', rw), io_data)
|
||||
+ if abnormal:
|
||||
msg["reason"] = "driver slow"
|
||||
msg["block_stack"] = "bio,rq_driver"
|
||||
- logging.warning("{} - {} report driver slow".format(disk_name, rw))
|
||||
+ msg["alarm_type"] = abnormal_list
|
||||
+ log_slow_win(msg, "driver slow")
|
||||
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
||||
return
|
||||
|
||||
+ # kernel slow
|
||||
kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue']
|
||||
for stage_name in kernel_stage:
|
||||
- if is_abnormal((disk_name, stage_name, rw), io_data):
|
||||
- msg["reason"] = "kernel slow"
|
||||
- msg["block_stack"] = f"bio,{stage_name}"
|
||||
- logging.warning("{} - {} report kernel slow".format(disk_name, rw))
|
||||
- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
||||
- return
|
||||
- msg["reason"] = "unknown"
|
||||
- msg["block_stack"] = "bio"
|
||||
- logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw))
|
||||
+ abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data)
|
||||
+ if not abnormal:
|
||||
+ continue
|
||||
+ msg["reason"] = "kernel slow"
|
||||
+ msg["block_stack"] = f"bio,{stage_name}"
|
||||
+ msg["alarm_type"] = abnormal_list
|
||||
+ log_slow_win(msg, "kernel slow")
|
||||
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
||||
+ return
|
||||
+
|
||||
+ log_slow_win(msg, "unknown")
|
||||
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py
|
||||
index 9b0ce79..5113782 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/stage_window.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py
|
||||
@@ -14,6 +14,11 @@ class AbnormalWindowBase:
|
||||
self.window_size = window_size
|
||||
self.window_threshold = window_threshold
|
||||
self.abnormal_window = [False] * window_size
|
||||
+ self.window_data = [-1] * window_size
|
||||
+
|
||||
+ def append_new_data(self, ab_res):
|
||||
+ self.window_data.pop(0)
|
||||
+ self.window_data.append(ab_res)
|
||||
|
||||
def append_new_period(self, ab_res, avg_val=0):
|
||||
self.abnormal_window.pop(0)
|
||||
@@ -25,6 +30,9 @@ class AbnormalWindowBase:
|
||||
def is_abnormal_window(self):
|
||||
return sum(self.abnormal_window) > self.window_threshold
|
||||
|
||||
+ def window_data_to_string(self):
|
||||
+ return ",".join(str(x) for x in self.window_data)
|
||||
+
|
||||
|
||||
class IoWindow(AbnormalWindowBase):
|
||||
def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40):
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
index 2de9a46..3b7f027 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
@@ -65,15 +65,32 @@ def set_nested_value(data, keys, value):
|
||||
return True
|
||||
|
||||
|
||||
+def get_win_data(disk_name, rw, io_data):
|
||||
+ """get latency and iodump win data"""
|
||||
+ latency = ''
|
||||
+ iodump = ''
|
||||
+ for stage_name in io_data[disk_name]:
|
||||
+ if 'latency' in io_data[disk_name][stage_name][rw]:
|
||||
+ latency_list = io_data[disk_name][stage_name][rw]['latency'].window_data_to_string()
|
||||
+ latency += f'{stage_name}: [{latency_list}], '
|
||||
+ if 'iodump' in io_data[disk_name][stage_name][rw]:
|
||||
+ iodump_list = io_data[disk_name][stage_name][rw]['iodump'].window_data_to_string()
|
||||
+ iodump += f'{stage_name}: [{iodump_list}], '
|
||||
+ return {"latency": latency[:-2], "iodump": iodump[:-2]}
|
||||
+
|
||||
+
|
||||
def is_abnormal(io_key, io_data):
|
||||
"""check if latency and iodump win abnormal"""
|
||||
+ abnormal_list = ''
|
||||
for key in ['latency', 'iodump']:
|
||||
all_keys = get_nested_value(io_data, io_key)
|
||||
if all_keys and key in all_keys:
|
||||
win = get_nested_value(io_data, io_key + (key,))
|
||||
if win and win.is_abnormal_window():
|
||||
- return True
|
||||
- return False
|
||||
+ abnormal_list += key + ', '
|
||||
+ if not abnormal_list:
|
||||
+ return False, abnormal_list
|
||||
+ return True, abnormal_list[:-2]
|
||||
|
||||
|
||||
def update_io_avg(old_avg, period_value, win_size):
|
||||
@@ -87,8 +104,8 @@ def update_io_avg(old_avg, period_value, win_size):
|
||||
return [new_avg_value, new_avg_count]
|
||||
|
||||
|
||||
-def update_io_data(old_avg, period_value, win_size, io_data, io_key):
|
||||
- """update data of latency and iodump window"""
|
||||
+def update_io_period(old_avg, period_value, io_data, io_key):
|
||||
+ """update period of latency and iodump window"""
|
||||
all_wins = get_nested_value(io_data, io_key)
|
||||
if all_wins and "latency" in all_wins:
|
||||
io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE])
|
||||
@@ -96,20 +113,54 @@ def update_io_data(old_avg, period_value, win_size, io_data, io_key):
|
||||
io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1])
|
||||
|
||||
|
||||
+def update_io_data(period_value, io_data, io_key):
|
||||
+ """update data of latency and iodump window"""
|
||||
+ all_wins = get_nested_value(io_data, io_key)
|
||||
+ if all_wins and "latency" in all_wins:
|
||||
+ io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_data(period_value[0])
|
||||
+ if all_wins and "iodump" in all_wins:
|
||||
+ io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_data(period_value[1])
|
||||
+
|
||||
+
|
||||
+def log_abnormal_period(old_avg, period_value, io_data, io_key):
|
||||
+ """record log of abnormal period"""
|
||||
+ all_wins = get_nested_value(io_data, io_key)
|
||||
+ if all_wins and "latency" in all_wins:
|
||||
+ if all_wins["latency"].is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
|
||||
+ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, "
|
||||
+ f"type: latency, avg: {round(old_avg[AVG_VALUE], 3)}, curr_val: {period_value[0]}")
|
||||
+ if all_wins and "iodump" in all_wins:
|
||||
+ if all_wins["iodump"].is_abnormal_period(period_value[1]):
|
||||
+ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, "
|
||||
+ f"type: iodump, curr_val: {period_value[1]}")
|
||||
+
|
||||
+
|
||||
+def log_slow_win(msg, reason):
|
||||
+ """record log of slow win"""
|
||||
+ logging.warning(f"[SLOW IO] disk: {msg['driver_name']}, stage: {msg['block_stack']}, "
|
||||
+ f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}")
|
||||
+ logging.info(f"latency: {msg['details']['latency']}")
|
||||
+ logging.info(f"iodump: {msg['details']['iodump']}")
|
||||
+
|
||||
+
|
||||
def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data):
|
||||
"""update avg and check abonrmal, return true if win_size full"""
|
||||
period_value = get_nested_value(data, io_key)
|
||||
old_avg = get_nested_value(io_avg_value, io_key)
|
||||
|
||||
# 更新avg数据
|
||||
+ update_io_data(period_value, io_data, io_key)
|
||||
if old_avg[AVG_COUNT] < win_size:
|
||||
set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
|
||||
return False
|
||||
|
||||
+ # 打印异常周期数据
|
||||
+ log_abnormal_period(old_avg, period_value, io_data, io_key)
|
||||
+
|
||||
# 更新win数据 -- 判断异常周期
|
||||
- update_io_data(old_avg, period_value, win_size, io_data, io_key)
|
||||
+ update_io_period(old_avg, period_value, io_data, io_key)
|
||||
all_wins = get_nested_value(io_data, io_key)
|
||||
- if all_wins and 'latency' not in all_wins:
|
||||
+ if not all_wins or 'latency' not in all_wins:
|
||||
return True
|
||||
period = get_nested_value(io_data, io_key + ("latency",))
|
||||
if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,24 +0,0 @@
|
||||
From ef3aad0ca57d35b0a4fe29a0205596021bae0227 Mon Sep 17 00:00:00 2001
|
||||
From: caixiaomeng <caixiaomeng2@.com>
|
||||
Date: Fri, 11 Oct 2024 17:59:54 +0800
|
||||
Subject: [PATCH] add log for xalarm when sending msg and clean invalid client
|
||||
socket
|
||||
|
||||
---
|
||||
src/python/xalarm/xalarm_transfer.py | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py
|
||||
index 42137d8..9e867cc 100644
|
||||
--- a/src/python/xalarm/xalarm_transfer.py
|
||||
+++ b/src/python/xalarm/xalarm_transfer.py
|
||||
@@ -117,4 +117,5 @@ def transmit_alarm(server_sock, epoll, fd_to_socket, bin_data):
|
||||
epoll.unregister(fileno)
|
||||
fd_to_socket[fileno].close()
|
||||
del fd_to_socket[fileno]
|
||||
+ logging.info(f"cleaned up connection {fileno} for client lost connection.")
|
||||
|
||||
--
|
||||
2.27.0
|
||||
|
||||
|
||||
@ -1,522 +0,0 @@
|
||||
From c1ab550a3f817826ac6f279de97e6d3820901275 Mon Sep 17 00:00:00 2001
|
||||
From: gaoruoshu <gaoruoshu@huawei.com>
|
||||
Date: Fri, 27 Sep 2024 14:10:18 +0800
|
||||
Subject: [PATCH] add log level and change log format
|
||||
|
||||
---
|
||||
config/collector.conf | 5 ++-
|
||||
config/inspect.conf | 5 ++-
|
||||
config/plugins/avg_block_io.ini | 5 ++-
|
||||
config/xalarm.conf | 3 ++
|
||||
src/python/sentryCollector/collect_config.py | 29 ++++++++++++++++
|
||||
src/python/sentryCollector/collect_io.py | 15 ++-------
|
||||
src/python/sentryCollector/collect_plugin.py | 32 +++++++++---------
|
||||
src/python/sentryCollector/collectd.py | 6 ++--
|
||||
.../avg_block_io/avg_block_io.py | 7 ++--
|
||||
.../sentryPlugins/avg_block_io/utils.py | 32 ++++++++++++++++++
|
||||
src/python/syssentry/sentry_config.py | 28 ++++++++++++++++
|
||||
src/python/syssentry/syssentry.py | 7 ++--
|
||||
src/python/xalarm/xalarm_config.py | 33 +++++++++++++++++--
|
||||
src/python/xalarm/xalarm_daemon.py | 7 ++--
|
||||
14 files changed, 172 insertions(+), 42 deletions(-)
|
||||
|
||||
diff --git a/config/collector.conf b/config/collector.conf
|
||||
index 9baa086..56b0ed1 100644
|
||||
--- a/config/collector.conf
|
||||
+++ b/config/collector.conf
|
||||
@@ -4,4 +4,7 @@ modules=io
|
||||
[io]
|
||||
period_time=1
|
||||
max_save=10
|
||||
-disk=default
|
||||
\ No newline at end of file
|
||||
+disk=default
|
||||
+
|
||||
+[log]
|
||||
+level=info
|
||||
\ No newline at end of file
|
||||
diff --git a/config/inspect.conf b/config/inspect.conf
|
||||
index 071cca1..f451d9e 100644
|
||||
--- a/config/inspect.conf
|
||||
+++ b/config/inspect.conf
|
||||
@@ -1,2 +1,5 @@
|
||||
[inspect]
|
||||
-Interval=3
|
||||
\ No newline at end of file
|
||||
+Interval=3
|
||||
+
|
||||
+[log]
|
||||
+level=info
|
||||
diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini
|
||||
index bc33dde..858db18 100644
|
||||
--- a/config/plugins/avg_block_io.ini
|
||||
+++ b/config/plugins/avg_block_io.ini
|
||||
@@ -1,8 +1,11 @@
|
||||
+[log]
|
||||
+level=info
|
||||
+
|
||||
[common]
|
||||
disk=default
|
||||
stage=default
|
||||
iotype=read,write
|
||||
-period_time=1
|
||||
+period_time=1
|
||||
|
||||
[algorithm]
|
||||
win_size=30
|
||||
diff --git a/config/xalarm.conf b/config/xalarm.conf
|
||||
index 14c6d39..323d2dd 100644
|
||||
--- a/config/xalarm.conf
|
||||
+++ b/config/xalarm.conf
|
||||
@@ -1,2 +1,5 @@
|
||||
[filter]
|
||||
id_mask = 1001-1128
|
||||
+
|
||||
+[log]
|
||||
+level=info
|
||||
diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py
|
||||
index 0fdd9f0..5aa38ec 100644
|
||||
--- a/src/python/sentryCollector/collect_config.py
|
||||
+++ b/src/python/sentryCollector/collect_config.py
|
||||
@@ -32,6 +32,35 @@ CONF_IO_PERIOD_TIME_DEFAULT = 1
|
||||
CONF_IO_MAX_SAVE_DEFAULT = 10
|
||||
CONF_IO_DISK_DEFAULT = "default"
|
||||
|
||||
+# log
|
||||
+CONF_LOG = 'log'
|
||||
+CONF_LOG_LEVEL = 'level'
|
||||
+LogLevel = {
|
||||
+ "debug": logging.DEBUG,
|
||||
+ "info": logging.INFO,
|
||||
+ "warning": logging.WARNING,
|
||||
+ "error": logging.ERROR,
|
||||
+ "critical": logging.CRITICAL
|
||||
+}
|
||||
+
|
||||
+
|
||||
+def get_log_level(filename=COLLECT_CONF_PATH):
|
||||
+ if not os.path.exists(filename):
|
||||
+ return logging.INFO
|
||||
+
|
||||
+ try:
|
||||
+ config = configparser.ConfigParser()
|
||||
+ config.read(filename)
|
||||
+ if not config.has_option(CONF_LOG, CONF_LOG_LEVEL):
|
||||
+ return logging.INFO
|
||||
+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL)
|
||||
+ if log_level.lower() in LogLevel:
|
||||
+ return LogLevel.get(log_level.lower())
|
||||
+ return logging.INFO
|
||||
+ except configparser.Error:
|
||||
+ return logging.INFO
|
||||
+
|
||||
+
|
||||
class CollectConfig:
|
||||
def __init__(self, filename=COLLECT_CONF_PATH):
|
||||
|
||||
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
|
||||
index 9c8dae7..019d174 100644
|
||||
--- a/src/python/sentryCollector/collect_io.py
|
||||
+++ b/src/python/sentryCollector/collect_io.py
|
||||
@@ -163,18 +163,6 @@ class CollectIo():
|
||||
logging.error("An error occurred2: %s", e)
|
||||
return column_names
|
||||
|
||||
- def task_loop(self):
|
||||
- if self.stop_event.is_set():
|
||||
- logging.info("collect io thread exit")
|
||||
- return
|
||||
-
|
||||
- for disk_name, stage_list in self.disk_map_stage.items():
|
||||
- if self.get_blk_io_hierarchy(disk_name, stage_list) < 0:
|
||||
- continue
|
||||
- self.append_period_lat(disk_name, stage_list)
|
||||
-
|
||||
- threading.Timer(self.period_time, self.task_loop).start()
|
||||
-
|
||||
def is_kernel_avaliable(self):
|
||||
base_path = '/sys/kernel/debug/block'
|
||||
all_disk = []
|
||||
@@ -191,6 +179,9 @@ class CollectIo():
|
||||
if file_name == 'stats':
|
||||
all_disk.append(disk_name)
|
||||
|
||||
+ if self.loop_all:
|
||||
+ self.disk_list = all_disk
|
||||
+
|
||||
for disk_name in self.disk_list:
|
||||
if not self.loop_all and disk_name not in all_disk:
|
||||
logging.warning("the %s disk not exist!", disk_name)
|
||||
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
|
||||
index 1faa5e3..3e2cf4c 100644
|
||||
--- a/src/python/sentryCollector/collect_plugin.py
|
||||
+++ b/src/python/sentryCollector/collect_plugin.py
|
||||
@@ -75,14 +75,14 @@ def client_send_and_recv(request_data, data_str_len, protocol):
|
||||
try:
|
||||
client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
except socket.error:
|
||||
- print("collect_plugin: client create socket error")
|
||||
+ logging.error("collect_plugin: client create socket error")
|
||||
return None
|
||||
|
||||
try:
|
||||
client_socket.connect(COLLECT_SOCKET_PATH)
|
||||
except OSError:
|
||||
client_socket.close()
|
||||
- print("collect_plugin: client connect error")
|
||||
+ logging.error("collect_plugin: client connect error")
|
||||
return None
|
||||
|
||||
req_data_len = len(request_data)
|
||||
@@ -94,23 +94,23 @@ def client_send_and_recv(request_data, data_str_len, protocol):
|
||||
res_data = res_data.decode()
|
||||
except (OSError, UnicodeError):
|
||||
client_socket.close()
|
||||
- print("collect_plugin: client communicate error")
|
||||
+ logging.error("collect_plugin: client communicate error")
|
||||
return None
|
||||
|
||||
res_magic = res_data[:CLT_MSG_MAGIC_LEN]
|
||||
if res_magic != "RES":
|
||||
- print("res msg format error")
|
||||
+ logging.error("res msg format error")
|
||||
return None
|
||||
|
||||
protocol_str = res_data[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN]
|
||||
try:
|
||||
protocol_id = int(protocol_str)
|
||||
except ValueError:
|
||||
- print("recv msg protocol id is invalid %s", protocol_str)
|
||||
+ logging.error("recv msg protocol id is invalid %s", protocol_str)
|
||||
return None
|
||||
|
||||
if protocol_id >= ClientProtocol.PRO_END:
|
||||
- print("protocol id is invalid")
|
||||
+ logging.error("protocol id is invalid")
|
||||
return None
|
||||
|
||||
try:
|
||||
@@ -119,7 +119,7 @@ def client_send_and_recv(request_data, data_str_len, protocol):
|
||||
res_msg_data = res_msg_data.decode()
|
||||
return res_msg_data
|
||||
except (OSError, ValueError, UnicodeError):
|
||||
- print("collect_plugin: client recv res msg error")
|
||||
+ logging.error("collect_plugin: client recv res msg error")
|
||||
finally:
|
||||
client_socket.close()
|
||||
|
||||
@@ -128,30 +128,30 @@ def client_send_and_recv(request_data, data_str_len, protocol):
|
||||
def validate_parameters(param, len_limit, char_limit):
|
||||
ret = ResultMessage.RESULT_SUCCEED
|
||||
if not param:
|
||||
- print("param is invalid")
|
||||
+ logging.error("param is invalid, param = %s", param)
|
||||
ret = ResultMessage.RESULT_NOT_PARAM
|
||||
return [False, ret]
|
||||
|
||||
if not isinstance(param, list):
|
||||
- print(f"{param} is not list type.")
|
||||
+ logging.error("%s is not list type.", param)
|
||||
ret = ResultMessage.RESULT_NOT_PARAM
|
||||
return [False, ret]
|
||||
|
||||
if len(param) <= 0:
|
||||
- print(f"{param} length is 0.")
|
||||
+ logging.error("%s length is 0.", param)
|
||||
ret = ResultMessage.RESULT_INVALID_LENGTH
|
||||
return [False, ret]
|
||||
|
||||
pattern = r'^[a-zA-Z0-9_-]+$'
|
||||
for info in param:
|
||||
if not re.match(pattern, info):
|
||||
- print(f"{info} is invalid char")
|
||||
+ logging.error("%s is invalid char", info)
|
||||
ret = ResultMessage.RESULT_INVALID_CHAR
|
||||
return [False, ret]
|
||||
|
||||
# length of len_limit is exceeded, keep len_limit
|
||||
if len(param) > len_limit:
|
||||
- print(f"{param} length more than {len_limit}, keep the first {len_limit}")
|
||||
+ logging.error("%s length more than %d, keep the first %d", param, len_limit, len_limit)
|
||||
param[:] = param[0:len_limit]
|
||||
|
||||
# only keep elements under the char_limit length
|
||||
@@ -202,13 +202,13 @@ def inter_is_iocollect_valid(period, disk_list=None, stage=None):
|
||||
request_message = json.dumps(req_msg_struct)
|
||||
result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.IS_IOCOLLECT_VALID)
|
||||
if not result_message:
|
||||
- print("collect_plugin: client_send_and_recv failed")
|
||||
+ logging.error("collect_plugin: client_send_and_recv failed")
|
||||
return result
|
||||
|
||||
try:
|
||||
json.loads(result_message)
|
||||
except json.JSONDecodeError:
|
||||
- print("is_iocollect_valid: json decode error")
|
||||
+ logging.error("is_iocollect_valid: json decode error")
|
||||
result['ret'] = ResultMessage.RESULT_PARSE_FAILED
|
||||
return result
|
||||
|
||||
@@ -260,12 +260,12 @@ def inter_get_io_data(period, disk_list, stage, iotype):
|
||||
request_message = json.dumps(req_msg_struct)
|
||||
result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.GET_IO_DATA)
|
||||
if not result_message:
|
||||
- print("collect_plugin: client_send_and_recv failed")
|
||||
+ logging.error("collect_plugin: client_send_and_recv failed")
|
||||
return result
|
||||
try:
|
||||
json.loads(result_message)
|
||||
except json.JSONDecodeError:
|
||||
- print("get_io_data: json decode error")
|
||||
+ logging.error("get_io_data: json decode error")
|
||||
result['ret'] = ResultMessage.RESULT_PARSE_FAILED
|
||||
return result
|
||||
|
||||
diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py
|
||||
index d9d8862..33f4b04 100644
|
||||
--- a/src/python/sentryCollector/collectd.py
|
||||
+++ b/src/python/sentryCollector/collectd.py
|
||||
@@ -26,7 +26,7 @@ import threading
|
||||
|
||||
from .collect_io import CollectIo
|
||||
from .collect_server import CollectServer
|
||||
-from .collect_config import CollectConfig
|
||||
+from .collect_config import CollectConfig, get_log_level
|
||||
|
||||
SENTRY_RUN_DIR = "/var/run/sysSentry"
|
||||
COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock"
|
||||
@@ -57,7 +57,9 @@ def main():
|
||||
os.mkdir(SENTRY_RUN_DIR)
|
||||
os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM)
|
||||
|
||||
- logging.basicConfig(filename=COLLECT_LOG_FILE, level=logging.INFO)
|
||||
+ log_level = get_log_level()
|
||||
+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
||||
+ logging.basicConfig(filename=COLLECT_LOG_FILE, level=log_level, format=log_format)
|
||||
os.chmod(COLLECT_LOG_FILE, 0o600)
|
||||
|
||||
try:
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
index ac35be2..b6b3b28 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
@@ -15,7 +15,7 @@ import time
|
||||
|
||||
from .stage_window import IoWindow, IoDumpWindow
|
||||
from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler
|
||||
-from .utils import update_avg_and_check_abnormal
|
||||
+from .utils import update_avg_and_check_abnormal, get_log_level
|
||||
|
||||
CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
|
||||
|
||||
@@ -283,7 +283,10 @@ def main():
|
||||
signal.signal(signal.SIGINT, sig_handler)
|
||||
signal.signal(signal.SIGTERM, sig_handler)
|
||||
|
||||
- logging.basicConfig(level=logging.INFO)
|
||||
+ log_level = get_log_level(CONFIG_FILE)
|
||||
+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
||||
+
|
||||
+ logging.basicConfig(level=log_level, format=log_format)
|
||||
|
||||
# 初始化配置读取
|
||||
config = configparser.ConfigParser(comment_prefixes=('#', ';'))
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
index 54ed080..2de9a46 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
@@ -8,9 +8,41 @@
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||
# PURPOSE.
|
||||
# See the Mulan PSL v2 for more details.
|
||||
+import configparser
|
||||
+import logging
|
||||
+import os
|
||||
+
|
||||
AVG_VALUE = 0
|
||||
AVG_COUNT = 1
|
||||
|
||||
+CONF_LOG = 'log'
|
||||
+CONF_LOG_LEVEL = 'level'
|
||||
+LogLevel = {
|
||||
+ "debug": logging.DEBUG,
|
||||
+ "info": logging.INFO,
|
||||
+ "warning": logging.WARNING,
|
||||
+ "error": logging.ERROR,
|
||||
+ "critical": logging.CRITICAL
|
||||
+}
|
||||
+
|
||||
+
|
||||
+def get_log_level(filename):
|
||||
+ if not os.path.exists(filename):
|
||||
+ return logging.INFO
|
||||
+
|
||||
+ try:
|
||||
+ config = configparser.ConfigParser()
|
||||
+ config.read(filename)
|
||||
+ if not config.has_option(CONF_LOG, CONF_LOG_LEVEL):
|
||||
+ return logging.INFO
|
||||
+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL)
|
||||
+
|
||||
+ if log_level.lower() in LogLevel:
|
||||
+ return LogLevel.get(log_level.lower())
|
||||
+ return logging.INFO
|
||||
+ except configparser.Error:
|
||||
+ return logging.INFO
|
||||
+
|
||||
|
||||
def get_nested_value(data, keys):
|
||||
"""get data from nested dict"""
|
||||
diff --git a/src/python/syssentry/sentry_config.py b/src/python/syssentry/sentry_config.py
|
||||
index a0e7b79..1169887 100644
|
||||
--- a/src/python/syssentry/sentry_config.py
|
||||
+++ b/src/python/syssentry/sentry_config.py
|
||||
@@ -21,6 +21,34 @@ import sys
|
||||
DEFAULT_INSPECT_DELAY = 3
|
||||
INSPECT_CONF_PATH = "/etc/sysSentry/inspect.conf"
|
||||
|
||||
+CONF_LOG = 'log'
|
||||
+CONF_LOG_LEVEL = 'level'
|
||||
+LogLevel = {
|
||||
+ "debug": logging.DEBUG,
|
||||
+ "info": logging.INFO,
|
||||
+ "warning": logging.WARNING,
|
||||
+ "error": logging.ERROR,
|
||||
+ "critical": logging.CRITICAL
|
||||
+}
|
||||
+
|
||||
+
|
||||
+def get_log_level(filename=INSPECT_CONF_PATH):
|
||||
+ if not os.path.exists(filename):
|
||||
+ return logging.INFO
|
||||
+
|
||||
+ try:
|
||||
+ config = configparser.ConfigParser()
|
||||
+ config.read(filename)
|
||||
+ if not config.has_option(CONF_LOG, CONF_LOG_LEVEL):
|
||||
+ return logging.INFO
|
||||
+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL)
|
||||
+
|
||||
+ if log_level.lower() in LogLevel:
|
||||
+ return LogLevel.get(log_level.lower())
|
||||
+ return logging.INFO
|
||||
+ except configparser.Error:
|
||||
+ return logging.INFO
|
||||
+
|
||||
|
||||
class SentryConfig:
|
||||
"""
|
||||
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
|
||||
index 776971f..9ef0203 100644
|
||||
--- a/src/python/syssentry/syssentry.py
|
||||
+++ b/src/python/syssentry/syssentry.py
|
||||
@@ -23,7 +23,7 @@ import fcntl
|
||||
|
||||
import select
|
||||
|
||||
-from .sentry_config import SentryConfig
|
||||
+from .sentry_config import SentryConfig, get_log_level
|
||||
|
||||
from .task_map import TasksMap
|
||||
from .global_values import SENTRY_RUN_DIR, CTL_SOCKET_PATH, SENTRY_RUN_DIR_PERM
|
||||
@@ -563,7 +563,10 @@ def main():
|
||||
os.mkdir(SENTRY_RUN_DIR)
|
||||
os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM)
|
||||
|
||||
- logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=logging.INFO)
|
||||
+ log_level = get_log_level()
|
||||
+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
||||
+
|
||||
+ logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=log_level, format=log_format)
|
||||
os.chmod(SYSSENTRY_LOG_FILE, 0o600)
|
||||
|
||||
if not chk_and_set_pidfile():
|
||||
diff --git a/src/python/xalarm/xalarm_config.py b/src/python/xalarm/xalarm_config.py
|
||||
index 8e56d10..754a816 100644
|
||||
--- a/src/python/xalarm/xalarm_config.py
|
||||
+++ b/src/python/xalarm/xalarm_config.py
|
||||
@@ -15,9 +15,10 @@ Create: 2023-11-02
|
||||
"""
|
||||
|
||||
import re
|
||||
+import os
|
||||
import dataclasses
|
||||
import logging
|
||||
-from configparser import ConfigParser
|
||||
+import configparser
|
||||
|
||||
|
||||
MAIN_CONFIG_PATH = '/etc/sysSentry/xalarm.conf'
|
||||
@@ -27,6 +28,34 @@ MIN_ID_NUMBER = 1001
|
||||
MAX_ID_NUMBER = 1128
|
||||
MAX_ID_MASK_CAPACITY = 128
|
||||
|
||||
+# log
|
||||
+CONF_LOG = 'log'
|
||||
+CONF_LOG_LEVEL = 'level'
|
||||
+LogLevel = {
|
||||
+ "debug": logging.DEBUG,
|
||||
+ "info": logging.INFO,
|
||||
+ "warning": logging.WARNING,
|
||||
+ "error": logging.ERROR,
|
||||
+ "critical": logging.CRITICAL
|
||||
+}
|
||||
+
|
||||
+
|
||||
+def get_log_level(filename=MAIN_CONFIG_PATH):
|
||||
+ if not os.path.exists(filename):
|
||||
+ return logging.INFO
|
||||
+
|
||||
+ try:
|
||||
+ config = configparser.ConfigParser()
|
||||
+ config.read(filename)
|
||||
+ if not config.has_option(CONF_LOG, CONF_LOG_LEVEL):
|
||||
+ return logging.INFO
|
||||
+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL)
|
||||
+ if log_level.lower() in LogLevel:
|
||||
+ return LogLevel.get(log_level.lower())
|
||||
+ return logging.INFO
|
||||
+ except configparser.Error:
|
||||
+ return logging.INFO
|
||||
+
|
||||
|
||||
@dataclasses.dataclass
|
||||
class AlarmConfig:
|
||||
@@ -106,7 +135,7 @@ def config_init():
|
||||
"""
|
||||
alarm_config = AlarmConfig()
|
||||
|
||||
- cfg = ConfigParser()
|
||||
+ cfg = configparser.ConfigParser()
|
||||
cfg.read(MAIN_CONFIG_PATH)
|
||||
|
||||
id_mask = parse_id_mask(cfg)
|
||||
diff --git a/src/python/xalarm/xalarm_daemon.py b/src/python/xalarm/xalarm_daemon.py
|
||||
index 00e8886..3ab211c 100644
|
||||
--- a/src/python/xalarm/xalarm_daemon.py
|
||||
+++ b/src/python/xalarm/xalarm_daemon.py
|
||||
@@ -21,7 +21,7 @@ import signal
|
||||
import fcntl
|
||||
import socket
|
||||
|
||||
-from .xalarm_config import config_init
|
||||
+from .xalarm_config import config_init, get_log_level
|
||||
from .xalarm_server import server_loop, SOCK_FILE
|
||||
|
||||
ALARM_DIR = "/var/run/xalarm"
|
||||
@@ -120,9 +120,10 @@ def alarm_process_create():
|
||||
os.mkdir(ALARM_DIR)
|
||||
os.chmod(ALARM_DIR, ALARM_DIR_PERMISSION)
|
||||
|
||||
+ log_level = get_log_level()
|
||||
+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
||||
|
||||
- logging.basicConfig(filename=ALARM_LOGFILE, level=logging.INFO,
|
||||
- format='%(asctime)s|%(levelname)s| %(message)s')
|
||||
+ logging.basicConfig(filename=ALARM_LOGFILE, level=log_level, format=log_format)
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,104 +0,0 @@
|
||||
From 0a4bd4097690bee7250676a0c262a830c7a8fbcf Mon Sep 17 00:00:00 2001
|
||||
From: jinsaihang <jinsaihang@h-partners.com>
|
||||
Date: Fri, 11 Oct 2024 15:35:43 +0800
|
||||
Subject: [PATCH] add parameter time_range ,alarm_id and alarm_clear_time
|
||||
validation
|
||||
|
||||
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
|
||||
---
|
||||
sysSentry-1.0.2/src/python/syssentry/alarm.py | 19 +++++++++++++++++++
|
||||
.../src/python/syssentry/load_mods.py | 6 ++----
|
||||
.../src/python/syssentry/sentryctl | 4 +++-
|
||||
3 files changed, 24 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
|
||||
index d5337d3..43c1065 100644
|
||||
--- a/src/python/syssentry/alarm.py
|
||||
+++ b/src/python/syssentry/alarm.py
|
||||
@@ -18,6 +18,7 @@ from datetime import datetime
|
||||
import time
|
||||
import logging
|
||||
import json
|
||||
+import sys
|
||||
|
||||
from xalarm.register_xalarm import xalarm_register,xalarm_getid,xalarm_getlevel,xalarm_gettype,xalarm_gettime,xalarm_getdesc
|
||||
from xalarm.xalarm_api import Xalarm
|
||||
@@ -41,9 +42,15 @@ id_base = 1001
|
||||
clientId = -1
|
||||
|
||||
MILLISECONDS_UNIT_SECONDS = 1000
|
||||
+MAX_NUM_OF_ALARM_ID = 128
|
||||
+MIN_ALARM_ID = 1001
|
||||
+MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
|
||||
|
||||
def update_alarm_list(alarm_info: Xalarm):
|
||||
alarm_id = xalarm_getid(alarm_info)
|
||||
+ if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
|
||||
+ logging.warnning(f"Invalid alarm_id {alarm_id}")
|
||||
+ return
|
||||
timestamp = xalarm_gettime(alarm_info)
|
||||
if not timestamp:
|
||||
logging.error("Retrieve timestamp failed")
|
||||
@@ -77,7 +84,19 @@ def alarm_register():
|
||||
logging.info(f"alarm_register: {task_name} is registered")
|
||||
task = TasksMap.tasks_dict[task_type][task_name]
|
||||
alarm_id = task.alarm_id
|
||||
+ if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
|
||||
+ logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
|
||||
+ continue
|
||||
alarm_clear_time = task.alarm_clear_time
|
||||
+ try:
|
||||
+ alarm_clear_time = int(alarm_clear_time)
|
||||
+ if alarm_clear_time <= 0:
|
||||
+ raise ValueError("Not a positive integer")
|
||||
+ if alarm_clear_time > sys.maxsize:
|
||||
+ raise ValueError("Exceeds maximum value for int")
|
||||
+ except (ValueError, OverflowError, TypeError) as e:
|
||||
+ logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
|
||||
+ continue
|
||||
alarm_list_dict[alarm_id] = []
|
||||
task_alarm_id_dict[task_name] = alarm_id
|
||||
if alarm_id not in alarm_id_clear_time_dict:
|
||||
diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py
|
||||
index ae05e57..7daf17d 100644
|
||||
--- a/src/python/syssentry/load_mods.py
|
||||
+++ b/src/python/syssentry/load_mods.py
|
||||
@@ -203,11 +203,9 @@ def parse_mod_conf(mod_name, mod_conf):
|
||||
if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID):
|
||||
raise ValueError("Invalid alarm_id")
|
||||
except ValueError:
|
||||
- task.alarm_id = -1
|
||||
- logging.warning("Invalid alarm_id, set to -1")
|
||||
+ logging.warning("Invalid alarm_id")
|
||||
except configparser.NoOptionError:
|
||||
- task.alarm_id = -1
|
||||
- logging.warning("Unset alarm_id and alarm_clear_time, use -1 and 15s as default")
|
||||
+ logging.warning("Unset alarm_clear_time, use 15s as default")
|
||||
|
||||
if CONF_ONSTART in mod_conf.options(CONF_TASK):
|
||||
is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes')
|
||||
diff --git a/src/python/syssentry/sentryctl b/src/python/syssentry/sentryctl
|
||||
index 3de93d0..c2e3cef 100644
|
||||
--- a/src/python/syssentry/sentryctl
|
||||
+++ b/src/python/syssentry/sentryctl
|
||||
@@ -136,7 +136,7 @@ if __name__ == '__main__':
|
||||
parser_get_result.add_argument('task_name')
|
||||
parser_get_alarm = subparsers.add_parser('get_alarm', help='get task alarm')
|
||||
parser_get_alarm.add_argument('task_name')
|
||||
- parser_get_alarm.add_argument('-s', '--time_range', type=str, default=DEFAULT_ALARM_TIME_RANGE, help='Specified time range')
|
||||
+ parser_get_alarm.add_argument('-s', '--time_range', type=int, default=DEFAULT_ALARM_TIME_RANGE, help='Specified time range')
|
||||
parser_get_alarm.add_argument('-d', '--detailed', action='store_true', help='Print Detailed Information')
|
||||
parser_list = subparsers.add_parser('list', help='show all loaded task mod')
|
||||
|
||||
@@ -153,6 +153,8 @@ if __name__ == '__main__':
|
||||
elif client_args.cmd_type == 'get_result':
|
||||
req_msg_struct = {"type": "get_result", "data": client_args.task_name}
|
||||
elif client_args.cmd_type == 'get_alarm':
|
||||
+ if not isinstance(client_args.time_range, int) or client_args.time_range <= 0:
|
||||
+ print(f"time_range is not a positive integer: {client_args.time_range}")
|
||||
req_msg_struct = {
|
||||
"type": "get_alarm",
|
||||
"data": {
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,678 +0,0 @@
|
||||
From a18ea2e94fef78334a56dce1ea3f67ee649732f3 Mon Sep 17 00:00:00 2001
|
||||
From: PshySimon <caixiaomeng2@huawei.com>
|
||||
Date: Thu, 26 Sep 2024 16:12:25 +0800
|
||||
Subject: [PATCH] add pyxalarm and pySentryNotify, add multi users support for
|
||||
xalarmd and adapt libxalarm
|
||||
|
||||
---
|
||||
src/libso/xalarm/register_xalarm.c | 41 ++----
|
||||
src/libso/xalarm/register_xalarm.h | 10 +-
|
||||
src/python/xalarm/register_xalarm.py | 192 +++++++++++++++++++++++++++
|
||||
src/python/xalarm/sentry_notify.py | 71 ++++++++++
|
||||
src/python/xalarm/xalarm_api.py | 18 ++-
|
||||
src/python/xalarm/xalarm_server.py | 40 +++++-
|
||||
src/python/xalarm/xalarm_transfer.py | 96 ++++++++++++--
|
||||
7 files changed, 408 insertions(+), 60 deletions(-)
|
||||
create mode 100644 src/python/xalarm/register_xalarm.py
|
||||
create mode 100644 src/python/xalarm/sentry_notify.py
|
||||
|
||||
diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c
|
||||
index 152c078..21a419f 100644
|
||||
--- a/src/libso/xalarm/register_xalarm.c
|
||||
+++ b/src/libso/xalarm/register_xalarm.c
|
||||
@@ -35,7 +35,7 @@
|
||||
#define ALARM_SOCKET_PERMISSION 0700
|
||||
#define TIME_UNIT_MILLISECONDS 1000
|
||||
|
||||
-#define MAX_PARAS_LEN 511
|
||||
+#define MAX_PARAS_LEN 1023
|
||||
#define MIN_ALARM_ID 1001
|
||||
#define MAX_ALARM_ID (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
|
||||
|
||||
@@ -91,7 +91,7 @@ static int create_unix_socket(const char *path)
|
||||
return -1;
|
||||
}
|
||||
|
||||
- fd = socket(AF_UNIX, SOCK_DGRAM, 0);
|
||||
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
|
||||
if (fd < 0) {
|
||||
printf("socket failed:%s\n", strerror(errno));
|
||||
return -1;
|
||||
@@ -103,14 +103,6 @@ static int create_unix_socket(const char *path)
|
||||
goto release_socket;
|
||||
}
|
||||
|
||||
- if (access(PATH_REG_ALARM, F_OK) == 0) {
|
||||
- ret = unlink(PATH_REG_ALARM);
|
||||
- if (ret != 0) {
|
||||
- printf("unlink register socket file failed\n");
|
||||
- goto release_socket;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
if (access(DIR_XALARM, F_OK) == -1) {
|
||||
if (mkdir(DIR_XALARM, ALARM_DIR_PERMISSION) == -1) {
|
||||
printf("mkdir %s failed\n", DIR_XALARM);
|
||||
@@ -120,32 +112,22 @@ static int create_unix_socket(const char *path)
|
||||
|
||||
if (memset(&alarm_addr, 0, sizeof(alarm_addr)) == NULL) {
|
||||
printf("create_unix_socket: memset alarm_addr failed, ret: %d\n", ret);
|
||||
- goto remove_dir;
|
||||
+ goto release_socket;
|
||||
}
|
||||
alarm_addr.sun_family = AF_UNIX;
|
||||
strncpy(alarm_addr.sun_path, path, sizeof(alarm_addr.sun_path) - 1);
|
||||
|
||||
- if (bind(fd, (struct sockaddr *)&alarm_addr, sizeof(alarm_addr.sun_family) + strlen(alarm_addr.sun_path)) < 0) {
|
||||
- printf("bind socket failed:%s\n", strerror(errno));
|
||||
- goto remove_dir;
|
||||
+ if (connect(fd, (struct sockaddr*)&alarm_addr, sizeof(alarm_addr)) == -1) {
|
||||
+ printf("create_unix_socket: connect alarm_addr failed, ret: %d\n", ret);
|
||||
+ goto release_socket;
|
||||
}
|
||||
if (chmod(path, ALARM_SOCKET_PERMISSION) < 0) {
|
||||
printf("chmod %s failed: %s\n", path, strerror(errno));
|
||||
- goto unlink_sockfile;
|
||||
+ goto release_socket;
|
||||
}
|
||||
|
||||
return fd;
|
||||
|
||||
-unlink_sockfile:
|
||||
- ret = unlink(PATH_REG_ALARM);
|
||||
- if (ret != 0) {
|
||||
- printf("unlink register socket file failed\n");
|
||||
- }
|
||||
-remove_dir:
|
||||
- ret = rmdir(DIR_XALARM);
|
||||
- if (ret != 0) {
|
||||
- printf("rmdir %s failed: %s\n", path, strerror(errno));
|
||||
- }
|
||||
release_socket:
|
||||
(void)close(fd);
|
||||
|
||||
@@ -271,8 +253,6 @@ int xalarm_Register(alarm_callback_func callback, struct alarm_subscription_info
|
||||
|
||||
void xalarm_UnRegister(int client_id)
|
||||
{
|
||||
- int ret;
|
||||
-
|
||||
if (!g_register_info.is_registered) {
|
||||
printf("%s: alarm has not registered\n", __func__);
|
||||
return;
|
||||
@@ -292,10 +272,6 @@ void xalarm_UnRegister(int client_id)
|
||||
if (g_register_info.register_fd != -1) {
|
||||
(void)close(g_register_info.register_fd);
|
||||
g_register_info.register_fd = -1;
|
||||
- ret = unlink(PATH_REG_ALARM);
|
||||
- if (ret != 0) {
|
||||
- printf("%s: unlink register socket file failed\n", __func__);
|
||||
- }
|
||||
}
|
||||
|
||||
memset(g_register_info.alarm_enable_bitmap, 0, MAX_NUM_OF_ALARM_ID * sizeof(char));
|
||||
@@ -357,7 +333,7 @@ int xalarm_Report(unsigned short usAlarmId, unsigned char ucAlarmLevel,
|
||||
struct sockaddr_un alarm_addr;
|
||||
|
||||
if ((usAlarmId < MIN_ALARM_ID || usAlarmId > MAX_ALARM_ID) ||
|
||||
- (ucAlarmLevel < ALARM_LEVEL_FATAL || ucAlarmLevel > ALARM_LEVEL_DEBUG) ||
|
||||
+ (ucAlarmLevel < MINOR_ALM || ucAlarmLevel > CRITICAL_ALM) ||
|
||||
(ucAlarmType < ALARM_TYPE_OCCUR || ucAlarmType > ALARM_TYPE_RECOVER)) {
|
||||
fprintf(stderr, "%s: alarm info invalid\n", __func__);
|
||||
return -1;
|
||||
@@ -666,3 +642,4 @@ int report_result(const char *task_name, enum RESULT_LEVEL result_level, const c
|
||||
return RETURE_CODE_SUCCESS;
|
||||
}
|
||||
|
||||
+
|
||||
diff --git a/src/libso/xalarm/register_xalarm.h b/src/libso/xalarm/register_xalarm.h
|
||||
index 1f26c6a..fef9482 100644
|
||||
--- a/src/libso/xalarm/register_xalarm.h
|
||||
+++ b/src/libso/xalarm/register_xalarm.h
|
||||
@@ -11,7 +11,7 @@
|
||||
#include <sys/time.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
-#define ALARM_INFO_MAX_PARAS_LEN 512
|
||||
+#define ALARM_INFO_MAX_PARAS_LEN 1024
|
||||
#define MAX_STRERROR_SIZE 1024
|
||||
#define MAX_ALARM_TYEPS 1024
|
||||
#define MIN_ALARM_ID 1001
|
||||
@@ -19,11 +19,9 @@
|
||||
|
||||
#define MEMORY_ALARM_ID 1001
|
||||
|
||||
-#define ALARM_LEVEL_FATAL 1
|
||||
-#define ALARM_LEVEL_ERROR 2
|
||||
-#define ALARM_LEVEL_WARNING 3
|
||||
-#define ALARM_LEVEL_INFO 4
|
||||
-#define ALARM_LEVEL_DEBUG 5
|
||||
+#define MINOR_ALM 1
|
||||
+#define MAJOR_ALM 2
|
||||
+#define CRITICAL_ALM 3
|
||||
|
||||
#define ALARM_TYPE_OCCUR 1
|
||||
#define ALARM_TYPE_RECOVER 2
|
||||
diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py
|
||||
new file mode 100644
|
||||
index 0000000..e58343d
|
||||
--- /dev/null
|
||||
+++ b/src/python/xalarm/register_xalarm.py
|
||||
@@ -0,0 +1,192 @@
|
||||
+import os
|
||||
+import sys
|
||||
+import socket
|
||||
+import logging
|
||||
+import threading
|
||||
+import time
|
||||
+import fcntl
|
||||
+import inspect
|
||||
+from struct import error as StructParseError
|
||||
+
|
||||
+from .xalarm_api import Xalarm, alarm_bin2stu
|
||||
+
|
||||
+
|
||||
+ALARM_REPORT_LEN = 1048
|
||||
+MAX_NUM_OF_ALARM_ID=128
|
||||
+MIN_ALARM_ID = 1001
|
||||
+MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
|
||||
+DIR_XALARM = "/var/run/xalarm"
|
||||
+PATH_REG_ALARM = "/var/run/xalarm/alarm"
|
||||
+PATH_REPORT_ALARM = "/var/run/xalarm/report"
|
||||
+ALARM_DIR_PERMISSION = 0o0750
|
||||
+ALARM_REG_SOCK_PERMISSION = 0o0700
|
||||
+ALARM_SOCKET_PERMISSION = 0o0700
|
||||
+TIME_UNIT_MILLISECONDS = 1000
|
||||
+ALARM_REGISTER_INFO = None
|
||||
+
|
||||
+
|
||||
+class AlarmRegister:
|
||||
+ def __init__(self, id_filter: list[bool], callback: callable):
|
||||
+ self.id_filter = id_filter
|
||||
+ self.callback = callback
|
||||
+ self.socket = self.create_unix_socket()
|
||||
+ self.is_registered = True
|
||||
+ self.thread = threading.Thread(target=self.alarm_recv)
|
||||
+ self.thread_should_stop = False
|
||||
+
|
||||
+ def check_params(self) -> bool:
|
||||
+ if (len(self.id_filter) != MAX_NUM_OF_ALARM_ID):
|
||||
+ sys.stderr.write("check_params: invalid param id_filter\n")
|
||||
+ return False
|
||||
+
|
||||
+ sig = inspect.signature(self.callback)
|
||||
+ if len(sig.parameters) != 1:
|
||||
+ sys.stderr.write("check_params: invalid param callback\n")
|
||||
+ return False
|
||||
+
|
||||
+ if self.socket is None:
|
||||
+ sys.stderr.write("check_params: scoket create failed\n")
|
||||
+ return False
|
||||
+ return True
|
||||
+
|
||||
+ def set_id_filter(self, id_filter: list[bool]) -> bool:
|
||||
+ if (len(id_filter) > MAX_NUM_OF_ALARM_ID):
|
||||
+ sys.stderr.write("set_id_filter: invalid param id_filter\n")
|
||||
+ return False
|
||||
+ self.id_filter = id_filter
|
||||
+
|
||||
+ def id_is_registered(self, alarm_id) -> bool:
|
||||
+ if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
|
||||
+ return False
|
||||
+ return self.id_filter[alarm_id - MIN_ALARM_ID]
|
||||
+
|
||||
+ def put_alarm_info(self, alarm_info: Xalarm) -> None:
|
||||
+ if not self.callback or not alarm_info:
|
||||
+ return
|
||||
+ if not self.id_is_registered(alarm_info.alarm_id):
|
||||
+ return
|
||||
+ self.callback(alarm_info)
|
||||
+
|
||||
+ def create_unix_socket(self) -> socket.socket:
|
||||
+ try:
|
||||
+ sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
+ sock.setblocking(False)
|
||||
+
|
||||
+ if not os.access(DIR_XALARM, os.F_OK):
|
||||
+ os.makedirs(DIR_XALARM)
|
||||
+ os.chmod(DIR_XALARM, ALARM_DIR_PERMISSION)
|
||||
+
|
||||
+ sock.connect(PATH_REG_ALARM)
|
||||
+ return sock
|
||||
+ except (IOError, OSError, FileNotFoundError) as e:
|
||||
+ sock.close()
|
||||
+ sys.stderr.write(f"create_unix_socket: create socket error:{e}\n")
|
||||
+ return None
|
||||
+
|
||||
+ def alarm_recv(self):
|
||||
+ while not self.thread_should_stop:
|
||||
+ try:
|
||||
+ data = self.socket.recv(ALARM_REPORT_LEN)
|
||||
+ if not data:
|
||||
+ sys.stderr.write("connection closed by xalarmd, maybe connections reach max num or service stopped.\n")
|
||||
+ self.thread_should_stop = True
|
||||
+ break
|
||||
+ if len(data) != ALARM_REPORT_LEN:
|
||||
+ sys.stderr.write(f"server receive report msg length wrong {len(data)}\n")
|
||||
+ continue
|
||||
+
|
||||
+ alarm_info = alarm_bin2stu(data)
|
||||
+ self.put_alarm_info(alarm_info)
|
||||
+ except (BlockingIOError) as e:
|
||||
+ time.sleep(0.1)
|
||||
+ except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError):
|
||||
+ sys.stderr.write("Connection closed by the server.\n")
|
||||
+ self.thread_should_stop = True
|
||||
+ except (ValueError, StructParseError, InterruptedError) as e:
|
||||
+ sys.stderr.write(f"{e}\n")
|
||||
+ except Exception as e:
|
||||
+ sys.stderr.write(f"{e}\n")
|
||||
+ self.thread_should_stop = True
|
||||
+
|
||||
+ def start_thread(self) -> None:
|
||||
+ self.thread.daemon = True
|
||||
+ self.thread.start()
|
||||
+
|
||||
+ def stop_thread(self) -> None:
|
||||
+ self.thread_should_stop = True
|
||||
+ self.thread.join()
|
||||
+ self.socket.close()
|
||||
+
|
||||
+
|
||||
+def xalarm_register(callback: callable, id_filter: list[bool]) -> int:
|
||||
+ global ALARM_REGISTER_INFO
|
||||
+
|
||||
+ if ALARM_REGISTER_INFO is not None:
|
||||
+ sys.stderr.write("xalarm_register: alarm has registered\n")
|
||||
+ return -1
|
||||
+
|
||||
+ ALARM_REGISTER_INFO = AlarmRegister(id_filter, callback)
|
||||
+ if not ALARM_REGISTER_INFO.check_params():
|
||||
+ return -1
|
||||
+
|
||||
+ ALARM_REGISTER_INFO.start_thread()
|
||||
+
|
||||
+ return 0
|
||||
+
|
||||
+
|
||||
+def xalarm_unregister(clientId: int) -> None:
|
||||
+ global ALARM_REGISTER_INFO
|
||||
+ if clientId < 0:
|
||||
+ sys.stderr.write("xalarm_unregister: invalid client\n")
|
||||
+ return
|
||||
+
|
||||
+ if ALARM_REGISTER_INFO is None:
|
||||
+ sys.stderr.write("xalarm_unregister: alarm has not registered\n")
|
||||
+ return
|
||||
+
|
||||
+ ALARM_REGISTER_INFO.stop_thread()
|
||||
+ ALARM_REGISTER_INFO = None
|
||||
+
|
||||
+
|
||||
+def xalarm_upgrade(clientId: int, id_filter: list[bool]) -> None:
|
||||
+ global ALARM_REGISTER_INFO
|
||||
+ if clientId < 0:
|
||||
+ sys.stderr.write("xalarm_unregister: invalid client\n")
|
||||
+ return
|
||||
+ if ALARM_REGISTER_INFO is None:
|
||||
+ sys.stderr.write("xalarm_unregister: alarm has not registered\n")
|
||||
+ return
|
||||
+ ALARM_REGISTER_INFO.id_filter = id_filter
|
||||
+
|
||||
+
|
||||
+def xalarm_getid(alarm_info: Xalarm) -> int:
|
||||
+ if not alarm_info:
|
||||
+ return 0
|
||||
+ return alarm_info.alarm_id
|
||||
+
|
||||
+
|
||||
+def xalarm_getlevel(alarm_info: Xalarm) -> int:
|
||||
+ if not alarm_info:
|
||||
+ return 0
|
||||
+ return alarm_info.alarm_level
|
||||
+
|
||||
+
|
||||
+def xalarm_gettype(alarm_info: Xalarm) -> int:
|
||||
+ if not alarm_info:
|
||||
+ return 0
|
||||
+ return alarm_info.alarm_type
|
||||
+
|
||||
+
|
||||
+def xalarm_gettime(alarm_info: Xalarm) -> int:
|
||||
+ if not alarm_info:
|
||||
+ return 0
|
||||
+ return alarm_info.timetamp.tv_sec * TIME_UNIT_MILLISECONDS + alarm_info.timetamp.tv_usec / TIME_UNIT_MILLISECONDS
|
||||
+
|
||||
+def xalarm_getdesc(alarm_info: Xalarm) -> str:
|
||||
+ if not alarm_info:
|
||||
+ return None
|
||||
+ try:
|
||||
+ desc_str = alarm_info.msg1.rstrip(b'\x00').decode('utf-8')
|
||||
+ except UnicodeError:
|
||||
+ desc_str = None
|
||||
+ return desc_str
|
||||
diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py
|
||||
new file mode 100644
|
||||
index 0000000..a19e5b3
|
||||
--- /dev/null
|
||||
+++ b/src/python/xalarm/sentry_notify.py
|
||||
@@ -0,0 +1,71 @@
|
||||
+import os
|
||||
+import sys
|
||||
+import time
|
||||
+import socket
|
||||
+from struct import error as StructParseError
|
||||
+
|
||||
+from .xalarm_api import alarm_stu2bin, Xalarm
|
||||
+
|
||||
+MAX_NUM_OF_ALARM_ID = 128
|
||||
+MIN_ALARM_ID = 1001
|
||||
+MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
|
||||
+
|
||||
+MINOR_ALM = 1
|
||||
+MAJOR_ALM = 2
|
||||
+CRITICAL_ALM = 3
|
||||
+
|
||||
+ALARM_TYPE_OCCUR = 1
|
||||
+ALARM_TYPE_RECOVER = 2
|
||||
+
|
||||
+MAX_PUC_PARAS_LEN = 1024
|
||||
+
|
||||
+DIR_XALARM = "/var/run/xalarm"
|
||||
+PATH_REPORT_ALARM = "/var/run/xalarm/report"
|
||||
+ALARM_DIR_PERMISSION = 0o750
|
||||
+ALARM_SOCKET_PERMISSION = 0o700
|
||||
+
|
||||
+
|
||||
+def check_params(alarm_id, alarm_level, alarm_type, puc_paras) -> bool:
|
||||
+ if not os.path.exists(DIR_XALARM):
|
||||
+ sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed")
|
||||
+ return False
|
||||
+
|
||||
+ if not os.path.exists(PATH_REPORT_ALARM):
|
||||
+ sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed")
|
||||
+ return False
|
||||
+
|
||||
+ if (alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID or
|
||||
+ alarm_level < MINOR_ALM or alarm_level > CRITICAL_ALM or
|
||||
+ alarm_type < ALARM_TYPE_OCCUR or alarm_type > ALARM_TYPE_RECOVER):
|
||||
+ sys.stderr.write("check_params: alarm info invalid\n")
|
||||
+ return False
|
||||
+
|
||||
+ if len(puc_paras) >= MAX_PUC_PARAS_LEN:
|
||||
+ sys.stderr.write(f"check_params: alarm msg should be less than {MAX_PUC_PARAS_LEN}\n")
|
||||
+ return False
|
||||
+
|
||||
+ return True
|
||||
+
|
||||
+def xalarm_report(alarm_id, alarm_level, alarm_type, puc_paras) -> bool:
|
||||
+ if not check_params(alarm_id, alarm_level, alarm_type, puc_paras):
|
||||
+ return False
|
||||
+
|
||||
+ try:
|
||||
+ sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
|
||||
+
|
||||
+ current_time = time.time()
|
||||
+ current_time_seconds = int(current_time)
|
||||
+ current_microseconds = int((current_time - current_time_seconds) * 1_000_000)
|
||||
+ alarm_info = Xalarm(alarm_id, alarm_type, alarm_level,
|
||||
+ current_time_seconds, current_microseconds, puc_paras)
|
||||
+
|
||||
+ sock.sendto(alarm_stu2bin(alarm_info), PATH_REPORT_ALARM)
|
||||
+ except (FileNotFoundError, StructParseError, socket.error, OSError, UnicodeError) as e:
|
||||
+ sys.stderr.write(f"check_params: error occurs when sending msg.{e}\n")
|
||||
+ return False
|
||||
+ finally:
|
||||
+ sock.close()
|
||||
+
|
||||
+ return True
|
||||
+
|
||||
+
|
||||
diff --git a/src/python/xalarm/xalarm_api.py b/src/python/xalarm/xalarm_api.py
|
||||
index 94d7638..99eabf5 100644
|
||||
--- a/src/python/xalarm/xalarm_api.py
|
||||
+++ b/src/python/xalarm/xalarm_api.py
|
||||
@@ -23,6 +23,7 @@ ALARM_LEVELS = (1, 2, 3, 4, 5)
|
||||
ALARM_SOCK_PATH = "/var/run/xalarm/report"
|
||||
MIN_ALARM_ID = 1001
|
||||
MAX_ALARM_ID = 1128
|
||||
+MAX_MSG_LEN = 1024
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
@@ -97,15 +98,15 @@ class Xalarm:
|
||||
def msg1(self, msg):
|
||||
"""msg1 setter
|
||||
"""
|
||||
- if len(msg) > 512:
|
||||
- raise ValueError("msg1 length must below 255")
|
||||
+ if len(msg) > MAX_MSG_LEN:
|
||||
+ raise ValueError(f"msg1 length must below {MAX_MSG_LEN}")
|
||||
self._msg1 = msg
|
||||
|
||||
|
||||
def alarm_bin2stu(bin_data):
|
||||
"""alarm binary to struct
|
||||
"""
|
||||
- struct_data = struct.unpack("@HBBll512s", bin_data)
|
||||
+ struct_data = struct.unpack(f"@HBBll{MAX_MSG_LEN}s", bin_data)
|
||||
|
||||
alarm_info = Xalarm(1001, 2, 1, 0, 0, "")
|
||||
alarm_info.alarm_id = struct_data[0]
|
||||
@@ -116,3 +117,14 @@ def alarm_bin2stu(bin_data):
|
||||
alarm_info.msg1 = struct_data[5]
|
||||
|
||||
return alarm_info
|
||||
+
|
||||
+
|
||||
+def alarm_stu2bin(alarm_info: Xalarm):
|
||||
+ return struct.pack(
|
||||
+ f'@HBBll{MAX_MSG_LEN}s',
|
||||
+ alarm_info.alarm_id,
|
||||
+ alarm_info.alarm_level,
|
||||
+ alarm_info.alarm_type,
|
||||
+ alarm_info.timetamp.tv_sec,
|
||||
+ alarm_info.timetamp.tv_usec,
|
||||
+ alarm_info.msg1.encode('utf-8'))
|
||||
diff --git a/src/python/xalarm/xalarm_server.py b/src/python/xalarm/xalarm_server.py
|
||||
index 84db273..fcaf393 100644
|
||||
--- a/src/python/xalarm/xalarm_server.py
|
||||
+++ b/src/python/xalarm/xalarm_server.py
|
||||
@@ -17,16 +17,20 @@ Create: 2023-11-02
|
||||
import socket
|
||||
import os
|
||||
import logging
|
||||
+import select
|
||||
+import threading
|
||||
from struct import error as StructParseError
|
||||
|
||||
from .xalarm_api import alarm_bin2stu
|
||||
-from .xalarm_transfer import check_filter, transmit_alarm
|
||||
+from .xalarm_transfer import check_filter, transmit_alarm, wait_for_connection
|
||||
|
||||
|
||||
ALARM_DIR = "/var/run/xalarm"
|
||||
+USER_RECV_SOCK = "/var/run/xalarm/alarm"
|
||||
SOCK_FILE = "/var/run/xalarm/report"
|
||||
-ALARM_REPORT_LEN = 536
|
||||
+ALARM_REPORT_LEN = 1048
|
||||
ALARM_DIR_PERMISSION = 0o750
|
||||
+ALARM_LISTEN_QUEUE_LEN = 5
|
||||
|
||||
|
||||
def clear_sock_path():
|
||||
@@ -37,6 +41,8 @@ def clear_sock_path():
|
||||
os.chmod(ALARM_DIR, ALARM_DIR_PERMISSION)
|
||||
if os.path.exists(SOCK_FILE):
|
||||
os.unlink(SOCK_FILE)
|
||||
+ if os.path.exists(USER_RECV_SOCK):
|
||||
+ os.unlink(USER_RECV_SOCK)
|
||||
|
||||
|
||||
def server_loop(alarm_config):
|
||||
@@ -49,6 +55,21 @@ def server_loop(alarm_config):
|
||||
sock.bind(SOCK_FILE)
|
||||
os.chmod(SOCK_FILE, 0o600)
|
||||
|
||||
+ alarm_sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
+ alarm_sock.bind(USER_RECV_SOCK)
|
||||
+ os.chmod(USER_RECV_SOCK, 0o600)
|
||||
+ alarm_sock.listen(ALARM_LISTEN_QUEUE_LEN)
|
||||
+ alarm_sock.setblocking(False)
|
||||
+
|
||||
+ epoll = select.epoll()
|
||||
+ epoll.register(alarm_sock.fileno(), select.EPOLLIN)
|
||||
+ fd_to_socket = {alarm_sock.fileno(): alarm_sock,}
|
||||
+ thread_should_stop = False
|
||||
+
|
||||
+ thread = threading.Thread(target=wait_for_connection, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop))
|
||||
+ thread.daemon = True
|
||||
+ thread.start()
|
||||
+
|
||||
while True:
|
||||
try:
|
||||
data, _ = sock.recvfrom(ALARM_REPORT_LEN)
|
||||
@@ -58,14 +79,21 @@ def server_loop(alarm_config):
|
||||
logging.debug("server receive report msg length wrong %d",
|
||||
len(data))
|
||||
continue
|
||||
-
|
||||
alarm_info = alarm_bin2stu(data)
|
||||
logging.debug("server bin2stu msg")
|
||||
if not check_filter(alarm_info, alarm_config):
|
||||
continue
|
||||
+ transmit_alarm(alarm_sock, epoll, fd_to_socket, data)
|
||||
+ except Exception as e:
|
||||
+ logging.error(f"Error server:{e}")
|
||||
+
|
||||
+ thread_should_stop = True
|
||||
+ thread.join()
|
||||
|
||||
- transmit_alarm(data)
|
||||
- except (ValueError, StructParseError):
|
||||
- pass
|
||||
+ epoll.unregister(alarm_sock.fileno())
|
||||
+ epoll.close()
|
||||
+ alarm_sock.close()
|
||||
+ os.unlink(USER_RECV_SOCK)
|
||||
|
||||
sock.close()
|
||||
+
|
||||
diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py
|
||||
index b590b43..42137d8 100644
|
||||
--- a/src/python/xalarm/xalarm_transfer.py
|
||||
+++ b/src/python/xalarm/xalarm_transfer.py
|
||||
@@ -16,10 +16,12 @@ Create: 2023-11-02
|
||||
|
||||
import socket
|
||||
import logging
|
||||
+import select
|
||||
|
||||
-USER_RECV_SOCK = "/var/run/xalarm/alarm"
|
||||
MIN_ID_NUMBER = 1001
|
||||
MAX_ID_NUMBER = 1128
|
||||
+MAX_CONNECTION_NUM = 100
|
||||
+TEST_CONNECT_BUFFER_SIZE = 32
|
||||
|
||||
|
||||
def check_filter(alarm_info, alarm_filter):
|
||||
@@ -35,16 +37,84 @@ def check_filter(alarm_info, alarm_filter):
|
||||
return True
|
||||
|
||||
|
||||
-def transmit_alarm(bin_data):
|
||||
- """forward alarm message
|
||||
+def cleanup_closed_connections(server_sock, epoll, fd_to_socket):
|
||||
"""
|
||||
- sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
|
||||
- try:
|
||||
- sock.sendto(bin_data, USER_RECV_SOCK)
|
||||
- logging.debug("transfer alarm success")
|
||||
- except ConnectionRefusedError:
|
||||
- logging.debug("transfer sendto failed")
|
||||
- except FileNotFoundError:
|
||||
- logging.debug("transfer sendto failed")
|
||||
- finally:
|
||||
- sock.close()
|
||||
+ clean invalid client socket connections saved in 'fd_to_socket'
|
||||
+ :param server_sock: server socket instance of alarm
|
||||
+ :param epoll: epoll instance, used to unregister invalid client connections
|
||||
+ :param fd_to_socket: dict instance, used to hold client connections and server connections
|
||||
+ """
|
||||
+ to_remove = []
|
||||
+ for fileno, connection in fd_to_socket.items():
|
||||
+ if connection is server_sock:
|
||||
+ continue
|
||||
+ try:
|
||||
+ # test whether connection still alive, use MSG_DONTWAIT to avoid blocking thread
|
||||
+ # use MSG_PEEK to avoid consuming buffer data
|
||||
+ data = connection.recv(TEST_CONNECT_BUFFER_SIZE, socket.MSG_DONTWAIT | socket.MSG_PEEK)
|
||||
+ if not data:
|
||||
+ to_remove.append(fileno)
|
||||
+ except BlockingIOError:
|
||||
+ pass
|
||||
+ except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError):
|
||||
+ to_remove.append(fileno)
|
||||
+
|
||||
+ for fileno in to_remove:
|
||||
+ epoll.unregister(fileno)
|
||||
+ fd_to_socket[fileno].close()
|
||||
+ del fd_to_socket[fileno]
|
||||
+ logging.info(f"cleaned up connection {fileno} for client lost connection.")
|
||||
+
|
||||
+
|
||||
+def wait_for_connection(server_sock, epoll, fd_to_socket, thread_should_stop):
|
||||
+ """
|
||||
+ thread function for catch and save client connection
|
||||
+ :param server_sock: server socket instance of alarm
|
||||
+ :param epoll: epoll instance, used to unregister invalid client connections
|
||||
+ :param fd_to_socket: dict instance, used to hold client connections and server connections
|
||||
+ :param thread_should_stop: bool instance
|
||||
+ """
|
||||
+ while not thread_should_stop:
|
||||
+ try:
|
||||
+ events = epoll.poll(1)
|
||||
+
|
||||
+ for fileno, event in events:
|
||||
+ if fileno == server_sock.fileno():
|
||||
+ connection, client_address = server_sock.accept()
|
||||
+ # if reach max connection, cleanup closed connections
|
||||
+ if len(fd_to_socket) - 1 >= MAX_CONNECTION_NUM:
|
||||
+ cleanup_closed_connections(server_sock, epoll, fd_to_socket)
|
||||
+ # if connections still reach max num, close this connection automatically
|
||||
+ if len(fd_to_socket) - 1 >= MAX_CONNECTION_NUM:
|
||||
+ logging.info(f"connection reach max num of {MAX_CONNECTION_NUM}, closed current connection!")
|
||||
+ connection.close()
|
||||
+ continue
|
||||
+ epoll.register(connection.fileno(), select.EPOLLOUT)
|
||||
+ fd_to_socket[connection.fileno()] = connection
|
||||
+ except socket.error as e:
|
||||
+ logging.debug(f"socket error, reason is {e}")
|
||||
+ break
|
||||
+ except (KeyError, OSError, ValueError) as e:
|
||||
+ logging.debug(f"wait for connection failed {e}")
|
||||
+
|
||||
+
|
||||
+def transmit_alarm(server_sock, epoll, fd_to_socket, bin_data):
|
||||
+ """
|
||||
+ this function is to broadcast alarm data to client, if fail to send data, remove connections held by fd_to_socket
|
||||
+ :param server_sock: server socket instance of alarm
|
||||
+ :param epoll: epoll instance, used to unregister invalid client connections
|
||||
+ :param fd_to_socket: dict instance, used to hold client connections and server connections
|
||||
+ :param bin_data: binary instance, alarm info data in C-style struct format defined in xalarm_api.py
|
||||
+ """
|
||||
+ to_remove = []
|
||||
+ for fileno, connection in fd_to_socket.items():
|
||||
+ if connection is not server_sock:
|
||||
+ try:
|
||||
+ connection.sendall(bin_data)
|
||||
+ except (BrokenPipeError, ConnectionResetError):
|
||||
+ to_remove.append(fileno)
|
||||
+ for fileno in to_remove:
|
||||
+ epoll.unregister(fileno)
|
||||
+ fd_to_socket[fileno].close()
|
||||
+ del fd_to_socket[fileno]
|
||||
+
|
||||
--
|
||||
2.27.0
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,438 +0,0 @@
|
||||
From 8fa9389a85763831ea85d94f179a305d7f95d585 Mon Sep 17 00:00:00 2001
|
||||
From: jinsaihang <jinsaihang@h-partners.com>
|
||||
Date: Sun, 29 Sep 2024 02:04:52 +0000
|
||||
Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=91=8A=E8=AD=A6=E4=BA=8B?=
|
||||
=?UTF-8?q?=E4=BB=B6=E6=9F=A5=E8=AF=A2=E5=8A=9F=E8=83=BD=EF=BC=9Asentryctl?=
|
||||
=?UTF-8?q?=20get=5Falarm=20<module=5Fname>=20-s=20<time=5Frange>=20-d?=
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
|
||||
---
|
||||
src/python/syssentry/alarm.py | 142 ++++++++++++++++++
|
||||
.../src/python/syssentry/callbacks.py | 17 +++
|
||||
.../src/python/syssentry/global_values.py | 4 +
|
||||
.../src/python/syssentry/load_mods.py | 16 ++
|
||||
.../src/python/syssentry/sentryctl | 20 ++-
|
||||
.../src/python/syssentry/syssentry.py | 13 +-
|
||||
.../src/python/syssentry/task_map.py | 5 +-
|
||||
7 files changed, 212 insertions(+), 5 deletions(-)
|
||||
create mode 100644 src/python/syssentry/alarm.py
|
||||
|
||||
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
|
||||
new file mode 100644
|
||||
index 0000000..74a2716
|
||||
--- /dev/null
|
||||
+++ b/src/python/syssentry/alarm.py
|
||||
@@ -0,0 +1,142 @@
|
||||
+# coding: utf-8
|
||||
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||||
+# sysSentry is licensed under the Mulan PSL v2.
|
||||
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||
+# You may obtain a copy of Mulan PSL v2 at:
|
||||
+# http://license.coscl.org.cn/MulanPSL2
|
||||
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
|
||||
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||
+# PURPOSE.
|
||||
+# See the Mulan PSL v2 for more details.
|
||||
+
|
||||
+"""
|
||||
+use for report alarm
|
||||
+"""
|
||||
+import threading
|
||||
+from typing import Dict, List
|
||||
+from datetime import datetime
|
||||
+import time
|
||||
+import logging
|
||||
+import json
|
||||
+
|
||||
+from xalarm.register_xalarm import xalarm_register,xalarm_getid,xalarm_getlevel,xalarm_gettype,xalarm_gettime,xalarm_getdesc
|
||||
+from xalarm.xalarm_api import Xalarm
|
||||
+
|
||||
+from .global_values import InspectTask
|
||||
+from .task_map import TasksMap
|
||||
+
|
||||
+# 告警ID映射字典,key为插件名,value为告警ID(类型为数字)
|
||||
+task_alarm_id_dict: Dict[str, int] = {}
|
||||
+
|
||||
+# 告警老化时间字典,key为告警ID,value为老化时间(类型为数字,单位为秒)
|
||||
+alarm_id_clear_time_dict: Dict[int, int] = {}
|
||||
+
|
||||
+# 告警事件列表,key为告警ID,value为告警ID对应的告警事件列表(类型为list)
|
||||
+alarm_list_dict: Dict[int, List[Xalarm]] = {}
|
||||
+# 告警事件列表锁
|
||||
+alarm_list_lock = threading.Lock()
|
||||
+
|
||||
+id_filter = []
|
||||
+id_base = 1001
|
||||
+clientId = -1
|
||||
+
|
||||
+MILLISECONDS_UNIT_SECONDS = 1000
|
||||
+
|
||||
+def update_alarm_list(alarm_info: Xalarm):
|
||||
+ alarm_id = xalarm_getid(alarm_info)
|
||||
+ timestamp = xalarm_gettime(alarm_info)
|
||||
+ if not timestamp:
|
||||
+ logging.error("Retrieve timestamp failed")
|
||||
+ return
|
||||
+ alarm_list_lock.acquire()
|
||||
+ try:
|
||||
+ # new alarm is inserted into list head
|
||||
+ if alarm_id not in alarm_list_dict:
|
||||
+ logging.warning(f"update_alarm_list: alarm_id {alarm_id} not found in alarm_list_dict")
|
||||
+ return
|
||||
+ alarm_list = alarm_list_dict[alarm_id]
|
||||
+
|
||||
+ alarm_list.insert(0, alarm_info)
|
||||
+ # clear alarm_info older than clear time threshold
|
||||
+ clear_index = -1
|
||||
+ clear_time = alarm_id_clear_time_dict[alarm_id]
|
||||
+ for i in range(len(alarm_list)):
|
||||
+ if (timestamp - xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > clear_time:
|
||||
+ clear_index = i
|
||||
+ break
|
||||
+ if clear_index >= 0:
|
||||
+ alarm_list_dict[alarm_id] = alarm_list[:clear_index]
|
||||
+ finally:
|
||||
+ alarm_list_lock.release()
|
||||
+
|
||||
+def alarm_register():
|
||||
+ logging.debug(f"alarm_register: enter")
|
||||
+ # 初始化告警ID映射字典、告警老化时间字典
|
||||
+ for task_type in TasksMap.tasks_dict:
|
||||
+ for task_name in TasksMap.tasks_dict[task_type]:
|
||||
+ logging.info(f"alarm_register: {task_name} is registered")
|
||||
+ task = TasksMap.tasks_dict[task_type][task_name]
|
||||
+ alarm_id = task.alarm_id
|
||||
+ alarm_clear_time = task.alarm_clear_time
|
||||
+ alarm_list_dict[alarm_id] = []
|
||||
+ task_alarm_id_dict[task_name] = alarm_id
|
||||
+ if alarm_id not in alarm_id_clear_time_dict:
|
||||
+ alarm_id_clear_time_dict[alarm_id] = alarm_clear_time
|
||||
+ else:
|
||||
+ alarm_id_clear_time_dict[alarm_id] = max(alarm_clear_time, alarm_id_clear_time_dict[alarm_id])
|
||||
+ # 注册告警回调
|
||||
+ id_filter = [True] * 128
|
||||
+ clientId = xalarm_register(update_alarm_list, id_filter)
|
||||
+ if clientId < 0:
|
||||
+ logging.info(f'register xalarm: failed')
|
||||
+ return clientId
|
||||
+ logging.info('register xalarm: success')
|
||||
+ return clientId
|
||||
+
|
||||
+def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Dict]:
|
||||
+ alarm_list_lock.acquire()
|
||||
+ try:
|
||||
+ if task_name not in task_alarm_id_dict:
|
||||
+ logging.debug("task_name does not exist")
|
||||
+ return []
|
||||
+ alarm_id = task_alarm_id_dict[task_name]
|
||||
+ if alarm_id not in alarm_list_dict:
|
||||
+ logging.debug("alarm_id does not exist")
|
||||
+ return []
|
||||
+ alarm_list = alarm_list_dict[alarm_id]
|
||||
+ logging.debug(f"get_alarm_result: alarm_list of {alarm_id} has {len(alarm_list)} elements")
|
||||
+ # clear alarm_info older than clear time threshold
|
||||
+ stop_index = -1
|
||||
+ timestamp = int(datetime.now().timestamp())
|
||||
+ for i in range(len(alarm_list)):
|
||||
+ logging.debug(f"timestamp, alarm_list[{i}].timestamp: {timestamp}, {xalarm_gettime(alarm_list[i])}")
|
||||
+ if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > int(time_range):
|
||||
+ stop_index = i
|
||||
+ break
|
||||
+ if stop_index >= 0:
|
||||
+ alarm_list = alarm_list[:stop_index]
|
||||
+ logging.debug(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements")
|
||||
+
|
||||
+ def xalarm_to_dict(alarm_info: Xalarm) -> dict:
|
||||
+ return {
|
||||
+ 'alarm_id': xalarm_getid(alarm_info),
|
||||
+ 'alarm_type': xalarm_gettype(alarm_info),
|
||||
+ 'alarm_level': xalarm_getlevel(alarm_info),
|
||||
+ 'timetamp': xalarm_gettime(alarm_info),
|
||||
+ 'msg1': xalarm_getdesc(alarm_info)
|
||||
+ }
|
||||
+
|
||||
+ alarm_list = [xalarm_to_dict(alarm) for alarm in alarm_list]
|
||||
+
|
||||
+ # keep detail
|
||||
+ for alarm in alarm_list:
|
||||
+ alarm_info = alarm['msg1']
|
||||
+ alarm_info = json.loads(alarm_info)
|
||||
+ if not detailed:
|
||||
+ if 'details' in alarm_info:
|
||||
+ alarm_info.pop('details', None)
|
||||
+ alarm.pop('msg1', None)
|
||||
+ alarm['alarm_info'] = alarm_info
|
||||
+ return alarm_list
|
||||
+ finally:
|
||||
+ alarm_list_lock.release()
|
||||
diff --git a/src/python/syssentry/callbacks.py b/src/python/syssentry/callbacks.py
|
||||
index b38b381..6ec2c29 100644
|
||||
--- a/src/python/syssentry/callbacks.py
|
||||
+++ b/src/python/syssentry/callbacks.py
|
||||
@@ -18,6 +18,7 @@ import logging
|
||||
|
||||
from .task_map import TasksMap, ONESHOT_TYPE, PERIOD_TYPE
|
||||
from .mod_status import EXITED_STATUS, RUNNING_STATUS, WAITING_STATUS, set_runtime_status
|
||||
+from .alarm import get_alarm_result
|
||||
|
||||
|
||||
def task_get_status(mod_name):
|
||||
@@ -41,6 +42,22 @@ def task_get_result(mod_name):
|
||||
|
||||
return "success", task.get_result()
|
||||
|
||||
+def task_get_alarm(data):
|
||||
+ """get alarm by mod name"""
|
||||
+ task_name = data['task_name']
|
||||
+ time_range = data['time_range']
|
||||
+ try:
|
||||
+ detailed = data['detailed']
|
||||
+ except KeyError:
|
||||
+ logging.debug("Key 'detailed' does not exist in the dictionary")
|
||||
+ detailed = None
|
||||
+ task = TasksMap.get_task_by_name(task_name)
|
||||
+ if not task:
|
||||
+ return "failed", f"cannot find task by name {task_name}"
|
||||
+ if not task.load_enabled:
|
||||
+ return "failed", f"mod {task_name} is not enabled"
|
||||
+
|
||||
+ return "success", get_alarm_result(task_name, time_range, detailed)
|
||||
|
||||
def task_stop(mod_name):
|
||||
"""stop by mod name"""
|
||||
diff --git a/src/python/syssentry/global_values.py b/src/python/syssentry/global_values.py
|
||||
index 483d544..b123b2d 100644
|
||||
--- a/src/python/syssentry/global_values.py
|
||||
+++ b/src/python/syssentry/global_values.py
|
||||
@@ -27,6 +27,7 @@ CTL_SOCKET_PATH = "/var/run/sysSentry/control.sock"
|
||||
SYSSENTRY_CONF_PATH = "/etc/sysSentry"
|
||||
INSPECT_CONF_PATH = "/etc/sysSentry/inspect.conf"
|
||||
TASK_LOG_DIR = "/var/log/sysSentry"
|
||||
+DEFAULT_ALARM_CLEAR_TIME = 15
|
||||
|
||||
SENTRY_RUN_DIR_PERM = 0o750
|
||||
|
||||
@@ -76,6 +77,9 @@ class InspectTask:
|
||||
self.env_file = ""
|
||||
# start mode
|
||||
self.conflict = "up"
|
||||
+ # alarm id
|
||||
+ self.alarm_id = -1
|
||||
+ self.alarm_clear_time = DEFAULT_ALARM_CLEAR_TIME
|
||||
|
||||
def start(self):
|
||||
"""
|
||||
diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py
|
||||
index 48d7e66..ae05e57 100644
|
||||
--- a/src/python/syssentry/load_mods.py
|
||||
+++ b/src/python/syssentry/load_mods.py
|
||||
@@ -24,6 +24,7 @@ from .task_map import TasksMap, ONESHOT_TYPE, PERIOD_TYPE
|
||||
from .cron_process import PeriodTask
|
||||
from .mod_status import set_task_status
|
||||
|
||||
+from xalarm.register_xalarm import MIN_ALARM_ID, MAX_ALARM_ID
|
||||
ONESHOT_CONF = 'oneshot'
|
||||
PERIOD_CONF = 'period'
|
||||
|
||||
@@ -41,6 +42,8 @@ CONF_TASK_RESTART = 'task_restart'
|
||||
CONF_ONSTART = 'onstart'
|
||||
CONF_ENV_FILE = 'env_file'
|
||||
CONF_CONFLICT = 'conflict'
|
||||
+CONF_ALARM_ID = 'alarm_id'
|
||||
+CONF_ALARM_CLEAR_TIME = 'alarm_clear_time'
|
||||
|
||||
MOD_FILE_SUFFIX = '.mod'
|
||||
MOD_SUFFIX_LEN = 4
|
||||
@@ -194,6 +197,18 @@ def parse_mod_conf(mod_name, mod_conf):
|
||||
task.heartbeat_interval = heartbeat_interval
|
||||
task.load_enabled = is_enabled
|
||||
|
||||
+ try:
|
||||
+ task.alarm_id = int(mod_conf.get(CONF_TASK, CONF_ALARM_ID))
|
||||
+ task.alarm_clear_time = int(mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME))
|
||||
+ if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID):
|
||||
+ raise ValueError("Invalid alarm_id")
|
||||
+ except ValueError:
|
||||
+ task.alarm_id = -1
|
||||
+ logging.warning("Invalid alarm_id, set to -1")
|
||||
+ except configparser.NoOptionError:
|
||||
+ task.alarm_id = -1
|
||||
+ logging.warning("Unset alarm_id and alarm_clear_time, use -1 and 15s as default")
|
||||
+
|
||||
if CONF_ONSTART in mod_conf.options(CONF_TASK):
|
||||
is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes')
|
||||
if task_type == PERIOD_CONF:
|
||||
@@ -327,3 +342,4 @@ def reload_single_mod(mod_name):
|
||||
res, ret = reload_mod_by_name(mod_name)
|
||||
|
||||
return res, ret
|
||||
+
|
||||
diff --git a/src/python/syssentry/sentryctl b/src/python/syssentry/sentryctl
|
||||
index e94491f..675c17a 100644
|
||||
--- a/src/python/syssentry/sentryctl
|
||||
+++ b/src/python/syssentry/sentryctl
|
||||
@@ -25,6 +25,7 @@ MAX_PARAM_LENGTH = 256
|
||||
|
||||
RESULT_MSG_DATA_LEN = 4
|
||||
CTL_MSG_LEN_LEN = 3
|
||||
+DEFAULT_ALARM_TIME_RANGE = 10
|
||||
|
||||
def status_output_format(res_data):
|
||||
"""format output"""
|
||||
@@ -57,6 +58,8 @@ def res_output_handle(res_struct, req_type):
|
||||
status_output_format(res_struct['data'])
|
||||
elif req_type == 'get_result':
|
||||
result_output_format(res_struct['data'])
|
||||
+ elif req_type == 'get_alarm':
|
||||
+ result_output_format(res_struct['data'])
|
||||
elif res_struct['ret'] == "failed":
|
||||
print(res_struct['data'])
|
||||
|
||||
@@ -75,6 +78,7 @@ def client_send_and_recv(request_data, data_str_len):
|
||||
print("sentryctl: client creat socket error")
|
||||
return None
|
||||
|
||||
+ # connect to syssentry
|
||||
try:
|
||||
client_socket.connect(CTL_SOCKET_PATH)
|
||||
except OSError:
|
||||
@@ -82,6 +86,7 @@ def client_send_and_recv(request_data, data_str_len):
|
||||
print("sentryctl: client connect error")
|
||||
return None
|
||||
|
||||
+ # msg: CTL{len}{data}
|
||||
req_data_len = len(request_data)
|
||||
request_msg = "CTL" + str(req_data_len).zfill(3) + request_data
|
||||
|
||||
@@ -94,8 +99,8 @@ def client_send_and_recv(request_data, data_str_len):
|
||||
print("sentryctl: client communicate error")
|
||||
return None
|
||||
|
||||
+ # res: RES{len}{data}
|
||||
res_magic = res_data[:3]
|
||||
-
|
||||
if res_magic != "RES":
|
||||
print("res msg format error")
|
||||
return None
|
||||
@@ -128,6 +133,10 @@ if __name__ == '__main__':
|
||||
parser_status.add_argument('task_name')
|
||||
parser_get_result = subparsers.add_parser('get_result', help='get task result')
|
||||
parser_get_result.add_argument('task_name')
|
||||
+ parser_get_alarm = subparsers.add_parser('get_alarm', help='get task alarm')
|
||||
+ parser_get_alarm.add_argument('task_name')
|
||||
+ parser_get_alarm.add_argument('-s', '--time_range', type=str, default=DEFAULT_ALARM_TIME_RANGE, help='Specified time range')
|
||||
+ parser_get_alarm.add_argument('-d', '--detailed', action='store_true', help='Print Detailed Information')
|
||||
parser_list = subparsers.add_parser('list', help='show all loaded task mod')
|
||||
|
||||
client_args = parser.parse_args()
|
||||
@@ -142,6 +151,15 @@ if __name__ == '__main__':
|
||||
req_msg_struct = {"type": "get_status", "data": client_args.task_name}
|
||||
elif client_args.cmd_type == 'get_result':
|
||||
req_msg_struct = {"type": "get_result", "data": client_args.task_name}
|
||||
+ elif client_args.cmd_type == 'get_alarm':
|
||||
+ req_msg_struct = {
|
||||
+ "type": "get_alarm",
|
||||
+ "data": {
|
||||
+ 'task_name': client_args.task_name,
|
||||
+ 'time_range': client_args.time_range,
|
||||
+ 'detailed': client_args.detailed,
|
||||
+ }
|
||||
+ }
|
||||
elif client_args.cmd_type == 'reload':
|
||||
req_msg_struct = {"type": "reload", "data": client_args.task_name}
|
||||
else:
|
||||
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
|
||||
index 9ef0203..c2dee85 100644
|
||||
--- a/src/python/syssentry/syssentry.py
|
||||
+++ b/src/python/syssentry/syssentry.py
|
||||
@@ -28,7 +28,7 @@ from .sentry_config import SentryConfig, get_log_level
|
||||
from .task_map import TasksMap
|
||||
from .global_values import SENTRY_RUN_DIR, CTL_SOCKET_PATH, SENTRY_RUN_DIR_PERM
|
||||
from .cron_process import period_tasks_handle
|
||||
-from .callbacks import mod_list_show, task_start, task_get_status, task_stop, task_get_result
|
||||
+from .callbacks import mod_list_show, task_start, task_get_status, task_stop, task_get_result, task_get_alarm
|
||||
from .mod_status import get_task_by_pid, set_runtime_status
|
||||
from .load_mods import load_tasks, reload_single_mod
|
||||
from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create,
|
||||
@@ -36,7 +36,11 @@ from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create,
|
||||
from .result import RESULT_MSG_HEAD_LEN, RESULT_MSG_MAGIC_LEN, RESULT_MAGIC
|
||||
from .result import RESULT_LEVEL_ERR_MSG_DICT, ResultLevel
|
||||
from .utils import get_current_time_string
|
||||
+from .alarm import alarm_register
|
||||
|
||||
+from xalarm.register_xalarm import xalarm_unregister
|
||||
+
|
||||
+clientId = -1
|
||||
|
||||
CPU_EXIST = True
|
||||
try:
|
||||
@@ -62,6 +66,7 @@ type_func = {
|
||||
'stop': task_stop,
|
||||
'get_status': task_get_status,
|
||||
'get_result': task_get_result,
|
||||
+ 'get_alarm': task_get_alarm,
|
||||
'reload': reload_single_mod
|
||||
}
|
||||
|
||||
@@ -107,11 +112,12 @@ def msg_data_process(msg_data):
|
||||
return "Invaild cmd type"
|
||||
|
||||
cmd_param = data_struct['data']
|
||||
- logging.debug("msg_data_process cmd_type:%s cmd_param:%s", cmd_type, cmd_param)
|
||||
+ logging.debug("msg_data_process cmd_type:%s cmd_param:%s", cmd_type, str(cmd_param))
|
||||
if cmd_type in type_func:
|
||||
ret, res_data = type_func[cmd_type](cmd_param)
|
||||
else:
|
||||
ret, res_data = type_func_void[cmd_type]()
|
||||
+ logging.debug("msg_data_process res_data:%s",str(res_data))
|
||||
res_msg_struct = {"ret": ret, "data": res_data}
|
||||
res_msg = json.dumps(res_msg_struct)
|
||||
|
||||
@@ -584,10 +590,13 @@ def main():
|
||||
_ = SentryConfig.init_param()
|
||||
TasksMap.init_task_map()
|
||||
load_tasks()
|
||||
+ clientId = alarm_register()
|
||||
main_loop()
|
||||
|
||||
except Exception:
|
||||
logging.error('%s', traceback.format_exc())
|
||||
finally:
|
||||
+ if clientId != -1:
|
||||
+ xalarm_unregister(clientId)
|
||||
release_pidfile()
|
||||
|
||||
diff --git a/src/python/syssentry/task_map.py b/src/python/syssentry/task_map.py
|
||||
index 70aa19d..27e97ff 100644
|
||||
--- a/src/python/syssentry/task_map.py
|
||||
+++ b/src/python/syssentry/task_map.py
|
||||
@@ -13,16 +13,16 @@
|
||||
tasks map class and initialize function.
|
||||
"""
|
||||
import logging
|
||||
+from typing import Dict
|
||||
|
||||
ONESHOT_TYPE = "ONESHOT"
|
||||
PERIOD_TYPE = "PERIOD"
|
||||
|
||||
TASKS_MAP = None
|
||||
|
||||
-
|
||||
class TasksMap:
|
||||
"""task map class"""
|
||||
- tasks_dict = {}
|
||||
+ tasks_dict: Dict[str, Dict] = {}
|
||||
|
||||
@classmethod
|
||||
def init_task_map(cls):
|
||||
@@ -65,3 +65,4 @@ class TasksMap:
|
||||
logging.debug("getting task by name: %s", res)
|
||||
break
|
||||
return res
|
||||
+
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,90 +0,0 @@
|
||||
From 4fa9b250f56dc3f4f431fc091e25d8f2558a9bb2 Mon Sep 17 00:00:00 2001
|
||||
From: caixiaomeng <caixiaomeng2@.com>
|
||||
Date: Fri, 11 Oct 2024 18:12:21 +0800
|
||||
Subject: [PATCH] add xalarm cleanup invalid server socket peroidly
|
||||
|
||||
---
|
||||
src/python/xalarm/xalarm_server.py | 20 +++++++++++++++-----
|
||||
src/python/xalarm/xalarm_transfer.py | 8 ++++++++
|
||||
2 files changed, 23 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/src/python/xalarm/xalarm_server.py b/src/python/xalarm/xalarm_server.py
|
||||
index 2882609..f90a0e2 100644
|
||||
--- a/src/python/xalarm/xalarm_server.py
|
||||
+++ b/src/python/xalarm/xalarm_server.py
|
||||
@@ -22,7 +22,12 @@ import threading
|
||||
from struct import error as StructParseError
|
||||
|
||||
from .xalarm_api import alarm_bin2stu
|
||||
-from .xalarm_transfer import check_filter, transmit_alarm, wait_for_connection
|
||||
+from .xalarm_transfer import (
|
||||
+ check_filter,
|
||||
+ transmit_alarm,
|
||||
+ wait_for_connection,
|
||||
+ peroid_task_to_cleanup_connections
|
||||
+)
|
||||
|
||||
|
||||
ALARM_DIR = "/var/run/xalarm"
|
||||
@@ -66,9 +71,13 @@ def server_loop(alarm_config):
|
||||
fd_to_socket = {alarm_sock.fileno(): alarm_sock,}
|
||||
thread_should_stop = False
|
||||
|
||||
- thread = threading.Thread(target=wait_for_connection, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop))
|
||||
- thread.daemon = True
|
||||
- thread.start()
|
||||
+ conn_thread = threading.Thread(target=wait_for_connection, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop))
|
||||
+ conn_thread.daemon = True
|
||||
+ conn_thread.start()
|
||||
+
|
||||
+ cleanup_thread = threading.Thread(target=peroid_task_to_cleanup_connections, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop))
|
||||
+ cleanup_thread.daemon = True
|
||||
+ cleanup_thread.start()
|
||||
|
||||
while True:
|
||||
try:
|
||||
@@ -88,7 +97,8 @@ def server_loop(alarm_config):
|
||||
logging.error(f"Error server:{e}")
|
||||
|
||||
thread_should_stop = True
|
||||
- thread.join()
|
||||
+ conn_thread.join()
|
||||
+ cleanup_thread.join()
|
||||
|
||||
epoll.unregister(alarm_sock.fileno())
|
||||
epoll.close()
|
||||
diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py
|
||||
index 90dccbc..75807e0 100644
|
||||
--- a/src/python/xalarm/xalarm_transfer.py
|
||||
+++ b/src/python/xalarm/xalarm_transfer.py
|
||||
@@ -17,11 +17,13 @@ Create: 2023-11-02
|
||||
import socket
|
||||
import logging
|
||||
import select
|
||||
+from time import sleep
|
||||
|
||||
MIN_ID_NUMBER = 1001
|
||||
MAX_ID_NUMBER = 1128
|
||||
MAX_CONNECTION_NUM = 100
|
||||
TEST_CONNECT_BUFFER_SIZE = 32
|
||||
+PEROID_SCANN_TIME = 60
|
||||
|
||||
|
||||
def check_filter(alarm_info, alarm_filter):
|
||||
@@ -66,6 +68,12 @@ def cleanup_closed_connections(server_sock, epoll, fd_to_socket):
|
||||
logging.info(f"cleaned up connection {fileno} for client lost connection.")
|
||||
|
||||
|
||||
+def peroid_task_to_cleanup_connections(server_sock, epoll, fd_to_socket, thread_should_stop):
|
||||
+ while not thread_should_stop:
|
||||
+ sleep(PEROID_SCANN_TIME)
|
||||
+ cleanup_closed_connections(server_sock, epoll, fd_to_socket)
|
||||
+
|
||||
+
|
||||
def wait_for_connection(server_sock, epoll, fd_to_socket, thread_should_stop):
|
||||
"""
|
||||
thread function for catch and save client connection
|
||||
--
|
||||
2.27.0
|
||||
|
||||
|
||||
@ -1,221 +0,0 @@
|
||||
From 367f8ab8a5ad26d80caf1bc4529c79d279ef0fb1 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
||||
Date: Thu, 10 Oct 2024 17:21:48 +0800
|
||||
Subject: [PATCH] ai_block_io adapt alarm module
|
||||
|
||||
---
|
||||
config/tasks/ai_block_io.mod | 4 +-
|
||||
.../sentryPlugins/ai_block_io/ai_block_io.py | 28 +++++---
|
||||
.../sentryPlugins/ai_block_io/alarm_report.py | 65 ++++++++++++++-----
|
||||
.../sentryPlugins/ai_block_io/data_access.py | 5 +-
|
||||
.../sentryPlugins/ai_block_io/detector.py | 2 +-
|
||||
5 files changed, 73 insertions(+), 31 deletions(-)
|
||||
|
||||
diff --git a/config/tasks/ai_block_io.mod b/config/tasks/ai_block_io.mod
|
||||
index 1971d7d..82f4f0b 100644
|
||||
--- a/config/tasks/ai_block_io.mod
|
||||
+++ b/config/tasks/ai_block_io.mod
|
||||
@@ -2,4 +2,6 @@
|
||||
enabled=yes
|
||||
task_start=/usr/bin/python3 /usr/bin/ai_block_io
|
||||
task_stop=pkill -f /usr/bin/ai_block_io
|
||||
-type=oneshot
|
||||
\ No newline at end of file
|
||||
+type=oneshot
|
||||
+alarm_id=1002
|
||||
+alarm_clear_time=5
|
||||
\ No newline at end of file
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
index 3b00ef3..77104a9 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
@@ -20,14 +20,14 @@ from .utils import get_data_queue_size_and_update_size
|
||||
from .config_parser import ConfigParser
|
||||
from .data_access import get_io_data_from_collect_plug, check_collect_valid
|
||||
from .io_data import MetricName
|
||||
-from .alarm_report import AlarmReport
|
||||
+from .alarm_report import Xalarm, Report
|
||||
|
||||
CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini"
|
||||
|
||||
|
||||
def sig_handler(signum, frame):
|
||||
logging.info("receive signal: %d", signum)
|
||||
- AlarmReport().report_fail(f"receive signal: {signum}")
|
||||
+ Report.report_pass(f"receive signal: {signum}, exiting...")
|
||||
exit(signum)
|
||||
|
||||
|
||||
@@ -44,6 +44,10 @@ class SlowIODetection:
|
||||
|
||||
def __init_detector_name_list(self):
|
||||
self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency())
|
||||
+ if self._disk_list is None:
|
||||
+ Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...")
|
||||
+ exit(1)
|
||||
+
|
||||
logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
|
||||
disks_to_detection: list = self._config_parser.get_disks_to_detection()
|
||||
# 情况1:None,则启用所有磁盘检测
|
||||
@@ -101,7 +105,8 @@ class SlowIODetection:
|
||||
)
|
||||
logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}')
|
||||
if io_data_dict_with_disk_name is None:
|
||||
- continue
|
||||
+ Report.report_pass("get io data error, please check if the collector plug is enable. exitting...")
|
||||
+ exit(1)
|
||||
|
||||
# Step2:慢IO检测
|
||||
logging.debug('step2. Start to detection slow io event.')
|
||||
@@ -117,13 +122,16 @@ class SlowIODetection:
|
||||
for slow_io_event in slow_io_event_list:
|
||||
metric_name: MetricName = slow_io_event[0]
|
||||
result = slow_io_event[1]
|
||||
- alarm_content = (f"disk {metric_name.get_disk_name()} has slow io event. "
|
||||
- f"stage is: {metric_name.get_stage_name()}, "
|
||||
- f"io access type is: {metric_name.get_io_access_type_name()}, "
|
||||
- f"metric is: {metric_name.get_metric_name()}, "
|
||||
- f"current window is: {result[1]}, "
|
||||
- f"threshold is: {result[2]}")
|
||||
- AlarmReport.report_major_alm(alarm_content)
|
||||
+ alarm_content = {
|
||||
+ "driver_name": f"{metric_name.get_disk_name()}",
|
||||
+ "reason": "disk_slow",
|
||||
+ "block_stack": f"{metric_name.get_stage_name()}",
|
||||
+ "io_type": f"{metric_name.get_io_access_type_name()}",
|
||||
+ "alarm_source": "ai_block_io",
|
||||
+ "alarm_type": "latency",
|
||||
+ "details": f"current window is: {result[1]}, threshold is: {result[2]}.",
|
||||
+ }
|
||||
+ Xalarm.major(alarm_content)
|
||||
logging.warning(alarm_content)
|
||||
|
||||
# Step4:等待检测时间
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py
|
||||
index 230c8cd..92bd6e3 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/alarm_report.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py
|
||||
@@ -9,41 +9,72 @@
|
||||
# PURPOSE.
|
||||
# See the Mulan PSL v2 for more details.
|
||||
|
||||
-from syssentry.result import ResultLevel, report_result
|
||||
import logging
|
||||
import json
|
||||
|
||||
+from xalarm.sentry_notify import (
|
||||
+ xalarm_report,
|
||||
+ MINOR_ALM,
|
||||
+ MAJOR_ALM,
|
||||
+ CRITICAL_ALM,
|
||||
+ ALARM_TYPE_OCCUR,
|
||||
+ ALARM_TYPE_RECOVER,
|
||||
+)
|
||||
+
|
||||
+from syssentry.result import ResultLevel, report_result
|
||||
+
|
||||
|
||||
-class AlarmReport:
|
||||
+class Report:
|
||||
TASK_NAME = "ai_block_io"
|
||||
|
||||
@staticmethod
|
||||
def report_pass(info: str):
|
||||
- report_result(AlarmReport.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info}))
|
||||
- logging.info(f'Report {AlarmReport.TASK_NAME} PASS: {info}')
|
||||
+ report_result(Report.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info}))
|
||||
+ logging.info(f'Report {Report.TASK_NAME} PASS: {info}')
|
||||
|
||||
@staticmethod
|
||||
def report_fail(info: str):
|
||||
- report_result(AlarmReport.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info}))
|
||||
- logging.info(f'Report {AlarmReport.TASK_NAME} FAIL: {info}')
|
||||
+ report_result(Report.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info}))
|
||||
+ logging.info(f'Report {Report.TASK_NAME} FAIL: {info}')
|
||||
|
||||
@staticmethod
|
||||
def report_skip(info: str):
|
||||
- report_result(AlarmReport.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info}))
|
||||
- logging.info(f'Report {AlarmReport.TASK_NAME} SKIP: {info}')
|
||||
+ report_result(Report.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info}))
|
||||
+ logging.info(f'Report {Report.TASK_NAME} SKIP: {info}')
|
||||
+
|
||||
+
|
||||
+class Xalarm:
|
||||
+ ALARM_ID = 1002
|
||||
|
||||
@staticmethod
|
||||
- def report_minor_alm(info: str):
|
||||
- report_result(AlarmReport.TASK_NAME, ResultLevel.MINOR_ALM, json.dumps({"msg": info}))
|
||||
- logging.info(f'Report {AlarmReport.TASK_NAME} MINOR_ALM: {info}')
|
||||
+ def minor(info: dict):
|
||||
+ info_str = json.dumps(info)
|
||||
+ xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_OCCUR, info_str)
|
||||
+ logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}")
|
||||
|
||||
@staticmethod
|
||||
- def report_major_alm(info: str):
|
||||
- report_result(AlarmReport.TASK_NAME, ResultLevel.MAJOR_ALM, json.dumps({"msg": info}))
|
||||
- logging.info(f'Report {AlarmReport.TASK_NAME} MAJOR_ALM: {info}')
|
||||
+ def major(info: dict):
|
||||
+ info_str = json.dumps(info)
|
||||
+ xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_OCCUR, info_str)
|
||||
+ logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}")
|
||||
|
||||
@staticmethod
|
||||
- def report_critical_alm(info: str):
|
||||
- report_result(AlarmReport.TASK_NAME, ResultLevel.CRITICAL_ALM, json.dumps({"msg": info}))
|
||||
- logging.info(f'Report {AlarmReport.TASK_NAME} CRITICAL_ALM: {info}')
|
||||
+ def critical(info: dict):
|
||||
+ info_str = json.dumps(info)
|
||||
+ xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_OCCUR, info_str)
|
||||
+ logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}")
|
||||
+
|
||||
+ def minor_recover(info: dict):
|
||||
+ info_str = json.dumps(info)
|
||||
+ xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_RECOVER, info_str)
|
||||
+ logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}")
|
||||
+
|
||||
+ def major_recover(info: dict):
|
||||
+ info_str = json.dumps(info)
|
||||
+ xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_RECOVER, info_str)
|
||||
+ logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}")
|
||||
|
||||
+ def critical_recover(info: dict):
|
||||
+ info_str = json.dumps(info)
|
||||
+ xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_RECOVER, info_str)
|
||||
+ logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}")
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
index 01c5315..c7679cd 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
@@ -42,10 +42,11 @@ def check_collect_valid(period):
|
||||
data = json.loads(data_raw["message"])
|
||||
except Exception as e:
|
||||
logging.warning(f"get io data failed, {e}")
|
||||
- return []
|
||||
+ return None
|
||||
return [k for k in data.keys()]
|
||||
else:
|
||||
- return []
|
||||
+ logging.warning(f"get io data failed, return {data_raw}")
|
||||
+ return None
|
||||
|
||||
|
||||
def _get_raw_data(period, disk_list):
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
index a48144f..0ed282b 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
@@ -35,7 +35,7 @@ class Detector:
|
||||
self._count += 1
|
||||
if self._count % 15 == 0:
|
||||
self._count = 0
|
||||
- logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
|
||||
+ logging.debug(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
|
||||
logging.debug(f'enter Detector: {self}')
|
||||
metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name)
|
||||
if metric_value is None:
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,235 +0,0 @@
|
||||
From 1e13bc31ae3aa94f36aa124eefdfc8773221eacd Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
||||
Date: Mon, 14 Oct 2024 23:16:46 +0800
|
||||
Subject: [PATCH] ai_block_io fix some bugs
|
||||
|
||||
---
|
||||
.../sentryPlugins/ai_block_io/ai_block_io.py | 1 +
|
||||
.../ai_block_io/config_parser.py | 20 ++++++++++---------
|
||||
.../sentryPlugins/ai_block_io/detector.py | 18 ++++++++++++-----
|
||||
.../sentryPlugins/ai_block_io/io_data.py | 2 +-
|
||||
.../sentryPlugins/ai_block_io/threshold.py | 17 +++++++++-------
|
||||
5 files changed, 36 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
index dd661a1..4eecd43 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
@@ -55,6 +55,7 @@ class SlowIODetection:
|
||||
Report.report_pass(
|
||||
"get available disk error, please check if the collector plug is enable. exiting..."
|
||||
)
|
||||
+ logging.critical("get available disk error, please check if the collector plug is enable. exiting...")
|
||||
exit(1)
|
||||
|
||||
logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
index 3388cd4..7b0cd29 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
@@ -190,7 +190,7 @@ class ConfigParser:
|
||||
self._conf["common"]["disk"] = disk_list
|
||||
|
||||
def _read_train_data_duration(self, items_algorithm: dict):
|
||||
- self._conf["common"]["train_data_duration"] = self._get_config_value(
|
||||
+ self._conf["algorithm"]["train_data_duration"] = self._get_config_value(
|
||||
items_algorithm,
|
||||
"train_data_duration",
|
||||
float,
|
||||
@@ -203,17 +203,17 @@ class ConfigParser:
|
||||
default_train_update_duration = self.DEFAULT_CONF["algorithm"][
|
||||
"train_update_duration"
|
||||
]
|
||||
- if default_train_update_duration > self._conf["common"]["train_data_duration"]:
|
||||
+ if default_train_update_duration > self._conf["algorithm"]["train_data_duration"]:
|
||||
default_train_update_duration = (
|
||||
- self._conf["common"]["train_data_duration"] / 2
|
||||
+ self._conf["algorithm"]["train_data_duration"] / 2
|
||||
)
|
||||
- self._conf["common"]["train_update_duration"] = self._get_config_value(
|
||||
+ self._conf["algorithm"]["train_update_duration"] = self._get_config_value(
|
||||
items_algorithm,
|
||||
"train_update_duration",
|
||||
float,
|
||||
default_train_update_duration,
|
||||
gt=0,
|
||||
- le=self._conf["common"]["train_data_duration"],
|
||||
+ le=self._conf["algorithm"]["train_data_duration"],
|
||||
)
|
||||
|
||||
def _read_algorithm_type_and_parameter(self, items_algorithm: dict):
|
||||
@@ -401,6 +401,8 @@ class ConfigParser:
|
||||
self._read_stage(items_common)
|
||||
self._read_iotype(items_common)
|
||||
else:
|
||||
+ self._conf["common"]["stage"] = ALL_STAGE_LIST
|
||||
+ self._conf["common"]["iotype"] = ALL_IOTPYE_LIST
|
||||
logging.warning(
|
||||
"common section parameter not found, it will be set to default value."
|
||||
)
|
||||
@@ -511,8 +513,8 @@ class ConfigParser:
|
||||
|
||||
def get_train_data_duration_and_train_update_duration(self):
|
||||
return (
|
||||
- self._conf["common"]["train_data_duration"],
|
||||
- self._conf["common"]["train_update_duration"],
|
||||
+ self._conf["algorithm"]["train_data_duration"],
|
||||
+ self._conf["algorithm"]["train_update_duration"],
|
||||
)
|
||||
|
||||
def get_window_size_and_window_minimum_threshold(self):
|
||||
@@ -535,11 +537,11 @@ class ConfigParser:
|
||||
|
||||
@property
|
||||
def train_data_duration(self):
|
||||
- return self._conf["common"]["train_data_duration"]
|
||||
+ return self._conf["algorithm"]["train_data_duration"]
|
||||
|
||||
@property
|
||||
def train_update_duration(self):
|
||||
- return self._conf["common"]["train_update_duration"]
|
||||
+ return self._conf["algorithm"]["train_update_duration"]
|
||||
|
||||
@property
|
||||
def window_size(self):
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
index 87bd1dd..5b21714 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
@@ -9,6 +9,7 @@
|
||||
# PURPOSE.
|
||||
# See the Mulan PSL v2 for more details.
|
||||
import logging
|
||||
+from datetime import datetime
|
||||
|
||||
from .io_data import MetricName
|
||||
from .threshold import Threshold
|
||||
@@ -21,18 +22,25 @@ class Detector:
|
||||
def __init__(self, metric_name: MetricName, threshold: Threshold, sliding_window: SlidingWindow):
|
||||
self._metric_name = metric_name
|
||||
self._threshold = threshold
|
||||
+ # for when threshold update, it can print latest threshold with metric name
|
||||
+ self._threshold.set_metric_name(self._metric_name)
|
||||
self._slidingWindow = sliding_window
|
||||
self._threshold.attach_observer(self._slidingWindow)
|
||||
- self._count = 0
|
||||
+ self._count = None
|
||||
|
||||
def get_metric_name(self):
|
||||
return self._metric_name
|
||||
|
||||
def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
|
||||
- self._count += 1
|
||||
- if self._count % 15 == 0:
|
||||
- self._count = 0
|
||||
- logging.debug(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
|
||||
+ if self._count is None:
|
||||
+ self._count = datetime.now()
|
||||
+ else:
|
||||
+ now_time = datetime.now()
|
||||
+ time_diff = (now_time - self._count).total_seconds()
|
||||
+ if time_diff >= 60:
|
||||
+ logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
|
||||
+ self._count = None
|
||||
+
|
||||
logging.debug(f'enter Detector: {self}')
|
||||
metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name)
|
||||
if metric_value is None:
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/io_data.py b/src/python/sentryPlugins/ai_block_io/io_data.py
|
||||
index d341b55..6042911 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/io_data.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/io_data.py
|
||||
@@ -48,7 +48,7 @@ class IOData:
|
||||
@dataclass(frozen=True)
|
||||
class MetricName:
|
||||
disk_name: str
|
||||
- disk_type: str
|
||||
+ disk_type: int
|
||||
stage_name: str
|
||||
io_access_type_name: str
|
||||
metric_name: str
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py
|
||||
index 3b7a5a8..600d041 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/threshold.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/threshold.py
|
||||
@@ -23,11 +23,6 @@ class ThresholdState(Enum):
|
||||
|
||||
|
||||
class Threshold:
|
||||
- threshold = None
|
||||
- data_queue: queue.Queue = None
|
||||
- data_queue_update_size: int = None
|
||||
- new_data_size: int = None
|
||||
- threshold_state: ThresholdState = None
|
||||
|
||||
def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
|
||||
self._observer = None
|
||||
@@ -36,12 +31,16 @@ class Threshold:
|
||||
self.new_data_size = 0
|
||||
self.threshold_state = ThresholdState.INIT
|
||||
self.threshold = math.inf
|
||||
+ self.metric_name = None
|
||||
|
||||
def set_threshold(self, threshold):
|
||||
self.threshold = threshold
|
||||
self.threshold_state = ThresholdState.START
|
||||
self.notify_observer()
|
||||
|
||||
+ def set_metric_name(self, metric_name):
|
||||
+ self.metric_name = metric_name
|
||||
+
|
||||
def get_threshold(self):
|
||||
if self.threshold_state == ThresholdState.INIT:
|
||||
return None
|
||||
@@ -84,6 +83,7 @@ class BoxplotThreshold(Threshold):
|
||||
self.parameter = boxplot_parameter
|
||||
|
||||
def _update_threshold(self):
|
||||
+ old_threshold = self.threshold
|
||||
data = list(self.data_queue.queue)
|
||||
q1 = np.percentile(data, 25)
|
||||
q3 = np.percentile(data, 75)
|
||||
@@ -91,6 +91,7 @@ class BoxplotThreshold(Threshold):
|
||||
self.threshold = q3 + self.parameter * iqr
|
||||
if self.threshold_state == ThresholdState.INIT:
|
||||
self.threshold_state = ThresholdState.START
|
||||
+ logging.info(f"MetricName: [{self.metric_name}]'s threshold update, old is: {old_threshold} -> new is: {self.threshold}")
|
||||
self.notify_observer()
|
||||
|
||||
def push_latest_data_to_queue(self, data):
|
||||
@@ -109,7 +110,7 @@ class BoxplotThreshold(Threshold):
|
||||
self.new_data_size = 0
|
||||
|
||||
def __repr__(self):
|
||||
- return f"[BoxplotThreshold, param is: {self.parameter}]"
|
||||
+ return f"[BoxplotThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]"
|
||||
|
||||
|
||||
class NSigmaThreshold(Threshold):
|
||||
@@ -118,12 +119,14 @@ class NSigmaThreshold(Threshold):
|
||||
self.parameter = n_sigma_parameter
|
||||
|
||||
def _update_threshold(self):
|
||||
+ old_threshold = self.threshold
|
||||
data = list(self.data_queue.queue)
|
||||
mean = np.mean(data)
|
||||
std = np.std(data)
|
||||
self.threshold = mean + self.parameter * std
|
||||
if self.threshold_state == ThresholdState.INIT:
|
||||
self.threshold_state = ThresholdState.START
|
||||
+ logging.info(f"MetricName: [{self.metric_name}]'s threshold update, old is: {old_threshold} -> new is: {self.threshold}")
|
||||
self.notify_observer()
|
||||
|
||||
def push_latest_data_to_queue(self, data):
|
||||
@@ -142,7 +145,7 @@ class NSigmaThreshold(Threshold):
|
||||
self.new_data_size = 0
|
||||
|
||||
def __repr__(self):
|
||||
- return f"[NSigmaThreshold, param is: {self.parameter}]"
|
||||
+ return f"[NSigmaThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]"
|
||||
|
||||
|
||||
class ThresholdType(Enum):
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,626 +0,0 @@
|
||||
From f3a0738061e852c8125513f6222b4a5d6ea73270 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
||||
Date: Fri, 25 Oct 2024 15:34:25 +0800
|
||||
Subject: [PATCH] ai_block_io fix some config parameters parse bug
|
||||
|
||||
---
|
||||
.../sentryPlugins/ai_block_io/ai_block_io.py | 70 +++++----
|
||||
.../ai_block_io/config_parser.py | 135 ++++++++++++++----
|
||||
.../sentryPlugins/ai_block_io/data_access.py | 14 ++
|
||||
.../sentryPlugins/ai_block_io/detector.py | 16 ++-
|
||||
.../ai_block_io/sliding_window.py | 2 +-
|
||||
.../sentryPlugins/ai_block_io/threshold.py | 14 +-
|
||||
src/python/sentryPlugins/ai_block_io/utils.py | 2 -
|
||||
7 files changed, 180 insertions(+), 73 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
index 74f246a..14f740d 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
@@ -23,6 +23,7 @@ from .data_access import (
|
||||
get_io_data_from_collect_plug,
|
||||
check_collect_valid,
|
||||
get_disk_type,
|
||||
+ check_disk_is_available
|
||||
)
|
||||
from .io_data import MetricName
|
||||
from .alarm_report import Xalarm, Report
|
||||
@@ -31,14 +32,14 @@ CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini"
|
||||
|
||||
|
||||
def sig_handler(signum, frame):
|
||||
- logging.info("receive signal: %d", signum)
|
||||
Report.report_pass(f"receive signal: {signum}, exiting...")
|
||||
+ logging.info("Finished ai_block_io plugin running.")
|
||||
exit(signum)
|
||||
|
||||
|
||||
class SlowIODetection:
|
||||
_config_parser = None
|
||||
- _disk_list = None
|
||||
+ _disk_list = []
|
||||
_detector_name_list = defaultdict(list)
|
||||
_disk_detectors = {}
|
||||
|
||||
@@ -48,32 +49,30 @@ class SlowIODetection:
|
||||
self.__init_detector()
|
||||
|
||||
def __init_detector_name_list(self):
|
||||
- self._disk_list = check_collect_valid(
|
||||
- self._config_parser.period_time
|
||||
- )
|
||||
- if self._disk_list is None:
|
||||
- Report.report_pass(
|
||||
- "get available disk error, please check if the collector plug is enable. exiting..."
|
||||
- )
|
||||
- logging.critical("get available disk error, please check if the collector plug is enable. exiting...")
|
||||
- exit(1)
|
||||
-
|
||||
- logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
|
||||
disks: list = self._config_parser.disks_to_detection
|
||||
stages: list = self._config_parser.stage
|
||||
iotypes: list = self._config_parser.iotype
|
||||
- # 情况1:None,则启用所有磁盘检测
|
||||
- # 情况2:is not None and len = 0,则不启动任何磁盘检测
|
||||
- # 情况3:len != 0,则取交集
|
||||
+
|
||||
if disks is None:
|
||||
- logging.warning(
|
||||
- "you not specify any disk or use default, so ai_block_io will enable all available disk."
|
||||
- )
|
||||
- for disk in self._disk_list:
|
||||
- if disks is not None:
|
||||
- if disk not in disks:
|
||||
- continue
|
||||
- disks.remove(disk)
|
||||
+ logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.")
|
||||
+ all_available_disk_list = check_collect_valid(self._config_parser.period_time)
|
||||
+ if all_available_disk_list is None:
|
||||
+ Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...")
|
||||
+ logging.critical("get available disk error, please check if the collector plug is enable. exiting...")
|
||||
+ exit(1)
|
||||
+ if len(all_available_disk_list) == 0:
|
||||
+ Report.report_pass("not found available disk. exiting...")
|
||||
+ logging.critical("not found available disk. exiting...")
|
||||
+ exit(1)
|
||||
+ disks = all_available_disk_list
|
||||
+ logging.info(f"available disk list is follow: {disks}.")
|
||||
+
|
||||
+ for disk in disks:
|
||||
+ tmp_disk = [disk]
|
||||
+ ret = check_disk_is_available(self._config_parser.period_time, tmp_disk)
|
||||
+ if not ret:
|
||||
+ logging.warning(f"disk: {disk} is not available, it will be ignored.")
|
||||
+ continue
|
||||
|
||||
disk_type_result = get_disk_type(disk)
|
||||
if disk_type_result["ret"] == 0 and disk_type_result["message"] in (
|
||||
@@ -89,20 +88,15 @@ class SlowIODetection:
|
||||
disk_type_result,
|
||||
)
|
||||
continue
|
||||
+ self._disk_list.append(disk)
|
||||
for stage in stages:
|
||||
for iotype in iotypes:
|
||||
self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "latency"))
|
||||
self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "io_dump"))
|
||||
- if disks:
|
||||
- logging.warning(
|
||||
- "disks: %s not in available disk list, so they will be ignored.",
|
||||
- disks,
|
||||
- )
|
||||
+
|
||||
if not self._detector_name_list:
|
||||
+ Report.report_pass("the disks to detection is empty, ai_block_io will exit.")
|
||||
logging.critical("the disks to detection is empty, ai_block_io will exit.")
|
||||
- Report.report_pass(
|
||||
- "the disks to detection is empty, ai_block_io will exit."
|
||||
- )
|
||||
exit(1)
|
||||
|
||||
def __init_detector(self):
|
||||
@@ -202,16 +196,20 @@ class SlowIODetection:
|
||||
logging.debug("step3. Report slow io event to sysSentry.")
|
||||
for slow_io_event in slow_io_event_list:
|
||||
alarm_content = {
|
||||
+ "alarm_source": "ai_block_io",
|
||||
"driver_name": slow_io_event[1],
|
||||
+ "io_type": slow_io_event[4],
|
||||
"reason": slow_io_event[2],
|
||||
"block_stack": slow_io_event[3],
|
||||
- "io_type": slow_io_event[4],
|
||||
- "alarm_source": "ai_block_io",
|
||||
"alarm_type": slow_io_event[5],
|
||||
- "details": slow_io_event[6],
|
||||
+ "details": slow_io_event[6]
|
||||
}
|
||||
Xalarm.major(alarm_content)
|
||||
- logging.warning("[SLOW IO] " + str(alarm_content))
|
||||
+ tmp_alarm_content = alarm_content.copy()
|
||||
+ del tmp_alarm_content["details"]
|
||||
+ logging.warning("[SLOW IO] " + str(tmp_alarm_content))
|
||||
+ logging.warning(f"latency: " + str(alarm_content.get("details").get("latency")))
|
||||
+ logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump")))
|
||||
|
||||
# Step4:等待检测时间
|
||||
logging.debug("step4. Wait to start next slow io event detection loop.")
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
index 91ec5c6..3049db2 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
@@ -105,21 +105,26 @@ class ConfigParser:
|
||||
ge=None,
|
||||
lt=None,
|
||||
le=None,
|
||||
+ section=None
|
||||
):
|
||||
+ if section is not None:
|
||||
+ print_key = section + "." + key
|
||||
+ else:
|
||||
+ print_key = key
|
||||
value = config_items.get(key)
|
||||
if value is None:
|
||||
logging.warning(
|
||||
"config of %s not found, the default value %s will be used.",
|
||||
- key,
|
||||
+ print_key,
|
||||
default_value,
|
||||
)
|
||||
value = default_value
|
||||
if not value:
|
||||
logging.critical(
|
||||
- "the value of %s is empty, ai_block_io plug will exit.", key
|
||||
+ "the value of %s is empty, ai_block_io plug will exit.", print_key
|
||||
)
|
||||
Report.report_pass(
|
||||
- f"the value of {key} is empty, ai_block_io plug will exit."
|
||||
+ f"the value of {print_key} is empty, ai_block_io plug will exit."
|
||||
)
|
||||
exit(1)
|
||||
try:
|
||||
@@ -127,51 +132,51 @@ class ConfigParser:
|
||||
except ValueError:
|
||||
logging.critical(
|
||||
"the value of %s is not a valid %s, ai_block_io plug will exit.",
|
||||
- key,
|
||||
+ print_key,
|
||||
value_type,
|
||||
)
|
||||
Report.report_pass(
|
||||
- f"the value of {key} is not a valid {value_type}, ai_block_io plug will exit."
|
||||
+ f"the value of {print_key} is not a valid {value_type}, ai_block_io plug will exit."
|
||||
)
|
||||
exit(1)
|
||||
if gt is not None and value <= gt:
|
||||
logging.critical(
|
||||
"the value of %s is not greater than %s, ai_block_io plug will exit.",
|
||||
- key,
|
||||
+ print_key,
|
||||
gt,
|
||||
)
|
||||
Report.report_pass(
|
||||
- f"the value of {key} is not greater than {gt}, ai_block_io plug will exit."
|
||||
+ f"the value of {print_key} is not greater than {gt}, ai_block_io plug will exit."
|
||||
)
|
||||
exit(1)
|
||||
if ge is not None and value < ge:
|
||||
logging.critical(
|
||||
"the value of %s is not greater than or equal to %s, ai_block_io plug will exit.",
|
||||
- key,
|
||||
+ print_key,
|
||||
ge,
|
||||
)
|
||||
Report.report_pass(
|
||||
- f"the value of {key} is not greater than or equal to {ge}, ai_block_io plug will exit."
|
||||
+ f"the value of {print_key} is not greater than or equal to {ge}, ai_block_io plug will exit."
|
||||
)
|
||||
exit(1)
|
||||
if lt is not None and value >= lt:
|
||||
logging.critical(
|
||||
"the value of %s is not less than %s, ai_block_io plug will exit.",
|
||||
- key,
|
||||
+ print_key,
|
||||
lt,
|
||||
)
|
||||
Report.report_pass(
|
||||
- f"the value of {key} is not less than {lt}, ai_block_io plug will exit."
|
||||
+ f"the value of {print_key} is not less than {lt}, ai_block_io plug will exit."
|
||||
)
|
||||
exit(1)
|
||||
if le is not None and value > le:
|
||||
logging.critical(
|
||||
"the value of %s is not less than or equal to %s, ai_block_io plug will exit.",
|
||||
- key,
|
||||
+ print_key,
|
||||
le,
|
||||
)
|
||||
Report.report_pass(
|
||||
- f"the value of {key} is not less than or equal to {le}, ai_block_io plug will exit."
|
||||
+ f"the value of {print_key} is not less than or equal to {le}, ai_block_io plug will exit."
|
||||
)
|
||||
exit(1)
|
||||
|
||||
@@ -188,7 +193,7 @@ class ConfigParser:
|
||||
frequency = self._conf["common"]["period_time"]
|
||||
ret = check_detect_frequency_is_valid(frequency)
|
||||
if ret is None:
|
||||
- log = f"period_time: {frequency} is valid, "\
|
||||
+ log = f"period_time: {frequency} is invalid, "\
|
||||
f"Check whether the value range is too large or is not an "\
|
||||
f"integer multiple of period_time.. exiting..."
|
||||
Report.report_pass(log)
|
||||
@@ -202,6 +207,7 @@ class ConfigParser:
|
||||
self._conf["common"]["disk"] = None
|
||||
return
|
||||
disks_to_detection = disks_to_detection.strip()
|
||||
+ disks_to_detection = disks_to_detection.lower()
|
||||
if not disks_to_detection:
|
||||
logging.critical("the value of disk is empty, ai_block_io plug will exit.")
|
||||
Report.report_pass(
|
||||
@@ -213,7 +219,18 @@ class ConfigParser:
|
||||
if len(disk_list) == 1 and disk_list[0] == "default":
|
||||
self._conf["common"]["disk"] = None
|
||||
return
|
||||
- self._conf["common"]["disk"] = disk_list
|
||||
+ if len(disk_list) > 10:
|
||||
+ ten_disk_list = disk_list[0:10]
|
||||
+ other_disk_list = disk_list[10:]
|
||||
+ logging.warning(f"disk only support maximum is 10, disks: {ten_disk_list} will be retained, other: {other_disk_list} will be ignored.")
|
||||
+ else:
|
||||
+ ten_disk_list = disk_list
|
||||
+ set_ten_disk_list = set(ten_disk_list)
|
||||
+ if len(ten_disk_list) > len(set_ten_disk_list):
|
||||
+ tmp = ten_disk_list
|
||||
+ ten_disk_list = list(set_ten_disk_list)
|
||||
+ logging.warning(f"disk exist duplicate, it will be deduplicate, before: {tmp}, after: {ten_disk_list}")
|
||||
+ self._conf["common"]["disk"] = ten_disk_list
|
||||
|
||||
def _read_train_data_duration(self, items_algorithm: dict):
|
||||
self._conf["algorithm"]["train_data_duration"] = self._get_config_value(
|
||||
@@ -244,10 +261,12 @@ class ConfigParser:
|
||||
|
||||
def _read_algorithm_type_and_parameter(self, items_algorithm: dict):
|
||||
algorithm_type = items_algorithm.get("algorithm_type")
|
||||
- if algorithm_type is not None:
|
||||
- self._conf["algorithm"]["algorithm_type"] = get_threshold_type_enum(
|
||||
- algorithm_type
|
||||
- )
|
||||
+ if algorithm_type is None:
|
||||
+ default_algorithm_type = self._conf["algorithm"]["algorithm_type"]
|
||||
+ logging.warning(f"algorithm_type not found, it will be set default: {default_algorithm_type}")
|
||||
+ else:
|
||||
+ self._conf["algorithm"]["algorithm_type"] = get_threshold_type_enum(algorithm_type)
|
||||
+
|
||||
if self._conf["algorithm"]["algorithm_type"] is None:
|
||||
logging.critical(
|
||||
"the algorithm_type: %s you set is invalid. ai_block_io plug will exit.",
|
||||
@@ -257,6 +276,7 @@ class ConfigParser:
|
||||
f"the algorithm_type: {algorithm_type} you set is invalid. ai_block_io plug will exit."
|
||||
)
|
||||
exit(1)
|
||||
+
|
||||
elif self._conf["algorithm"]["algorithm_type"] == ThresholdType.NSigmaThreshold:
|
||||
self._conf["algorithm"]["n_sigma_parameter"] = self._get_config_value(
|
||||
items_algorithm,
|
||||
@@ -279,9 +299,14 @@ class ConfigParser:
|
||||
)
|
||||
|
||||
def _read_stage(self, items_algorithm: dict):
|
||||
- stage_str = items_algorithm.get(
|
||||
- "stage", self.DEFAULT_CONF["common"]["stage"]
|
||||
- ).strip()
|
||||
+ stage_str = items_algorithm.get("stage")
|
||||
+ if stage_str is None:
|
||||
+ stage_str = self.DEFAULT_CONF["common"]["stage"]
|
||||
+ logging.warning(f"stage not found, it will be set default: {stage_str}")
|
||||
+ else:
|
||||
+ stage_str = stage_str.strip()
|
||||
+
|
||||
+ stage_str = stage_str.lower()
|
||||
stage_list = stage_str.split(",")
|
||||
stage_list = [stage.strip() for stage in stage_list]
|
||||
if len(stage_list) == 1 and stage_list[0] == "":
|
||||
@@ -307,9 +332,14 @@ class ConfigParser:
|
||||
self._conf["common"]["stage"] = dup_stage_list
|
||||
|
||||
def _read_iotype(self, items_algorithm: dict):
|
||||
- iotype_str = items_algorithm.get(
|
||||
- "iotype", self.DEFAULT_CONF["common"]["iotype"]
|
||||
- ).strip()
|
||||
+ iotype_str = items_algorithm.get("iotype")
|
||||
+ if iotype_str is None:
|
||||
+ iotype_str = self.DEFAULT_CONF["common"]["iotype"]
|
||||
+ logging.warning(f"iotype not found, it will be set default: {iotype_str}")
|
||||
+ else:
|
||||
+ iotype_str = iotype_str.strip()
|
||||
+
|
||||
+ iotype_str = iotype_str.lower()
|
||||
iotype_list = iotype_str.split(",")
|
||||
iotype_list = [iotype.strip() for iotype in iotype_list]
|
||||
if len(iotype_list) == 1 and iotype_list[0] == "":
|
||||
@@ -333,6 +363,13 @@ class ConfigParser:
|
||||
|
||||
def _read_sliding_window_type(self, items_sliding_window: dict):
|
||||
sliding_window_type = items_sliding_window.get("win_type")
|
||||
+
|
||||
+ if sliding_window_type is None:
|
||||
+ default_sliding_window_type = self._conf["algorithm"]["win_type"]
|
||||
+ logging.warning(f"win_type not found, it will be set default: {default_sliding_window_type}")
|
||||
+ return
|
||||
+
|
||||
+ sliding_window_type = sliding_window_type.strip()
|
||||
if sliding_window_type is not None:
|
||||
self._conf["algorithm"]["win_type"] = (
|
||||
get_sliding_window_type_enum(sliding_window_type)
|
||||
@@ -439,6 +476,7 @@ class ConfigParser:
|
||||
int,
|
||||
self.DEFAULT_CONF["latency_sata_ssd"]["read_tot_lim"],
|
||||
gt=0,
|
||||
+ section="latency_sata_ssd"
|
||||
)
|
||||
self._conf["latency_sata_ssd"]["write_tot_lim"] = self._get_config_value(
|
||||
items_latency_sata_ssd,
|
||||
@@ -446,21 +484,32 @@ class ConfigParser:
|
||||
int,
|
||||
self.DEFAULT_CONF["latency_sata_ssd"]["write_tot_lim"],
|
||||
gt=0,
|
||||
+ section="latency_sata_ssd"
|
||||
)
|
||||
self._conf["latency_sata_ssd"]["read_avg_lim"] = self._get_config_value(
|
||||
items_latency_sata_ssd,
|
||||
"read_avg_lim",
|
||||
int,
|
||||
self.DEFAULT_CONF["latency_sata_ssd"]["read_avg_lim"],
|
||||
- gt=0
|
||||
+ gt=0,
|
||||
+ section="latency_sata_ssd"
|
||||
)
|
||||
self._conf["latency_sata_ssd"]["write_avg_lim"] = self._get_config_value(
|
||||
items_latency_sata_ssd,
|
||||
"write_avg_lim",
|
||||
int,
|
||||
self.DEFAULT_CONF["latency_sata_ssd"]["write_avg_lim"],
|
||||
- gt=0
|
||||
+ gt=0,
|
||||
+ section="latency_sata_ssd"
|
||||
)
|
||||
+ if self._conf["latency_sata_ssd"]["read_avg_lim"] >= self._conf["latency_sata_ssd"]["read_tot_lim"]:
|
||||
+ Report.report_pass("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...")
|
||||
+ logging.critical("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...")
|
||||
+ exit(1)
|
||||
+ if self._conf["latency_sata_ssd"]["write_avg_lim"] >= self._conf["latency_sata_ssd"]["write_tot_lim"]:
|
||||
+ Report.report_pass("latency_sata_ssd.write_avg_lim must < latency_sata_ssd.write_tot_lim . exiting...")
|
||||
+ logging.critical("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...")
|
||||
+ exit(1)
|
||||
else:
|
||||
Report.report_pass("not found latency_sata_ssd section. exiting...")
|
||||
logging.critical("not found latency_sata_ssd section. exiting...")
|
||||
@@ -474,6 +523,7 @@ class ConfigParser:
|
||||
int,
|
||||
self.DEFAULT_CONF["latency_nvme_ssd"]["read_tot_lim"],
|
||||
gt=0,
|
||||
+ section="latency_nvme_ssd"
|
||||
)
|
||||
self._conf["latency_nvme_ssd"]["write_tot_lim"] = self._get_config_value(
|
||||
items_latency_nvme_ssd,
|
||||
@@ -481,21 +531,32 @@ class ConfigParser:
|
||||
int,
|
||||
self.DEFAULT_CONF["latency_nvme_ssd"]["write_tot_lim"],
|
||||
gt=0,
|
||||
+ section="latency_nvme_ssd"
|
||||
)
|
||||
self._conf["latency_nvme_ssd"]["read_avg_lim"] = self._get_config_value(
|
||||
items_latency_nvme_ssd,
|
||||
"read_avg_lim",
|
||||
int,
|
||||
self.DEFAULT_CONF["latency_nvme_ssd"]["read_avg_lim"],
|
||||
- gt=0
|
||||
+ gt=0,
|
||||
+ section="latency_nvme_ssd"
|
||||
)
|
||||
self._conf["latency_nvme_ssd"]["write_avg_lim"] = self._get_config_value(
|
||||
items_latency_nvme_ssd,
|
||||
"write_avg_lim",
|
||||
int,
|
||||
self.DEFAULT_CONF["latency_nvme_ssd"]["write_avg_lim"],
|
||||
- gt=0
|
||||
+ gt=0,
|
||||
+ section="latency_nvme_ssd"
|
||||
)
|
||||
+ if self._conf["latency_nvme_ssd"]["read_avg_lim"] >= self._conf["latency_nvme_ssd"]["read_tot_lim"]:
|
||||
+ Report.report_pass("latency_nvme_ssd.read_avg_lim must < latency_nvme_ssd.read_tot_lim . exiting...")
|
||||
+ logging.critical("latency_nvme_ssd.read_avg_lim must < latency_nvme_ssd.read_tot_lim . exiting...")
|
||||
+ exit(1)
|
||||
+ if self._conf["latency_nvme_ssd"]["write_avg_lim"] >= self._conf["latency_nvme_ssd"]["write_tot_lim"]:
|
||||
+ Report.report_pass("latency_nvme_ssd.write_avg_lim must < latency_nvme_ssd.write_tot_lim . exiting...")
|
||||
+ logging.critical("latency_nvme_ssd.write_avg_lim must < latency_nvme_ssd.write_tot_lim . exiting...")
|
||||
+ exit(1)
|
||||
else:
|
||||
Report.report_pass("not found latency_nvme_ssd section. exiting...")
|
||||
logging.critical("not found latency_nvme_ssd section. exiting...")
|
||||
@@ -509,6 +570,7 @@ class ConfigParser:
|
||||
int,
|
||||
self.DEFAULT_CONF["latency_sata_hdd"]["read_tot_lim"],
|
||||
gt=0,
|
||||
+ section="latency_sata_hdd"
|
||||
)
|
||||
self._conf["latency_sata_hdd"]["write_tot_lim"] = self._get_config_value(
|
||||
items_latency_sata_hdd,
|
||||
@@ -516,21 +578,32 @@ class ConfigParser:
|
||||
int,
|
||||
self.DEFAULT_CONF["latency_sata_hdd"]["write_tot_lim"],
|
||||
gt=0,
|
||||
+ section="latency_sata_hdd"
|
||||
)
|
||||
self._conf["latency_sata_hdd"]["read_avg_lim"] = self._get_config_value(
|
||||
items_latency_sata_hdd,
|
||||
"read_avg_lim",
|
||||
int,
|
||||
self.DEFAULT_CONF["latency_sata_hdd"]["read_avg_lim"],
|
||||
- gt=0
|
||||
+ gt=0,
|
||||
+ section="latency_sata_hdd"
|
||||
)
|
||||
self._conf["latency_sata_hdd"]["write_avg_lim"] = self._get_config_value(
|
||||
items_latency_sata_hdd,
|
||||
"write_avg_lim",
|
||||
int,
|
||||
self.DEFAULT_CONF["latency_sata_hdd"]["write_avg_lim"],
|
||||
- gt=0
|
||||
+ gt=0,
|
||||
+ section="latency_sata_hdd"
|
||||
)
|
||||
+ if self._conf["latency_sata_hdd"]["read_avg_lim"] >= self._conf["latency_sata_hdd"]["read_tot_lim"]:
|
||||
+ Report.report_pass("latency_sata_hdd.read_avg_lim must < latency_sata_hdd.read_tot_lim . exiting...")
|
||||
+ logging.critical("latency_sata_hdd.read_avg_lim must < latency_sata_hdd.read_tot_lim . exiting...")
|
||||
+ exit(1)
|
||||
+ if self._conf["latency_sata_hdd"]["write_avg_lim"] >= self._conf["latency_sata_hdd"]["write_tot_lim"]:
|
||||
+ Report.report_pass("latency_sata_hdd.write_avg_lim must < latency_sata_hdd.write_tot_lim . exiting...")
|
||||
+ logging.critical("latency_sata_hdd.write_avg_lim must < latency_sata_hdd.write_tot_lim . exiting...")
|
||||
+ exit(1)
|
||||
else:
|
||||
Report.report_pass("not found latency_sata_hdd section. exiting...")
|
||||
logging.critical("not found latency_sata_hdd section. exiting...")
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
index e4869d5..2f2d607 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
@@ -67,6 +67,20 @@ def check_detect_frequency_is_valid(period):
|
||||
return None
|
||||
|
||||
|
||||
+def check_disk_is_available(period_time, disk):
|
||||
+ data_raw = is_iocollect_valid(period_time, disk)
|
||||
+ if data_raw["ret"] == 0:
|
||||
+ try:
|
||||
+ data = json.loads(data_raw["message"])
|
||||
+ except Exception as e:
|
||||
+ return False
|
||||
+ if not data:
|
||||
+ return False
|
||||
+ return True
|
||||
+ else:
|
||||
+ return False
|
||||
+
|
||||
+
|
||||
def _get_raw_data(period, disk_list):
|
||||
return get_io_data(
|
||||
period,
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
index e3a0952..496e032 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
@@ -75,6 +75,18 @@ class Detector:
|
||||
f' sliding_window_type: {self._slidingWindow}')
|
||||
|
||||
|
||||
+def set_to_str(parameter: set):
|
||||
+ ret = ""
|
||||
+ parameter = list(parameter)
|
||||
+ length = len(parameter)
|
||||
+ for i in range(length):
|
||||
+ if i == 0:
|
||||
+ ret += parameter[i]
|
||||
+ else:
|
||||
+ ret += "," + parameter[i]
|
||||
+ return ret
|
||||
+
|
||||
+
|
||||
class DiskDetector:
|
||||
|
||||
def __init__(self, disk_name: str):
|
||||
@@ -124,7 +136,7 @@ class DiskDetector:
|
||||
alarm_type.add(metric_name.metric_name)
|
||||
|
||||
latency_wins, iodump_wins = self.get_detector_list_window()
|
||||
- details = f"latency: {latency_wins}, iodump: {iodump_wins}"
|
||||
+ details = {"latency": latency_wins, "iodump": iodump_wins}
|
||||
|
||||
io_press = {"throtl", "wbt", "iocost", "bfq"}
|
||||
driver_slow = {"rq_driver"}
|
||||
@@ -137,7 +149,7 @@ class DiskDetector:
|
||||
elif not kernel_slow.isdisjoint(block_stack):
|
||||
reason = "kernel_slow"
|
||||
|
||||
- return True, driver_name, reason, str(block_stack), str(io_type), str(alarm_type), details
|
||||
+ return True, driver_name, reason, set_to_str(block_stack), set_to_str(io_type), set_to_str(alarm_type), details
|
||||
|
||||
def __repr__(self):
|
||||
msg = f'disk: {self._disk_name}, '
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
index 4083c43..ff3fa3b 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
@@ -107,7 +107,7 @@ class MedianSlidingWindow(SlidingWindow):
|
||||
if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None):
|
||||
is_slow_io_event = False
|
||||
median = np.median(self._io_data_queue)
|
||||
- if median >= self._ai_threshold:
|
||||
+ if (self._ai_threshold is not None and median > self._ai_threshold) or (self._abs_threshold is not None and median > self._abs_threshold):
|
||||
is_slow_io_event = True
|
||||
return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py
|
||||
index 600d041..e202bb8 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/threshold.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/threshold.py
|
||||
@@ -65,9 +65,12 @@ class Threshold:
|
||||
def __repr__(self):
|
||||
return "Threshold"
|
||||
|
||||
+ def __str__(self):
|
||||
+ return "Threshold"
|
||||
+
|
||||
|
||||
class AbsoluteThreshold(Threshold):
|
||||
- def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
|
||||
+ def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
|
||||
super().__init__(data_queue_size, data_queue_update_size)
|
||||
|
||||
def push_latest_data_to_queue(self, data):
|
||||
@@ -76,6 +79,9 @@ class AbsoluteThreshold(Threshold):
|
||||
def __repr__(self):
|
||||
return "[AbsoluteThreshold]"
|
||||
|
||||
+ def __str__(self):
|
||||
+ return "absolute"
|
||||
+
|
||||
|
||||
class BoxplotThreshold(Threshold):
|
||||
def __init__(self, boxplot_parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
|
||||
@@ -112,6 +118,9 @@ class BoxplotThreshold(Threshold):
|
||||
def __repr__(self):
|
||||
return f"[BoxplotThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]"
|
||||
|
||||
+ def __str__(self):
|
||||
+ return "boxplot"
|
||||
+
|
||||
|
||||
class NSigmaThreshold(Threshold):
|
||||
def __init__(self, n_sigma_parameter: float = 3.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
|
||||
@@ -147,6 +156,9 @@ class NSigmaThreshold(Threshold):
|
||||
def __repr__(self):
|
||||
return f"[NSigmaThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]"
|
||||
|
||||
+ def __str__(self):
|
||||
+ return "n_sigma"
|
||||
+
|
||||
|
||||
class ThresholdType(Enum):
|
||||
AbsoluteThreshold = 0
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py
|
||||
index d6f4067..7d2390b 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/utils.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/utils.py
|
||||
@@ -19,8 +19,6 @@ from .io_data import MetricName, IOData
|
||||
|
||||
|
||||
def get_threshold_type_enum(algorithm_type: str):
|
||||
- if algorithm_type.lower() == "absolute":
|
||||
- return ThresholdType.AbsoluteThreshold
|
||||
if algorithm_type.lower() == "boxplot":
|
||||
return ThresholdType.BoxplotThreshold
|
||||
if algorithm_type.lower() == "n_sigma":
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,98 +0,0 @@
|
||||
From 8e4f39897dc8dc51cfa0bbf24667be1688876c15 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
||||
Date: Mon, 21 Oct 2024 14:18:20 +0800
|
||||
Subject: [PATCH] ai_block_io lack section exit
|
||||
|
||||
---
|
||||
.../ai_block_io/config_parser.py | 40 +++++++++----------
|
||||
1 file changed, 20 insertions(+), 20 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
index 7b0cd29..447eccd 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
@@ -401,11 +401,9 @@ class ConfigParser:
|
||||
self._read_stage(items_common)
|
||||
self._read_iotype(items_common)
|
||||
else:
|
||||
- self._conf["common"]["stage"] = ALL_STAGE_LIST
|
||||
- self._conf["common"]["iotype"] = ALL_IOTPYE_LIST
|
||||
- logging.warning(
|
||||
- "common section parameter not found, it will be set to default value."
|
||||
- )
|
||||
+ Report.report_pass("not found common section. exiting...")
|
||||
+ logging.critical("not found common section. exiting...")
|
||||
+ exit(1)
|
||||
|
||||
if con.has_section("algorithm"):
|
||||
items_algorithm = dict(con.items("algorithm"))
|
||||
@@ -413,9 +411,9 @@ class ConfigParser:
|
||||
self._read_train_update_duration(items_algorithm)
|
||||
self._read_algorithm_type_and_parameter(items_algorithm)
|
||||
else:
|
||||
- logging.warning(
|
||||
- "algorithm section parameter not found, it will be set to default value."
|
||||
- )
|
||||
+ Report.report_pass("not found algorithm section. exiting...")
|
||||
+ logging.critical("not found algorithm section. exiting...")
|
||||
+ exit(1)
|
||||
|
||||
if con.has_section("sliding_window"):
|
||||
items_sliding_window = dict(con.items("sliding_window"))
|
||||
@@ -423,9 +421,9 @@ class ConfigParser:
|
||||
self._read_window_size(items_sliding_window)
|
||||
self._read_window_minimum_threshold(items_sliding_window)
|
||||
else:
|
||||
- logging.warning(
|
||||
- "sliding_window section parameter not found, it will be set to default value."
|
||||
- )
|
||||
+ Report.report_pass("not found sliding_window section. exiting...")
|
||||
+ logging.critical("not found sliding_window section. exiting...")
|
||||
+ exit(1)
|
||||
|
||||
if con.has_section("latency_sata_ssd"):
|
||||
items_latency_sata_ssd = dict(con.items("latency_sata_ssd"))
|
||||
@@ -444,9 +442,10 @@ class ConfigParser:
|
||||
gt=0,
|
||||
)
|
||||
else:
|
||||
- logging.warning(
|
||||
- "latency_sata_ssd section parameter not found, it will be set to default value."
|
||||
- )
|
||||
+ Report.report_pass("not found latency_sata_ssd section. exiting...")
|
||||
+ logging.critical("not found latency_sata_ssd section. exiting...")
|
||||
+ exit(1)
|
||||
+
|
||||
if con.has_section("latency_nvme_ssd"):
|
||||
items_latency_nvme_ssd = dict(con.items("latency_nvme_ssd"))
|
||||
self._conf["latency_nvme_ssd"]["read_tot_lim"] = self._get_config_value(
|
||||
@@ -464,9 +463,10 @@ class ConfigParser:
|
||||
gt=0,
|
||||
)
|
||||
else:
|
||||
- logging.warning(
|
||||
- "latency_nvme_ssd section parameter not found, it will be set to default value."
|
||||
- )
|
||||
+ Report.report_pass("not found latency_nvme_ssd section. exiting...")
|
||||
+ logging.critical("not found latency_nvme_ssd section. exiting...")
|
||||
+ exit(1)
|
||||
+
|
||||
if con.has_section("latency_sata_hdd"):
|
||||
items_latency_sata_hdd = dict(con.items("latency_sata_hdd"))
|
||||
self._conf["latency_sata_hdd"]["read_tot_lim"] = self._get_config_value(
|
||||
@@ -484,9 +484,9 @@ class ConfigParser:
|
||||
gt=0,
|
||||
)
|
||||
else:
|
||||
- logging.warning(
|
||||
- "latency_sata_hdd section parameter not found, it will be set to default value."
|
||||
- )
|
||||
+ Report.report_pass("not found latency_sata_hdd section. exiting...")
|
||||
+ logging.critical("not found latency_sata_hdd section. exiting...")
|
||||
+ exit(1)
|
||||
|
||||
self.__print_all_config_value()
|
||||
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,728 +0,0 @@
|
||||
From cedd862d4e4a97a6c4fa13cbff2af452910ea5b4 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
||||
Date: Thu, 24 Oct 2024 09:39:16 +0800
|
||||
Subject: [PATCH] ai_block_io support absolute threshold lower limit
|
||||
|
||||
---
|
||||
config/plugins/ai_block_io.ini | 19 +-
|
||||
.../sentryPlugins/ai_block_io/ai_block_io.py | 36 ++--
|
||||
.../sentryPlugins/ai_block_io/alarm_report.py | 18 +-
|
||||
.../ai_block_io/config_parser.py | 168 ++++++++++++------
|
||||
.../sentryPlugins/ai_block_io/detector.py | 92 ++++++----
|
||||
.../ai_block_io/sliding_window.py | 21 ++-
|
||||
6 files changed, 222 insertions(+), 132 deletions(-)
|
||||
|
||||
diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini
|
||||
index 040237d..d0b1e74 100644
|
||||
--- a/config/plugins/ai_block_io.ini
|
||||
+++ b/config/plugins/ai_block_io.ini
|
||||
@@ -2,9 +2,9 @@
|
||||
level=info
|
||||
|
||||
[common]
|
||||
-slow_io_detect_frequency=1
|
||||
+period_time=1
|
||||
disk=default
|
||||
-stage=bio
|
||||
+stage=default
|
||||
iotype=read,write
|
||||
|
||||
[algorithm]
|
||||
@@ -12,22 +12,25 @@ train_data_duration=24
|
||||
train_update_duration=2
|
||||
algorithm_type=boxplot
|
||||
boxplot_parameter=1.5
|
||||
-n_sigma_parameter=3
|
||||
-
|
||||
-[sliding_window]
|
||||
-sliding_window_type=not_continuous
|
||||
-window_size=30
|
||||
-window_minimum_threshold=6
|
||||
+win_type=not_continuous
|
||||
+win_size=30
|
||||
+win_threshold=6
|
||||
|
||||
[latency_sata_ssd]
|
||||
+read_avg_lim=10000
|
||||
+write_avg_lim=10000
|
||||
read_tot_lim=50000
|
||||
write_tot_lim=50000
|
||||
|
||||
[latency_nvme_ssd]
|
||||
+read_avg_lim=300
|
||||
+write_avg_lim=300
|
||||
read_tot_lim=500
|
||||
write_tot_lim=500
|
||||
|
||||
[latency_sata_hdd]
|
||||
+read_avg_lim=15000
|
||||
+write_avg_lim=15000
|
||||
read_tot_lim=50000
|
||||
write_tot_lim=50000
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
index f25e6d5..74f246a 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
@@ -49,7 +49,7 @@ class SlowIODetection:
|
||||
|
||||
def __init_detector_name_list(self):
|
||||
self._disk_list = check_collect_valid(
|
||||
- self._config_parser.slow_io_detect_frequency
|
||||
+ self._config_parser.period_time
|
||||
)
|
||||
if self._disk_list is None:
|
||||
Report.report_pass(
|
||||
@@ -109,7 +109,7 @@ class SlowIODetection:
|
||||
train_data_duration, train_update_duration = (
|
||||
self._config_parser.get_train_data_duration_and_train_update_duration()
|
||||
)
|
||||
- slow_io_detection_frequency = self._config_parser.slow_io_detect_frequency
|
||||
+ slow_io_detection_frequency = self._config_parser.period_time
|
||||
threshold_type = self._config_parser.algorithm_type
|
||||
data_queue_size, update_size = get_data_queue_size_and_update_size(
|
||||
train_data_duration, train_update_duration, slow_io_detection_frequency
|
||||
@@ -131,10 +131,13 @@ class SlowIODetection:
|
||||
data_queue_size=data_queue_size,
|
||||
data_queue_update_size=update_size,
|
||||
)
|
||||
- abs_threshold = self._config_parser.get_tot_lim(
|
||||
+ tot_lim = self._config_parser.get_tot_lim(
|
||||
metric_name.disk_type, metric_name.io_access_type_name
|
||||
)
|
||||
- if abs_threshold is None:
|
||||
+ avg_lim = self._config_parser.get_avg_lim(
|
||||
+ metric_name.disk_type, metric_name.io_access_type_name
|
||||
+ )
|
||||
+ if tot_lim is None:
|
||||
logging.warning(
|
||||
"disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.",
|
||||
disk,
|
||||
@@ -145,7 +148,8 @@ class SlowIODetection:
|
||||
sliding_window_type,
|
||||
queue_length=window_size,
|
||||
threshold=window_threshold,
|
||||
- abs_threshold=abs_threshold,
|
||||
+ abs_threshold=tot_lim,
|
||||
+ avg_lim=avg_lim
|
||||
)
|
||||
detector = Detector(metric_name, threshold, sliding_window)
|
||||
disk_detector.add_detector(detector)
|
||||
@@ -176,7 +180,7 @@ class SlowIODetection:
|
||||
|
||||
# Step1:获取IO数据
|
||||
io_data_dict_with_disk_name = get_io_data_from_collect_plug(
|
||||
- self._config_parser.slow_io_detect_frequency, self._disk_list
|
||||
+ self._config_parser.period_time, self._disk_list
|
||||
)
|
||||
logging.debug(f"step1. Get io data: {str(io_data_dict_with_disk_name)}")
|
||||
if io_data_dict_with_disk_name is None:
|
||||
@@ -197,25 +201,21 @@ class SlowIODetection:
|
||||
# Step3:慢IO事件上报
|
||||
logging.debug("step3. Report slow io event to sysSentry.")
|
||||
for slow_io_event in slow_io_event_list:
|
||||
- metric_name: MetricName = slow_io_event[1]
|
||||
- window_info = slow_io_event[2]
|
||||
- root_cause = slow_io_event[3]
|
||||
alarm_content = {
|
||||
- "driver_name": f"{metric_name.disk_name}",
|
||||
- "reason": root_cause,
|
||||
- "block_stack": f"{metric_name.stage_name}",
|
||||
- "io_type": f"{metric_name.io_access_type_name}",
|
||||
+ "driver_name": slow_io_event[1],
|
||||
+ "reason": slow_io_event[2],
|
||||
+ "block_stack": slow_io_event[3],
|
||||
+ "io_type": slow_io_event[4],
|
||||
"alarm_source": "ai_block_io",
|
||||
- "alarm_type": "latency",
|
||||
- "details": f"disk type: {metric_name.disk_type}, current window: {window_info[1]}, "
|
||||
- f"ai threshold: {window_info[2]}, abs threshold: {window_info[3]}.",
|
||||
+ "alarm_type": slow_io_event[5],
|
||||
+ "details": slow_io_event[6],
|
||||
}
|
||||
Xalarm.major(alarm_content)
|
||||
- logging.warning(alarm_content)
|
||||
+ logging.warning("[SLOW IO] " + str(alarm_content))
|
||||
|
||||
# Step4:等待检测时间
|
||||
logging.debug("step4. Wait to start next slow io event detection loop.")
|
||||
- time.sleep(self._config_parser.slow_io_detect_frequency)
|
||||
+ time.sleep(self._config_parser.period_time)
|
||||
|
||||
|
||||
def main():
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py
|
||||
index 92bd6e3..61bb145 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/alarm_report.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py
|
||||
@@ -30,17 +30,17 @@ class Report:
|
||||
@staticmethod
|
||||
def report_pass(info: str):
|
||||
report_result(Report.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info}))
|
||||
- logging.info(f'Report {Report.TASK_NAME} PASS: {info}')
|
||||
+ logging.debug(f'Report {Report.TASK_NAME} PASS: {info}')
|
||||
|
||||
@staticmethod
|
||||
def report_fail(info: str):
|
||||
report_result(Report.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info}))
|
||||
- logging.info(f'Report {Report.TASK_NAME} FAIL: {info}')
|
||||
+ logging.debug(f'Report {Report.TASK_NAME} FAIL: {info}')
|
||||
|
||||
@staticmethod
|
||||
def report_skip(info: str):
|
||||
report_result(Report.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info}))
|
||||
- logging.info(f'Report {Report.TASK_NAME} SKIP: {info}')
|
||||
+ logging.debug(f'Report {Report.TASK_NAME} SKIP: {info}')
|
||||
|
||||
|
||||
class Xalarm:
|
||||
@@ -50,31 +50,31 @@ class Xalarm:
|
||||
def minor(info: dict):
|
||||
info_str = json.dumps(info)
|
||||
xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_OCCUR, info_str)
|
||||
- logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}")
|
||||
+ logging.debug(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}")
|
||||
|
||||
@staticmethod
|
||||
def major(info: dict):
|
||||
info_str = json.dumps(info)
|
||||
xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_OCCUR, info_str)
|
||||
- logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}")
|
||||
+ logging.debug(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}")
|
||||
|
||||
@staticmethod
|
||||
def critical(info: dict):
|
||||
info_str = json.dumps(info)
|
||||
xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_OCCUR, info_str)
|
||||
- logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}")
|
||||
+ logging.debug(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}")
|
||||
|
||||
def minor_recover(info: dict):
|
||||
info_str = json.dumps(info)
|
||||
xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_RECOVER, info_str)
|
||||
- logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}")
|
||||
+ logging.debug(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}")
|
||||
|
||||
def major_recover(info: dict):
|
||||
info_str = json.dumps(info)
|
||||
xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_RECOVER, info_str)
|
||||
- logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}")
|
||||
+ logging.debug(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}")
|
||||
|
||||
def critical_recover(info: dict):
|
||||
info_str = json.dumps(info)
|
||||
xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_RECOVER, info_str)
|
||||
- logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}")
|
||||
+ logging.debug(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}")
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
index 1117939..91ec5c6 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
@@ -52,7 +52,7 @@ class ConfigParser:
|
||||
DEFAULT_CONF = {
|
||||
"log": {"level": "info"},
|
||||
"common": {
|
||||
- "slow_io_detect_frequency": 1,
|
||||
+ "period_time": 1,
|
||||
"disk": None,
|
||||
"stage": "throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio",
|
||||
"iotype": "read,write",
|
||||
@@ -63,16 +63,32 @@ class ConfigParser:
|
||||
"algorithm_type": get_threshold_type_enum("boxplot"),
|
||||
"boxplot_parameter": 1.5,
|
||||
"n_sigma_parameter": 3.0,
|
||||
+ "win_type": get_sliding_window_type_enum("not_continuous"),
|
||||
+ "win_size": 30,
|
||||
+ "win_threshold": 6,
|
||||
},
|
||||
- "sliding_window": {
|
||||
- "sliding_window_type": get_sliding_window_type_enum("not_continuous"),
|
||||
- "window_size": 30,
|
||||
- "window_minimum_threshold": 6,
|
||||
+ "latency_sata_ssd": {
|
||||
+ "read_avg_lim": 10000,
|
||||
+ "write_avg_lim": 10000,
|
||||
+ "read_tot_lim": 50000,
|
||||
+ "write_tot_lim": 50000
|
||||
},
|
||||
- "latency_sata_ssd": {"read_tot_lim": 50000, "write_tot_lim": 50000},
|
||||
- "latency_nvme_ssd": {"read_tot_lim": 500, "write_tot_lim": 500},
|
||||
- "latency_sata_hdd": {"read_tot_lim": 50000, "write_tot_lim": 50000},
|
||||
- "iodump": {"read_iodump_lim": 0, "write_iodump_lim": 0}
|
||||
+ "latency_nvme_ssd": {
|
||||
+ "read_avg_lim": 300,
|
||||
+ "write_avg_lim": 300,
|
||||
+ "read_tot_lim": 500,
|
||||
+ "write_tot_lim": 500
|
||||
+ },
|
||||
+ "latency_sata_hdd": {
|
||||
+ "read_avg_lim": 15000,
|
||||
+ "write_avg_lim": 15000,
|
||||
+ "read_tot_lim": 50000,
|
||||
+ "write_tot_lim": 50000
|
||||
+ },
|
||||
+ "iodump": {
|
||||
+ "read_iodump_lim": 0,
|
||||
+ "write_iodump_lim": 0
|
||||
+ }
|
||||
}
|
||||
|
||||
def __init__(self, config_file_name):
|
||||
@@ -161,18 +177,18 @@ class ConfigParser:
|
||||
|
||||
return value
|
||||
|
||||
- def _read_slow_io_detect_frequency(self, items_common: dict):
|
||||
- self._conf["common"]["slow_io_detect_frequency"] = self._get_config_value(
|
||||
+ def _read_period_time(self, items_common: dict):
|
||||
+ self._conf["common"]["period_time"] = self._get_config_value(
|
||||
items_common,
|
||||
- "slow_io_detect_frequency",
|
||||
+ "period_time",
|
||||
int,
|
||||
- self.DEFAULT_CONF["common"]["slow_io_detect_frequency"],
|
||||
+ self.DEFAULT_CONF["common"]["period_time"],
|
||||
gt=0
|
||||
)
|
||||
- frequency = self._conf["common"]["slow_io_detect_frequency"]
|
||||
+ frequency = self._conf["common"]["period_time"]
|
||||
ret = check_detect_frequency_is_valid(frequency)
|
||||
if ret is None:
|
||||
- log = f"slow io detect frequency: {frequency} is valid, "\
|
||||
+ log = f"period_time: {frequency} is valid, "\
|
||||
f"Check whether the value range is too large or is not an "\
|
||||
f"integer multiple of period_time.. exiting..."
|
||||
Report.report_pass(log)
|
||||
@@ -316,50 +332,41 @@ class ConfigParser:
|
||||
self._conf["common"]["iotype"] = dup_iotype_list
|
||||
|
||||
def _read_sliding_window_type(self, items_sliding_window: dict):
|
||||
- sliding_window_type = items_sliding_window.get("sliding_window_type")
|
||||
+ sliding_window_type = items_sliding_window.get("win_type")
|
||||
if sliding_window_type is not None:
|
||||
- self._conf["sliding_window"]["sliding_window_type"] = (
|
||||
+ self._conf["algorithm"]["win_type"] = (
|
||||
get_sliding_window_type_enum(sliding_window_type)
|
||||
)
|
||||
- if self._conf["sliding_window"]["sliding_window_type"] is None:
|
||||
+ if self._conf["algorithm"]["win_type"] is None:
|
||||
logging.critical(
|
||||
- "the sliding_window_type: %s you set is invalid. ai_block_io plug will exit.",
|
||||
+ "the win_type: %s you set is invalid. ai_block_io plug will exit.",
|
||||
sliding_window_type,
|
||||
)
|
||||
Report.report_pass(
|
||||
- f"the sliding_window_type: {sliding_window_type} you set is invalid. ai_block_io plug will exit."
|
||||
+ f"the win_type: {sliding_window_type} you set is invalid. ai_block_io plug will exit."
|
||||
)
|
||||
exit(1)
|
||||
|
||||
def _read_window_size(self, items_sliding_window: dict):
|
||||
- self._conf["sliding_window"]["window_size"] = self._get_config_value(
|
||||
+ self._conf["algorithm"]["win_size"] = self._get_config_value(
|
||||
items_sliding_window,
|
||||
- "window_size",
|
||||
+ "win_size",
|
||||
int,
|
||||
- self.DEFAULT_CONF["sliding_window"]["window_size"],
|
||||
+ self.DEFAULT_CONF["algorithm"]["win_size"],
|
||||
gt=0,
|
||||
- le=3600,
|
||||
+ le=300,
|
||||
)
|
||||
|
||||
def _read_window_minimum_threshold(self, items_sliding_window: dict):
|
||||
- default_window_minimum_threshold = self.DEFAULT_CONF["sliding_window"][
|
||||
- "window_minimum_threshold"
|
||||
- ]
|
||||
- if (
|
||||
- default_window_minimum_threshold
|
||||
- > self._conf["sliding_window"]["window_size"]
|
||||
- ):
|
||||
- default_window_minimum_threshold = (
|
||||
- self._conf["sliding_window"]["window_size"] / 2
|
||||
- )
|
||||
- self._conf["sliding_window"]["window_minimum_threshold"] = (
|
||||
+ default_window_minimum_threshold = self.DEFAULT_CONF["algorithm"]["win_threshold"]
|
||||
+ self._conf["algorithm"]["win_threshold"] = (
|
||||
self._get_config_value(
|
||||
items_sliding_window,
|
||||
- "window_minimum_threshold",
|
||||
+ "win_threshold",
|
||||
int,
|
||||
default_window_minimum_threshold,
|
||||
gt=0,
|
||||
- le=self._conf["sliding_window"]["window_size"],
|
||||
+ le=self._conf["algorithm"]["win_size"],
|
||||
)
|
||||
)
|
||||
|
||||
@@ -406,7 +413,7 @@ class ConfigParser:
|
||||
if con.has_section("common"):
|
||||
items_common = dict(con.items("common"))
|
||||
|
||||
- self._read_slow_io_detect_frequency(items_common)
|
||||
+ self._read_period_time(items_common)
|
||||
self._read_disks_to_detect(items_common)
|
||||
self._read_stage(items_common)
|
||||
self._read_iotype(items_common)
|
||||
@@ -420,20 +427,9 @@ class ConfigParser:
|
||||
self._read_train_data_duration(items_algorithm)
|
||||
self._read_train_update_duration(items_algorithm)
|
||||
self._read_algorithm_type_and_parameter(items_algorithm)
|
||||
- else:
|
||||
- Report.report_pass("not found algorithm section. exiting...")
|
||||
- logging.critical("not found algorithm section. exiting...")
|
||||
- exit(1)
|
||||
-
|
||||
- if con.has_section("sliding_window"):
|
||||
- items_sliding_window = dict(con.items("sliding_window"))
|
||||
-
|
||||
- self._read_window_size(items_sliding_window)
|
||||
- self._read_window_minimum_threshold(items_sliding_window)
|
||||
- else:
|
||||
- Report.report_pass("not found sliding_window section. exiting...")
|
||||
- logging.critical("not found sliding_window section. exiting...")
|
||||
- exit(1)
|
||||
+ self._read_sliding_window_type(items_algorithm)
|
||||
+ self._read_window_size(items_algorithm)
|
||||
+ self._read_window_minimum_threshold(items_algorithm)
|
||||
|
||||
if con.has_section("latency_sata_ssd"):
|
||||
items_latency_sata_ssd = dict(con.items("latency_sata_ssd"))
|
||||
@@ -451,6 +447,20 @@ class ConfigParser:
|
||||
self.DEFAULT_CONF["latency_sata_ssd"]["write_tot_lim"],
|
||||
gt=0,
|
||||
)
|
||||
+ self._conf["latency_sata_ssd"]["read_avg_lim"] = self._get_config_value(
|
||||
+ items_latency_sata_ssd,
|
||||
+ "read_avg_lim",
|
||||
+ int,
|
||||
+ self.DEFAULT_CONF["latency_sata_ssd"]["read_avg_lim"],
|
||||
+ gt=0
|
||||
+ )
|
||||
+ self._conf["latency_sata_ssd"]["write_avg_lim"] = self._get_config_value(
|
||||
+ items_latency_sata_ssd,
|
||||
+ "write_avg_lim",
|
||||
+ int,
|
||||
+ self.DEFAULT_CONF["latency_sata_ssd"]["write_avg_lim"],
|
||||
+ gt=0
|
||||
+ )
|
||||
else:
|
||||
Report.report_pass("not found latency_sata_ssd section. exiting...")
|
||||
logging.critical("not found latency_sata_ssd section. exiting...")
|
||||
@@ -472,6 +482,20 @@ class ConfigParser:
|
||||
self.DEFAULT_CONF["latency_nvme_ssd"]["write_tot_lim"],
|
||||
gt=0,
|
||||
)
|
||||
+ self._conf["latency_nvme_ssd"]["read_avg_lim"] = self._get_config_value(
|
||||
+ items_latency_nvme_ssd,
|
||||
+ "read_avg_lim",
|
||||
+ int,
|
||||
+ self.DEFAULT_CONF["latency_nvme_ssd"]["read_avg_lim"],
|
||||
+ gt=0
|
||||
+ )
|
||||
+ self._conf["latency_nvme_ssd"]["write_avg_lim"] = self._get_config_value(
|
||||
+ items_latency_nvme_ssd,
|
||||
+ "write_avg_lim",
|
||||
+ int,
|
||||
+ self.DEFAULT_CONF["latency_nvme_ssd"]["write_avg_lim"],
|
||||
+ gt=0
|
||||
+ )
|
||||
else:
|
||||
Report.report_pass("not found latency_nvme_ssd section. exiting...")
|
||||
logging.critical("not found latency_nvme_ssd section. exiting...")
|
||||
@@ -493,6 +517,20 @@ class ConfigParser:
|
||||
self.DEFAULT_CONF["latency_sata_hdd"]["write_tot_lim"],
|
||||
gt=0,
|
||||
)
|
||||
+ self._conf["latency_sata_hdd"]["read_avg_lim"] = self._get_config_value(
|
||||
+ items_latency_sata_hdd,
|
||||
+ "read_avg_lim",
|
||||
+ int,
|
||||
+ self.DEFAULT_CONF["latency_sata_hdd"]["read_avg_lim"],
|
||||
+ gt=0
|
||||
+ )
|
||||
+ self._conf["latency_sata_hdd"]["write_avg_lim"] = self._get_config_value(
|
||||
+ items_latency_sata_hdd,
|
||||
+ "write_avg_lim",
|
||||
+ int,
|
||||
+ self.DEFAULT_CONF["latency_sata_hdd"]["write_avg_lim"],
|
||||
+ gt=0
|
||||
+ )
|
||||
else:
|
||||
Report.report_pass("not found latency_sata_hdd section. exiting...")
|
||||
logging.critical("not found latency_sata_hdd section. exiting...")
|
||||
@@ -542,6 +580,18 @@ class ConfigParser:
|
||||
else:
|
||||
return None
|
||||
|
||||
+ def get_avg_lim(self, disk_type, io_type):
|
||||
+ if io_type == "read":
|
||||
+ return self._conf.get(
|
||||
+ f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {}
|
||||
+ ).get("read_avg_lim", None)
|
||||
+ elif io_type == "write":
|
||||
+ return self._conf.get(
|
||||
+ f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {}
|
||||
+ ).get("write_avg_lim", None)
|
||||
+ else:
|
||||
+ return None
|
||||
+
|
||||
def get_train_data_duration_and_train_update_duration(self):
|
||||
return (
|
||||
self._conf["algorithm"]["train_data_duration"],
|
||||
@@ -550,13 +600,13 @@ class ConfigParser:
|
||||
|
||||
def get_window_size_and_window_minimum_threshold(self):
|
||||
return (
|
||||
- self._conf["sliding_window"]["window_size"],
|
||||
- self._conf["sliding_window"]["window_minimum_threshold"],
|
||||
+ self._conf["algorithm"]["win_size"],
|
||||
+ self._conf["algorithm"]["win_threshold"],
|
||||
)
|
||||
|
||||
@property
|
||||
- def slow_io_detect_frequency(self):
|
||||
- return self._conf["common"]["slow_io_detect_frequency"]
|
||||
+ def period_time(self):
|
||||
+ return self._conf["common"]["period_time"]
|
||||
|
||||
@property
|
||||
def algorithm_type(self):
|
||||
@@ -564,7 +614,7 @@ class ConfigParser:
|
||||
|
||||
@property
|
||||
def sliding_window_type(self):
|
||||
- return self._conf["sliding_window"]["sliding_window_type"]
|
||||
+ return self._conf["algorithm"]["win_type"]
|
||||
|
||||
@property
|
||||
def train_data_duration(self):
|
||||
@@ -576,11 +626,11 @@ class ConfigParser:
|
||||
|
||||
@property
|
||||
def window_size(self):
|
||||
- return self._conf["sliding_window"]["window_size"]
|
||||
+ return self._conf["algorithm"]["win_size"]
|
||||
|
||||
@property
|
||||
def window_minimum_threshold(self):
|
||||
- return self._conf["sliding_window"]["window_minimum_threshold"]
|
||||
+ return self._conf["algorithm"]["win_threshold"]
|
||||
|
||||
@property
|
||||
def absolute_threshold(self):
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
index 8536f7a..e3a0952 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
@@ -28,9 +28,13 @@ class Detector:
|
||||
self._threshold.attach_observer(self._slidingWindow)
|
||||
self._count = None
|
||||
|
||||
- def get_metric_name(self):
|
||||
+ @property
|
||||
+ def metric_name(self):
|
||||
return self._metric_name
|
||||
|
||||
+ def get_sliding_window_data(self):
|
||||
+ return self._slidingWindow.get_data()
|
||||
+
|
||||
def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
|
||||
if self._count is None:
|
||||
self._count = datetime.now()
|
||||
@@ -38,22 +42,27 @@ class Detector:
|
||||
now_time = datetime.now()
|
||||
time_diff = (now_time - self._count).total_seconds()
|
||||
if time_diff >= 60:
|
||||
- logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
|
||||
+ logging.info(f"({self._metric_name}) 's latest ai threshold is: {self._threshold.get_threshold()}.")
|
||||
self._count = None
|
||||
|
||||
logging.debug(f'enter Detector: {self}')
|
||||
metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name)
|
||||
if metric_value is None:
|
||||
logging.debug('not found metric value, so return None.')
|
||||
- return (False, False), None, None, None
|
||||
+ return (False, False), None, None, None, None
|
||||
logging.debug(f'input metric value: {str(metric_value)}')
|
||||
self._threshold.push_latest_data_to_queue(metric_value)
|
||||
detection_result = self._slidingWindow.is_slow_io_event(metric_value)
|
||||
# 检测到慢周期,由Detector负责打印info级别日志
|
||||
if detection_result[0][1]:
|
||||
- logging.info(f'[abnormal period happen]: disk info: {self._metric_name}, window: {detection_result[1]}, '
|
||||
- f'current value: {metric_value}, ai threshold: {detection_result[2]}, '
|
||||
- f'absolute threshold: {detection_result[3]}')
|
||||
+ logging.info(f'[abnormal_period]: disk: {self._metric_name.disk_name}, '
|
||||
+ f'stage: {self._metric_name.stage_name}, '
|
||||
+ f'iotype: {self._metric_name.io_access_type_name}, '
|
||||
+ f'metric: {self._metric_name.metric_name}, '
|
||||
+ f'current value: {metric_value}, '
|
||||
+ f'ai threshold: {detection_result[2]}, '
|
||||
+ f'absolute threshold upper limit: {detection_result[3]}, '
|
||||
+ f'lower limit: {detection_result[4]}')
|
||||
else:
|
||||
logging.debug(f'Detection result: {str(detection_result)}')
|
||||
logging.debug(f'exit Detector: {self}')
|
||||
@@ -75,41 +84,60 @@ class DiskDetector:
|
||||
def add_detector(self, detector: Detector):
|
||||
self._detector_list.append(detector)
|
||||
|
||||
+ def get_detector_list_window(self):
|
||||
+ latency_wins = {"read": {}, "write": {}}
|
||||
+ iodump_wins = {"read": {}, "write": {}}
|
||||
+ for detector in self._detector_list:
|
||||
+ if detector.metric_name.metric_name == 'latency':
|
||||
+ latency_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data()
|
||||
+ elif detector.metric_name.metric_name == 'io_dump':
|
||||
+ iodump_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data()
|
||||
+ return latency_wins, iodump_wins
|
||||
+
|
||||
def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
|
||||
- """
|
||||
- 根因诊断逻辑:只有bio阶段发生异常,才认为发生了慢IO事件,即bio阶段异常是慢IO事件的必要条件
|
||||
- 情况一:bio异常,rq_driver也异常,则慢盘
|
||||
- 情况二:bio异常,rq_driver无异常,且有内核IO栈任意阶段异常,则IO栈异常
|
||||
- 情况三:bio异常,rq_driver无异常,且无内核IO栈任意阶段异常,则IO压力大
|
||||
- 情况四:bio异常,则UNKNOWN
|
||||
- """
|
||||
- diagnosis_info = {"bio": [], "rq_driver": [], "io_stage": []}
|
||||
+ diagnosis_info = {"bio": [], "rq_driver": [], "kernel_stack": []}
|
||||
for detector in self._detector_list:
|
||||
# result返回内容:(是否检测到慢IO,是否检测到慢周期)、窗口、ai阈值、绝对阈值
|
||||
# 示例: (False, False), self._io_data_queue, self._ai_threshold, self._abs_threshold
|
||||
result = detector.is_slow_io_event(io_data_dict_with_disk_name)
|
||||
if result[0][0]:
|
||||
- if detector.get_metric_name().stage_name == "bio":
|
||||
- diagnosis_info["bio"].append((detector.get_metric_name(), result))
|
||||
- elif detector.get_metric_name().stage_name == "rq_driver":
|
||||
- diagnosis_info["rq_driver"].append((detector.get_metric_name(), result))
|
||||
+ if detector.metric_name.stage_name == "bio":
|
||||
+ diagnosis_info["bio"].append(detector.metric_name)
|
||||
+ elif detector.metric_name.stage_name == "rq_driver":
|
||||
+ diagnosis_info["rq_driver"].append(detector.metric_name)
|
||||
else:
|
||||
- diagnosis_info["io_stage"].append((detector.get_metric_name(), result))
|
||||
+ diagnosis_info["kernel_stack"].append(detector.metric_name)
|
||||
|
||||
- # 返回内容:(1)是否检测到慢IO事件、(2)MetricName、(3)滑动窗口及阈值、(4)慢IO事件根因
|
||||
- root_cause = None
|
||||
if len(diagnosis_info["bio"]) == 0:
|
||||
- return False, None, None, None
|
||||
- elif len(diagnosis_info["rq_driver"]) != 0:
|
||||
- root_cause = "[Root Cause: disk slow]"
|
||||
- elif len(diagnosis_info["io_stage"]) != 0:
|
||||
- stage_list = []
|
||||
- for io_stage in diagnosis_info["io_stage"]:
|
||||
- stage_list.append(io_stage[0].stage_name)
|
||||
- root_cause = f"[Root Cause: io stage slow, stage: {stage_list}]"
|
||||
- if root_cause is None:
|
||||
- root_cause = "[Root Cause: high io pressure]"
|
||||
- return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause
|
||||
+ return False, None, None, None, None, None, None
|
||||
+
|
||||
+ driver_name = self._disk_name
|
||||
+ reason = "unknown"
|
||||
+ block_stack = set()
|
||||
+ io_type = set()
|
||||
+ alarm_type = set()
|
||||
+
|
||||
+ for key, value in diagnosis_info.items():
|
||||
+ for metric_name in value:
|
||||
+ block_stack.add(metric_name.stage_name)
|
||||
+ io_type.add(metric_name.io_access_type_name)
|
||||
+ alarm_type.add(metric_name.metric_name)
|
||||
+
|
||||
+ latency_wins, iodump_wins = self.get_detector_list_window()
|
||||
+ details = f"latency: {latency_wins}, iodump: {iodump_wins}"
|
||||
+
|
||||
+ io_press = {"throtl", "wbt", "iocost", "bfq"}
|
||||
+ driver_slow = {"rq_driver"}
|
||||
+ kernel_slow = {"gettag", "plug", "deadline", "hctx", "requeue"}
|
||||
+
|
||||
+ if not io_press.isdisjoint(block_stack):
|
||||
+ reason = "io_press"
|
||||
+ elif not driver_slow.isdisjoint(block_stack):
|
||||
+ reason = "driver_slow"
|
||||
+ elif not kernel_slow.isdisjoint(block_stack):
|
||||
+ reason = "kernel_slow"
|
||||
+
|
||||
+ return True, driver_name, reason, str(block_stack), str(io_type), str(alarm_type), details
|
||||
|
||||
def __repr__(self):
|
||||
msg = f'disk: {self._disk_name}, '
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
index cebe41f..4083c43 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
@@ -21,11 +21,12 @@ class SlidingWindowType(Enum):
|
||||
|
||||
|
||||
class SlidingWindow:
|
||||
- def __init__(self, queue_length: int, threshold: int, abs_threshold: int = None):
|
||||
+ def __init__(self, queue_length: int, threshold: int, abs_threshold: int = None, avg_lim: int = None):
|
||||
self._queue_length = queue_length
|
||||
self._queue_threshold = threshold
|
||||
self._ai_threshold = None
|
||||
self._abs_threshold = abs_threshold
|
||||
+ self._avg_lim = avg_lim
|
||||
self._io_data_queue = []
|
||||
self._io_data_queue_abnormal_tag = []
|
||||
|
||||
@@ -35,8 +36,13 @@ class SlidingWindow:
|
||||
self._io_data_queue_abnormal_tag.pop(0)
|
||||
self._io_data_queue.append(data)
|
||||
tag = False
|
||||
- if ((self._ai_threshold is not None and data > self._ai_threshold) or
|
||||
- (self._abs_threshold is not None and data > self._abs_threshold)):
|
||||
+ if self._avg_lim is not None and data < self._avg_lim:
|
||||
+ tag = False
|
||||
+ self._io_data_queue_abnormal_tag.append(tag)
|
||||
+ return tag
|
||||
+ if self._ai_threshold is not None and data > self._ai_threshold:
|
||||
+ tag = True
|
||||
+ if self._abs_threshold is not None and data > self._abs_threshold:
|
||||
tag = True
|
||||
self._io_data_queue_abnormal_tag.append(tag)
|
||||
return tag
|
||||
@@ -52,6 +58,9 @@ class SlidingWindow:
|
||||
def is_slow_io_event(self, data):
|
||||
return False, None, None, None
|
||||
|
||||
+ def get_data(self):
|
||||
+ return self._io_data_queue
|
||||
+
|
||||
def __repr__(self):
|
||||
return "[SlidingWindow]"
|
||||
|
||||
@@ -64,7 +73,7 @@ class NotContinuousSlidingWindow(SlidingWindow):
|
||||
is_slow_io_event = False
|
||||
if self._io_data_queue_abnormal_tag.count(True) >= self._queue_threshold:
|
||||
is_slow_io_event = True
|
||||
- return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold
|
||||
+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim
|
||||
|
||||
def __repr__(self):
|
||||
return f"[NotContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]"
|
||||
@@ -85,7 +94,7 @@ class ContinuousSlidingWindow(SlidingWindow):
|
||||
break
|
||||
else:
|
||||
consecutive_count = 0
|
||||
- return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold
|
||||
+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim
|
||||
|
||||
def __repr__(self):
|
||||
return f"[ContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]"
|
||||
@@ -100,7 +109,7 @@ class MedianSlidingWindow(SlidingWindow):
|
||||
median = np.median(self._io_data_queue)
|
||||
if median >= self._ai_threshold:
|
||||
is_slow_io_event = True
|
||||
- return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold
|
||||
+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim
|
||||
|
||||
def __repr__(self):
|
||||
return f"[MedianSlidingWindow, window size: {self._queue_length}]"
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,200 +0,0 @@
|
||||
From db97139c411e86d6dc07fe0e91ae38c1bef17a8d Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
||||
Date: Tue, 22 Oct 2024 16:37:52 +0800
|
||||
Subject: [PATCH] ai_block_io support iodump
|
||||
|
||||
---
|
||||
config/plugins/ai_block_io.ini | 6 +-
|
||||
.../sentryPlugins/ai_block_io/ai_block_io.py | 75 ++++++++++++-------
|
||||
.../ai_block_io/config_parser.py | 30 ++++++++
|
||||
.../ai_block_io/sliding_window.py | 4 +-
|
||||
4 files changed, 84 insertions(+), 31 deletions(-)
|
||||
|
||||
diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini
|
||||
index 422cfa3..040237d 100644
|
||||
--- a/config/plugins/ai_block_io.ini
|
||||
+++ b/config/plugins/ai_block_io.ini
|
||||
@@ -29,4 +29,8 @@ write_tot_lim=500
|
||||
|
||||
[latency_sata_hdd]
|
||||
read_tot_lim=50000
|
||||
-write_tot_lim=50000
|
||||
\ No newline at end of file
|
||||
+write_tot_lim=50000
|
||||
+
|
||||
+[iodump]
|
||||
+read_iodump_lim=0
|
||||
+write_iodump_lim=0
|
||||
\ No newline at end of file
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
index 4eecd43..f25e6d5 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
@@ -15,7 +15,7 @@ import logging
|
||||
from collections import defaultdict
|
||||
|
||||
from .detector import Detector, DiskDetector
|
||||
-from .threshold import ThresholdFactory
|
||||
+from .threshold import ThresholdFactory, ThresholdType
|
||||
from .sliding_window import SlidingWindowFactory
|
||||
from .utils import get_data_queue_size_and_update_size
|
||||
from .config_parser import ConfigParser
|
||||
@@ -91,9 +91,8 @@ class SlowIODetection:
|
||||
continue
|
||||
for stage in stages:
|
||||
for iotype in iotypes:
|
||||
- self._detector_name_list[disk].append(
|
||||
- MetricName(disk, disk_type, stage, iotype, "latency")
|
||||
- )
|
||||
+ self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "latency"))
|
||||
+ self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "io_dump"))
|
||||
if disks:
|
||||
logging.warning(
|
||||
"disks: %s not in available disk list, so they will be ignored.",
|
||||
@@ -123,31 +122,51 @@ class SlowIODetection:
|
||||
for disk, metric_name_list in self._detector_name_list.items():
|
||||
disk_detector = DiskDetector(disk)
|
||||
for metric_name in metric_name_list:
|
||||
- threshold = ThresholdFactory().get_threshold(
|
||||
- threshold_type,
|
||||
- boxplot_parameter=self._config_parser.boxplot_parameter,
|
||||
- n_sigma_paramter=self._config_parser.n_sigma_parameter,
|
||||
- data_queue_size=data_queue_size,
|
||||
- data_queue_update_size=update_size,
|
||||
- )
|
||||
- abs_threshold = self._config_parser.get_tot_lim(
|
||||
- metric_name.disk_type, metric_name.io_access_type_name
|
||||
- )
|
||||
- if abs_threshold is None:
|
||||
- logging.warning(
|
||||
- "disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.",
|
||||
- disk,
|
||||
- metric_name.disk_type,
|
||||
- metric_name.io_access_type_name,
|
||||
+
|
||||
+ if metric_name.metric_name == 'latency':
|
||||
+ threshold = ThresholdFactory().get_threshold(
|
||||
+ threshold_type,
|
||||
+ boxplot_parameter=self._config_parser.boxplot_parameter,
|
||||
+ n_sigma_paramter=self._config_parser.n_sigma_parameter,
|
||||
+ data_queue_size=data_queue_size,
|
||||
+ data_queue_update_size=update_size,
|
||||
)
|
||||
- sliding_window = SlidingWindowFactory().get_sliding_window(
|
||||
- sliding_window_type,
|
||||
- queue_length=window_size,
|
||||
- threshold=window_threshold,
|
||||
- abs_threshold=abs_threshold,
|
||||
- )
|
||||
- detector = Detector(metric_name, threshold, sliding_window)
|
||||
- disk_detector.add_detector(detector)
|
||||
+ abs_threshold = self._config_parser.get_tot_lim(
|
||||
+ metric_name.disk_type, metric_name.io_access_type_name
|
||||
+ )
|
||||
+ if abs_threshold is None:
|
||||
+ logging.warning(
|
||||
+ "disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.",
|
||||
+ disk,
|
||||
+ metric_name.disk_type,
|
||||
+ metric_name.io_access_type_name,
|
||||
+ )
|
||||
+ sliding_window = SlidingWindowFactory().get_sliding_window(
|
||||
+ sliding_window_type,
|
||||
+ queue_length=window_size,
|
||||
+ threshold=window_threshold,
|
||||
+ abs_threshold=abs_threshold,
|
||||
+ )
|
||||
+ detector = Detector(metric_name, threshold, sliding_window)
|
||||
+ disk_detector.add_detector(detector)
|
||||
+ continue
|
||||
+
|
||||
+ elif metric_name.metric_name == 'io_dump':
|
||||
+ threshold = ThresholdFactory().get_threshold(ThresholdType.AbsoluteThreshold)
|
||||
+ abs_threshold = None
|
||||
+ if metric_name.io_access_type_name == 'read':
|
||||
+ abs_threshold = self._config_parser.read_iodump_lim
|
||||
+ elif metric_name.io_access_type_name == 'write':
|
||||
+ abs_threshold = self._config_parser.write_iodump_lim
|
||||
+ sliding_window = SlidingWindowFactory().get_sliding_window(
|
||||
+ sliding_window_type,
|
||||
+ queue_length=window_size,
|
||||
+ threshold=window_threshold
|
||||
+ )
|
||||
+ detector = Detector(metric_name, threshold, sliding_window)
|
||||
+ threshold.set_threshold(abs_threshold)
|
||||
+ disk_detector.add_detector(detector)
|
||||
+
|
||||
logging.info(f"disk: [{disk}] add detector:\n [{disk_detector}]")
|
||||
self._disk_detectors[disk] = disk_detector
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
index 274a31e..1117939 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
@@ -72,6 +72,7 @@ class ConfigParser:
|
||||
"latency_sata_ssd": {"read_tot_lim": 50000, "write_tot_lim": 50000},
|
||||
"latency_nvme_ssd": {"read_tot_lim": 500, "write_tot_lim": 500},
|
||||
"latency_sata_hdd": {"read_tot_lim": 50000, "write_tot_lim": 50000},
|
||||
+ "iodump": {"read_iodump_lim": 0, "write_iodump_lim": 0}
|
||||
}
|
||||
|
||||
def __init__(self, config_file_name):
|
||||
@@ -497,6 +498,27 @@ class ConfigParser:
|
||||
logging.critical("not found latency_sata_hdd section. exiting...")
|
||||
exit(1)
|
||||
|
||||
+ if con.has_section("iodump"):
|
||||
+ items_iodump = dict(con.items("iodump"))
|
||||
+ self._conf["iodump"]["read_iodump_lim"] = self._get_config_value(
|
||||
+ items_iodump,
|
||||
+ "read_iodump_lim",
|
||||
+ int,
|
||||
+ self.DEFAULT_CONF["iodump"]["read_iodump_lim"],
|
||||
+ ge=0
|
||||
+ )
|
||||
+ self._conf["iodump"]["write_iodump_lim"] = self._get_config_value(
|
||||
+ items_iodump,
|
||||
+ "write_iodump_lim",
|
||||
+ int,
|
||||
+ self.DEFAULT_CONF["iodump"]["write_iodump_lim"],
|
||||
+ ge=0
|
||||
+ )
|
||||
+ else:
|
||||
+ Report.report_pass("not found iodump section. exiting...")
|
||||
+ logging.critical("not found iodump section. exiting...")
|
||||
+ exit(1)
|
||||
+
|
||||
self.__print_all_config_value()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
@@ -587,3 +609,11 @@ class ConfigParser:
|
||||
@property
|
||||
def n_sigma_parameter(self):
|
||||
return self._conf["algorithm"]["n_sigma_parameter"]
|
||||
+
|
||||
+ @property
|
||||
+ def read_iodump_lim(self):
|
||||
+ return self._conf["iodump"]["read_iodump_lim"]
|
||||
+
|
||||
+ @property
|
||||
+ def write_iodump_lim(self):
|
||||
+ return self._conf["iodump"]["write_iodump_lim"]
|
||||
\ No newline at end of file
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
index d7c402a..cebe41f 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
@@ -35,8 +35,8 @@ class SlidingWindow:
|
||||
self._io_data_queue_abnormal_tag.pop(0)
|
||||
self._io_data_queue.append(data)
|
||||
tag = False
|
||||
- if ((self._ai_threshold is not None and data >= self._ai_threshold) or
|
||||
- (self._abs_threshold is not None and data >= self._abs_threshold)):
|
||||
+ if ((self._ai_threshold is not None and data > self._ai_threshold) or
|
||||
+ (self._abs_threshold is not None and data > self._abs_threshold)):
|
||||
tag = True
|
||||
self._io_data_queue_abnormal_tag.append(tag)
|
||||
return tag
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,906 +0,0 @@
|
||||
From 13dc3712b4530a312aa43610f7696a4a62f30e96 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
||||
Date: Fri, 11 Oct 2024 21:50:32 +0800
|
||||
Subject: [PATCH] ai_block_io support stage and iotype
|
||||
|
||||
---
|
||||
config/plugins/ai_block_io.ini | 7 +-
|
||||
.../sentryPlugins/ai_block_io/ai_block_io.py | 126 +++--
|
||||
.../ai_block_io/config_parser.py | 471 +++++++++++++-----
|
||||
.../sentryPlugins/ai_block_io/data_access.py | 11 +-
|
||||
.../sentryPlugins/ai_block_io/detector.py | 25 +
|
||||
src/python/sentryPlugins/ai_block_io/utils.py | 3 +-
|
||||
6 files changed, 453 insertions(+), 190 deletions(-)
|
||||
|
||||
diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini
|
||||
index 01ce266..a814d52 100644
|
||||
--- a/config/plugins/ai_block_io.ini
|
||||
+++ b/config/plugins/ai_block_io.ini
|
||||
@@ -1,7 +1,12 @@
|
||||
+[log]
|
||||
+level=info
|
||||
+
|
||||
[common]
|
||||
absolute_threshold=40
|
||||
slow_io_detect_frequency=1
|
||||
-log_level=info
|
||||
+disk=default
|
||||
+stage=bio
|
||||
+iotype=read,write
|
||||
|
||||
[algorithm]
|
||||
train_data_duration=24
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
index 77104a9..e1052ec 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
@@ -13,7 +13,7 @@ import time
|
||||
import signal
|
||||
import logging
|
||||
|
||||
-from .detector import Detector
|
||||
+from .detector import Detector, DiskDetector
|
||||
from .threshold import ThresholdFactory, AbsoluteThreshold
|
||||
from .sliding_window import SlidingWindowFactory
|
||||
from .utils import get_data_queue_size_and_update_size
|
||||
@@ -34,8 +34,8 @@ def sig_handler(signum, frame):
|
||||
class SlowIODetection:
|
||||
_config_parser = None
|
||||
_disk_list = None
|
||||
- _detector_name_list = []
|
||||
- _detectors = {}
|
||||
+ _detector_name_list = {}
|
||||
+ _disk_detectors = {}
|
||||
|
||||
def __init__(self, config_parser: ConfigParser):
|
||||
self._config_parser = config_parser
|
||||
@@ -43,85 +43,101 @@ class SlowIODetection:
|
||||
self.__init_detector()
|
||||
|
||||
def __init_detector_name_list(self):
|
||||
- self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency())
|
||||
+ self._disk_list = check_collect_valid(self._config_parser.slow_io_detect_frequency)
|
||||
if self._disk_list is None:
|
||||
Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...")
|
||||
exit(1)
|
||||
|
||||
logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
|
||||
- disks_to_detection: list = self._config_parser.get_disks_to_detection()
|
||||
+ disks: list = self._config_parser.disks_to_detection
|
||||
+ stages: list = self._config_parser.stage
|
||||
+ iotypes: list = self._config_parser.iotype
|
||||
# 情况1:None,则启用所有磁盘检测
|
||||
# 情况2:is not None and len = 0,则不启动任何磁盘检测
|
||||
# 情况3:len != 0,则取交集
|
||||
- if disks_to_detection is None:
|
||||
+ if disks is None:
|
||||
logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.")
|
||||
for disk in self._disk_list:
|
||||
- self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
|
||||
- self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
|
||||
- elif len(disks_to_detection) == 0:
|
||||
- logging.warning('please attention: conf file not specify any disk to detection, so it will not start ai block io.')
|
||||
+ for stage in stages:
|
||||
+ for iotype in iotypes:
|
||||
+ if disk not in self._detector_name_list:
|
||||
+ self._detector_name_list[disk] = []
|
||||
+ self._detector_name_list[disk].append(MetricName(disk, stage, iotype, "latency"))
|
||||
else:
|
||||
- for disk_to_detection in disks_to_detection:
|
||||
- if disk_to_detection in self._disk_list:
|
||||
- self._detector_name_list.append(MetricName(disk_to_detection, "bio", "read", "latency"))
|
||||
- self._detector_name_list.append(MetricName(disk_to_detection, "bio", "write", "latency"))
|
||||
+ for disk in disks:
|
||||
+ if disk in self._disk_list:
|
||||
+ for stage in stages:
|
||||
+ for iotype in iotypes:
|
||||
+ if disk not in self._detector_name_list:
|
||||
+ self._detector_name_list[disk] = []
|
||||
+ self._detector_name_list[disk].append(MetricName(disk, stage, iotype, "latency"))
|
||||
else:
|
||||
- logging.warning(f"disk:[{disk_to_detection}] not in available disk list, so it will be ignored.")
|
||||
- logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}')
|
||||
+ logging.warning("disk: [%s] not in available disk list, so it will be ignored.", disk)
|
||||
+ if len(self._detector_name_list) == 0:
|
||||
+ logging.critical("the disks to detection is empty, ai_block_io will exit.")
|
||||
+ Report.report_pass("the disks to detection is empty, ai_block_io will exit.")
|
||||
+ exit(1)
|
||||
|
||||
def __init_detector(self):
|
||||
- train_data_duration, train_update_duration = (self._config_parser.
|
||||
- get_train_data_duration_and_train_update_duration())
|
||||
- slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency()
|
||||
- threshold_type = self._config_parser.get_algorithm_type()
|
||||
- data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration,
|
||||
- train_update_duration,
|
||||
- slow_io_detection_frequency)
|
||||
- sliding_window_type = self._config_parser.get_sliding_window_type()
|
||||
- window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold()
|
||||
-
|
||||
- for detector_name in self._detector_name_list:
|
||||
- threshold = ThresholdFactory().get_threshold(threshold_type,
|
||||
- boxplot_parameter=self._config_parser.get_boxplot_parameter(),
|
||||
- n_sigma_paramter=self._config_parser.get_n_sigma_parameter(),
|
||||
- data_queue_size=data_queue_size,
|
||||
- data_queue_update_size=update_size)
|
||||
- sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size,
|
||||
- threshold=window_threshold)
|
||||
- detector = Detector(detector_name, threshold, sliding_window)
|
||||
- # 绝对阈值的阈值初始化
|
||||
- if isinstance(threshold, AbsoluteThreshold):
|
||||
- threshold.set_threshold(self._config_parser.get_absolute_threshold())
|
||||
- self._detectors[detector_name] = detector
|
||||
- logging.info(f"add detector: {detector}")
|
||||
+ train_data_duration, train_update_duration = (
|
||||
+ self._config_parser.get_train_data_duration_and_train_update_duration()
|
||||
+ )
|
||||
+ slow_io_detection_frequency = self._config_parser.slow_io_detect_frequency
|
||||
+ threshold_type = self._config_parser.algorithm_type
|
||||
+ data_queue_size, update_size = get_data_queue_size_and_update_size(
|
||||
+ train_data_duration, train_update_duration, slow_io_detection_frequency
|
||||
+ )
|
||||
+ sliding_window_type = self._config_parser.sliding_window_type
|
||||
+ window_size, window_threshold = (self._config_parser.get_window_size_and_window_minimum_threshold())
|
||||
+
|
||||
+ for disk, metric_name_list in self._detector_name_list.items():
|
||||
+ threshold = ThresholdFactory().get_threshold(
|
||||
+ threshold_type,
|
||||
+ boxplot_parameter=self._config_parser.boxplot_parameter,
|
||||
+ n_sigma_paramter=self._config_parser.n_sigma_parameter,
|
||||
+ data_queue_size=data_queue_size,
|
||||
+ data_queue_update_size=update_size,
|
||||
+ )
|
||||
+ sliding_window = SlidingWindowFactory().get_sliding_window(
|
||||
+ sliding_window_type,
|
||||
+ queue_length=window_size,
|
||||
+ threshold=window_threshold,
|
||||
+ )
|
||||
+ disk_detector = DiskDetector(disk)
|
||||
+ for metric_name in metric_name_list:
|
||||
+ detector = Detector(metric_name, threshold, sliding_window)
|
||||
+ disk_detector.add_detector(detector)
|
||||
+ logging.info(f'disk: [{disk}] add detector:\n [{disk_detector}]')
|
||||
+ self._disk_detectors[disk] = disk_detector
|
||||
|
||||
def launch(self):
|
||||
while True:
|
||||
- logging.debug('step0. AI threshold slow io event detection is looping.')
|
||||
+ logging.debug("step0. AI threshold slow io event detection is looping.")
|
||||
|
||||
# Step1:获取IO数据
|
||||
io_data_dict_with_disk_name = get_io_data_from_collect_plug(
|
||||
- self._config_parser.get_slow_io_detect_frequency(), self._disk_list
|
||||
+ self._config_parser.slow_io_detect_frequency, self._disk_list
|
||||
)
|
||||
- logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}')
|
||||
+ logging.debug(f"step1. Get io data: {str(io_data_dict_with_disk_name)}")
|
||||
if io_data_dict_with_disk_name is None:
|
||||
- Report.report_pass("get io data error, please check if the collector plug is enable. exitting...")
|
||||
+ Report.report_pass(
|
||||
+ "get io data error, please check if the collector plug is enable. exitting..."
|
||||
+ )
|
||||
exit(1)
|
||||
|
||||
# Step2:慢IO检测
|
||||
- logging.debug('step2. Start to detection slow io event.')
|
||||
+ logging.debug("step2. Start to detection slow io event.")
|
||||
slow_io_event_list = []
|
||||
- for metric_name, detector in self._detectors.items():
|
||||
- result = detector.is_slow_io_event(io_data_dict_with_disk_name)
|
||||
+ for disk, disk_detector in self._disk_detectors.items():
|
||||
+ result = disk_detector.is_slow_io_event(io_data_dict_with_disk_name)
|
||||
if result[0]:
|
||||
- slow_io_event_list.append((detector.get_metric_name(), result))
|
||||
- logging.debug('step2. End to detection slow io event.')
|
||||
+ slow_io_event_list.append(result)
|
||||
+ logging.debug("step2. End to detection slow io event.")
|
||||
|
||||
# Step3:慢IO事件上报
|
||||
- logging.debug('step3. Report slow io event to sysSentry.')
|
||||
+ logging.debug("step3. Report slow io event to sysSentry.")
|
||||
for slow_io_event in slow_io_event_list:
|
||||
- metric_name: MetricName = slow_io_event[0]
|
||||
- result = slow_io_event[1]
|
||||
+ metric_name: MetricName = slow_io_event[1]
|
||||
alarm_content = {
|
||||
"driver_name": f"{metric_name.get_disk_name()}",
|
||||
"reason": "disk_slow",
|
||||
@@ -129,14 +145,14 @@ class SlowIODetection:
|
||||
"io_type": f"{metric_name.get_io_access_type_name()}",
|
||||
"alarm_source": "ai_block_io",
|
||||
"alarm_type": "latency",
|
||||
- "details": f"current window is: {result[1]}, threshold is: {result[2]}.",
|
||||
+ "details": f"current window is: {slow_io_event[2]}, threshold is: {slow_io_event[3]}.",
|
||||
}
|
||||
Xalarm.major(alarm_content)
|
||||
logging.warning(alarm_content)
|
||||
|
||||
# Step4:等待检测时间
|
||||
- logging.debug('step4. Wait to start next slow io event detection loop.')
|
||||
- time.sleep(self._config_parser.get_slow_io_detect_frequency())
|
||||
+ logging.debug("step4. Wait to start next slow io event detection loop.")
|
||||
+ time.sleep(self._config_parser.slow_io_detect_frequency)
|
||||
|
||||
|
||||
def main():
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
index 354c122..a357766 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
@@ -9,44 +9,60 @@
|
||||
# PURPOSE.
|
||||
# See the Mulan PSL v2 for more details.
|
||||
|
||||
+import os
|
||||
import configparser
|
||||
import logging
|
||||
|
||||
+from .alarm_report import Report
|
||||
from .threshold import ThresholdType
|
||||
from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level
|
||||
|
||||
|
||||
LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
||||
|
||||
+ALL_STAGE_LIST = ['throtl', 'wbt', 'gettag', 'plug', 'deadline', 'hctx', 'requeue', 'rq_driver', 'bio']
|
||||
+ALL_IOTPYE_LIST = ['read', 'write']
|
||||
+
|
||||
|
||||
def init_log_format(log_level: str):
|
||||
logging.basicConfig(level=get_log_level(log_level.lower()), format=LOG_FORMAT)
|
||||
- if log_level.lower() not in ('info', 'warning', 'error', 'debug'):
|
||||
- logging.warning(f'the log_level: {log_level} you set is invalid, use default value: info.')
|
||||
+ if log_level.lower() not in ("info", "warning", "error", "debug"):
|
||||
+ logging.warning(
|
||||
+ f"the log_level: {log_level} you set is invalid, use default value: info."
|
||||
+ )
|
||||
|
||||
|
||||
class ConfigParser:
|
||||
DEFAULT_ABSOLUTE_THRESHOLD = 40
|
||||
DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1
|
||||
- DEFAULT_LOG_LEVEL = 'info'
|
||||
+ DEFAULT_LOG_LEVEL = "info"
|
||||
+
|
||||
+ DEFAULT_STAGE = 'throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio'
|
||||
+ DEFAULT_IOTYPE = 'read,write'
|
||||
|
||||
- DEFAULT_ALGORITHM_TYPE = 'boxplot'
|
||||
+ DEFAULT_ALGORITHM_TYPE = "boxplot"
|
||||
DEFAULT_TRAIN_DATA_DURATION = 24
|
||||
DEFAULT_TRAIN_UPDATE_DURATION = 2
|
||||
DEFAULT_BOXPLOT_PARAMETER = 1.5
|
||||
DEFAULT_N_SIGMA_PARAMETER = 3
|
||||
|
||||
- DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous'
|
||||
+ DEFAULT_SLIDING_WINDOW_TYPE = "not_continuous"
|
||||
DEFAULT_WINDOW_SIZE = 30
|
||||
DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6
|
||||
|
||||
def __init__(self, config_file_name):
|
||||
self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
|
||||
- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
|
||||
+ self.__slow_io_detect_frequency = (
|
||||
+ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
|
||||
+ )
|
||||
self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL
|
||||
self.__disks_to_detection = None
|
||||
+ self.__stage = ConfigParser.DEFAULT_STAGE
|
||||
+ self.__iotype = ConfigParser.DEFAULT_IOTYPE
|
||||
|
||||
- self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE
|
||||
+ self.__algorithm_type = get_threshold_type_enum(
|
||||
+ ConfigParser.DEFAULT_ALGORITHM_TYPE
|
||||
+ )
|
||||
self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
|
||||
self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
|
||||
self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
|
||||
@@ -58,199 +74,398 @@ class ConfigParser:
|
||||
|
||||
self.__config_file_name = config_file_name
|
||||
|
||||
- def __read_absolute_threshold(self, items_common: dict):
|
||||
+ def _get_config_value(
|
||||
+ self,
|
||||
+ config_items: dict,
|
||||
+ key: str,
|
||||
+ value_type,
|
||||
+ default_value=None,
|
||||
+ gt=None,
|
||||
+ ge=None,
|
||||
+ lt=None,
|
||||
+ le=None,
|
||||
+ ):
|
||||
+ value = config_items.get(key)
|
||||
+ if value is None:
|
||||
+ logging.warning(
|
||||
+ "config of %s not found, the default value %s will be used.",
|
||||
+ key,
|
||||
+ default_value,
|
||||
+ )
|
||||
+ value = default_value
|
||||
+ if not value:
|
||||
+ logging.critical(
|
||||
+ "the value of %s is empty, ai_block_io plug will exit.", key
|
||||
+ )
|
||||
+ Report.report_pass(
|
||||
+ f"the value of {key} is empty, ai_block_io plug will exit."
|
||||
+ )
|
||||
+ exit(1)
|
||||
try:
|
||||
- self.__absolute_threshold = float(items_common.get('absolute_threshold',
|
||||
- ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD))
|
||||
- if self.__absolute_threshold <= 0:
|
||||
- logging.warning(
|
||||
- f'the_absolute_threshold: {self.__absolute_threshold} you set is invalid, use default value: {ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD}.')
|
||||
- self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
|
||||
+ value = value_type(value)
|
||||
except ValueError:
|
||||
- self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
|
||||
- logging.warning(
|
||||
- f'the_absolute_threshold type conversion has error, use default value: {self.__absolute_threshold}.')
|
||||
+ logging.critical(
|
||||
+ "the value of %s is not a valid %s, ai_block_io plug will exit.",
|
||||
+ key,
|
||||
+ value_type,
|
||||
+ )
|
||||
+ Report.report_pass(
|
||||
+ f"the value of {key} is not a valid {value_type}, ai_block_io plug will exit."
|
||||
+ )
|
||||
+ exit(1)
|
||||
+ if gt is not None and value <= gt:
|
||||
+ logging.critical(
|
||||
+ "the value of %s is not greater than %s, ai_block_io plug will exit.",
|
||||
+ key,
|
||||
+ gt,
|
||||
+ )
|
||||
+ Report.report_pass(
|
||||
+ f"the value of {key} is not greater than {gt}, ai_block_io plug will exit."
|
||||
+ )
|
||||
+ exit(1)
|
||||
+ if ge is not None and value < ge:
|
||||
+ logging.critical(
|
||||
+ "the value of %s is not greater than or equal to %s, ai_block_io plug will exit.",
|
||||
+ key,
|
||||
+ ge,
|
||||
+ )
|
||||
+ Report.report_pass(
|
||||
+ f"the value of {key} is not greater than or equal to {ge}, ai_block_io plug will exit."
|
||||
+ )
|
||||
+ exit(1)
|
||||
+ if lt is not None and value >= lt:
|
||||
+ logging.critical(
|
||||
+ "the value of %s is not less than %s, ai_block_io plug will exit.",
|
||||
+ key,
|
||||
+ lt,
|
||||
+ )
|
||||
+ Report.report_pass(
|
||||
+ f"the value of {key} is not less than {lt}, ai_block_io plug will exit."
|
||||
+ )
|
||||
+ exit(1)
|
||||
+ if le is not None and value > le:
|
||||
+ logging.critical(
|
||||
+ "the value of %s is not less than or equal to %s, ai_block_io plug will exit.",
|
||||
+ key,
|
||||
+ le,
|
||||
+ )
|
||||
+ Report.report_pass(
|
||||
+ f"the value of {key} is not less than or equal to {le}, ai_block_io plug will exit."
|
||||
+ )
|
||||
+ exit(1)
|
||||
+
|
||||
+ return value
|
||||
+
|
||||
+ def __read_absolute_threshold(self, items_common: dict):
|
||||
+ self.__absolute_threshold = self._get_config_value(
|
||||
+ items_common,
|
||||
+ "absolute_threshold",
|
||||
+ float,
|
||||
+ ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD,
|
||||
+ gt=0,
|
||||
+ )
|
||||
|
||||
def __read__slow_io_detect_frequency(self, items_common: dict):
|
||||
- try:
|
||||
- self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency',
|
||||
- ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY))
|
||||
- if self.__slow_io_detect_frequency < 1 or self.__slow_io_detect_frequency > 10:
|
||||
- logging.warning(
|
||||
- f'the slow_io_detect_frequency: {self.__slow_io_detect_frequency} you set is invalid, use default value: {ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY}.')
|
||||
- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
|
||||
- except ValueError:
|
||||
- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
|
||||
- logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.')
|
||||
+ self.__slow_io_detect_frequency = self._get_config_value(
|
||||
+ items_common,
|
||||
+ "slow_io_detect_frequency",
|
||||
+ int,
|
||||
+ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY,
|
||||
+ gt=0,
|
||||
+ le=300,
|
||||
+ )
|
||||
|
||||
def __read__disks_to_detect(self, items_common: dict):
|
||||
- disks_to_detection = items_common.get('disk')
|
||||
+ disks_to_detection = items_common.get("disk")
|
||||
if disks_to_detection is None:
|
||||
- logging.warning(f'config of disk not found, the default value will be used.')
|
||||
+ logging.warning("config of disk not found, the default value will be used.")
|
||||
self.__disks_to_detection = None
|
||||
return
|
||||
- disk_list = disks_to_detection.split(',')
|
||||
- if len(disk_list) == 0 or (len(disk_list) == 1 and disk_list[0] == ''):
|
||||
- logging.warning("you don't specify any disk.")
|
||||
- self.__disks_to_detection = []
|
||||
- return
|
||||
- if len(disk_list) == 1 and disk_list[0] == 'default':
|
||||
+ disks_to_detection = disks_to_detection.strip()
|
||||
+ if not disks_to_detection:
|
||||
+ logging.critical("the value of disk is empty, ai_block_io plug will exit.")
|
||||
+ Report.report_pass(
|
||||
+ "the value of disk is empty, ai_block_io plug will exit."
|
||||
+ )
|
||||
+ exit(1)
|
||||
+ disk_list = disks_to_detection.split(",")
|
||||
+ if len(disk_list) == 1 and disk_list[0] == "default":
|
||||
self.__disks_to_detection = None
|
||||
return
|
||||
self.__disks_to_detection = disk_list
|
||||
|
||||
def __read__train_data_duration(self, items_algorithm: dict):
|
||||
- try:
|
||||
- self.__train_data_duration = float(items_algorithm.get('train_data_duration',
|
||||
- ConfigParser.DEFAULT_TRAIN_DATA_DURATION))
|
||||
- if self.__train_data_duration <= 0 or self.__train_data_duration > 720:
|
||||
- logging.warning(
|
||||
- f'the train_data_duration: {self.__train_data_duration} you set is invalid, use default value: {ConfigParser.DEFAULT_TRAIN_DATA_DURATION}.')
|
||||
- self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION
|
||||
- except ValueError:
|
||||
- self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION
|
||||
- logging.warning(f'the train_data_duration type conversion has error, use default value: {self.__train_data_duration}.')
|
||||
+ self.__train_data_duration = self._get_config_value(
|
||||
+ items_algorithm,
|
||||
+ "train_data_duration",
|
||||
+ float,
|
||||
+ ConfigParser.DEFAULT_TRAIN_DATA_DURATION,
|
||||
+ gt=0,
|
||||
+ le=720,
|
||||
+ )
|
||||
|
||||
def __read__train_update_duration(self, items_algorithm: dict):
|
||||
default_train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
|
||||
if default_train_update_duration > self.__train_data_duration:
|
||||
default_train_update_duration = self.__train_data_duration / 2
|
||||
-
|
||||
- try:
|
||||
- self.__train_update_duration = float(items_algorithm.get('train_update_duration',
|
||||
- ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION))
|
||||
- if self.__train_update_duration <= 0 or self.__train_update_duration > self.__train_data_duration:
|
||||
- logging.warning(
|
||||
- f'the train_update_duration: {self.__train_update_duration} you set is invalid, use default value: {default_train_update_duration}.')
|
||||
- self.__train_update_duration = default_train_update_duration
|
||||
- except ValueError:
|
||||
- self.__train_update_duration = default_train_update_duration
|
||||
- logging.warning(f'the train_update_duration type conversion has error, use default value: {self.__train_update_duration}.')
|
||||
+ self.__train_update_duration = self._get_config_value(
|
||||
+ items_algorithm,
|
||||
+ "train_update_duration",
|
||||
+ float,
|
||||
+ default_train_update_duration,
|
||||
+ gt=0,
|
||||
+ le=self.__train_data_duration,
|
||||
+ )
|
||||
|
||||
def __read__algorithm_type_and_parameter(self, items_algorithm: dict):
|
||||
- algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE)
|
||||
+ algorithm_type = items_algorithm.get(
|
||||
+ "algorithm_type", ConfigParser.DEFAULT_ALGORITHM_TYPE
|
||||
+ )
|
||||
self.__algorithm_type = get_threshold_type_enum(algorithm_type)
|
||||
+ if self.__algorithm_type is None:
|
||||
+ logging.critical(
|
||||
+ "the algorithm_type: %s you set is invalid. ai_block_io plug will exit.",
|
||||
+ algorithm_type,
|
||||
+ )
|
||||
+ Report.report_pass(
|
||||
+ f"the algorithm_type: {algorithm_type} you set is invalid. ai_block_io plug will exit."
|
||||
+ )
|
||||
+ exit(1)
|
||||
|
||||
if self.__algorithm_type == ThresholdType.NSigmaThreshold:
|
||||
- try:
|
||||
- self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter',
|
||||
- ConfigParser.DEFAULT_N_SIGMA_PARAMETER))
|
||||
- if self.__n_sigma_parameter <= 0 or self.__n_sigma_parameter > 10:
|
||||
- logging.warning(
|
||||
- f'the n_sigma_parameter: {self.__n_sigma_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_N_SIGMA_PARAMETER}.')
|
||||
- self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
|
||||
- except ValueError:
|
||||
- self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
|
||||
- logging.warning(f'the n_sigma_parameter type conversion has error, use default value: {self.__n_sigma_parameter}.')
|
||||
+ self.__n_sigma_parameter = self._get_config_value(
|
||||
+ items_algorithm,
|
||||
+ "n_sigma_parameter",
|
||||
+ float,
|
||||
+ ConfigParser.DEFAULT_N_SIGMA_PARAMETER,
|
||||
+ gt=0,
|
||||
+ le=10,
|
||||
+ )
|
||||
elif self.__algorithm_type == ThresholdType.BoxplotThreshold:
|
||||
- try:
|
||||
- self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter',
|
||||
- ConfigParser.DEFAULT_BOXPLOT_PARAMETER))
|
||||
- if self.__boxplot_parameter <= 0 or self.__boxplot_parameter > 10:
|
||||
- logging.warning(
|
||||
- f'the boxplot_parameter: {self.__boxplot_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_BOXPLOT_PARAMETER}.')
|
||||
- self.__n_sigma_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
|
||||
- except ValueError:
|
||||
- self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
|
||||
- logging.warning(f'the boxplot_parameter type conversion has error, use default value: {self.__boxplot_parameter}.')
|
||||
+ self.__boxplot_parameter = self._get_config_value(
|
||||
+ items_algorithm,
|
||||
+ "boxplot_parameter",
|
||||
+ float,
|
||||
+ ConfigParser.DEFAULT_BOXPLOT_PARAMETER,
|
||||
+ gt=0,
|
||||
+ le=10,
|
||||
+ )
|
||||
+
|
||||
+ def __read__stage(self, items_algorithm: dict):
|
||||
+ stage_str = items_algorithm.get('stage', ConfigParser.DEFAULT_STAGE)
|
||||
+ stage_list = stage_str.split(',')
|
||||
+ if len(stage_list) == 1 and stage_list[0] == '':
|
||||
+ logging.critical('stage value not allow is empty, exiting...')
|
||||
+ exit(1)
|
||||
+ if len(stage_list) == 1 and stage_list[0] == 'default':
|
||||
+ logging.warning(f'stage will enable default value: {ConfigParser.DEFAULT_STAGE}')
|
||||
+ self.__stage = ALL_STAGE_LIST
|
||||
+ return
|
||||
+ for stage in stage_list:
|
||||
+ if stage not in ALL_STAGE_LIST:
|
||||
+ logging.critical(f'stage: {stage} is not valid stage, ai_block_io will exit...')
|
||||
+ exit(1)
|
||||
+ dup_stage_list = set(stage_list)
|
||||
+ if 'bio' not in dup_stage_list:
|
||||
+ logging.critical('stage must contains bio stage, exiting...')
|
||||
+ exit(1)
|
||||
+ self.__stage = dup_stage_list
|
||||
+
|
||||
+ def __read__iotype(self, items_algorithm: dict):
|
||||
+ iotype_str = items_algorithm.get('iotype', ConfigParser.DEFAULT_IOTYPE)
|
||||
+ iotype_list = iotype_str.split(',')
|
||||
+ if len(iotype_list) == 1 and iotype_list[0] == '':
|
||||
+ logging.critical('iotype value not allow is empty, exiting...')
|
||||
+ exit(1)
|
||||
+ if len(iotype_list) == 1 and iotype_list[0] == 'default':
|
||||
+ logging.warning(f'iotype will enable default value: {ConfigParser.DEFAULT_IOTYPE}')
|
||||
+ self.__iotype = ALL_IOTPYE_LIST
|
||||
+ return
|
||||
+ for iotype in iotype_list:
|
||||
+ if iotype not in ALL_IOTPYE_LIST:
|
||||
+ logging.critical(f'iotype: {iotype} is not valid iotype, ai_block_io will exit...')
|
||||
+ exit(1)
|
||||
+ dup_iotype_list = set(iotype_list)
|
||||
+ self.__iotype = dup_iotype_list
|
||||
|
||||
def __read__window_size(self, items_sliding_window: dict):
|
||||
- try:
|
||||
- self.__window_size = int(items_sliding_window.get('window_size',
|
||||
- ConfigParser.DEFAULT_WINDOW_SIZE))
|
||||
- if self.__window_size < 1 or self.__window_size > 3600:
|
||||
- logging.warning(
|
||||
- f'the window_size: {self.__window_size} you set is invalid, use default value: {ConfigParser.DEFAULT_WINDOW_SIZE}.')
|
||||
- self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
|
||||
- except ValueError:
|
||||
- self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
|
||||
- logging.warning(f'window_size type conversion has error, use default value: {self.__window_size}.')
|
||||
+ self.__window_size = self._get_config_value(
|
||||
+ items_sliding_window,
|
||||
+ "window_size",
|
||||
+ int,
|
||||
+ ConfigParser.DEFAULT_WINDOW_SIZE,
|
||||
+ gt=0,
|
||||
+ le=3600,
|
||||
+ )
|
||||
|
||||
def __read__window_minimum_threshold(self, items_sliding_window: dict):
|
||||
default_window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD
|
||||
if default_window_minimum_threshold > self.__window_size:
|
||||
default_window_minimum_threshold = self.__window_size / 2
|
||||
- try:
|
||||
- self.__window_minimum_threshold = (
|
||||
- int(items_sliding_window.get('window_minimum_threshold',
|
||||
- ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD)))
|
||||
- if self.__window_minimum_threshold < 1 or self.__window_minimum_threshold > self.__window_size:
|
||||
- logging.warning(
|
||||
- f'the window_minimum_threshold: {self.__window_minimum_threshold} you set is invalid, use default value: {default_window_minimum_threshold}.')
|
||||
- self.__window_minimum_threshold = default_window_minimum_threshold
|
||||
- except ValueError:
|
||||
- self.__window_minimum_threshold = default_window_minimum_threshold
|
||||
- logging.warning(f'window_minimum_threshold type conversion has error, use default value: {self.__window_minimum_threshold}.')
|
||||
+ self.__window_minimum_threshold = self._get_config_value(
|
||||
+ items_sliding_window,
|
||||
+ "window_minimum_threshold",
|
||||
+ int,
|
||||
+ default_window_minimum_threshold,
|
||||
+ gt=0,
|
||||
+ le=self.__window_size,
|
||||
+ )
|
||||
|
||||
def read_config_from_file(self):
|
||||
+ if not os.path.exists(self.__config_file_name):
|
||||
+ init_log_format(self.__log_level)
|
||||
+ logging.critical(
|
||||
+ "config file %s not found, ai_block_io plug will exit.",
|
||||
+ self.__config_file_name,
|
||||
+ )
|
||||
+ Report.report_pass(
|
||||
+ f"config file {self.__config_file_name} not found, ai_block_io plug will exit."
|
||||
+ )
|
||||
+ exit(1)
|
||||
+
|
||||
con = configparser.ConfigParser()
|
||||
try:
|
||||
- con.read(self.__config_file_name, encoding='utf-8')
|
||||
+ con.read(self.__config_file_name, encoding="utf-8")
|
||||
except configparser.Error as e:
|
||||
init_log_format(self.__log_level)
|
||||
- logging.critical(f'config file read error: {e}, ai_block_io plug will exit.')
|
||||
+ logging.critical(
|
||||
+ f"config file read error: %s, ai_block_io plug will exit.", e
|
||||
+ )
|
||||
+ Report.report_pass(
|
||||
+ f"config file read error: {e}, ai_block_io plug will exit."
|
||||
+ )
|
||||
exit(1)
|
||||
|
||||
- if con.has_section('common'):
|
||||
- items_common = dict(con.items('common'))
|
||||
- self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL)
|
||||
+ if con.has_section('log'):
|
||||
+ items_log = dict(con.items('log'))
|
||||
+ # 情况一:没有log,则使用默认值
|
||||
+ # 情况二:有log,值为空或异常,使用默认值
|
||||
+ # 情况三:有log,值正常,则使用该值
|
||||
+ self.__log_level = items_log.get('level', ConfigParser.DEFAULT_LOG_LEVEL)
|
||||
init_log_format(self.__log_level)
|
||||
+ else:
|
||||
+ init_log_format(self.__log_level)
|
||||
+ logging.warning(f"log section parameter not found, it will be set to default value.")
|
||||
+
|
||||
+ if con.has_section("common"):
|
||||
+ items_common = dict(con.items("common"))
|
||||
self.__read_absolute_threshold(items_common)
|
||||
self.__read__slow_io_detect_frequency(items_common)
|
||||
self.__read__disks_to_detect(items_common)
|
||||
+ self.__read__stage(items_common)
|
||||
+ self.__read__iotype(items_common)
|
||||
else:
|
||||
- init_log_format(self.__log_level)
|
||||
- logging.warning("common section parameter not found, it will be set to default value.")
|
||||
+ logging.warning(
|
||||
+ "common section parameter not found, it will be set to default value."
|
||||
+ )
|
||||
|
||||
- if con.has_section('algorithm'):
|
||||
- items_algorithm = dict(con.items('algorithm'))
|
||||
+ if con.has_section("algorithm"):
|
||||
+ items_algorithm = dict(con.items("algorithm"))
|
||||
self.__read__train_data_duration(items_algorithm)
|
||||
self.__read__train_update_duration(items_algorithm)
|
||||
self.__read__algorithm_type_and_parameter(items_algorithm)
|
||||
else:
|
||||
- logging.warning("algorithm section parameter not found, it will be set to default value.")
|
||||
-
|
||||
- if con.has_section('sliding_window'):
|
||||
- items_sliding_window = dict(con.items('sliding_window'))
|
||||
- sliding_window_type = items_sliding_window.get('sliding_window_type',
|
||||
- ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE)
|
||||
- self.__sliding_window_type = get_sliding_window_type_enum(sliding_window_type)
|
||||
+ logging.warning(
|
||||
+ "algorithm section parameter not found, it will be set to default value."
|
||||
+ )
|
||||
+
|
||||
+ if con.has_section("sliding_window"):
|
||||
+ items_sliding_window = dict(con.items("sliding_window"))
|
||||
+ sliding_window_type = items_sliding_window.get(
|
||||
+ "sliding_window_type", ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE
|
||||
+ )
|
||||
+ self.__sliding_window_type = get_sliding_window_type_enum(
|
||||
+ sliding_window_type
|
||||
+ )
|
||||
self.__read__window_size(items_sliding_window)
|
||||
self.__read__window_minimum_threshold(items_sliding_window)
|
||||
else:
|
||||
- logging.warning("sliding_window section parameter not found, it will be set to default value.")
|
||||
+ logging.warning(
|
||||
+ "sliding_window section parameter not found, it will be set to default value."
|
||||
+ )
|
||||
|
||||
self.__print_all_config_value()
|
||||
|
||||
+ def __repr__(self):
|
||||
+ config_str = {
|
||||
+ 'log.level': self.__log_level,
|
||||
+ 'common.absolute_threshold': self.__absolute_threshold,
|
||||
+ 'common.slow_io_detect_frequency': self.__slow_io_detect_frequency,
|
||||
+ 'common.disk': self.__disks_to_detection,
|
||||
+ 'common.stage': self.__stage,
|
||||
+ 'common.iotype': self.__iotype,
|
||||
+ 'algorithm.train_data_duration': self.__train_data_duration,
|
||||
+ 'algorithm.train_update_duration': self.__train_update_duration,
|
||||
+ 'algorithm.algorithm_type': self.__algorithm_type,
|
||||
+ 'algorithm.boxplot_parameter': self.__boxplot_parameter,
|
||||
+ 'algorithm.n_sigma_parameter': self.__n_sigma_parameter,
|
||||
+ 'sliding_window.sliding_window_type': self.__sliding_window_type,
|
||||
+ 'sliding_window.window_size': self.__window_size,
|
||||
+ 'sliding_window.window_minimum_threshold': self.__window_minimum_threshold
|
||||
+ }
|
||||
+ return str(config_str)
|
||||
+
|
||||
def __print_all_config_value(self):
|
||||
- pass
|
||||
+ logging.info(f"all config is follow:\n {self}")
|
||||
+
|
||||
+ def get_train_data_duration_and_train_update_duration(self):
|
||||
+ return self.__train_data_duration, self.__train_update_duration
|
||||
|
||||
- def get_slow_io_detect_frequency(self):
|
||||
+ def get_window_size_and_window_minimum_threshold(self):
|
||||
+ return self.__window_size, self.__window_minimum_threshold
|
||||
+
|
||||
+ @property
|
||||
+ def slow_io_detect_frequency(self):
|
||||
return self.__slow_io_detect_frequency
|
||||
|
||||
- def get_algorithm_type(self):
|
||||
+ @property
|
||||
+ def algorithm_type(self):
|
||||
return self.__algorithm_type
|
||||
|
||||
- def get_sliding_window_type(self):
|
||||
+ @property
|
||||
+ def sliding_window_type(self):
|
||||
return self.__sliding_window_type
|
||||
|
||||
- def get_train_data_duration_and_train_update_duration(self):
|
||||
- return self.__train_data_duration, self.__train_update_duration
|
||||
+ @property
|
||||
+ def train_data_duration(self):
|
||||
+ return self.__train_data_duration
|
||||
|
||||
- def get_window_size_and_window_minimum_threshold(self):
|
||||
- return self.__window_size, self.__window_minimum_threshold
|
||||
+ @property
|
||||
+ def train_update_duration(self):
|
||||
+ return self.__train_update_duration
|
||||
+
|
||||
+ @property
|
||||
+ def window_size(self):
|
||||
+ return self.__window_size
|
||||
|
||||
- def get_absolute_threshold(self):
|
||||
+ @property
|
||||
+ def window_minimum_threshold(self):
|
||||
+ return self.__window_minimum_threshold
|
||||
+
|
||||
+ @property
|
||||
+ def absolute_threshold(self):
|
||||
return self.__absolute_threshold
|
||||
|
||||
- def get_log_level(self):
|
||||
+ @property
|
||||
+ def log_level(self):
|
||||
return self.__log_level
|
||||
|
||||
- def get_disks_to_detection(self):
|
||||
+ @property
|
||||
+ def disks_to_detection(self):
|
||||
return self.__disks_to_detection
|
||||
|
||||
- def get_boxplot_parameter(self):
|
||||
+ @property
|
||||
+ def stage(self):
|
||||
+ return self.__stage
|
||||
+
|
||||
+ @property
|
||||
+ def iotype(self):
|
||||
+ return self.__iotype
|
||||
+
|
||||
+ @property
|
||||
+ def boxplot_parameter(self):
|
||||
return self.__boxplot_parameter
|
||||
|
||||
- def get_n_sigma_parameter(self):
|
||||
+ @property
|
||||
+ def n_sigma_parameter(self):
|
||||
return self.__n_sigma_parameter
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
index c7679cd..ed997e6 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
@@ -41,11 +41,14 @@ def check_collect_valid(period):
|
||||
try:
|
||||
data = json.loads(data_raw["message"])
|
||||
except Exception as e:
|
||||
- logging.warning(f"get io data failed, {e}")
|
||||
+ logging.warning(f"get valid devices failed, occur exception: {e}")
|
||||
+ return None
|
||||
+ if not data:
|
||||
+ logging.warning(f"get valid devices failed, return {data_raw}")
|
||||
return None
|
||||
return [k for k in data.keys()]
|
||||
else:
|
||||
- logging.warning(f"get io data failed, return {data_raw}")
|
||||
+ logging.warning(f"get valid devices failed, return {data_raw}")
|
||||
return None
|
||||
|
||||
|
||||
@@ -60,7 +63,7 @@ def _get_raw_data(period, disk_list):
|
||||
|
||||
def _get_io_stage_data(data):
|
||||
io_stage_data = IOStageData()
|
||||
- for data_type in ('read', 'write', 'flush', 'discard'):
|
||||
+ for data_type in ("read", "write", "flush", "discard"):
|
||||
if data_type in data:
|
||||
getattr(io_stage_data, data_type).latency = data[data_type][0]
|
||||
getattr(io_stage_data, data_type).io_dump = data[data_type][1]
|
||||
@@ -87,7 +90,7 @@ def get_io_data_from_collect_plug(period, disk_list):
|
||||
getattr(disk_ret, k)
|
||||
setattr(disk_ret, k, _get_io_stage_data(v))
|
||||
except AttributeError:
|
||||
- logging.debug(f'no attr {k}')
|
||||
+ logging.debug(f"no attr {k}")
|
||||
continue
|
||||
ret[disk] = disk_ret
|
||||
return ret
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
index 0ed282b..e710ddd 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
@@ -53,3 +53,28 @@ class Detector:
|
||||
f' io_type_name: {self._metric_name.get_io_access_type_name()},'
|
||||
f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},'
|
||||
f' sliding_window_type: {self._slidingWindow}')
|
||||
+
|
||||
+
|
||||
+class DiskDetector:
|
||||
+
|
||||
+ def __init__(self, disk_name: str):
|
||||
+ self._disk_name = disk_name
|
||||
+ self._detector_list = []
|
||||
+
|
||||
+ def add_detector(self, detector: Detector):
|
||||
+ self._detector_list.append(detector)
|
||||
+
|
||||
+ def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
|
||||
+ # 只有bio阶段发生异常,就认为发生了慢IO事件
|
||||
+ # todo:根因诊断
|
||||
+ for detector in self._detector_list:
|
||||
+ result = detector.is_slow_io_event(io_data_dict_with_disk_name)
|
||||
+ if result[0] and detector.get_metric_name().get_stage_name() == 'bio':
|
||||
+ return result[0], detector.get_metric_name(), result[1], result[2]
|
||||
+ return False, None, None, None
|
||||
+
|
||||
+ def __repr__(self):
|
||||
+ msg = f'disk: {self._disk_name}, '
|
||||
+ for detector in self._detector_list:
|
||||
+ msg += f'\n detector: [{detector}]'
|
||||
+ return msg
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py
|
||||
index 8dbba06..0ed37b9 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/utils.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/utils.py
|
||||
@@ -25,8 +25,7 @@ def get_threshold_type_enum(algorithm_type: str):
|
||||
return ThresholdType.BoxplotThreshold
|
||||
if algorithm_type.lower() == 'n_sigma':
|
||||
return ThresholdType.NSigmaThreshold
|
||||
- logging.warning(f"the algorithm type: {algorithm_type} you set is invalid, use default value: boxplot")
|
||||
- return ThresholdType.BoxplotThreshold
|
||||
+ return None
|
||||
|
||||
|
||||
def get_sliding_window_type_enum(sliding_window_type: str):
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,73 +0,0 @@
|
||||
From 7d5ad8f2dd87432b8f46ea5002400ee46cb6756a Mon Sep 17 00:00:00 2001
|
||||
From: gaoruoshu <gaoruoshu@huawei.com>
|
||||
Date: Wed, 9 Oct 2024 14:22:38 +0800
|
||||
Subject: [PATCH] avg_block_io send alarm to xalarmd
|
||||
|
||||
---
|
||||
config/tasks/avg_block_io.mod | 2 ++
|
||||
.../sentryPlugins/avg_block_io/module_conn.py | 23 +++++++++++++++----
|
||||
2 files changed, 21 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod
|
||||
index b9b6f34..bcd063b 100644
|
||||
--- a/config/tasks/avg_block_io.mod
|
||||
+++ b/config/tasks/avg_block_io.mod
|
||||
@@ -3,3 +3,5 @@ enabled=yes
|
||||
task_start=/usr/bin/python3 /usr/bin/avg_block_io
|
||||
task_stop=pkill -f /usr/bin/avg_block_io
|
||||
type=oneshot
|
||||
+alarm_id=1002
|
||||
+alarm_clear_time=5
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
index 0da4208..2fc5a83 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
@@ -16,6 +16,7 @@ import time
|
||||
from .utils import is_abnormal
|
||||
from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
|
||||
from syssentry.result import ResultLevel, report_result
|
||||
+from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR
|
||||
|
||||
|
||||
TASK_NAME = "avg_block_io"
|
||||
@@ -68,19 +69,33 @@ def process_report_data(disk_name, rw, io_data):
|
||||
if not is_abnormal((disk_name, 'bio', rw), io_data):
|
||||
return
|
||||
|
||||
+ msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw}
|
||||
+
|
||||
ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
|
||||
for stage_name in ctrl_stage:
|
||||
if is_abnormal((disk_name, stage_name, rw), io_data):
|
||||
- logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw))
|
||||
+ msg["reason"] = "IO press slow"
|
||||
+ msg["block_stack"] = f"bio,{stage_name}"
|
||||
+ logging.warning("{} - {} report IO press slow".format(disk_name, rw))
|
||||
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
||||
return
|
||||
|
||||
if is_abnormal((disk_name, 'rq_driver', rw), io_data):
|
||||
- logging.warning("{} - {} - {} report driver".format(time.ctime(), disk_name, rw))
|
||||
+ msg["reason"] = "driver slow"
|
||||
+ msg["block_stack"] = "bio,rq_driver"
|
||||
+ logging.warning("{} - {} report driver slow".format(disk_name, rw))
|
||||
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
||||
return
|
||||
|
||||
kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue']
|
||||
for stage_name in kernel_stage:
|
||||
if is_abnormal((disk_name, stage_name, rw), io_data):
|
||||
- logging.warning("{} - {} - {} report kernel".format(time.ctime(), disk_name, rw))
|
||||
+ msg["reason"] = "kernel slow"
|
||||
+ msg["block_stack"] = f"bio,{stage_name}"
|
||||
+ logging.warning("{} - {} report kernel slow".format(disk_name, rw))
|
||||
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
||||
return
|
||||
- logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw))
|
||||
+ msg["reason"] = "unknown"
|
||||
+ msg["block_stack"] = "bio"
|
||||
+ logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw))
|
||||
+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,34 +0,0 @@
|
||||
From 7d5ad8f2dd87432b8f46ea5002400ee46cb6756a Mon Sep 17 00:00:00 2001
|
||||
From: gaoruoshu <gaoruoshu@huawei.com>
|
||||
Date: Wed, 9 Oct 2024 14:22:38 +0800
|
||||
Subject: [PATCH] bugfix typo
|
||||
|
||||
---
|
||||
src/python/sentryPlugins/avg_block_io/avg_block_io.py | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
index b6b3b28..26a60c5 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
@@ -114,7 +114,7 @@ def read_config_lat_iodump(io_dic, config):
|
||||
common_param = {}
|
||||
lat_sec = None
|
||||
if not config.has_section("latency"):
|
||||
- logging.warning("Cannot find algorithm section in config file")
|
||||
+ logging.warning("Cannot find latency section in config file")
|
||||
else:
|
||||
lat_sec = config["latency"]
|
||||
|
||||
@@ -122,7 +122,7 @@ def read_config_lat_iodump(io_dic, config):
|
||||
if not config.has_section("iodump"):
|
||||
logging.warning("Cannot find iodump section in config file")
|
||||
else:
|
||||
- lat_sec = config["iodump"]
|
||||
+ iodump_sec = config["iodump"]
|
||||
|
||||
if not lat_sec and not iodump_sec:
|
||||
return common_param
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,56 +0,0 @@
|
||||
From 67439c0040b1fb0614ac009bf53062e9ec2880aa Mon Sep 17 00:00:00 2001
|
||||
From: jinsaihang <jinsaihang@h-partners.com>
|
||||
Date: Wed, 9 Oct 2024 11:55:35 +0800
|
||||
Subject: [PATCH 1/2] change alarm length
|
||||
|
||||
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
|
||||
---
|
||||
src/python/syssentry/sentryctl | 3 +++
|
||||
src/python/syssentry/syssentry.py | 3 +++
|
||||
2 files changed, 6 insertions(+)
|
||||
|
||||
diff --git a/src/python/syssentry/sentryctl b/src/python/syssentry/sentryctl
|
||||
index 675c17a..3de93d0 100644
|
||||
--- a/src/python/syssentry/sentryctl
|
||||
+++ b/src/python/syssentry/sentryctl
|
||||
@@ -25,6 +25,7 @@ MAX_PARAM_LENGTH = 256
|
||||
|
||||
RESULT_MSG_DATA_LEN = 4
|
||||
CTL_MSG_LEN_LEN = 3
|
||||
+ALARM_MSG_DATA_LEN = 6
|
||||
DEFAULT_ALARM_TIME_RANGE = 10
|
||||
|
||||
def status_output_format(res_data):
|
||||
@@ -173,6 +174,8 @@ if __name__ == '__main__':
|
||||
request_message = json.dumps(req_msg_struct)
|
||||
if client_args.cmd_type == 'get_result':
|
||||
result_message = client_send_and_recv(request_message, RESULT_MSG_DATA_LEN)
|
||||
+ elif client_args.cmd_type == 'get_alarm':
|
||||
+ result_message = client_send_and_recv(request_message, ALARM_MSG_DATA_LEN)
|
||||
else:
|
||||
result_message = client_send_and_recv(request_message, CTL_MSG_LEN_LEN)
|
||||
if not result_message:
|
||||
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
|
||||
index c2dee85..ea09095 100644
|
||||
--- a/src/python/syssentry/syssentry.py
|
||||
+++ b/src/python/syssentry/syssentry.py
|
||||
@@ -56,6 +56,7 @@ CTL_MSG_MAGIC_LEN = 3
|
||||
CTL_MSG_LEN_LEN = 3
|
||||
CTL_MAGIC = "CTL"
|
||||
RES_MAGIC = "RES"
|
||||
+ALARM_MSG_DATA_LEN = 6
|
||||
|
||||
CTL_LISTEN_QUEUE_LEN = 5
|
||||
SERVER_EPOLL_TIMEOUT = 0.3
|
||||
@@ -256,6 +257,8 @@ def server_recv(server_socket: socket.socket):
|
||||
res_head = RES_MAGIC
|
||||
if cmd_type == "get_result":
|
||||
res_data_len = str(len(res_data)).zfill(RESULT_MSG_HEAD_LEN - RESULT_MSG_MAGIC_LEN)
|
||||
+ elif cmd_type == "get_alarm":
|
||||
+ res_data_len = str(len(res_data)).zfill(ALARM_MSG_DATA_LEN)
|
||||
else:
|
||||
res_data_len = str(len(res_data)).zfill(CTL_MSG_MAGIC_LEN)
|
||||
res_head += res_data_len
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,55 +0,0 @@
|
||||
From aaff413d6954003a3c21af21003c3bc134f940e2 Mon Sep 17 00:00:00 2001
|
||||
From: gaoruoshu <gaoruoshu@huawei.com>
|
||||
Date: Tue, 5 Nov 2024 10:31:10 +0800
|
||||
Subject: [PATCH] change avg_block_io config
|
||||
|
||||
---
|
||||
config/plugins/avg_block_io.ini | 8 ++++----
|
||||
.../src/python/sentryPlugins/avg_block_io/config.py | 8 ++++----
|
||||
2 files changed, 8 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini
|
||||
index 5c4b9b0..3b4ee33 100644
|
||||
--- a/config/plugins/avg_block_io.ini
|
||||
+++ b/config/plugins/avg_block_io.ini
|
||||
@@ -12,12 +12,12 @@ win_size=30
|
||||
win_threshold=6
|
||||
|
||||
[latency_nvme_ssd]
|
||||
-read_avg_lim=300
|
||||
-write_avg_lim=300
|
||||
+read_avg_lim=10000
|
||||
+write_avg_lim=10000
|
||||
read_avg_time=3
|
||||
write_avg_time=3
|
||||
-read_tot_lim=500
|
||||
-write_tot_lim=500
|
||||
+read_tot_lim=50000
|
||||
+write_tot_lim=50000
|
||||
|
||||
[latency_sata_ssd]
|
||||
read_avg_lim=10000
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/config.py b/src/python/sentryPlugins/avg_block_io/config.py
|
||||
index c8f45ce..c1e8ab1 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/config.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/config.py
|
||||
@@ -42,12 +42,12 @@ DEFAULT_PARAM = {
|
||||
CONF_ALGO_SIZE: 30,
|
||||
CONF_ALGO_THRE: 6
|
||||
}, 'latency_nvme_ssd': {
|
||||
- 'read_avg_lim': 300,
|
||||
- 'write_avg_lim': 300,
|
||||
+ 'read_avg_lim': 10000,
|
||||
+ 'write_avg_lim': 10000,
|
||||
'read_avg_time': 3,
|
||||
'write_avg_time': 3,
|
||||
- 'read_tot_lim': 500,
|
||||
- 'write_tot_lim': 500,
|
||||
+ 'read_tot_lim': 50000,
|
||||
+ 'write_tot_lim': 50000,
|
||||
}, 'latency_sata_ssd' : {
|
||||
'read_avg_lim': 10000,
|
||||
'write_avg_lim': 10000,
|
||||
--
|
||||
2.39.5 (Apple Git-154)
|
||||
|
||||
@ -1,36 +0,0 @@
|
||||
From 8cc13a422ed29e48b0c5b86b2da2a5dc8ad4aa59 Mon Sep 17 00:00:00 2001
|
||||
From: zhuofeng <zhuofeng6@huawei.com>
|
||||
Date: Fri, 13 Dec 2024 11:20:55 +0800
|
||||
Subject: [PATCH] change status of period task and sort mod file
|
||||
|
||||
---
|
||||
src/python/syssentry/cron_process.py | 1 +
|
||||
src/python/syssentry/load_mods.py | 1 +
|
||||
2 files changed, 2 insertions(+)
|
||||
|
||||
diff --git a/src/python/syssentry/cron_process.py b/src/python/syssentry/cron_process.py
|
||||
index 50780b3..5543d67 100644
|
||||
--- a/src/python/syssentry/cron_process.py
|
||||
+++ b/src/python/syssentry/cron_process.py
|
||||
@@ -144,6 +144,7 @@ def period_tasks_handle():
|
||||
|
||||
if not task.onstart:
|
||||
logging.debug("period onstart not enabled, task: %s", task.name)
|
||||
+ task.runtime_status = EXITED_STATUS
|
||||
continue
|
||||
|
||||
if task.runtime_status == WAITING_STATUS and \
|
||||
diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py
|
||||
index 48d7e66..5be5540 100644
|
||||
--- a/src/python/syssentry/load_mods.py
|
||||
+++ b/src/python/syssentry/load_mods.py
|
||||
@@ -224,6 +224,7 @@ def load_tasks():
|
||||
return "failed", ""
|
||||
|
||||
mod_files = os.listdir(TASKS_STORAGE_PATH)
|
||||
+ mod_files.sort()
|
||||
for mod_file in mod_files:
|
||||
logging.debug("find mod, path is %s", mod_file)
|
||||
if not mod_file.endswith(MOD_FILE_SUFFIX):
|
||||
--
|
||||
2.33.0
|
||||
@ -1,41 +0,0 @@
|
||||
From 6e98b2e5008ffabfda8d1c10778717f972b54398 Mon Sep 17 00:00:00 2001
|
||||
From: jwolf <523083921@qq.com>
|
||||
Date: Mon, 22 Jul 2024 14:58:27 +0800
|
||||
Subject: [PATCH] cpu_utility and cpu_patrol musht be an integer
|
||||
|
||||
---
|
||||
src/c/catcli/catlib/cli_param_checker.c | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c
|
||||
index a1aa636..e400428 100644
|
||||
--- a/src/c/catcli/catlib/cli_param_checker.c
|
||||
+++ b/src/c/catcli/catlib/cli_param_checker.c
|
||||
@@ -2,6 +2,7 @@
|
||||
#include <sys/un.h>
|
||||
#include <regex.h>
|
||||
#include <stdbool.h>
|
||||
+#include <string.h>
|
||||
#include <limits.h>
|
||||
#include <unistd.h>
|
||||
#include "cli_common.h"
|
||||
@@ -13,7 +14,7 @@
|
||||
void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs)
|
||||
{
|
||||
long cpu_utility = strtol(getopt_optarg, NULL, DECIMAL);
|
||||
- if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX) {
|
||||
+ if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) {
|
||||
strncpy(errs->patrol_module_err,
|
||||
"\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN);
|
||||
}
|
||||
@@ -68,7 +69,7 @@ void checkset_cpulist(char *getopt_optarg, catcli_request_body *p_request_body,
|
||||
void checkset_patrol_time(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs)
|
||||
{
|
||||
long second = strtol(getopt_optarg, NULL, DECIMAL);
|
||||
- if (second <= 0 || second > INT_MAX) {
|
||||
+ if (second <= 0 || second > INT_MAX || strchr(getopt_optarg, '.') != NULL) {
|
||||
strncpy(errs->patrol_time_err,
|
||||
"\"patrol_second\" must be a number in the range of (0,INT_MAX] ,correct \"-t, --patrol_second\"\n",
|
||||
MAX_ERR_LEN);
|
||||
--
|
||||
Gitee
|
||||
@ -1,430 +0,0 @@
|
||||
From e7c1b0095e16369fb09ae62ffa3158be5e8893a1 Mon Sep 17 00:00:00 2001
|
||||
From: gaoruoshu <gaoruoshu@huawei.com>
|
||||
Date: Fri, 11 Oct 2024 10:48:35 +0800
|
||||
Subject: [PATCH] diff disk type use diff config
|
||||
|
||||
---
|
||||
config/plugins/avg_block_io.ini | 26 +++-
|
||||
src/python/sentryCollector/collect_plugin.py | 6 +
|
||||
.../avg_block_io/avg_block_io.py | 144 ++++++++----------
|
||||
.../sentryPlugins/avg_block_io/module_conn.py | 19 ++-
|
||||
.../sentryPlugins/avg_block_io/utils.py | 43 ++++++
|
||||
5 files changed, 146 insertions(+), 92 deletions(-)
|
||||
|
||||
diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini
|
||||
index 858db18..5c4b9b0 100644
|
||||
--- a/config/plugins/avg_block_io.ini
|
||||
+++ b/config/plugins/avg_block_io.ini
|
||||
@@ -11,13 +11,29 @@ period_time=1
|
||||
win_size=30
|
||||
win_threshold=6
|
||||
|
||||
-[latency]
|
||||
-read_avg_lim=10
|
||||
-write_avg_lim=10
|
||||
+[latency_nvme_ssd]
|
||||
+read_avg_lim=300
|
||||
+write_avg_lim=300
|
||||
read_avg_time=3
|
||||
write_avg_time=3
|
||||
-read_tot_lim=50
|
||||
-write_tot_lim=50
|
||||
+read_tot_lim=500
|
||||
+write_tot_lim=500
|
||||
+
|
||||
+[latency_sata_ssd]
|
||||
+read_avg_lim=10000
|
||||
+write_avg_lim=10000
|
||||
+read_avg_time=3
|
||||
+write_avg_time=3
|
||||
+read_tot_lim=50000
|
||||
+write_tot_lim=50000
|
||||
+
|
||||
+[latency_sata_hdd]
|
||||
+read_avg_lim=15000
|
||||
+write_avg_lim=15000
|
||||
+read_avg_time=3
|
||||
+write_avg_time=3
|
||||
+read_tot_lim=50000
|
||||
+write_tot_lim=50000
|
||||
|
||||
[iodump]
|
||||
read_iodump_lim=0
|
||||
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
|
||||
index 31bf11b..bec405a 100644
|
||||
--- a/src/python/sentryCollector/collect_plugin.py
|
||||
+++ b/src/python/sentryCollector/collect_plugin.py
|
||||
@@ -79,6 +79,12 @@ class DiskType():
|
||||
TYPE_SATA_SSD = 1
|
||||
TYPE_SATA_HDD = 2
|
||||
|
||||
+Disk_Type = {
|
||||
+ DiskType.TYPE_NVME_SSD: "nvme_ssd",
|
||||
+ DiskType.TYPE_SATA_SSD: "sata_ssd",
|
||||
+ DiskType.TYPE_SATA_HDD: "sata_hdd"
|
||||
+}
|
||||
+
|
||||
def client_send_and_recv(request_data, data_str_len, protocol):
|
||||
"""client socket send and recv message"""
|
||||
try:
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
index cf2ded3..fdad995 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
@@ -14,8 +14,9 @@ import configparser
|
||||
import time
|
||||
|
||||
from .stage_window import IoWindow, IoDumpWindow
|
||||
-from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler
|
||||
-from .utils import update_avg_and_check_abnormal, get_log_level
|
||||
+from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name
|
||||
+from .utils import update_avg_and_check_abnormal, get_log_level, get_section_value
|
||||
+from sentryCollector.collect_plugin import Disk_Type
|
||||
|
||||
CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
|
||||
|
||||
@@ -37,44 +38,40 @@ def read_config_common(config):
|
||||
disk = [] if disk_name == "default" else disk_name.split(",")
|
||||
except configparser.NoOptionError:
|
||||
disk = []
|
||||
- logging.warning("Unset disk, set to default")
|
||||
+ logging.warning("Unset common.disk, set to default")
|
||||
|
||||
try:
|
||||
stage_name = config.get("common", "stage")
|
||||
stage = [] if stage_name == "default" else stage_name.split(",")
|
||||
except configparser.NoOptionError:
|
||||
stage = []
|
||||
- logging.warning("Unset stage, set to read,write")
|
||||
+ logging.warning("Unset common.stage, set to default")
|
||||
|
||||
if len(disk) > 10:
|
||||
- logging.warning("Too many disks, record only max 10 disks")
|
||||
+ logging.warning("Too many common.disks, record only max 10 disks")
|
||||
disk = disk[:10]
|
||||
|
||||
try:
|
||||
iotype_name = config.get("common", "iotype").split(",")
|
||||
- iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write', 'flush', 'discard']]
|
||||
- err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write', 'flush', 'discard']]
|
||||
+ iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']]
|
||||
+ err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']]
|
||||
|
||||
- if iotype_list in [None, []]:
|
||||
- iotype_list = ["read", "write"]
|
||||
- except configparser.NoOptionError:
|
||||
- iotype = ["read", "write"]
|
||||
- logging.warning("Unset iotype, set to default")
|
||||
+ if err_iotype:
|
||||
+ report_alarm_fail("Invalid common.iotype config")
|
||||
|
||||
- if err_iotype:
|
||||
- logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list))
|
||||
-
|
||||
+ except configparser.NoOptionError:
|
||||
+ iotype_list = ["read", "write"]
|
||||
+ logging.warning("Unset common.iotype, set to read,write")
|
||||
|
||||
try:
|
||||
period_time = int(config.get("common", "period_time"))
|
||||
if not (1 <= period_time <= 300):
|
||||
raise ValueError("Invalid period_time")
|
||||
except ValueError:
|
||||
- period_time = 1
|
||||
- logging.warning("Invalid period_time, set to 1s")
|
||||
+ report_alarm_fail("Invalid common.period_time")
|
||||
except configparser.NoOptionError:
|
||||
period_time = 1
|
||||
- logging.warning("Unset period_time, use 1s as default")
|
||||
+ logging.warning("Unset common.period_time, use 1s as default")
|
||||
|
||||
return period_time, disk, stage, iotype_list
|
||||
|
||||
@@ -87,76 +84,56 @@ def read_config_algorithm(config):
|
||||
try:
|
||||
win_size = int(config.get("algorithm", "win_size"))
|
||||
if not (1 <= win_size <= 300):
|
||||
- raise ValueError("Invalid win_size")
|
||||
+ raise ValueError("Invalid algorithm.win_size")
|
||||
except ValueError:
|
||||
- win_size = 30
|
||||
- logging.warning("Invalid win_size, set to 30")
|
||||
+ report_alarm_fail("Invalid algorithm.win_size config")
|
||||
except configparser.NoOptionError:
|
||||
win_size = 30
|
||||
- logging.warning("Unset win_size, use 30 as default")
|
||||
+ logging.warning("Unset algorithm.win_size, use 30 as default")
|
||||
|
||||
try:
|
||||
win_threshold = int(config.get("algorithm", "win_threshold"))
|
||||
if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
|
||||
- raise ValueError("Invalid win_threshold")
|
||||
+ raise ValueError("Invalid algorithm.win_threshold")
|
||||
except ValueError:
|
||||
- win_threshold = 6
|
||||
- logging.warning("Invalid win_threshold, set to 6")
|
||||
+ report_alarm_fail("Invalid algorithm.win_threshold config")
|
||||
except configparser.NoOptionError:
|
||||
win_threshold = 6
|
||||
- logging.warning("Unset win_threshold, use 6 as default")
|
||||
+ logging.warning("Unset algorithm.win_threshold, use 6 as default")
|
||||
|
||||
return win_size, win_threshold
|
||||
|
||||
|
||||
-def read_config_lat_iodump(io_dic, config):
|
||||
- """read config file, get [latency] [iodump] section value"""
|
||||
+def read_config_latency(config):
|
||||
+ """read config file, get [latency_xxx] section value"""
|
||||
common_param = {}
|
||||
- lat_sec = None
|
||||
- if not config.has_section("latency"):
|
||||
- logging.warning("Cannot find latency section in config file")
|
||||
- else:
|
||||
- lat_sec = config["latency"]
|
||||
-
|
||||
- iodump_sec = None
|
||||
- if not config.has_section("iodump"):
|
||||
- logging.warning("Cannot find iodump section in config file")
|
||||
- else:
|
||||
- iodump_sec = config["iodump"]
|
||||
-
|
||||
- if not lat_sec and not iodump_sec:
|
||||
- return common_param
|
||||
-
|
||||
- for io_type in io_dic["iotype_list"]:
|
||||
- common_param[io_type] = {}
|
||||
-
|
||||
- latency_keys = {
|
||||
- "avg_lim": "{}_avg_lim".format(io_type),
|
||||
- "avg_time": "{}_avg_time".format(io_type),
|
||||
- "tot_lim": "{}_tot_lim".format(io_type),
|
||||
- }
|
||||
- iodump_key = "{}_iodump_lim".format(io_type)
|
||||
+ for type_name in Disk_Type:
|
||||
+ section_name = f"latency_{Disk_Type[type_name]}"
|
||||
+ if not config.has_section(section_name):
|
||||
+ report_alarm_fail(f"Cannot find {section_name} section in config file")
|
||||
|
||||
- if iodump_sec and iodump_key in iodump_sec and iodump_sec[iodump_key].isdecimal():
|
||||
- common_param[io_type][iodump_key] = int(iodump_sec[iodump_key])
|
||||
+ common_param[Disk_Type[type_name]] = get_section_value(section_name, config)
|
||||
+ return common_param
|
||||
|
||||
- if not lat_sec:
|
||||
- continue
|
||||
|
||||
- for key_suffix, key_template in latency_keys.items():
|
||||
- if key_template in lat_sec and lat_sec[key_template].isdecimal():
|
||||
- common_param[io_type][key_template] = int(lat_sec[key_template])
|
||||
+def read_config_iodump(config):
|
||||
+ """read config file, get [iodump] section value"""
|
||||
+ common_param = {}
|
||||
+ section_name = "iodump"
|
||||
+ if not config.has_section(section_name):
|
||||
+ report_alarm_fail(f"Cannot find {section_name} section in config file")
|
||||
|
||||
- return common_param
|
||||
+ return get_section_value(section_name, config)
|
||||
|
||||
|
||||
-def read_config_stage(config, stage, iotype_list):
|
||||
- """read config file, get [STAGE_NAME] section value"""
|
||||
+def read_config_stage(config, stage, iotype_list, curr_disk_type):
|
||||
+ """read config file, get [STAGE_NAME_diskType] section value"""
|
||||
res = {}
|
||||
- if not stage in config:
|
||||
+ section_name = f"{stage}_{curr_disk_type}"
|
||||
+ if not config.has_section(section_name):
|
||||
return res
|
||||
|
||||
- for key in config[stage]:
|
||||
+ for key in config[section_name]:
|
||||
if config[stage][key].isdecimal():
|
||||
res[key] = int(config[stage][key])
|
||||
|
||||
@@ -171,11 +148,12 @@ def init_io_win(io_dic, config, common_param):
|
||||
for disk_name in io_dic["disk_list"]:
|
||||
io_data[disk_name] = {}
|
||||
io_avg_value[disk_name] = {}
|
||||
+ curr_disk_type = get_disk_type_by_name(disk_name)
|
||||
for stage_name in io_dic["stage_list"]:
|
||||
io_data[disk_name][stage_name] = {}
|
||||
io_avg_value[disk_name][stage_name] = {}
|
||||
- # step3. 解析stage配置
|
||||
- curr_stage_param = read_config_stage(config, stage_name, iotype_list)
|
||||
+ # 解析stage配置
|
||||
+ curr_stage_param = read_config_stage(config, stage_name, iotype_list, curr_disk_type)
|
||||
for rw in iotype_list:
|
||||
io_data[disk_name][stage_name][rw] = {}
|
||||
io_avg_value[disk_name][stage_name][rw] = [0, 0]
|
||||
@@ -187,10 +165,10 @@ def init_io_win(io_dic, config, common_param):
|
||||
iodump_lim_key = "{}_iodump_lim".format(rw)
|
||||
|
||||
# 获取值,优先从 curr_stage_param 获取,如果不存在,则从 common_param 获取
|
||||
- avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key))
|
||||
- avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key))
|
||||
- tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key))
|
||||
- iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key))
|
||||
+ avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(curr_disk_type, {}).get(avg_lim_key))
|
||||
+ avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(curr_disk_type, {}).get(avg_time_key))
|
||||
+ tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(curr_disk_type, {}).get(tot_lim_key))
|
||||
+ iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get("iodump", {}).get(iodump_lim_key))
|
||||
|
||||
if avg_lim_value and avg_time_value and tot_lim_value:
|
||||
io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
|
||||
@@ -217,28 +195,21 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
|
||||
stage_list = [key for key in all_stage_set if key in config_stage]
|
||||
not_in_stage_list = [key for key in config_stage if key not in all_stage_set]
|
||||
|
||||
- if not config_disk:
|
||||
+ if not_in_stage_list:
|
||||
+ report_alarm_fail(f"Invalid common.stage_list config, cannot set {not_in_stage_list}")
|
||||
+
|
||||
+ if not config_disk and not not_in_disk_list:
|
||||
disk_list = [key for key in all_disk_set]
|
||||
|
||||
- if not config_stage:
|
||||
+ if not config_stage and not not_in_stage_list:
|
||||
stage_list = [key for key in all_stage_set]
|
||||
|
||||
disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list
|
||||
- stage_list = stage_list[:15] if len(stage_list) > 15 else stage_list
|
||||
-
|
||||
- if config_disk and not disk_list:
|
||||
- logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk))
|
||||
- disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage)
|
||||
-
|
||||
- if config_stage and not stage_list:
|
||||
- logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage))
|
||||
- disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, [])
|
||||
|
||||
if not stage_list or not disk_list:
|
||||
report_alarm_fail("Cannot get valid disk name or stage name.")
|
||||
|
||||
log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list)
|
||||
- log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list)
|
||||
|
||||
return disk_list, stage_list
|
||||
|
||||
@@ -310,8 +281,13 @@ def main():
|
||||
# step1. 解析公共配置 --- algorithm
|
||||
io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config)
|
||||
|
||||
- # step2. 循环创建窗口
|
||||
- common_param = read_config_lat_iodump(io_dic, config)
|
||||
+ # step2. 解析公共配置 --- latency_xxx
|
||||
+ common_param = read_config_latency(config)
|
||||
+
|
||||
+ # step3. 解析公共配置 --- iodump
|
||||
+ common_param['iodump'] = read_config_iodump(config)
|
||||
+
|
||||
+ # step4. 循环创建窗口
|
||||
io_data, io_avg_value = init_io_win(io_dic, config, common_param)
|
||||
|
||||
main_loop(io_dic, io_data, io_avg_value)
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
index 40b3fcc..8d6f429 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
@@ -14,7 +14,7 @@ import sys
|
||||
import time
|
||||
|
||||
from .utils import is_abnormal, get_win_data, log_slow_win
|
||||
-from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
|
||||
+from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages, get_disk_type, Disk_Type
|
||||
from syssentry.result import ResultLevel, report_result
|
||||
from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR
|
||||
|
||||
@@ -51,7 +51,7 @@ def check_result_validation(res, reason):
|
||||
try:
|
||||
json_data = json.loads(res['message'])
|
||||
except json.JSONDecodeError:
|
||||
- err_msg = "Failed to {}: invalid return message".format(reason)
|
||||
+ err_msg = f"Failed to {reason}: invalid return message"
|
||||
report_alarm_fail(err_msg)
|
||||
|
||||
return json_data
|
||||
@@ -60,7 +60,7 @@ def check_result_validation(res, reason):
|
||||
def report_alarm_fail(alarm_info):
|
||||
"""report result to xalarmd"""
|
||||
report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info}))
|
||||
- logging.error(alarm_info)
|
||||
+ logging.critical(alarm_info)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -114,3 +114,16 @@ def process_report_data(disk_name, rw, io_data):
|
||||
|
||||
log_slow_win(msg, "unknown")
|
||||
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
||||
+
|
||||
+
|
||||
+def get_disk_type_by_name(disk_name):
|
||||
+ res = get_disk_type(disk_name)
|
||||
+ disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}')
|
||||
+ try:
|
||||
+ curr_disk_type = int(disk_type_str)
|
||||
+ if curr_disk_type not in Disk_Type:
|
||||
+ raise ValueError
|
||||
+ except ValueError:
|
||||
+ report_alarm_fail(f"Failed to get disk type for {disk_name}")
|
||||
+
|
||||
+ return Disk_Type[curr_disk_type]
|
||||
\ No newline at end of file
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
index 3b7f027..cef1edd 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
@@ -26,6 +26,49 @@ LogLevel = {
|
||||
}
|
||||
|
||||
|
||||
+DEFAULT_PARAM = {
|
||||
+ 'latency_nvme_ssd': {
|
||||
+ 'read_avg_lim': 300,
|
||||
+ 'write_avg_lim': 300,
|
||||
+ 'read_avg_time': 3,
|
||||
+ 'write_avg_time': 3,
|
||||
+ 'read_tot_lim': 500,
|
||||
+ 'write_tot_lim': 500,
|
||||
+ }, 'latency_sata_ssd' : {
|
||||
+ 'read_avg_lim': 10000,
|
||||
+ 'write_avg_lim': 10000,
|
||||
+ 'read_avg_time': 3,
|
||||
+ 'write_avg_time': 3,
|
||||
+ 'read_tot_lim': 50000,
|
||||
+ 'write_tot_lim': 50000,
|
||||
+ }, 'latency_sata_hdd' : {
|
||||
+ 'read_avg_lim': 15000,
|
||||
+ 'write_avg_lim': 15000,
|
||||
+ 'read_avg_time': 3,
|
||||
+ 'write_avg_time': 3,
|
||||
+ 'read_tot_lim': 50000,
|
||||
+ 'write_tot_lim': 50000
|
||||
+ }, 'iodump': {
|
||||
+ 'read_iodump_lim': 0,
|
||||
+ 'write_iodump_lim': 0
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+
|
||||
+def get_section_value(section_name, config):
|
||||
+ common_param = {}
|
||||
+ config_sec = config[section_name]
|
||||
+ for config_key in DEFAULT_PARAM[section_name]:
|
||||
+ if config_key in config_sec:
|
||||
+ if not config_sec[config_key].isdecimal():
|
||||
+ report_alarm_fail(f"Invalid {section_name}.{config_key} config.")
|
||||
+ common_param[config_key] = int(config_sec[config_key])
|
||||
+ else:
|
||||
+ logging.warning(f"Unset {section_name}.{config_key} in config file, use {DEFAULT_PARAM[section_name][config_key]} as default")
|
||||
+ common_param[config_key] = DEFAULT_PARAM[section_name][config_key]
|
||||
+ return common_param
|
||||
+
|
||||
+
|
||||
def get_log_level(filename):
|
||||
if not os.path.exists(filename):
|
||||
return logging.INFO
|
||||
--
|
||||
2.27.0
|
||||
@ -1,29 +0,0 @@
|
||||
From 41bf507ca6cbbdf5e646a405de6b8d5b9be4bd28 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
||||
Date: Wed, 16 Oct 2024 17:20:01 +0800
|
||||
Subject: [PATCH] enrich alert info about kernel stack
|
||||
|
||||
---
|
||||
src/python/sentryPlugins/ai_block_io/detector.py | 6 ++++--
|
||||
1 file changed, 4 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
index ed8b64a..8536f7a 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
@@ -103,8 +103,10 @@ class DiskDetector:
|
||||
elif len(diagnosis_info["rq_driver"]) != 0:
|
||||
root_cause = "[Root Cause: disk slow]"
|
||||
elif len(diagnosis_info["io_stage"]) != 0:
|
||||
- stage = diagnosis_info["io_stage"][0][1].stage_name
|
||||
- root_cause = f"[Root Cause: io stage slow, stage: {stage}]"
|
||||
+ stage_list = []
|
||||
+ for io_stage in diagnosis_info["io_stage"]:
|
||||
+ stage_list.append(io_stage[0].stage_name)
|
||||
+ root_cause = f"[Root Cause: io stage slow, stage: {stage_list}]"
|
||||
if root_cause is None:
|
||||
root_cause = "[Root Cause: high io pressure]"
|
||||
return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,572 +0,0 @@
|
||||
From acb77d6a69aa9269b0f691613bef53efd0c01e53 Mon Sep 17 00:00:00 2001
|
||||
From: gaoruoshu <gaoruoshu@huawei.com>
|
||||
Date: Thu, 12 Sep 2024 11:31:34 +0800
|
||||
Subject: [PATCH 2/2] add avg_block_io plugin
|
||||
|
||||
---
|
||||
config/plugins/avg_block_io.ini | 21 ++
|
||||
config/tasks/avg_block_io.mod | 5 +
|
||||
src/python/sentryPlugins/__init__.py | 0
|
||||
.../sentryPlugins/avg_block_io/__init__.py | 0
|
||||
.../avg_block_io/avg_block_io.py | 257 ++++++++++++++++++
|
||||
.../sentryPlugins/avg_block_io/module_conn.py | 86 ++++++
|
||||
.../avg_block_io/stage_window.py | 47 ++++
|
||||
.../sentryPlugins/avg_block_io/utils.py | 86 ++++++
|
||||
8 files changed, 502 insertions(+)
|
||||
create mode 100644 config/plugins/avg_block_io.ini
|
||||
create mode 100644 config/tasks/avg_block_io.mod
|
||||
create mode 100644 src/python/sentryPlugins/__init__.py
|
||||
create mode 100644 src/python/sentryPlugins/avg_block_io/__init__.py
|
||||
create mode 100644 src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
create mode 100644 src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
create mode 100644 src/python/sentryPlugins/avg_block_io/stage_window.py
|
||||
create mode 100644 src/python/sentryPlugins/avg_block_io/utils.py
|
||||
|
||||
diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini
|
||||
new file mode 100644
|
||||
index 0000000..bc33dde
|
||||
--- /dev/null
|
||||
+++ b/config/plugins/avg_block_io.ini
|
||||
@@ -0,0 +1,21 @@
|
||||
+[common]
|
||||
+disk=default
|
||||
+stage=default
|
||||
+iotype=read,write
|
||||
+period_time=1
|
||||
+
|
||||
+[algorithm]
|
||||
+win_size=30
|
||||
+win_threshold=6
|
||||
+
|
||||
+[latency]
|
||||
+read_avg_lim=10
|
||||
+write_avg_lim=10
|
||||
+read_avg_time=3
|
||||
+write_avg_time=3
|
||||
+read_tot_lim=50
|
||||
+write_tot_lim=50
|
||||
+
|
||||
+[iodump]
|
||||
+read_iodump_lim=0
|
||||
+write_iodump_lim=0
|
||||
diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod
|
||||
new file mode 100644
|
||||
index 0000000..814c483
|
||||
--- /dev/null
|
||||
+++ b/config/tasks/avg_block_io.mod
|
||||
@@ -0,0 +1,5 @@
|
||||
+[common]
|
||||
+enabled=yes
|
||||
+task_start=/usr/bin/python3 /usr/bin/avg_block_io
|
||||
+task_stop=pkill avg_block_io
|
||||
+type=oneshot
|
||||
\ No newline at end of file
|
||||
diff --git a/src/python/sentryPlugins/__init__.py b/src/python/sentryPlugins/__init__.py
|
||||
new file mode 100644
|
||||
index 0000000..e69de29
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/__init__.py b/src/python/sentryPlugins/avg_block_io/__init__.py
|
||||
new file mode 100644
|
||||
index 0000000..e69de29
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
new file mode 100644
|
||||
index 0000000..ff2071d
|
||||
--- /dev/null
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
@@ -0,0 +1,257 @@
|
||||
+# coding: utf-8
|
||||
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||||
+# sysSentry is licensed under the Mulan PSL v2.
|
||||
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||
+# You may obtain a copy of Mulan PSL v2 at:
|
||||
+# http://license.coscl.org.cn/MulanPSL2
|
||||
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
|
||||
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||
+# PURPOSE.
|
||||
+# See the Mulan PSL v2 for more details.
|
||||
+import logging
|
||||
+import signal
|
||||
+import configparser
|
||||
+import time
|
||||
+
|
||||
+from .stage_window import IoWindow, IoDumpWindow
|
||||
+from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler
|
||||
+from .utils import update_avg_and_check_abnormal
|
||||
+
|
||||
+CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
|
||||
+
|
||||
+def log_invalid_keys(not_in_list, keys_name, config_list, default_list):
|
||||
+ """print invalid log"""
|
||||
+ if config_list and default_list:
|
||||
+ logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list))
|
||||
+ elif config_list == ["default"]:
|
||||
+ logging.warning("Default {} use {}".format(keys_name, default_list))
|
||||
+
|
||||
+
|
||||
+def read_config_common(config):
|
||||
+ """read config file, get [common] section value"""
|
||||
+ try:
|
||||
+ common_sec = config['common']
|
||||
+ except configparser.NoSectionError:
|
||||
+ report_alarm_fail("Cannot find common section in config file")
|
||||
+
|
||||
+ try:
|
||||
+ period_time = int(common_sec.get("period_time", 1))
|
||||
+ if not (1 <= period_time <= 300):
|
||||
+ raise ValueError("Invalid period_time")
|
||||
+ except ValueError:
|
||||
+ period_time = 1
|
||||
+ logging.warning("Invalid period_time, set to 1s")
|
||||
+
|
||||
+ disk = common_sec.get('disk').split(",") if common_sec.get('disk') not in [None, 'default'] else []
|
||||
+ stage = common_sec.get('stage').split(",") if common_sec.get('stage') not in [None, 'default'] else []
|
||||
+
|
||||
+ if len(disk) > 10:
|
||||
+ logging.warning("Too many disks, record only max 10 disks")
|
||||
+ disk = disk[:10]
|
||||
+
|
||||
+ iotype = common_sec.get('iotype', 'read,write').split(",")
|
||||
+ iotype_list = [rw.lower() for rw in iotype if rw.lower() in ['read', 'write', 'flush', 'discard']]
|
||||
+ err_iotype = [rw for rw in iotype if rw.lower() not in ['read', 'write', 'flush', 'discard']]
|
||||
+
|
||||
+ if err_iotype:
|
||||
+ logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list))
|
||||
+
|
||||
+ return period_time, disk, stage, iotype_list
|
||||
+
|
||||
+
|
||||
+def read_config_algorithm(config):
|
||||
+ """read config file, get [algorithm] section value"""
|
||||
+ if not config.has_section("algorithm"):
|
||||
+ report_alarm_fail("Cannot find algorithm section in config file")
|
||||
+
|
||||
+ try:
|
||||
+ win_size = int(config.get("algorithm", "win_size"))
|
||||
+ if not (1 <= win_size <= 300):
|
||||
+ raise ValueError("Invalid win_size")
|
||||
+ win_threshold = int(config.get("algorithm", "win_threshold"))
|
||||
+ if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
|
||||
+ raise ValueError("Invalid win_threshold")
|
||||
+ except ValueError:
|
||||
+ report_alarm_fail("Invalid win_threshold or win_size")
|
||||
+
|
||||
+ return win_size, win_threshold
|
||||
+
|
||||
+
|
||||
+def read_config_lat_iodump(io_dic, config):
|
||||
+ """read config file, get [latency] [iodump] section value"""
|
||||
+ common_param = {}
|
||||
+ for io_type in io_dic["iotype_list"]:
|
||||
+ common_param[io_type] = {}
|
||||
+
|
||||
+ latency_keys = {
|
||||
+ "avg_lim": "{}_avg_lim".format(io_type),
|
||||
+ "avg_time": "{}_avg_time".format(io_type),
|
||||
+ "tot_lim": "{}_tot_lim".format(io_type),
|
||||
+ }
|
||||
+ iodump_key = "{}_iodump_lim".format(io_type)
|
||||
+
|
||||
+ for key_suffix, key_template in latency_keys.items():
|
||||
+ if key_template in config["latency"] and config["latency"][key_template].isdecimal():
|
||||
+ common_param[io_type][key_template] = int(config["latency"][key_template])
|
||||
+
|
||||
+ if iodump_key in config["iodump"] and config["iodump"][iodump_key].isdecimal():
|
||||
+ common_param[io_type][iodump_key] = int(config["iodump"][iodump_key])
|
||||
+
|
||||
+ return common_param
|
||||
+
|
||||
+
|
||||
+def read_config_stage(config, stage, iotype_list):
|
||||
+ """read config file, get [STAGE_NAME] section value"""
|
||||
+ res = {}
|
||||
+ if not stage in config:
|
||||
+ return res
|
||||
+
|
||||
+ for key in config[stage]:
|
||||
+ if config[stage][key].isdecimal():
|
||||
+ res[key] = int(config[stage][key])
|
||||
+
|
||||
+ return res
|
||||
+
|
||||
+
|
||||
+def init_io_win(io_dic, config, common_param):
|
||||
+ """initialize windows of latency, iodump, and dict of avg_value"""
|
||||
+ iotype_list = io_dic["iotype_list"]
|
||||
+ io_data = {}
|
||||
+ io_avg_value = {}
|
||||
+ for disk_name in io_dic["disk_list"]:
|
||||
+ io_data[disk_name] = {}
|
||||
+ io_avg_value[disk_name] = {}
|
||||
+ for stage_name in io_dic["stage_list"]:
|
||||
+ io_data[disk_name][stage_name] = {}
|
||||
+ io_avg_value[disk_name][stage_name] = {}
|
||||
+ # step3. 解析stage配置
|
||||
+ curr_stage_param = read_config_stage(config, stage_name, iotype_list)
|
||||
+ for rw in iotype_list:
|
||||
+ io_data[disk_name][stage_name][rw] = {}
|
||||
+ io_avg_value[disk_name][stage_name][rw] = [0, 0]
|
||||
+
|
||||
+ # 对每个rw创建latency和iodump窗口
|
||||
+ avg_lim_key = "{}_avg_lim".format(rw)
|
||||
+ avg_time_key = "{}_avg_time".format(rw)
|
||||
+ tot_lim_key = "{}_tot_lim".format(rw)
|
||||
+ iodump_lim_key = "{}_iodump_lim".format(rw)
|
||||
+
|
||||
+ # 获取值,优先从 curr_stage_param 获取,如果不存在,则从 common_param 获取
|
||||
+ avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key))
|
||||
+ avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key))
|
||||
+ tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key))
|
||||
+ iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key))
|
||||
+
|
||||
+ if avg_lim_value and avg_time_value and tot_lim_value:
|
||||
+ io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
|
||||
+
|
||||
+ if iodump_lim_value is not None:
|
||||
+ io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value)
|
||||
+ return io_data, io_avg_value
|
||||
+
|
||||
+
|
||||
+def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
|
||||
+ """get disk_list and stage_list by sentryCollector"""
|
||||
+ json_data = avg_is_iocollect_valid(io_dic, config_disk, config_stage)
|
||||
+
|
||||
+ all_disk_set = json_data.keys()
|
||||
+ all_stage_set = set()
|
||||
+ for disk_stage_list in json_data.values():
|
||||
+ all_stage_set.update(disk_stage_list)
|
||||
+
|
||||
+ disk_list = [key for key in config_disk if key in all_disk_set]
|
||||
+ not_in_disk_list = [key for key in config_disk if key not in all_disk_set]
|
||||
+
|
||||
+ stage_list = [key for key in config_stage if key in all_stage_set]
|
||||
+ not_in_stage_list = [key for key in config_stage if key not in all_stage_set]
|
||||
+
|
||||
+ if not config_disk:
|
||||
+ disk_list = [key for key in all_disk_set]
|
||||
+
|
||||
+ if not config_stage:
|
||||
+ stage_list = [key for key in all_stage_set]
|
||||
+
|
||||
+ if config_disk and not disk_list:
|
||||
+ logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk))
|
||||
+ disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage)
|
||||
+
|
||||
+ if config_stage and not stage_list:
|
||||
+ logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage))
|
||||
+ disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, [])
|
||||
+
|
||||
+ if not stage_list or not disk_list:
|
||||
+ report_alarm_fail("Cannot get valid disk name or stage name.")
|
||||
+
|
||||
+ log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list)
|
||||
+ log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list)
|
||||
+
|
||||
+ return disk_list, stage_list
|
||||
+
|
||||
+
|
||||
+def main_loop(io_dic, io_data, io_avg_value):
|
||||
+ """main loop of avg_block_io"""
|
||||
+ period_time = io_dic["period_time"]
|
||||
+ disk_list = io_dic["disk_list"]
|
||||
+ stage_list = io_dic["stage_list"]
|
||||
+ iotype_list = io_dic["iotype_list"]
|
||||
+ win_size = io_dic["win_size"]
|
||||
+ # 开始循环
|
||||
+ while True:
|
||||
+ # 等待x秒
|
||||
+ time.sleep(period_time)
|
||||
+
|
||||
+ # 采集模块对接,获取周期数据
|
||||
+ curr_period_data = avg_get_io_data(io_dic)
|
||||
+
|
||||
+ # 处理周期数据
|
||||
+ reach_size = False
|
||||
+ for disk_name in disk_list:
|
||||
+ for stage_name in stage_list:
|
||||
+ for rw in iotype_list:
|
||||
+ if disk_name in curr_period_data and stage_name in curr_period_data[disk_name] and rw in curr_period_data[disk_name][stage_name]:
|
||||
+ io_key = (disk_name, stage_name, rw)
|
||||
+ reach_size = update_avg_and_check_abnormal(curr_period_data, io_key, win_size, io_avg_value, io_data)
|
||||
+
|
||||
+ # win_size不满时不进行告警判断
|
||||
+ if not reach_size:
|
||||
+ continue
|
||||
+
|
||||
+ # 判断异常窗口、异常场景
|
||||
+ for disk_name in disk_list:
|
||||
+ for rw in iotype_list:
|
||||
+ process_report_data(disk_name, rw, io_data)
|
||||
+
|
||||
+
|
||||
+def main():
|
||||
+ """main func"""
|
||||
+ # 注册停止信号-2/-15
|
||||
+ signal.signal(signal.SIGINT, sig_handler)
|
||||
+ signal.signal(signal.SIGTERM, sig_handler)
|
||||
+
|
||||
+ # 初始化配置读取
|
||||
+ config = configparser.ConfigParser(comment_prefixes=('#', ';'))
|
||||
+ try:
|
||||
+ config.read(CONFIG_FILE)
|
||||
+ except configparser.Error:
|
||||
+ report_alarm_fail("Failed to read config file")
|
||||
+
|
||||
+ io_dic = {}
|
||||
+
|
||||
+ # 读取配置文件 -- common段
|
||||
+ io_dic["period_time"], disk, stage, io_dic["iotype_list"] = read_config_common(config)
|
||||
+
|
||||
+ # 采集模块对接,is_iocollect_valid()
|
||||
+ io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage)
|
||||
+
|
||||
+ if "bio" not in io_dic["stage_list"]:
|
||||
+ report_alarm_fail("Cannot run avg_block_io without bio stage")
|
||||
+
|
||||
+ # 初始化窗口 -- config读取,对应is_iocollect_valid返回的结果
|
||||
+ # step1. 解析公共配置 --- algorithm
|
||||
+ io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config)
|
||||
+
|
||||
+ # step2. 循环创建窗口
|
||||
+ common_param = read_config_lat_iodump(io_dic, config)
|
||||
+ io_data, io_avg_value = init_io_win(io_dic, config, common_param)
|
||||
+
|
||||
+ main_loop(io_dic, io_data, io_avg_value)
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
new file mode 100644
|
||||
index 0000000..caa0191
|
||||
--- /dev/null
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
@@ -0,0 +1,86 @@
|
||||
+# coding: utf-8
|
||||
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||||
+# sysSentry is licensed under the Mulan PSL v2.
|
||||
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||
+# You may obtain a copy of Mulan PSL v2 at:
|
||||
+# http://license.coscl.org.cn/MulanPSL2
|
||||
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
|
||||
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||
+# PURPOSE.
|
||||
+# See the Mulan PSL v2 for more details.
|
||||
+import json
|
||||
+import logging
|
||||
+import sys
|
||||
+import time
|
||||
+
|
||||
+from .utils import is_abnormal
|
||||
+from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages
|
||||
+from syssentry.result import ResultLevel, report_result
|
||||
+
|
||||
+
|
||||
+TASK_NAME = "avg_block_io"
|
||||
+
|
||||
+def sig_handler(signum, _f):
|
||||
+ """stop avg_block_io"""
|
||||
+ report_result(TASK_NAME, ResultLevel.PASS, json.dumps({}))
|
||||
+ logging.info("Finished avg_block_io plugin running.")
|
||||
+ sys.exit(0)
|
||||
+
|
||||
+def avg_get_io_data(io_dic):
|
||||
+ """get_io_data from sentryCollector"""
|
||||
+ res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"])
|
||||
+ return check_result_validation(res, 'get io data')
|
||||
+
|
||||
+
|
||||
+def avg_is_iocollect_valid(io_dic, config_disk, config_stage):
|
||||
+ """is_iocollect_valid from sentryCollector"""
|
||||
+ res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage)
|
||||
+ return check_result_validation(res, 'check config validation')
|
||||
+
|
||||
+
|
||||
+def check_result_validation(res, reason):
|
||||
+ """check validation of result from sentryCollector"""
|
||||
+ if not 'ret' in res or not 'message' in res:
|
||||
+ err_msg = "Failed to {}: Cannot connect to sentryCollector.".format(reason)
|
||||
+ report_alarm_fail(err_msg)
|
||||
+ if res['ret'] != 0:
|
||||
+ err_msg = "Failed to {}: {}".format(reason, Result_Messages[res['ret']])
|
||||
+ report_alarm_fail(err_msg)
|
||||
+
|
||||
+ try:
|
||||
+ json_data = json.loads(res['message'])
|
||||
+ except json.JSONDecodeError:
|
||||
+ err_msg = "Failed to {}: invalid return message".format(reason)
|
||||
+ report_alarm_fail(err_msg)
|
||||
+
|
||||
+ return json_data
|
||||
+
|
||||
+
|
||||
+def report_alarm_fail(alarm_info):
|
||||
+ """report result to xalarmd"""
|
||||
+ report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info}))
|
||||
+ logging.error(alarm_info)
|
||||
+ sys.exit(1)
|
||||
+
|
||||
+
|
||||
+def process_report_data(disk_name, rw, io_data):
|
||||
+ """check abnormal window and report to xalarm"""
|
||||
+ if not is_abnormal((disk_name, 'bio', rw), io_data):
|
||||
+ return
|
||||
+
|
||||
+ ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
|
||||
+ for stage_name in ctrl_stage:
|
||||
+ if is_abnormal((disk_name, stage_name, rw), io_data):
|
||||
+ logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw))
|
||||
+ return
|
||||
+
|
||||
+ if is_abnormal((disk_name, 'rq_driver', rw), io_data):
|
||||
+ logging.warning("{} - {} - {} report driver".format(time.ctime(), disk_name, rw))
|
||||
+ return
|
||||
+
|
||||
+ kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue']
|
||||
+ for stage_name in kernel_stage:
|
||||
+ if is_abnormal((disk_name, stage_name, rw), io_data):
|
||||
+ logging.warning("{} - {} - {} report kernel".format(time.ctime(), disk_name, rw))
|
||||
+ return
|
||||
+ logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw))
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py
|
||||
new file mode 100644
|
||||
index 0000000..9b0ce79
|
||||
--- /dev/null
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py
|
||||
@@ -0,0 +1,47 @@
|
||||
+# coding: utf-8
|
||||
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||||
+# sysSentry is licensed under the Mulan PSL v2.
|
||||
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||
+# You may obtain a copy of Mulan PSL v2 at:
|
||||
+# http://license.coscl.org.cn/MulanPSL2
|
||||
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
|
||||
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||
+# PURPOSE.
|
||||
+# See the Mulan PSL v2 for more details.
|
||||
+
|
||||
+class AbnormalWindowBase:
|
||||
+ def __init__(self, window_size=10, window_threshold=7):
|
||||
+ self.window_size = window_size
|
||||
+ self.window_threshold = window_threshold
|
||||
+ self.abnormal_window = [False] * window_size
|
||||
+
|
||||
+ def append_new_period(self, ab_res, avg_val=0):
|
||||
+ self.abnormal_window.pop(0)
|
||||
+ if self.is_abnormal_period(ab_res, avg_val):
|
||||
+ self.abnormal_window.append(True)
|
||||
+ else:
|
||||
+ self.abnormal_window.append(False)
|
||||
+
|
||||
+ def is_abnormal_window(self):
|
||||
+ return sum(self.abnormal_window) > self.window_threshold
|
||||
+
|
||||
+
|
||||
+class IoWindow(AbnormalWindowBase):
|
||||
+ def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40):
|
||||
+ super().__init__(window_size, window_threshold)
|
||||
+ self.abnormal_multiple = abnormal_multiple
|
||||
+ self.abnormal_multiple_lim = abnormal_multiple_lim
|
||||
+ self.abnormal_time = abnormal_time
|
||||
+
|
||||
+ def is_abnormal_period(self, value, avg_val):
|
||||
+ return (value > avg_val * self.abnormal_multiple and value > self.abnormal_multiple_lim) or \
|
||||
+ (value > self.abnormal_time)
|
||||
+
|
||||
+
|
||||
+class IoDumpWindow(AbnormalWindowBase):
|
||||
+ def __init__(self, window_size=10, window_threshold=7, abnormal_time=40):
|
||||
+ super().__init__(window_size, window_threshold)
|
||||
+ self.abnormal_time = abnormal_time
|
||||
+
|
||||
+ def is_abnormal_period(self, value, avg_val=0):
|
||||
+ return value > self.abnormal_time
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
new file mode 100644
|
||||
index 0000000..54ed080
|
||||
--- /dev/null
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
@@ -0,0 +1,86 @@
|
||||
+# coding: utf-8
|
||||
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||||
+# sysSentry is licensed under the Mulan PSL v2.
|
||||
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||
+# You may obtain a copy of Mulan PSL v2 at:
|
||||
+# http://license.coscl.org.cn/MulanPSL2
|
||||
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
|
||||
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||
+# PURPOSE.
|
||||
+# See the Mulan PSL v2 for more details.
|
||||
+AVG_VALUE = 0
|
||||
+AVG_COUNT = 1
|
||||
+
|
||||
+
|
||||
+def get_nested_value(data, keys):
|
||||
+ """get data from nested dict"""
|
||||
+ for key in keys:
|
||||
+ if key in data:
|
||||
+ data = data[key]
|
||||
+ else:
|
||||
+ return None
|
||||
+ return data
|
||||
+
|
||||
+
|
||||
+def set_nested_value(data, keys, value):
|
||||
+ """set data to nested dict"""
|
||||
+ for key in keys[:-1]:
|
||||
+ if key in data:
|
||||
+ data = data[key]
|
||||
+ else:
|
||||
+ return False
|
||||
+ data[keys[-1]] = value
|
||||
+ return True
|
||||
+
|
||||
+
|
||||
+def is_abnormal(io_key, io_data):
|
||||
+ """check if latency and iodump win abnormal"""
|
||||
+ for key in ['latency', 'iodump']:
|
||||
+ all_keys = get_nested_value(io_data, io_key)
|
||||
+ if all_keys and key in all_keys:
|
||||
+ win = get_nested_value(io_data, io_key + (key,))
|
||||
+ if win and win.is_abnormal_window():
|
||||
+ return True
|
||||
+ return False
|
||||
+
|
||||
+
|
||||
+def update_io_avg(old_avg, period_value, win_size):
|
||||
+ """update average of latency window"""
|
||||
+ if old_avg[AVG_COUNT] < win_size:
|
||||
+ new_avg_count = old_avg[AVG_COUNT] + 1
|
||||
+ new_avg_value = (old_avg[AVG_VALUE] * old_avg[AVG_COUNT] + period_value[0]) / new_avg_count
|
||||
+ else:
|
||||
+ new_avg_count = old_avg[AVG_COUNT]
|
||||
+ new_avg_value = (old_avg[AVG_VALUE] * (old_avg[AVG_COUNT] - 1) + period_value[0]) / new_avg_count
|
||||
+ return [new_avg_value, new_avg_count]
|
||||
+
|
||||
+
|
||||
+def update_io_data(old_avg, period_value, win_size, io_data, io_key):
|
||||
+ """update data of latency and iodump window"""
|
||||
+ all_wins = get_nested_value(io_data, io_key)
|
||||
+ if all_wins and "latency" in all_wins:
|
||||
+ io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE])
|
||||
+ if all_wins and "iodump" in all_wins:
|
||||
+ io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1])
|
||||
+
|
||||
+
|
||||
+def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data):
|
||||
+ """update avg and check abonrmal, return true if win_size full"""
|
||||
+ period_value = get_nested_value(data, io_key)
|
||||
+ old_avg = get_nested_value(io_avg_value, io_key)
|
||||
+
|
||||
+ # 更新avg数据
|
||||
+ if old_avg[AVG_COUNT] < win_size:
|
||||
+ set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
|
||||
+ return False
|
||||
+
|
||||
+ # 更新win数据 -- 判断异常周期
|
||||
+ update_io_data(old_avg, period_value, win_size, io_data, io_key)
|
||||
+ all_wins = get_nested_value(io_data, io_key)
|
||||
+ if all_wins and 'latency' not in all_wins:
|
||||
+ return True
|
||||
+ period = get_nested_value(io_data, io_key + ("latency",))
|
||||
+ if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]):
|
||||
+ return True
|
||||
+ set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size))
|
||||
+ return True
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,33 +0,0 @@
|
||||
From ac9ce326dee20edde2451946e34ea9a13bd8c338 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
||||
Date: Wed, 16 Oct 2024 11:50:46 +0800
|
||||
Subject: [PATCH] fix ai_block_io root cause bug
|
||||
|
||||
---
|
||||
src/python/sentryPlugins/ai_block_io/detector.py | 8 ++++----
|
||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
index 5b21714..ed8b64a 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
@@ -101,12 +101,12 @@ class DiskDetector:
|
||||
if len(diagnosis_info["bio"]) == 0:
|
||||
return False, None, None, None
|
||||
elif len(diagnosis_info["rq_driver"]) != 0:
|
||||
- root_cause = "[Root Cause:disk slow]"
|
||||
+ root_cause = "[Root Cause: disk slow]"
|
||||
elif len(diagnosis_info["io_stage"]) != 0:
|
||||
- stage = diagnosis_info["io_stage"][0][1].get_stage_name()
|
||||
- root_cause = f"[Root Cause:io stage slow, stage: {stage}]"
|
||||
+ stage = diagnosis_info["io_stage"][0][1].stage_name
|
||||
+ root_cause = f"[Root Cause: io stage slow, stage: {stage}]"
|
||||
if root_cause is None:
|
||||
- root_cause = "[Root Cause:high io pressure]"
|
||||
+ root_cause = "[Root Cause: high io pressure]"
|
||||
return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause
|
||||
|
||||
def __repr__(self):
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,832 +0,0 @@
|
||||
From 35ba8fe8e241c5e3508c5dadc82a777065a5cc4d Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
||||
Date: Mon, 30 Sep 2024 00:15:29 +0800
|
||||
Subject: [PATCH] fix ai_block_io some issues
|
||||
|
||||
---
|
||||
..._slow_io_detection.ini => ai_block_io.ini} | 6 +-
|
||||
config/tasks/ai_block_io.mod | 5 +
|
||||
.../tasks/ai_threshold_slow_io_detection.mod | 5 -
|
||||
...ow_io_detection.py => test_ai_block_io.py} | 0
|
||||
.../README.md | 0
|
||||
.../__init__.py | 0
|
||||
.../ai_block_io.py} | 57 ++--
|
||||
.../alarm_report.py | 2 +-
|
||||
.../ai_block_io/config_parser.py | 256 ++++++++++++++++++
|
||||
.../data_access.py | 3 +
|
||||
.../detector.py | 17 +-
|
||||
.../io_data.py | 0
|
||||
.../sliding_window.py | 0
|
||||
.../threshold.py | 13 +-
|
||||
.../utils.py | 15 +-
|
||||
.../config_parser.py | 141 ----------
|
||||
src/python/setup.py | 2 +-
|
||||
17 files changed, 336 insertions(+), 186 deletions(-)
|
||||
rename config/plugins/{ai_threshold_slow_io_detection.ini => ai_block_io.ini} (66%)
|
||||
create mode 100644 config/tasks/ai_block_io.mod
|
||||
delete mode 100644 config/tasks/ai_threshold_slow_io_detection.mod
|
||||
rename selftest/test/{test_ai_threshold_slow_io_detection.py => test_ai_block_io.py} (100%)
|
||||
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/README.md (100%)
|
||||
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/__init__.py (100%)
|
||||
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection/slow_io_detection.py => ai_block_io/ai_block_io.py} (66%)
|
||||
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/alarm_report.py (98%)
|
||||
create mode 100644 src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/data_access.py (99%)
|
||||
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/detector.py (77%)
|
||||
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/io_data.py (100%)
|
||||
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/sliding_window.py (100%)
|
||||
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/threshold.py (92%)
|
||||
rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/utils.py (86%)
|
||||
delete mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py
|
||||
|
||||
diff --git a/config/plugins/ai_threshold_slow_io_detection.ini b/config/plugins/ai_block_io.ini
|
||||
similarity index 66%
|
||||
rename from config/plugins/ai_threshold_slow_io_detection.ini
|
||||
rename to config/plugins/ai_block_io.ini
|
||||
index 44eb928..01ce266 100644
|
||||
--- a/config/plugins/ai_threshold_slow_io_detection.ini
|
||||
+++ b/config/plugins/ai_block_io.ini
|
||||
@@ -4,9 +4,9 @@ slow_io_detect_frequency=1
|
||||
log_level=info
|
||||
|
||||
[algorithm]
|
||||
-train_data_duration=0.1
|
||||
-train_update_duration=0.02
|
||||
-algorithm_type=n_sigma
|
||||
+train_data_duration=24
|
||||
+train_update_duration=2
|
||||
+algorithm_type=boxplot
|
||||
boxplot_parameter=1.5
|
||||
n_sigma_parameter=3
|
||||
|
||||
diff --git a/config/tasks/ai_block_io.mod b/config/tasks/ai_block_io.mod
|
||||
new file mode 100644
|
||||
index 0000000..1971d7d
|
||||
--- /dev/null
|
||||
+++ b/config/tasks/ai_block_io.mod
|
||||
@@ -0,0 +1,5 @@
|
||||
+[common]
|
||||
+enabled=yes
|
||||
+task_start=/usr/bin/python3 /usr/bin/ai_block_io
|
||||
+task_stop=pkill -f /usr/bin/ai_block_io
|
||||
+type=oneshot
|
||||
\ No newline at end of file
|
||||
diff --git a/config/tasks/ai_threshold_slow_io_detection.mod b/config/tasks/ai_threshold_slow_io_detection.mod
|
||||
deleted file mode 100644
|
||||
index 2729f72..0000000
|
||||
--- a/config/tasks/ai_threshold_slow_io_detection.mod
|
||||
+++ /dev/null
|
||||
@@ -1,5 +0,0 @@
|
||||
-[common]
|
||||
-enabled=yes
|
||||
-task_start=/usr/bin/python3 /usr/bin/ai_threshold_slow_io_detection
|
||||
-task_stop=pkill -f /usr/bin/ai_threshold_slow_io_detection
|
||||
-type=oneshot
|
||||
\ No newline at end of file
|
||||
diff --git a/selftest/test/test_ai_threshold_slow_io_detection.py b/selftest/test/test_ai_block_io.py
|
||||
similarity index 100%
|
||||
rename from selftest/test/test_ai_threshold_slow_io_detection.py
|
||||
rename to selftest/test/test_ai_block_io.py
|
||||
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md b/src/python/sentryPlugins/ai_block_io/README.md
|
||||
similarity index 100%
|
||||
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md
|
||||
rename to src/python/sentryPlugins/ai_block_io/README.md
|
||||
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py b/src/python/sentryPlugins/ai_block_io/__init__.py
|
||||
similarity index 100%
|
||||
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py
|
||||
rename to src/python/sentryPlugins/ai_block_io/__init__.py
|
||||
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
similarity index 66%
|
||||
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py
|
||||
rename to src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
index 43cf770..31b8a97 100644
|
||||
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
@@ -23,7 +23,7 @@ from .data_access import get_io_data_from_collect_plug, check_collect_valid
|
||||
from .io_data import MetricName
|
||||
from .alarm_report import AlarmReport
|
||||
|
||||
-CONFIG_FILE = "/etc/sysSentry/plugins/ai_threshold_slow_io_detection.ini"
|
||||
+CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini"
|
||||
|
||||
|
||||
def sig_handler(signum, frame):
|
||||
@@ -40,34 +40,48 @@ class SlowIODetection:
|
||||
|
||||
def __init__(self, config_parser: ConfigParser):
|
||||
self._config_parser = config_parser
|
||||
- self.__set_log_format()
|
||||
self.__init_detector_name_list()
|
||||
self.__init_detector()
|
||||
|
||||
- def __set_log_format(self):
|
||||
- log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
||||
- log_level = get_log_level(self._config_parser.get_log_level())
|
||||
- logging.basicConfig(level=log_level, format=log_format)
|
||||
-
|
||||
def __init_detector_name_list(self):
|
||||
self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency())
|
||||
- for disk in self._disk_list:
|
||||
- self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
|
||||
- self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
|
||||
+ disks_to_detection: list = self._config_parser.get_disks_to_detection()
|
||||
+ # 情况1:None,则启用所有磁盘检测
|
||||
+ # 情况2:is not None and len = 0,则不启动任何磁盘检测
|
||||
+ # 情况3:len != 0,则取交集
|
||||
+ if disks_to_detection is None:
|
||||
+ for disk in self._disk_list:
|
||||
+ self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
|
||||
+ self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
|
||||
+ elif len(disks_to_detection) == 0:
|
||||
+ logging.warning('please attention: conf file not specify any disk to detection, '
|
||||
+ 'so it will not start ai block io.')
|
||||
+ else:
|
||||
+ disks_name_to_detection = []
|
||||
+ for disk_name_to_detection in disks_to_detection:
|
||||
+ disks_name_to_detection.append(disk_name_to_detection.get_disk_name())
|
||||
+ disk_intersection = [disk for disk in self._disk_list if disk in disks_name_to_detection]
|
||||
+ for disk in disk_intersection:
|
||||
+ self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
|
||||
+ self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
|
||||
+ logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}')
|
||||
|
||||
def __init_detector(self):
|
||||
train_data_duration, train_update_duration = (self._config_parser.
|
||||
get_train_data_duration_and_train_update_duration())
|
||||
slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency()
|
||||
- threshold_type = get_threshold_type_enum(self._config_parser.get_algorithm_type())
|
||||
+ threshold_type = self._config_parser.get_algorithm_type()
|
||||
data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration,
|
||||
train_update_duration,
|
||||
slow_io_detection_frequency)
|
||||
- sliding_window_type = get_sliding_window_type_enum(self._config_parser.get_sliding_window_type())
|
||||
+ sliding_window_type = self._config_parser.get_sliding_window_type()
|
||||
window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold()
|
||||
|
||||
for detector_name in self._detector_name_list:
|
||||
- threshold = ThresholdFactory().get_threshold(threshold_type, data_queue_size=data_queue_size,
|
||||
+ threshold = ThresholdFactory().get_threshold(threshold_type,
|
||||
+ boxplot_parameter=self._config_parser.get_boxplot_parameter(),
|
||||
+ n_sigma_paramter=self._config_parser.get_n_sigma_parameter(),
|
||||
+ data_queue_size=data_queue_size,
|
||||
data_queue_update_size=update_size)
|
||||
sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size,
|
||||
threshold=window_threshold)
|
||||
@@ -89,6 +103,7 @@ class SlowIODetection:
|
||||
logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}')
|
||||
if io_data_dict_with_disk_name is None:
|
||||
continue
|
||||
+
|
||||
# Step2:慢IO检测
|
||||
logging.debug('step2. Start to detection slow io event.')
|
||||
slow_io_event_list = []
|
||||
@@ -103,13 +118,14 @@ class SlowIODetection:
|
||||
for slow_io_event in slow_io_event_list:
|
||||
metric_name: MetricName = slow_io_event[0]
|
||||
result = slow_io_event[1]
|
||||
- AlarmReport.report_major_alm(f"disk {metric_name.get_disk_name()} has slow io event."
|
||||
- f"stage: {metric_name.get_metric_name()},"
|
||||
- f"type: {metric_name.get_io_access_type_name()},"
|
||||
- f"metric: {metric_name.get_metric_name()},"
|
||||
- f"current window: {result[1]},"
|
||||
- f"threshold: {result[2]}")
|
||||
- logging.error(f"slow io event happen: {str(slow_io_event)}")
|
||||
+ alarm_content = (f"disk {metric_name.get_disk_name()} has slow io event. "
|
||||
+ f"stage is: {metric_name.get_stage_name()}, "
|
||||
+ f"io access type is: {metric_name.get_io_access_type_name()}, "
|
||||
+ f"metric is: {metric_name.get_metric_name()}, "
|
||||
+ f"current window is: {result[1]}, "
|
||||
+ f"threshold is: {result[2]}")
|
||||
+ AlarmReport.report_major_alm(alarm_content)
|
||||
+ logging.warning(alarm_content)
|
||||
|
||||
# Step4:等待检测时间
|
||||
logging.debug('step4. Wait to start next slow io event detection loop.')
|
||||
@@ -120,6 +136,7 @@ def main():
|
||||
# Step1:注册消息处理函数
|
||||
signal.signal(signal.SIGINT, sig_handler)
|
||||
signal.signal(signal.SIGTERM, sig_handler)
|
||||
+
|
||||
# Step2:断点恢复
|
||||
# todo:
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py
|
||||
similarity index 98%
|
||||
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py
|
||||
rename to src/python/sentryPlugins/ai_block_io/alarm_report.py
|
||||
index 3f4f34e..230c8cd 100644
|
||||
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py
|
||||
@@ -15,7 +15,7 @@ import json
|
||||
|
||||
|
||||
class AlarmReport:
|
||||
- TASK_NAME = "SLOW_IO_DETECTION"
|
||||
+ TASK_NAME = "ai_block_io"
|
||||
|
||||
@staticmethod
|
||||
def report_pass(info: str):
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
new file mode 100644
|
||||
index 0000000..632391d
|
||||
--- /dev/null
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
@@ -0,0 +1,256 @@
|
||||
+# coding: utf-8
|
||||
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||||
+# sysSentry is licensed under the Mulan PSL v2.
|
||||
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||
+# You may obtain a copy of Mulan PSL v2 at:
|
||||
+# http://license.coscl.org.cn/MulanPSL2
|
||||
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
|
||||
+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||
+# PURPOSE.
|
||||
+# See the Mulan PSL v2 for more details.
|
||||
+
|
||||
+import configparser
|
||||
+import json
|
||||
+import logging
|
||||
+
|
||||
+from .io_data import MetricName
|
||||
+from .threshold import ThresholdType
|
||||
+from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level
|
||||
+
|
||||
+LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
||||
+
|
||||
+
|
||||
+def init_log_format(log_level: str):
|
||||
+ logging.basicConfig(level=get_log_level(log_level), format=LOG_FORMAT)
|
||||
+
|
||||
+
|
||||
+class ConfigParser:
|
||||
+ DEFAULT_ABSOLUTE_THRESHOLD = 40
|
||||
+ DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1
|
||||
+ DEFAULT_LOG_LEVEL = 'info'
|
||||
+
|
||||
+ DEFAULT_ALGORITHM_TYPE = 'boxplot'
|
||||
+ DEFAULT_TRAIN_DATA_DURATION = 24
|
||||
+ DEFAULT_TRAIN_UPDATE_DURATION = 2
|
||||
+ DEFAULT_BOXPLOT_PARAMETER = 1.5
|
||||
+ DEFAULT_N_SIGMA_PARAMETER = 3
|
||||
+
|
||||
+ DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous'
|
||||
+ DEFAULT_WINDOW_SIZE = 30
|
||||
+ DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6
|
||||
+
|
||||
+ def __init__(self, config_file_name):
|
||||
+ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
|
||||
+ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
|
||||
+ self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL
|
||||
+ self.__disks_to_detection: list = []
|
||||
+
|
||||
+ self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE
|
||||
+ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
|
||||
+ self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
|
||||
+ self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
|
||||
+ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
|
||||
+
|
||||
+ self.__sliding_window_type = ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE
|
||||
+ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
|
||||
+ self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD
|
||||
+
|
||||
+ self.__config_file_name = config_file_name
|
||||
+
|
||||
+ def __read_absolute_threshold(self, items_common: dict):
|
||||
+ try:
|
||||
+ self.__absolute_threshold = float(items_common.get('absolute_threshold',
|
||||
+ ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD))
|
||||
+ if self.__absolute_threshold <= 0:
|
||||
+ logging.warning(
|
||||
+ f'the_absolute_threshold: {self.__absolute_threshold} you set is invalid, use default value: {ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD}.')
|
||||
+ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
|
||||
+ except ValueError:
|
||||
+ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
|
||||
+ logging.warning(
|
||||
+ f'the_absolute_threshold type conversion has error, use default value: {self.__absolute_threshold}.')
|
||||
+
|
||||
+ def __read__slow_io_detect_frequency(self, items_common: dict):
|
||||
+ try:
|
||||
+ self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency',
|
||||
+ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY))
|
||||
+ if self.__slow_io_detect_frequency < 1 or self.__slow_io_detect_frequency > 10:
|
||||
+ logging.warning(
|
||||
+ f'the slow_io_detect_frequency: {self.__slow_io_detect_frequency} you set is invalid, use default value: {ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY}.')
|
||||
+ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
|
||||
+ except ValueError:
|
||||
+ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
|
||||
+ logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.')
|
||||
+
|
||||
+ def __read__disks_to_detect(self, items_common: dict):
|
||||
+ disks_to_detection = items_common.get('disks_to_detect')
|
||||
+ if disks_to_detection is None:
|
||||
+ logging.warning(f'config of disks_to_detect not found, the default value be used.')
|
||||
+ self.__disks_to_detection = None
|
||||
+ return
|
||||
+ try:
|
||||
+ disks_to_detection_list = json.loads(disks_to_detection)
|
||||
+ for disk_to_detection in disks_to_detection_list:
|
||||
+ disk_name = disk_to_detection.get('disk_name', None)
|
||||
+ stage_name = disk_to_detection.get('stage_name', None)
|
||||
+ io_access_type_name = disk_to_detection.get('io_access_type_name', None)
|
||||
+ metric_name = disk_to_detection.get('metric_name', None)
|
||||
+ if not (disk_name is None or stage_name is None or io_access_type_name is None or metric_name is None):
|
||||
+ metric_name_object = MetricName(disk_name, stage_name, io_access_type_name, metric_name)
|
||||
+ self.__disks_to_detection.append(metric_name_object)
|
||||
+ else:
|
||||
+ logging.warning(f'config of disks_to_detect\'s some part has some error: {disk_to_detection}, it will be ignored.')
|
||||
+ except json.decoder.JSONDecodeError as e:
|
||||
+ logging.warning(f'config of disks_to_detect is error: {e}, it will be ignored and default value be used.')
|
||||
+ self.__disks_to_detection = None
|
||||
+
|
||||
+ def __read__train_data_duration(self, items_algorithm: dict):
|
||||
+ try:
|
||||
+ self.__train_data_duration = float(items_algorithm.get('train_data_duration',
|
||||
+ ConfigParser.DEFAULT_TRAIN_DATA_DURATION))
|
||||
+ if self.__train_data_duration <= 0 or self.__train_data_duration > 720:
|
||||
+ logging.warning(
|
||||
+ f'the train_data_duration: {self.__train_data_duration} you set is invalid, use default value: {ConfigParser.DEFAULT_TRAIN_DATA_DURATION}.')
|
||||
+ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION
|
||||
+ except ValueError:
|
||||
+ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION
|
||||
+ logging.warning(f'the train_data_duration type conversion has error, use default value: {self.__train_data_duration}.')
|
||||
+
|
||||
+ def __read__train_update_duration(self, items_algorithm: dict):
|
||||
+ default_train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
|
||||
+ if default_train_update_duration > self.__train_data_duration:
|
||||
+ default_train_update_duration = self.__train_data_duration / 2
|
||||
+
|
||||
+ try:
|
||||
+ self.__train_update_duration = float(items_algorithm.get('train_update_duration',
|
||||
+ ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION))
|
||||
+ if self.__train_update_duration <= 0 or self.__train_update_duration > self.__train_data_duration:
|
||||
+ logging.warning(
|
||||
+ f'the train_update_duration: {self.__train_update_duration} you set is invalid, use default value: {default_train_update_duration}.')
|
||||
+ self.__train_update_duration = default_train_update_duration
|
||||
+ except ValueError:
|
||||
+ self.__train_update_duration = default_train_update_duration
|
||||
+ logging.warning(f'the train_update_duration type conversion has error, use default value: {self.__train_update_duration}.')
|
||||
+
|
||||
+ def __read__algorithm_type_and_parameter(self, items_algorithm: dict):
|
||||
+ algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE)
|
||||
+ self.__algorithm_type = get_threshold_type_enum(algorithm_type)
|
||||
+
|
||||
+ if self.__algorithm_type == ThresholdType.NSigmaThreshold:
|
||||
+ try:
|
||||
+ self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter',
|
||||
+ ConfigParser.DEFAULT_N_SIGMA_PARAMETER))
|
||||
+ if self.__n_sigma_parameter <= 0 or self.__n_sigma_parameter > 10:
|
||||
+ logging.warning(
|
||||
+ f'the n_sigma_parameter: {self.__n_sigma_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_N_SIGMA_PARAMETER}.')
|
||||
+ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
|
||||
+ except ValueError:
|
||||
+ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
|
||||
+ logging.warning(f'the n_sigma_parameter type conversion has error, use default value: {self.__n_sigma_parameter}.')
|
||||
+ elif self.__algorithm_type == ThresholdType.BoxplotThreshold:
|
||||
+ try:
|
||||
+ self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter',
|
||||
+ ConfigParser.DEFAULT_BOXPLOT_PARAMETER))
|
||||
+ if self.__boxplot_parameter <= 0 or self.__boxplot_parameter > 10:
|
||||
+ logging.warning(
|
||||
+ f'the boxplot_parameter: {self.__boxplot_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_BOXPLOT_PARAMETER}.')
|
||||
+ self.__n_sigma_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
|
||||
+ except ValueError:
|
||||
+ self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
|
||||
+ logging.warning(f'the boxplot_parameter type conversion has error, use default value: {self.__boxplot_parameter}.')
|
||||
+
|
||||
+ def __read__window_size(self, items_sliding_window: dict):
|
||||
+ try:
|
||||
+ self.__window_size = int(items_sliding_window.get('window_size',
|
||||
+ ConfigParser.DEFAULT_WINDOW_SIZE))
|
||||
+ if self.__window_size < 1 or self.__window_size > 3600:
|
||||
+ logging.warning(
|
||||
+ f'the window_size: {self.__window_size} you set is invalid, use default value: {ConfigParser.DEFAULT_WINDOW_SIZE}.')
|
||||
+ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
|
||||
+ except ValueError:
|
||||
+ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
|
||||
+ logging.warning(f'window_size type conversion has error, use default value: {self.__window_size}.')
|
||||
+
|
||||
+ def __read__window_minimum_threshold(self, items_sliding_window: dict):
|
||||
+ default_window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD
|
||||
+ if default_window_minimum_threshold > self.__window_size:
|
||||
+ default_window_minimum_threshold = self.__window_size / 2
|
||||
+ try:
|
||||
+ self.__window_minimum_threshold = (
|
||||
+ int(items_sliding_window.get('window_minimum_threshold',
|
||||
+ ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD)))
|
||||
+ if self.__window_minimum_threshold < 1 or self.__window_minimum_threshold > self.__window_size:
|
||||
+ logging.warning(
|
||||
+ f'the window_minimum_threshold: {self.__window_minimum_threshold} you set is invalid, use default value: {default_window_minimum_threshold}.')
|
||||
+ self.__window_minimum_threshold = default_window_minimum_threshold
|
||||
+ except ValueError:
|
||||
+ self.__window_minimum_threshold = default_window_minimum_threshold
|
||||
+ logging.warning(f'window_minimum_threshold type conversion has error, use default value: {self.__window_minimum_threshold}.')
|
||||
+
|
||||
+ def read_config_from_file(self):
|
||||
+ con = configparser.ConfigParser()
|
||||
+ con.read(self.__config_file_name, encoding='utf-8')
|
||||
+
|
||||
+ if con.has_section('common'):
|
||||
+ items_common = dict(con.items('common'))
|
||||
+ self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL)
|
||||
+ init_log_format(self.__log_level)
|
||||
+ self.__read_absolute_threshold(items_common)
|
||||
+ self.__read__slow_io_detect_frequency(items_common)
|
||||
+ self.__read__disks_to_detect(items_common)
|
||||
+ else:
|
||||
+ init_log_format(self.__log_level)
|
||||
+ logging.warning("common section parameter not found, it will be set to default value.")
|
||||
+
|
||||
+ if con.has_section('algorithm'):
|
||||
+ items_algorithm = dict(con.items('algorithm'))
|
||||
+ self.__read__train_data_duration(items_algorithm)
|
||||
+ self.__read__train_update_duration(items_algorithm)
|
||||
+ self.__read__algorithm_type_and_parameter(items_algorithm)
|
||||
+ else:
|
||||
+ logging.warning("algorithm section parameter not found, it will be set to default value.")
|
||||
+
|
||||
+ if con.has_section('sliding_window'):
|
||||
+ items_sliding_window = dict(con.items('sliding_window'))
|
||||
+ sliding_window_type = items_sliding_window.get('sliding_window_type',
|
||||
+ ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE)
|
||||
+ self.__sliding_window_type = get_sliding_window_type_enum(sliding_window_type)
|
||||
+ self.__read__window_size(items_sliding_window)
|
||||
+ self.__read__window_minimum_threshold(items_sliding_window)
|
||||
+ else:
|
||||
+ logging.warning("sliding_window section parameter not found, it will be set to default value.")
|
||||
+
|
||||
+ self.__print_all_config_value()
|
||||
+
|
||||
+ def __print_all_config_value(self):
|
||||
+ pass
|
||||
+
|
||||
+ def get_slow_io_detect_frequency(self):
|
||||
+ return self.__slow_io_detect_frequency
|
||||
+
|
||||
+ def get_algorithm_type(self):
|
||||
+ return self.__algorithm_type
|
||||
+
|
||||
+ def get_sliding_window_type(self):
|
||||
+ return self.__sliding_window_type
|
||||
+
|
||||
+ def get_train_data_duration_and_train_update_duration(self):
|
||||
+ return self.__train_data_duration, self.__train_update_duration
|
||||
+
|
||||
+ def get_window_size_and_window_minimum_threshold(self):
|
||||
+ return self.__window_size, self.__window_minimum_threshold
|
||||
+
|
||||
+ def get_absolute_threshold(self):
|
||||
+ return self.__absolute_threshold
|
||||
+
|
||||
+ def get_log_level(self):
|
||||
+ return self.__log_level
|
||||
+
|
||||
+ def get_disks_to_detection(self):
|
||||
+ return self.__disks_to_detection
|
||||
+
|
||||
+ def get_boxplot_parameter(self):
|
||||
+ return self.__boxplot_parameter
|
||||
+
|
||||
+ def get_n_sigma_parameter(self):
|
||||
+ return self.__n_sigma_parameter
|
||||
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
similarity index 99%
|
||||
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py
|
||||
rename to src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
index d9f3460..01c5315 100644
|
||||
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
@@ -17,6 +17,8 @@ from sentryCollector.collect_plugin import (
|
||||
get_io_data,
|
||||
is_iocollect_valid,
|
||||
)
|
||||
+
|
||||
+
|
||||
from .io_data import IOStageData, IOData
|
||||
|
||||
COLLECT_STAGES = [
|
||||
@@ -32,6 +34,7 @@ COLLECT_STAGES = [
|
||||
"iocost",
|
||||
]
|
||||
|
||||
+
|
||||
def check_collect_valid(period):
|
||||
data_raw = is_iocollect_valid(period)
|
||||
if data_raw["ret"] == 0:
|
||||
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
similarity index 77%
|
||||
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py
|
||||
rename to src/python/sentryPlugins/ai_block_io/detector.py
|
||||
index eda9825..bcf62cb 100644
|
||||
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
@@ -26,19 +26,26 @@ class Detector:
|
||||
self._threshold = threshold
|
||||
self._slidingWindow = sliding_window
|
||||
self._threshold.attach_observer(self._slidingWindow)
|
||||
+ self._count = 0
|
||||
|
||||
def get_metric_name(self):
|
||||
return self._metric_name
|
||||
|
||||
def is_slow_io_event(self, io_data_dict_with_disk_name: dict):
|
||||
- logging.debug(f'Enter Detector: {self}')
|
||||
+ self._count += 1
|
||||
+ if self._count % 15 == 0:
|
||||
+ self._count = 0
|
||||
+ logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
|
||||
+ logging.debug(f'enter Detector: {self}')
|
||||
metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name)
|
||||
- if metric_value > 1e-6:
|
||||
- logging.debug(f'Input metric value: {str(metric_value)}')
|
||||
- self._threshold.push_latest_data_to_queue(metric_value)
|
||||
+ if metric_value is None:
|
||||
+ logging.debug('not found metric value, so return None.')
|
||||
+ return False, None, None
|
||||
+ logging.debug(f'input metric value: {str(metric_value)}')
|
||||
+ self._threshold.push_latest_data_to_queue(metric_value)
|
||||
detection_result = self._slidingWindow.is_slow_io_event(metric_value)
|
||||
logging.debug(f'Detection result: {str(detection_result)}')
|
||||
- logging.debug(f'Exit Detector: {self}')
|
||||
+ logging.debug(f'exit Detector: {self}')
|
||||
return detection_result
|
||||
|
||||
def __repr__(self):
|
||||
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py b/src/python/sentryPlugins/ai_block_io/io_data.py
|
||||
similarity index 100%
|
||||
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py
|
||||
rename to src/python/sentryPlugins/ai_block_io/io_data.py
|
||||
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
similarity index 100%
|
||||
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py
|
||||
rename to src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py
|
||||
similarity index 92%
|
||||
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py
|
||||
rename to src/python/sentryPlugins/ai_block_io/threshold.py
|
||||
index 9e1ca7b..ff85d85 100644
|
||||
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/threshold.py
|
||||
@@ -79,9 +79,9 @@ class AbsoluteThreshold(Threshold):
|
||||
|
||||
|
||||
class BoxplotThreshold(Threshold):
|
||||
- def __init__(self, parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
|
||||
+ def __init__(self, boxplot_parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
|
||||
super().__init__(data_queue_size, data_queue_update_size)
|
||||
- self.parameter = parameter
|
||||
+ self.parameter = boxplot_parameter
|
||||
|
||||
def _update_threshold(self):
|
||||
data = list(self.data_queue.queue)
|
||||
@@ -94,6 +94,8 @@ class BoxplotThreshold(Threshold):
|
||||
self.notify_observer()
|
||||
|
||||
def push_latest_data_to_queue(self, data):
|
||||
+ if data < 1e-6:
|
||||
+ return
|
||||
try:
|
||||
self.data_queue.put(data, block=False)
|
||||
except queue.Full:
|
||||
@@ -111,9 +113,9 @@ class BoxplotThreshold(Threshold):
|
||||
|
||||
|
||||
class NSigmaThreshold(Threshold):
|
||||
- def __init__(self, parameter: float = 2.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000):
|
||||
+ def __init__(self, n_sigma_parameter: float = 3.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs):
|
||||
super().__init__(data_queue_size, data_queue_update_size)
|
||||
- self.parameter = parameter
|
||||
+ self.parameter = n_sigma_parameter
|
||||
|
||||
def _update_threshold(self):
|
||||
data = list(self.data_queue.queue)
|
||||
@@ -125,6 +127,8 @@ class NSigmaThreshold(Threshold):
|
||||
self.notify_observer()
|
||||
|
||||
def push_latest_data_to_queue(self, data):
|
||||
+ if data < 1e-6:
|
||||
+ return
|
||||
try:
|
||||
self.data_queue.put(data, block=False)
|
||||
except queue.Full:
|
||||
@@ -157,4 +161,3 @@ class ThresholdFactory:
|
||||
return NSigmaThreshold(*args, **kwargs)
|
||||
else:
|
||||
raise ValueError(f"Invalid threshold type: {threshold_type}")
|
||||
-
|
||||
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py
|
||||
similarity index 86%
|
||||
rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py
|
||||
rename to src/python/sentryPlugins/ai_block_io/utils.py
|
||||
index f66e5ed..8dbba06 100644
|
||||
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/utils.py
|
||||
@@ -8,13 +8,16 @@
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||
# PURPOSE.
|
||||
# See the Mulan PSL v2 for more details.
|
||||
+
|
||||
import logging
|
||||
from dataclasses import asdict
|
||||
|
||||
+
|
||||
from .threshold import ThresholdType
|
||||
from .sliding_window import SlidingWindowType
|
||||
from .io_data import MetricName, IOData
|
||||
|
||||
+
|
||||
def get_threshold_type_enum(algorithm_type: str):
|
||||
if algorithm_type.lower() == 'absolute':
|
||||
return ThresholdType.AbsoluteThreshold
|
||||
@@ -22,7 +25,7 @@ def get_threshold_type_enum(algorithm_type: str):
|
||||
return ThresholdType.BoxplotThreshold
|
||||
if algorithm_type.lower() == 'n_sigma':
|
||||
return ThresholdType.NSigmaThreshold
|
||||
- logging.info('not found correct algorithm type, use default: boxplot.')
|
||||
+ logging.warning(f"the algorithm type: {algorithm_type} you set is invalid, use default value: boxplot")
|
||||
return ThresholdType.BoxplotThreshold
|
||||
|
||||
|
||||
@@ -33,7 +36,7 @@ def get_sliding_window_type_enum(sliding_window_type: str):
|
||||
return SlidingWindowType.ContinuousSlidingWindow
|
||||
if sliding_window_type.lower() == 'median':
|
||||
return SlidingWindowType.MedianSlidingWindow
|
||||
- logging.info('not found correct sliding window type, use default: not_continuous.')
|
||||
+ logging.warning(f"the sliding window type: {sliding_window_type} you set is invalid, use default value: not_continuous")
|
||||
return SlidingWindowType.NotContinuousSlidingWindow
|
||||
|
||||
|
||||
@@ -62,6 +65,8 @@ def get_log_level(log_level: str):
|
||||
return logging.INFO
|
||||
elif log_level.lower() == 'warning':
|
||||
return logging.WARNING
|
||||
- elif log_level.lower() == 'fatal':
|
||||
- return logging.FATAL
|
||||
- return None
|
||||
+ elif log_level.lower() == 'error':
|
||||
+ return logging.ERROR
|
||||
+ elif log_level.lower() == 'critical':
|
||||
+ return logging.CRITICAL
|
||||
+ return logging.INFO
|
||||
diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py
|
||||
deleted file mode 100644
|
||||
index cd4e6f1..0000000
|
||||
--- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py
|
||||
+++ /dev/null
|
||||
@@ -1,141 +0,0 @@
|
||||
-# coding: utf-8
|
||||
-# Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||||
-# sysSentry is licensed under the Mulan PSL v2.
|
||||
-# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||
-# You may obtain a copy of Mulan PSL v2 at:
|
||||
-# http://license.coscl.org.cn/MulanPSL2
|
||||
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
|
||||
-# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||
-# PURPOSE.
|
||||
-# See the Mulan PSL v2 for more details.
|
||||
-
|
||||
-import configparser
|
||||
-import logging
|
||||
-
|
||||
-
|
||||
-class ConfigParser:
|
||||
-
|
||||
- DEFAULT_ABSOLUTE_THRESHOLD = 40
|
||||
- DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1
|
||||
- DEFAULT_LOG_LEVEL = 'info'
|
||||
- DEFAULT_TRAIN_DATA_DURATION = 24
|
||||
- DEFAULT_TRAIN_UPDATE_DURATION = 2
|
||||
- DEFAULT_ALGORITHM_TYPE = 'boxplot'
|
||||
- DEFAULT_N_SIGMA_PARAMETER = 3
|
||||
- DEFAULT_BOXPLOT_PARAMETER = 1.5
|
||||
- DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous'
|
||||
- DEFAULT_WINDOW_SIZE = 30
|
||||
- DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6
|
||||
-
|
||||
- def __init__(self, config_file_name):
|
||||
- self.__boxplot_parameter = None
|
||||
- self.__window_minimum_threshold = None
|
||||
- self.__window_size = None
|
||||
- self.__sliding_window_type = None
|
||||
- self.__n_sigma_parameter = None
|
||||
- self.__algorithm_type = None
|
||||
- self.__train_update_duration = None
|
||||
- self.__log_level = None
|
||||
- self.__slow_io_detect_frequency = None
|
||||
- self.__absolute_threshold = None
|
||||
- self.__train_data_duration = None
|
||||
- self.__config_file_name = config_file_name
|
||||
-
|
||||
- def read_config_from_file(self):
|
||||
-
|
||||
- con = configparser.ConfigParser()
|
||||
- con.read(self.__config_file_name, encoding='utf-8')
|
||||
-
|
||||
- items_common = dict(con.items('common'))
|
||||
- items_algorithm = dict(con.items('algorithm'))
|
||||
- items_sliding_window = dict(con.items('sliding_window'))
|
||||
-
|
||||
- try:
|
||||
- self.__absolute_threshold = int(items_common.get('absolute_threshold',
|
||||
- ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD))
|
||||
- except ValueError:
|
||||
- self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
|
||||
- logging.warning('absolute threshold type conversion has error, use default value.')
|
||||
-
|
||||
- try:
|
||||
- self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency',
|
||||
- ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY))
|
||||
- except ValueError:
|
||||
- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
|
||||
- logging.warning('slow_io_detect_frequency type conversion has error, use default value.')
|
||||
-
|
||||
- self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL)
|
||||
-
|
||||
- try:
|
||||
- self.__train_data_duration = float(items_algorithm.get('train_data_duration',
|
||||
- ConfigParser.DEFAULT_TRAIN_DATA_DURATION))
|
||||
- except ValueError:
|
||||
- self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION
|
||||
- logging.warning('train_data_duration type conversion has error, use default value.')
|
||||
-
|
||||
- try:
|
||||
- self.__train_update_duration = float(items_algorithm.get('train_update_duration',
|
||||
- ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION))
|
||||
- except ValueError:
|
||||
- self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
|
||||
- logging.warning('train_update_duration type conversion has error, use default value.')
|
||||
-
|
||||
- try:
|
||||
- self.__algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE)
|
||||
- except ValueError:
|
||||
- self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE
|
||||
- logging.warning('algorithmType type conversion has error, use default value.')
|
||||
-
|
||||
- if self.__algorithm_type == 'n_sigma':
|
||||
- try:
|
||||
- self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter',
|
||||
- ConfigParser.DEFAULT_N_SIGMA_PARAMETER))
|
||||
- except ValueError:
|
||||
- self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER
|
||||
- logging.warning('n_sigma_parameter type conversion has error, use default value.')
|
||||
- elif self.__algorithm_type == 'boxplot':
|
||||
- try:
|
||||
- self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter',
|
||||
- ConfigParser.DEFAULT_BOXPLOT_PARAMETER))
|
||||
- except ValueError:
|
||||
- self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER
|
||||
- logging.warning('boxplot_parameter type conversion has error, use default value.')
|
||||
-
|
||||
- self.__sliding_window_type = items_sliding_window.get('sliding_window_type',
|
||||
- ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE)
|
||||
-
|
||||
- try:
|
||||
- self.__window_size = int(items_sliding_window.get('window_size',
|
||||
- ConfigParser.DEFAULT_WINDOW_SIZE))
|
||||
- except ValueError:
|
||||
- self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE
|
||||
- logging.warning('window_size type conversion has error, use default value.')
|
||||
-
|
||||
- try:
|
||||
- self.__window_minimum_threshold = (
|
||||
- int(items_sliding_window.get('window_minimum_threshold',
|
||||
- ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD)))
|
||||
- except ValueError:
|
||||
- self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD
|
||||
- logging.warning('window_minimum_threshold type conversion has error, use default value.')
|
||||
-
|
||||
- def get_slow_io_detect_frequency(self):
|
||||
- return self.__slow_io_detect_frequency
|
||||
-
|
||||
- def get_algorithm_type(self):
|
||||
- return self.__algorithm_type
|
||||
-
|
||||
- def get_sliding_window_type(self):
|
||||
- return self.__sliding_window_type
|
||||
-
|
||||
- def get_train_data_duration_and_train_update_duration(self):
|
||||
- return self.__train_data_duration, self.__train_update_duration
|
||||
-
|
||||
- def get_window_size_and_window_minimum_threshold(self):
|
||||
- return self.__window_size, self.__window_minimum_threshold
|
||||
-
|
||||
- def get_absolute_threshold(self):
|
||||
- return self.__absolute_threshold
|
||||
-
|
||||
- def get_log_level(self):
|
||||
- return self.__log_level
|
||||
diff --git a/src/python/setup.py b/src/python/setup.py
|
||||
index dac6481..9e26a10 100644
|
||||
--- a/src/python/setup.py
|
||||
+++ b/src/python/setup.py
|
||||
@@ -34,7 +34,7 @@ setup(
|
||||
'xalarmd=xalarm.xalarm_daemon:alarm_process_create',
|
||||
'sentryCollector=sentryCollector.collectd:main',
|
||||
'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main',
|
||||
- 'ai_threshold_slow_io_detection=sentryPlugins.ai_threshold_slow_io_detection.slow_io_detection:main'
|
||||
+ 'ai_block_io=sentryPlugins.ai_block_io.ai_block_io:main'
|
||||
]
|
||||
},
|
||||
)
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,48 +0,0 @@
|
||||
From fe1bb401c1f77860616e74c1dbf5fe6aa862b17d Mon Sep 17 00:00:00 2001
|
||||
From: jinsaihang <jinsaihang@h-partners.com>
|
||||
Date: Sat, 26 Oct 2024 07:18:16 +0000
|
||||
Subject: [PATCH] fix alarm_info newline break error
|
||||
|
||||
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
|
||||
---
|
||||
sysSentry-1.0.2/src/python/syssentry/alarm.py | 23 +++++++++++++++++++
|
||||
1 file changed, 23 insertions(+)
|
||||
|
||||
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
|
||||
index 2575307..b35a126 100644
|
||||
--- a/src/python/syssentry/alarm.py
|
||||
+++ b/src/python/syssentry/alarm.py
|
||||
@@ -180,7 +180,30 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
|
||||
if 'details' in alarm_info:
|
||||
alarm_info.pop('details', None)
|
||||
alarm.pop('msg1', None)
|
||||
+
|
||||
+ # dump each {key,value} of details in one line
|
||||
+ if 'details' in alarm_info and isinstance(alarm_info['details'], dict):
|
||||
+ for key in alarm_info['details']:
|
||||
+ alarm_info['details'][key] = json.dumps(alarm_info['details'][key], indent=None)
|
||||
+
|
||||
alarm['alarm_info'] = alarm_info
|
||||
+ alarm_list = [alarm for alarm in alarm_list if 'alarm_source' in alarm['alarm_info'] and alarm['alarm_info']['alarm_source'] == task_name]
|
||||
+
|
||||
+ alarm_level_mapping = {
|
||||
+ 1: 'MINOR_ALM',
|
||||
+ 2: 'MAJOR_ALM',
|
||||
+ 3: 'CRITICAL_ALM'
|
||||
+ }
|
||||
+
|
||||
+ alarm_type_mapping = {
|
||||
+ 1: 'ALARM_TYPE_OCCUR',
|
||||
+ 2: 'ALARM_TYPE_RECOVER'
|
||||
+ }
|
||||
+
|
||||
+ for alarm in alarm_list:
|
||||
+ alarm['alarm_level'] = alarm_level_mapping.get(alarm['alarm_level'], 'UNKNOWN_LEVEL')
|
||||
+ alarm['alarm_type'] = alarm_type_mapping.get(alarm['alarm_type'], 'UNKNOWN_TYPE')
|
||||
return alarm_list
|
||||
+
|
||||
finally:
|
||||
alarm_list_lock.release()
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,323 +0,0 @@
|
||||
From e6eb39799b3ca15fb385c572863417ea26bdfa66 Mon Sep 17 00:00:00 2001
|
||||
From: zhuofeng <zhuofeng2@huawei.com>
|
||||
Date: Wed, 25 Sep 2024 11:03:29 +0800
|
||||
Subject: [PATCH] fix-bug-step-2-about-collect-module-and-avg-block-io
|
||||
|
||||
---
|
||||
src/python/sentryCollector/collect_config.py | 11 ++-
|
||||
src/python/sentryCollector/collect_io.py | 25 ++---
|
||||
src/python/sentryCollector/collect_plugin.py | 6 +-
|
||||
src/python/sentryCollector/collect_server.py | 1 -
|
||||
src/python/sentryCollector/collectd.py | 4 +-
|
||||
.../avg_block_io/avg_block_io.py | 92 ++++++++++++++-----
|
||||
6 files changed, 96 insertions(+), 43 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py
|
||||
index b6cc75c..0fdd9f0 100644
|
||||
--- a/src/python/sentryCollector/collect_config.py
|
||||
+++ b/src/python/sentryCollector/collect_config.py
|
||||
@@ -49,14 +49,14 @@ class CollectConfig:
|
||||
self.config = configparser.ConfigParser()
|
||||
self.config.read(self.filename)
|
||||
except configparser.Error:
|
||||
- logging.error("collectd configure file read failed")
|
||||
+ logging.error("collect configure file read failed")
|
||||
return
|
||||
|
||||
try:
|
||||
common_config = self.config[CONF_COMMON]
|
||||
- modules_str = common_config[CONF_MODULES]
|
||||
+ modules_str = common_config[CONF_MODULES].lower()
|
||||
# remove space
|
||||
- modules_list = modules_str.replace(" ", "").split(',')
|
||||
+ modules_list = set(modules_str.replace(" ", "").split(','))
|
||||
except KeyError as e:
|
||||
logging.error("read config data failed, %s", e)
|
||||
return
|
||||
@@ -98,7 +98,7 @@ class CollectConfig:
|
||||
CONF_IO, CONF_IO_MAX_SAVE, CONF_IO_MAX_SAVE_DEFAULT)
|
||||
result_io_config[CONF_IO_MAX_SAVE] = CONF_IO_MAX_SAVE_DEFAULT
|
||||
# disk
|
||||
- disk = io_map_value.get(CONF_IO_DISK)
|
||||
+ disk = io_map_value.get(CONF_IO_DISK).lower()
|
||||
if disk:
|
||||
disk_str = disk.replace(" ", "")
|
||||
pattern = r'^[a-zA-Z0-9-_,]+$'
|
||||
@@ -106,12 +106,13 @@ class CollectConfig:
|
||||
logging.warning("module_name = %s section, field = %s is incorrect, use default %s",
|
||||
CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT)
|
||||
disk_str = CONF_IO_DISK_DEFAULT
|
||||
+ disk_str = ",".join(set(disk_str.split(',')))
|
||||
result_io_config[CONF_IO_DISK] = disk_str
|
||||
else:
|
||||
logging.warning("module_name = %s section, field = %s is incorrect, use default %s",
|
||||
CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT)
|
||||
result_io_config[CONF_IO_DISK] = CONF_IO_DISK_DEFAULT
|
||||
- logging.info("config get_io_config: %s", result_io_config)
|
||||
+ logging.debug("config get_io_config: %s", result_io_config)
|
||||
return result_io_config
|
||||
|
||||
def get_common_config(self):
|
||||
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
|
||||
index 104b734..9c8dae7 100644
|
||||
--- a/src/python/sentryCollector/collect_io.py
|
||||
+++ b/src/python/sentryCollector/collect_io.py
|
||||
@@ -177,10 +177,8 @@ class CollectIo():
|
||||
|
||||
def is_kernel_avaliable(self):
|
||||
base_path = '/sys/kernel/debug/block'
|
||||
+ all_disk = []
|
||||
for disk_name in os.listdir(base_path):
|
||||
- if not self.loop_all and disk_name not in self.disk_list:
|
||||
- continue
|
||||
-
|
||||
disk_path = os.path.join(base_path, disk_name)
|
||||
blk_io_hierarchy_path = os.path.join(disk_path, 'blk_io_hierarchy')
|
||||
|
||||
@@ -190,12 +188,18 @@ class CollectIo():
|
||||
|
||||
for file_name in os.listdir(blk_io_hierarchy_path):
|
||||
file_path = os.path.join(blk_io_hierarchy_path, file_name)
|
||||
-
|
||||
if file_name == 'stats':
|
||||
- stage_list = self.extract_first_column(file_path)
|
||||
- self.disk_map_stage[disk_name] = stage_list
|
||||
- self.window_value[disk_name] = {}
|
||||
- IO_GLOBAL_DATA[disk_name] = {}
|
||||
+ all_disk.append(disk_name)
|
||||
+
|
||||
+ for disk_name in self.disk_list:
|
||||
+ if not self.loop_all and disk_name not in all_disk:
|
||||
+ logging.warning("the %s disk not exist!", disk_name)
|
||||
+ continue
|
||||
+ stats_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/stats'.format(disk_name)
|
||||
+ stage_list = self.extract_first_column(stats_file)
|
||||
+ self.disk_map_stage[disk_name] = stage_list
|
||||
+ self.window_value[disk_name] = {}
|
||||
+ IO_GLOBAL_DATA[disk_name] = {}
|
||||
|
||||
return len(IO_GLOBAL_DATA) != 0
|
||||
|
||||
@@ -203,7 +207,7 @@ class CollectIo():
|
||||
logging.info("collect io thread start")
|
||||
|
||||
if not self.is_kernel_avaliable() or len(self.disk_map_stage) == 0:
|
||||
- logging.warning("no disks meet the requirements. collect io thread exits")
|
||||
+ logging.warning("no disks meet the requirements. collect io thread exit")
|
||||
return
|
||||
|
||||
for disk_name, stage_list in self.disk_map_stage.items():
|
||||
@@ -239,5 +243,4 @@ class CollectIo():
|
||||
|
||||
# set stop event, notify thread exit
|
||||
def stop_thread(self):
|
||||
- logging.debug("collect io thread is preparing to exit")
|
||||
- self.stop_event.set()
|
||||
+ self.stop_event.set()
|
||||
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
|
||||
index 9132473..1faa5e3 100644
|
||||
--- a/src/python/sentryCollector/collect_plugin.py
|
||||
+++ b/src/python/sentryCollector/collect_plugin.py
|
||||
@@ -10,7 +10,7 @@
|
||||
# See the Mulan PSL v2 for more details.
|
||||
|
||||
"""
|
||||
-collcet plugin
|
||||
+collect plugin
|
||||
"""
|
||||
import json
|
||||
import socket
|
||||
@@ -75,7 +75,7 @@ def client_send_and_recv(request_data, data_str_len, protocol):
|
||||
try:
|
||||
client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
except socket.error:
|
||||
- print("collect_plugin: client creat socket error")
|
||||
+ print("collect_plugin: client create socket error")
|
||||
return None
|
||||
|
||||
try:
|
||||
@@ -128,7 +128,7 @@ def client_send_and_recv(request_data, data_str_len, protocol):
|
||||
def validate_parameters(param, len_limit, char_limit):
|
||||
ret = ResultMessage.RESULT_SUCCEED
|
||||
if not param:
|
||||
- print("parm is invalid")
|
||||
+ print("param is invalid")
|
||||
ret = ResultMessage.RESULT_NOT_PARAM
|
||||
return [False, ret]
|
||||
|
||||
diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py
|
||||
index bab4e56..11d1af0 100644
|
||||
--- a/src/python/sentryCollector/collect_server.py
|
||||
+++ b/src/python/sentryCollector/collect_server.py
|
||||
@@ -281,5 +281,4 @@ class CollectServer():
|
||||
pass
|
||||
|
||||
def stop_thread(self):
|
||||
- logging.debug("collect listen thread is preparing to exit")
|
||||
self.stop_event.set()
|
||||
diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py
|
||||
index 3a836df..d9d8862 100644
|
||||
--- a/src/python/sentryCollector/collectd.py
|
||||
+++ b/src/python/sentryCollector/collectd.py
|
||||
@@ -79,7 +79,7 @@ def main():
|
||||
for info in module_list:
|
||||
class_name = Module_Map_Class.get(info)
|
||||
if not class_name:
|
||||
- logging.info("%s correspond to class is not exists", info)
|
||||
+ logging.info("%s correspond to class is not exist", info)
|
||||
continue
|
||||
cn = class_name(module_config)
|
||||
collect_thread = threading.Thread(target=cn.main_loop)
|
||||
@@ -94,4 +94,4 @@ def main():
|
||||
finally:
|
||||
pass
|
||||
|
||||
- logging.info("All threads have finished. Main thread is exiting.")
|
||||
\ No newline at end of file
|
||||
+ logging.info("all threads have finished. main thread exit.")
|
||||
\ No newline at end of file
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
index 73f0b22..ac35be2 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
@@ -28,33 +28,53 @@ def log_invalid_keys(not_in_list, keys_name, config_list, default_list):
|
||||
|
||||
|
||||
def read_config_common(config):
|
||||
- """read config file, get [common] section value"""
|
||||
- try:
|
||||
- common_sec = config['common']
|
||||
- except configparser.NoSectionError:
|
||||
+ """read config file, get [common] section value"""
|
||||
+ if not config.has_section("common"):
|
||||
report_alarm_fail("Cannot find common section in config file")
|
||||
|
||||
try:
|
||||
- period_time = int(common_sec.get("period_time", 1))
|
||||
- if not (1 <= period_time <= 300):
|
||||
- raise ValueError("Invalid period_time")
|
||||
- except ValueError:
|
||||
- period_time = 1
|
||||
- logging.warning("Invalid period_time, set to 1s")
|
||||
+ disk_name = config.get("common", "disk")
|
||||
+ disk = [] if disk_name == "default" else disk_name.split(",")
|
||||
+ except configparser.NoOptionError:
|
||||
+ disk = []
|
||||
+ logging.warning("Unset disk, set to default")
|
||||
|
||||
- disk = common_sec.get('disk').split(",") if common_sec.get('disk') not in [None, 'default'] else []
|
||||
- stage = common_sec.get('stage').split(",") if common_sec.get('stage') not in [None, 'default'] else []
|
||||
+ try:
|
||||
+ stage_name = config.get("common", "stage")
|
||||
+ stage = [] if stage_name == "default" else stage_name.split(",")
|
||||
+ except configparser.NoOptionError:
|
||||
+ stage = []
|
||||
+ logging.warning("Unset stage, set to read,write")
|
||||
|
||||
if len(disk) > 10:
|
||||
logging.warning("Too many disks, record only max 10 disks")
|
||||
disk = disk[:10]
|
||||
|
||||
- iotype = common_sec.get('iotype', 'read,write').split(",")
|
||||
- iotype_list = [rw.lower() for rw in iotype if rw.lower() in ['read', 'write', 'flush', 'discard']]
|
||||
- err_iotype = [rw for rw in iotype if rw.lower() not in ['read', 'write', 'flush', 'discard']]
|
||||
+ try:
|
||||
+ iotype_name = config.get("common", "iotype").split(",")
|
||||
+ iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write', 'flush', 'discard']]
|
||||
+ err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write', 'flush', 'discard']]
|
||||
+
|
||||
+ if iotype_list in [None, []]:
|
||||
+ iotype_list = ["read", "write"]
|
||||
+ except configparser.NoOptionError:
|
||||
+ iotype = ["read", "write"]
|
||||
+ logging.warning("Unset iotype, set to default")
|
||||
|
||||
if err_iotype:
|
||||
logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list))
|
||||
+
|
||||
+
|
||||
+ try:
|
||||
+ period_time = int(config.get("common", "period_time"))
|
||||
+ if not (1 <= period_time <= 300):
|
||||
+ raise ValueError("Invalid period_time")
|
||||
+ except ValueError:
|
||||
+ period_time = 1
|
||||
+ logging.warning("Invalid period_time, set to 1s")
|
||||
+ except configparser.NoOptionError:
|
||||
+ period_time = 1
|
||||
+ logging.warning("Unset period_time, use 1s as default")
|
||||
|
||||
return period_time, disk, stage, iotype_list
|
||||
|
||||
@@ -68,11 +88,23 @@ def read_config_algorithm(config):
|
||||
win_size = int(config.get("algorithm", "win_size"))
|
||||
if not (1 <= win_size <= 300):
|
||||
raise ValueError("Invalid win_size")
|
||||
+ except ValueError:
|
||||
+ win_size = 30
|
||||
+ logging.warning("Invalid win_size, set to 30")
|
||||
+ except configparser.NoOptionError:
|
||||
+ win_size = 30
|
||||
+ logging.warning("Unset win_size, use 30 as default")
|
||||
+
|
||||
+ try:
|
||||
win_threshold = int(config.get("algorithm", "win_threshold"))
|
||||
if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
|
||||
raise ValueError("Invalid win_threshold")
|
||||
except ValueError:
|
||||
- report_alarm_fail("Invalid win_threshold or win_size")
|
||||
+ win_threshold = 6
|
||||
+ logging.warning("Invalid win_threshold, set to 6")
|
||||
+ except configparser.NoOptionError:
|
||||
+ win_threshold = 6
|
||||
+ logging.warning("Unset win_threshold, use 6 as default")
|
||||
|
||||
return win_size, win_threshold
|
||||
|
||||
@@ -80,6 +112,21 @@ def read_config_algorithm(config):
|
||||
def read_config_lat_iodump(io_dic, config):
|
||||
"""read config file, get [latency] [iodump] section value"""
|
||||
common_param = {}
|
||||
+ lat_sec = None
|
||||
+ if not config.has_section("latency"):
|
||||
+ logging.warning("Cannot find algorithm section in config file")
|
||||
+ else:
|
||||
+ lat_sec = config["latency"]
|
||||
+
|
||||
+ iodump_sec = None
|
||||
+ if not config.has_section("iodump"):
|
||||
+ logging.warning("Cannot find iodump section in config file")
|
||||
+ else:
|
||||
+ lat_sec = config["iodump"]
|
||||
+
|
||||
+ if not lat_sec and not iodump_sec:
|
||||
+ return common_param
|
||||
+
|
||||
for io_type in io_dic["iotype_list"]:
|
||||
common_param[io_type] = {}
|
||||
|
||||
@@ -90,13 +137,16 @@ def read_config_lat_iodump(io_dic, config):
|
||||
}
|
||||
iodump_key = "{}_iodump_lim".format(io_type)
|
||||
|
||||
+ if iodump_sec and iodump_key in iodump_sec and iodump_sec[iodump_key].isdecimal():
|
||||
+ common_param[io_type][iodump_key] = int(iodump_sec[iodump_key])
|
||||
+
|
||||
+ if not lat_sec:
|
||||
+ continue
|
||||
+
|
||||
for key_suffix, key_template in latency_keys.items():
|
||||
- if key_template in config["latency"] and config["latency"][key_template].isdecimal():
|
||||
- common_param[io_type][key_template] = int(config["latency"][key_template])
|
||||
+ if key_template in lat_sec and lat_sec[key_template].isdecimal():
|
||||
+ common_param[io_type][key_template] = int(lat_sec[key_template])
|
||||
|
||||
- if iodump_key in config["iodump"] and config["iodump"][iodump_key].isdecimal():
|
||||
- common_param[io_type][iodump_key] = int(config["iodump"][iodump_key])
|
||||
-
|
||||
return common_param
|
||||
|
||||
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,243 +0,0 @@
|
||||
From c9f62e01f09a56743ccc3e470f273875ab22ac5f Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
||||
Date: Wed, 9 Oct 2024 16:19:52 +0800
|
||||
Subject: [PATCH] fix config relative some issues
|
||||
|
||||
---
|
||||
.../sentryPlugins/ai_block_io/README.md | 1 -
|
||||
.../sentryPlugins/ai_block_io/ai_block_io.py | 21 +++++-----
|
||||
.../ai_block_io/config_parser.py | 42 +++++++++----------
|
||||
.../sentryPlugins/ai_block_io/detector.py | 2 +-
|
||||
.../ai_block_io/sliding_window.py | 8 ++--
|
||||
.../sentryPlugins/ai_block_io/threshold.py | 6 +--
|
||||
6 files changed, 39 insertions(+), 41 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/README.md b/src/python/sentryPlugins/ai_block_io/README.md
|
||||
index f9b8388..95c1111 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/README.md
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/README.md
|
||||
@@ -1,2 +1 @@
|
||||
# slow_io_detection
|
||||
-
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
index 31b8a97..3b00ef3 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
@@ -16,8 +16,7 @@ import logging
|
||||
from .detector import Detector
|
||||
from .threshold import ThresholdFactory, AbsoluteThreshold
|
||||
from .sliding_window import SlidingWindowFactory
|
||||
-from .utils import (get_threshold_type_enum, get_sliding_window_type_enum, get_data_queue_size_and_update_size,
|
||||
- get_log_level)
|
||||
+from .utils import get_data_queue_size_and_update_size
|
||||
from .config_parser import ConfigParser
|
||||
from .data_access import get_io_data_from_collect_plug, check_collect_valid
|
||||
from .io_data import MetricName
|
||||
@@ -45,25 +44,25 @@ class SlowIODetection:
|
||||
|
||||
def __init_detector_name_list(self):
|
||||
self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency())
|
||||
+ logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
|
||||
disks_to_detection: list = self._config_parser.get_disks_to_detection()
|
||||
# 情况1:None,则启用所有磁盘检测
|
||||
# 情况2:is not None and len = 0,则不启动任何磁盘检测
|
||||
# 情况3:len != 0,则取交集
|
||||
if disks_to_detection is None:
|
||||
+ logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.")
|
||||
for disk in self._disk_list:
|
||||
self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
|
||||
self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
|
||||
elif len(disks_to_detection) == 0:
|
||||
- logging.warning('please attention: conf file not specify any disk to detection, '
|
||||
- 'so it will not start ai block io.')
|
||||
+ logging.warning('please attention: conf file not specify any disk to detection, so it will not start ai block io.')
|
||||
else:
|
||||
- disks_name_to_detection = []
|
||||
- for disk_name_to_detection in disks_to_detection:
|
||||
- disks_name_to_detection.append(disk_name_to_detection.get_disk_name())
|
||||
- disk_intersection = [disk for disk in self._disk_list if disk in disks_name_to_detection]
|
||||
- for disk in disk_intersection:
|
||||
- self._detector_name_list.append(MetricName(disk, "bio", "read", "latency"))
|
||||
- self._detector_name_list.append(MetricName(disk, "bio", "write", "latency"))
|
||||
+ for disk_to_detection in disks_to_detection:
|
||||
+ if disk_to_detection in self._disk_list:
|
||||
+ self._detector_name_list.append(MetricName(disk_to_detection, "bio", "read", "latency"))
|
||||
+ self._detector_name_list.append(MetricName(disk_to_detection, "bio", "write", "latency"))
|
||||
+ else:
|
||||
+ logging.warning(f"disk:[{disk_to_detection}] not in available disk list, so it will be ignored.")
|
||||
logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}')
|
||||
|
||||
def __init_detector(self):
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
index 632391d..354c122 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
@@ -10,18 +10,19 @@
|
||||
# See the Mulan PSL v2 for more details.
|
||||
|
||||
import configparser
|
||||
-import json
|
||||
import logging
|
||||
|
||||
-from .io_data import MetricName
|
||||
from .threshold import ThresholdType
|
||||
from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level
|
||||
|
||||
+
|
||||
LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
||||
|
||||
|
||||
def init_log_format(log_level: str):
|
||||
- logging.basicConfig(level=get_log_level(log_level), format=LOG_FORMAT)
|
||||
+ logging.basicConfig(level=get_log_level(log_level.lower()), format=LOG_FORMAT)
|
||||
+ if log_level.lower() not in ('info', 'warning', 'error', 'debug'):
|
||||
+ logging.warning(f'the log_level: {log_level} you set is invalid, use default value: info.')
|
||||
|
||||
|
||||
class ConfigParser:
|
||||
@@ -43,7 +44,7 @@ class ConfigParser:
|
||||
self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD
|
||||
self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY
|
||||
self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL
|
||||
- self.__disks_to_detection: list = []
|
||||
+ self.__disks_to_detection = None
|
||||
|
||||
self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE
|
||||
self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION
|
||||
@@ -83,26 +84,20 @@ class ConfigParser:
|
||||
logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.')
|
||||
|
||||
def __read__disks_to_detect(self, items_common: dict):
|
||||
- disks_to_detection = items_common.get('disks_to_detect')
|
||||
+ disks_to_detection = items_common.get('disk')
|
||||
if disks_to_detection is None:
|
||||
- logging.warning(f'config of disks_to_detect not found, the default value be used.')
|
||||
+ logging.warning(f'config of disk not found, the default value will be used.')
|
||||
self.__disks_to_detection = None
|
||||
return
|
||||
- try:
|
||||
- disks_to_detection_list = json.loads(disks_to_detection)
|
||||
- for disk_to_detection in disks_to_detection_list:
|
||||
- disk_name = disk_to_detection.get('disk_name', None)
|
||||
- stage_name = disk_to_detection.get('stage_name', None)
|
||||
- io_access_type_name = disk_to_detection.get('io_access_type_name', None)
|
||||
- metric_name = disk_to_detection.get('metric_name', None)
|
||||
- if not (disk_name is None or stage_name is None or io_access_type_name is None or metric_name is None):
|
||||
- metric_name_object = MetricName(disk_name, stage_name, io_access_type_name, metric_name)
|
||||
- self.__disks_to_detection.append(metric_name_object)
|
||||
- else:
|
||||
- logging.warning(f'config of disks_to_detect\'s some part has some error: {disk_to_detection}, it will be ignored.')
|
||||
- except json.decoder.JSONDecodeError as e:
|
||||
- logging.warning(f'config of disks_to_detect is error: {e}, it will be ignored and default value be used.')
|
||||
+ disk_list = disks_to_detection.split(',')
|
||||
+ if len(disk_list) == 0 or (len(disk_list) == 1 and disk_list[0] == ''):
|
||||
+ logging.warning("you don't specify any disk.")
|
||||
+ self.__disks_to_detection = []
|
||||
+ return
|
||||
+ if len(disk_list) == 1 and disk_list[0] == 'default':
|
||||
self.__disks_to_detection = None
|
||||
+ return
|
||||
+ self.__disks_to_detection = disk_list
|
||||
|
||||
def __read__train_data_duration(self, items_algorithm: dict):
|
||||
try:
|
||||
@@ -189,7 +184,12 @@ class ConfigParser:
|
||||
|
||||
def read_config_from_file(self):
|
||||
con = configparser.ConfigParser()
|
||||
- con.read(self.__config_file_name, encoding='utf-8')
|
||||
+ try:
|
||||
+ con.read(self.__config_file_name, encoding='utf-8')
|
||||
+ except configparser.Error as e:
|
||||
+ init_log_format(self.__log_level)
|
||||
+ logging.critical(f'config file read error: {e}, ai_block_io plug will exit.')
|
||||
+ exit(1)
|
||||
|
||||
if con.has_section('common'):
|
||||
items_common = dict(con.items('common'))
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
index bcf62cb..a48144f 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
@@ -50,6 +50,6 @@ class Detector:
|
||||
|
||||
def __repr__(self):
|
||||
return (f'disk_name: {self._metric_name.get_disk_name()}, stage_name: {self._metric_name.get_stage_name()},'
|
||||
- f' access_type_name: {self._metric_name.get_io_access_type_name()},'
|
||||
+ f' io_type_name: {self._metric_name.get_io_access_type_name()},'
|
||||
f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},'
|
||||
f' sliding_window_type: {self._slidingWindow}')
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
index d395d48..89191e5 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py
|
||||
@@ -52,7 +52,7 @@ class SlidingWindow:
|
||||
return False, None, None
|
||||
|
||||
def __repr__(self):
|
||||
- return "SlidingWindow"
|
||||
+ return "[SlidingWindow]"
|
||||
|
||||
|
||||
class NotContinuousSlidingWindow(SlidingWindow):
|
||||
@@ -65,7 +65,7 @@ class NotContinuousSlidingWindow(SlidingWindow):
|
||||
return False, self._io_data_queue, self._ai_threshold
|
||||
|
||||
def __repr__(self):
|
||||
- return "NotContinuousSlidingWindow"
|
||||
+ return f"[NotContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]"
|
||||
|
||||
|
||||
class ContinuousSlidingWindow(SlidingWindow):
|
||||
@@ -84,7 +84,7 @@ class ContinuousSlidingWindow(SlidingWindow):
|
||||
return False, self._io_data_queue, self._ai_threshold
|
||||
|
||||
def __repr__(self):
|
||||
- return "ContinuousSlidingWindow"
|
||||
+ return f"[ContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]"
|
||||
|
||||
|
||||
class MedianSlidingWindow(SlidingWindow):
|
||||
@@ -98,7 +98,7 @@ class MedianSlidingWindow(SlidingWindow):
|
||||
return False, self._io_data_queue, self._ai_threshold
|
||||
|
||||
def __repr__(self):
|
||||
- return "MedianSlidingWindow"
|
||||
+ return f"[MedianSlidingWindow, window size: {self._queue_length}]"
|
||||
|
||||
|
||||
class SlidingWindowFactory:
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py
|
||||
index ff85d85..3b7a5a8 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/threshold.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/threshold.py
|
||||
@@ -75,7 +75,7 @@ class AbsoluteThreshold(Threshold):
|
||||
pass
|
||||
|
||||
def __repr__(self):
|
||||
- return "AbsoluteThreshold"
|
||||
+ return "[AbsoluteThreshold]"
|
||||
|
||||
|
||||
class BoxplotThreshold(Threshold):
|
||||
@@ -109,7 +109,7 @@ class BoxplotThreshold(Threshold):
|
||||
self.new_data_size = 0
|
||||
|
||||
def __repr__(self):
|
||||
- return "BoxplotThreshold"
|
||||
+ return f"[BoxplotThreshold, param is: {self.parameter}]"
|
||||
|
||||
|
||||
class NSigmaThreshold(Threshold):
|
||||
@@ -142,7 +142,7 @@ class NSigmaThreshold(Threshold):
|
||||
self.new_data_size = 0
|
||||
|
||||
def __repr__(self):
|
||||
- return "NSigmaThreshold"
|
||||
+ return f"[NSigmaThreshold, param is: {self.parameter}]"
|
||||
|
||||
|
||||
class ThresholdType(Enum):
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,37 +0,0 @@
|
||||
From 65ceade489c4018c3f315104d70be0550a28d9d9 Mon Sep 17 00:00:00 2001
|
||||
From: shixuantong <shixuantong1@huawei.com>
|
||||
Date: Wed, 11 Sep 2024 10:23:41 +0800
|
||||
Subject: [PATCH] fix configparser.InterpolationSyntaxError
|
||||
|
||||
---
|
||||
src/python/syssentry/sentry_config.py | 8 ++++++--
|
||||
1 file changed, 6 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/python/syssentry/sentry_config.py b/src/python/syssentry/sentry_config.py
|
||||
index 01f3df8..a0e7b79 100644
|
||||
--- a/src/python/syssentry/sentry_config.py
|
||||
+++ b/src/python/syssentry/sentry_config.py
|
||||
@@ -103,14 +103,18 @@ class CpuPluginsParamsConfig:
|
||||
"""read config file"""
|
||||
config_param_section_args = {}
|
||||
if os.path.exists(self.config_file):
|
||||
- self.config.read(self.config_file)
|
||||
try:
|
||||
+ self.config.read(self.config_file)
|
||||
config_param_section_args = dict(self.config[self.param_section_name])
|
||||
- except (ValueError, KeyError):
|
||||
+ except (ValueError, KeyError, configparser.InterpolationSyntaxError):
|
||||
config_param_section_args = {}
|
||||
+ logging.error("Failed to parse cpu_sentry.ini!")
|
||||
return config_param_section_args
|
||||
|
||||
def join_cpu_start_cmd(self, cpu_param_dict: dict) -> str:
|
||||
+ if not cpu_param_dict:
|
||||
+ return ""
|
||||
+
|
||||
cpu_list = cpu_param_dict.get("cpu_list", "default")
|
||||
if cpu_list == "default":
|
||||
cpu_list = CpuPluginsParamsConfig.get_cpu_info()
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
From 370b22b032dce9290eebca1cf8d48bd155164b6a Mon Sep 17 00:00:00 2001
|
||||
From: shixuantong <shixuantong1@huawei.com>
|
||||
Date: Wed, 24 Jul 2024 17:53:58 +0800
|
||||
Subject: [PATCH] fix error handling
|
||||
|
||||
---
|
||||
src/python/syssentry/cpu_sentry.py | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py
|
||||
index 3c4d58d..d0bafa8 100644
|
||||
--- a/src/python/syssentry/cpu_sentry.py
|
||||
+++ b/src/python/syssentry/cpu_sentry.py
|
||||
@@ -87,7 +87,7 @@ class CpuSentry:
|
||||
}
|
||||
|
||||
def handle_cpu_output(self, stdout: str):
|
||||
- if "<ERROR>" in stdout:
|
||||
+ if "ERROR" in stdout:
|
||||
self.send_result["result"] = ResultLevel.FAIL
|
||||
self.send_result["details"]["code"] = 1004
|
||||
self.send_result["details"]["msg"] = stdout.split("\n")[0]
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,41 +0,0 @@
|
||||
From 815537382fc0d5164fe57b0d984ca4a1ed8254ea Mon Sep 17 00:00:00 2001
|
||||
From: jinsaihang <jinsaihang@h-partners.com>
|
||||
Date: Thu, 31 Oct 2024 16:00:50 +0800
|
||||
Subject: [PATCH] excessive CPU usage
|
||||
|
||||
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
|
||||
---
|
||||
sysSentry-1.0.2/src/python/xalarm/xalarm_transfer.py | 3 ---
|
||||
1 file changed, 3 deletions(-)
|
||||
|
||||
diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py
|
||||
index b072007..4bebe5d 100644
|
||||
--- a/src/python/xalarm/xalarm_transfer.py
|
||||
+++ b/src/python/xalarm/xalarm_transfer.py
|
||||
@@ -62,7 +62,6 @@ def cleanup_closed_connections(server_sock, epoll, fd_to_socket):
|
||||
to_remove.append(fileno)
|
||||
|
||||
for fileno in to_remove:
|
||||
- epoll.unregister(fileno)
|
||||
fd_to_socket[fileno].close()
|
||||
del fd_to_socket[fileno]
|
||||
logging.info(f"cleaned up connection {fileno} for client lost connection.")
|
||||
@@ -97,7 +96,6 @@ def wait_for_connection(server_sock, epoll, fd_to_socket, thread_should_stop):
|
||||
logging.info(f"connection reach max num of {MAX_CONNECTION_NUM}, closed current connection!")
|
||||
connection.close()
|
||||
continue
|
||||
- epoll.register(connection.fileno(), select.EPOLLOUT)
|
||||
fd_to_socket[connection.fileno()] = connection
|
||||
except socket.error as e:
|
||||
logging.debug(f"socket error, reason is {e}")
|
||||
@@ -122,7 +120,6 @@ def transmit_alarm(server_sock, epoll, fd_to_socket, bin_data):
|
||||
except (BrokenPipeError, ConnectionResetError):
|
||||
to_remove.append(fileno)
|
||||
for fileno in to_remove:
|
||||
- epoll.unregister(fileno)
|
||||
fd_to_socket[fileno].close()
|
||||
del fd_to_socket[fileno]
|
||||
logging.info(f"cleaned up connection {fileno} for client lost connection.")
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,70 +0,0 @@
|
||||
From a06ad0c944b093a71f49cc9fccd5097c1493ca5e Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
||||
Date: Mon, 21 Oct 2024 17:31:32 +0800
|
||||
Subject: [PATCH] fix frequency param check bug
|
||||
|
||||
---
|
||||
.../sentryPlugins/ai_block_io/config_parser.py | 13 +++++++++++--
|
||||
.../sentryPlugins/ai_block_io/data_access.py | 14 ++++++++++++++
|
||||
2 files changed, 25 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
index 447eccd..274a31e 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
@@ -16,6 +16,7 @@ import logging
|
||||
from .alarm_report import Report
|
||||
from .threshold import ThresholdType
|
||||
from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level
|
||||
+from .data_access import check_detect_frequency_is_valid
|
||||
|
||||
|
||||
LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
||||
@@ -165,9 +166,17 @@ class ConfigParser:
|
||||
"slow_io_detect_frequency",
|
||||
int,
|
||||
self.DEFAULT_CONF["common"]["slow_io_detect_frequency"],
|
||||
- gt=0,
|
||||
- le=300,
|
||||
+ gt=0
|
||||
)
|
||||
+ frequency = self._conf["common"]["slow_io_detect_frequency"]
|
||||
+ ret = check_detect_frequency_is_valid(frequency)
|
||||
+ if ret is None:
|
||||
+ log = f"slow io detect frequency: {frequency} is valid, "\
|
||||
+ f"Check whether the value range is too large or is not an "\
|
||||
+ f"integer multiple of period_time.. exiting..."
|
||||
+ Report.report_pass(log)
|
||||
+ logging.critical(log)
|
||||
+ exit(1)
|
||||
|
||||
def _read_disks_to_detect(self, items_common: dict):
|
||||
disks_to_detection = items_common.get("disk")
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
index 1bc5ed8..e4869d5 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
|
||||
@@ -53,6 +53,20 @@ def check_collect_valid(period):
|
||||
return None
|
||||
|
||||
|
||||
+def check_detect_frequency_is_valid(period):
|
||||
+ data_raw = is_iocollect_valid(period)
|
||||
+ if data_raw["ret"] == 0:
|
||||
+ try:
|
||||
+ data = json.loads(data_raw["message"])
|
||||
+ except Exception as e:
|
||||
+ return None
|
||||
+ if not data:
|
||||
+ return None
|
||||
+ return [k for k in data.keys()]
|
||||
+ else:
|
||||
+ return None
|
||||
+
|
||||
+
|
||||
def _get_raw_data(period, disk_list):
|
||||
return get_io_data(
|
||||
period,
|
||||
--
|
||||
2.23.0
|
||||
|
||||
@ -1,36 +0,0 @@
|
||||
From 8f28a40ffd7dc7aa969a7bfc0a170ed0c8f03bce Mon Sep 17 00:00:00 2001
|
||||
From: jinsaihang <jinsaihang@h-partners.com>
|
||||
Date: Tue, 22 Oct 2024 20:28:59 +0800
|
||||
Subject: [PATCH] fix get_alarm error
|
||||
|
||||
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
|
||||
---
|
||||
sysSentry-1.0.2/src/python/syssentry/alarm.py | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
|
||||
index c3f2ee1..2575307 100644
|
||||
--- a/src/python/syssentry/alarm.py
|
||||
+++ b/src/python/syssentry/alarm.py
|
||||
@@ -139,8 +139,6 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
|
||||
return []
|
||||
alarm_id = task_alarm_id_dict[task_name]
|
||||
clear_time = alarm_id_clear_time_dict[alarm_id]
|
||||
- if clear_time < int(time_range):
|
||||
- return []
|
||||
if alarm_id not in alarm_list_dict:
|
||||
logging.debug("alarm_id does not exist")
|
||||
return []
|
||||
@@ -154,6 +152,9 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
|
||||
if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > time_range:
|
||||
stop_index = i
|
||||
break
|
||||
+ if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > clear_time:
|
||||
+ stop_index = i
|
||||
+ break
|
||||
if stop_index >= 0:
|
||||
alarm_list = alarm_list[:stop_index]
|
||||
logging.debug(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements")
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,508 +0,0 @@
|
||||
From 85d6dae9d7c6148f2699ef7da7d2d784043a2ee1 Mon Sep 17 00:00:00 2001
|
||||
From: luckky <guodashun1@huawei.com>
|
||||
Date: Wed, 30 Oct 2024 10:41:11 +0800
|
||||
Subject: [PATCH] fix hbm online repair notice and efi create
|
||||
|
||||
---
|
||||
src/c/hbm_online_repair/hbm_online_repair.c | 5 +-
|
||||
.../non-standard-hbm-repair.c | 194 +++++++++---------
|
||||
.../non-standard-hbm-repair.h | 2 +-
|
||||
src/c/hbm_online_repair/ras-events.c | 1 -
|
||||
.../ras-non-standard-handler.c | 33 +--
|
||||
.../ras-non-standard-handler.h | 1 +
|
||||
6 files changed, 116 insertions(+), 120 deletions(-)
|
||||
|
||||
diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c
|
||||
index 3ace206..b3b2742 100644
|
||||
--- a/src/c/hbm_online_repair/hbm_online_repair.c
|
||||
+++ b/src/c/hbm_online_repair/hbm_online_repair.c
|
||||
@@ -127,10 +127,7 @@ int main(int argc, char *argv[])
|
||||
return -1;
|
||||
}
|
||||
|
||||
- ret = init_all_flash();
|
||||
- if (ret < 0) {
|
||||
- log(LOG_ERROR, "flash writer init failed\n");
|
||||
- }
|
||||
+ get_flash_total_size();
|
||||
|
||||
handle_ras_events(ras);
|
||||
|
||||
diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c
|
||||
index b175e14..f26d8ae 100644
|
||||
--- a/src/c/hbm_online_repair/non-standard-hbm-repair.c
|
||||
+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c
|
||||
@@ -15,7 +15,7 @@
|
||||
#include "non-standard-hbm-repair.h"
|
||||
|
||||
extern int page_isolation_threshold;
|
||||
-size_t total_size = 0;
|
||||
+size_t flash_total_size = 0;
|
||||
struct hisi_common_error_section {
|
||||
uint32_t val_bits;
|
||||
uint8_t version;
|
||||
@@ -122,28 +122,58 @@ static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned
|
||||
info_struct->crc8 = (uint32_t)fault_addr;
|
||||
}
|
||||
|
||||
-static bool variable_existed(char *name, char *guid)
|
||||
+static bool is_variable_existing(char *name, char *guid)
|
||||
{
|
||||
+ char filename[PATH_MAX];
|
||||
+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
|
||||
+
|
||||
+ return access(filename, F_OK | R_OK) == 0;
|
||||
+}
|
||||
+
|
||||
+static size_t get_var_size(char *name, char *guid) {
|
||||
char filename[PATH_MAX];
|
||||
int fd;
|
||||
+ struct stat stat;
|
||||
|
||||
snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
|
||||
|
||||
// open var file
|
||||
fd = open(filename, O_RDONLY);
|
||||
if (fd < 0) {
|
||||
- log(LOG_WARNING, "open file %s failed\n", filename);
|
||||
- return false;
|
||||
+ log(LOG_WARNING, "open %s failed\n", filename);
|
||||
+ goto err;
|
||||
+ }
|
||||
+ // read stat
|
||||
+ if (fstat(fd, &stat) != 0) {
|
||||
+ log(LOG_WARNING, "fstat %s failed\n", filename);
|
||||
+ goto err;
|
||||
}
|
||||
close(fd);
|
||||
- return true;
|
||||
+ return stat.st_size;
|
||||
+err:
|
||||
+ if (fd >= 0)
|
||||
+ close(fd);
|
||||
+ return (size_t)-1;
|
||||
}
|
||||
|
||||
-static uint32_t read_variable_attribute(char *name, char *guid) {
|
||||
+void get_flash_total_size() {
|
||||
+ for (int i = 0; i < FLASH_ENTRY_NUM; i++) {
|
||||
+ if (is_variable_existing(flash_names[i], flash_guids[i])) {
|
||||
+ flash_total_size += get_var_size(flash_names[i], flash_guids[i]);
|
||||
+ }
|
||||
+ }
|
||||
+ // check total entry size
|
||||
+ log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n",
|
||||
+ flash_total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE);
|
||||
+ if (flash_total_size > MAX_VAR_SIZE) {
|
||||
+ log(LOG_WARNING, "fault info storage %zu reach threshold, cannot save new record\n", flash_total_size);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static int read_variable_attribute(char *name, char *guid, uint32_t *attribute) {
|
||||
char filename[PATH_MAX];
|
||||
int fd;
|
||||
size_t readsize;
|
||||
- uint32_t attribute = (uint32_t)-1;
|
||||
|
||||
snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
|
||||
|
||||
@@ -151,17 +181,18 @@ static uint32_t read_variable_attribute(char *name, char *guid) {
|
||||
fd = open(filename, O_RDONLY);
|
||||
if (fd < 0) {
|
||||
log(LOG_ERROR, "open %s failed\n", filename);
|
||||
- return attribute;
|
||||
+ return -1;
|
||||
}
|
||||
|
||||
// read attributes from first 4 bytes
|
||||
- readsize = read(fd, &attribute, sizeof(uint32_t));
|
||||
+ readsize = read(fd, attribute, sizeof(uint32_t));
|
||||
if (readsize != sizeof(uint32_t)) {
|
||||
log(LOG_ERROR, "read attribute of %s failed\n", filename);
|
||||
+ return -1;
|
||||
}
|
||||
|
||||
close(fd);
|
||||
- return attribute;
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
static int efivarfs_set_mutable(char *name, char *guid, bool mutable)
|
||||
@@ -205,8 +236,8 @@ err:
|
||||
return -1;
|
||||
}
|
||||
|
||||
-static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute) {
|
||||
- int fd, mode;
|
||||
+static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute, bool is_existing) {
|
||||
+ int fd = -1, mode;
|
||||
size_t writesize;
|
||||
void *buffer;
|
||||
unsigned long total;
|
||||
@@ -225,16 +256,13 @@ static int write_variable(char *name, char *guid, void *value, unsigned long siz
|
||||
memcpy(buffer + sizeof(uint32_t), value, size);
|
||||
|
||||
// change attr
|
||||
- if (efivarfs_set_mutable(name, guid, 1) != 0) {
|
||||
+ if (is_existing && efivarfs_set_mutable(name, guid, 1) != 0) {
|
||||
log(LOG_ERROR, "set mutable for %s failed\n", filename);
|
||||
goto err;
|
||||
}
|
||||
|
||||
mode = O_WRONLY;
|
||||
- if (attribute & EFI_VARIABLE_APPEND_WRITE)
|
||||
- mode |= O_APPEND;
|
||||
- else
|
||||
- mode |= O_CREAT;
|
||||
+ mode |= is_existing ? O_APPEND : O_CREAT;
|
||||
|
||||
// open var file
|
||||
fd = open(filename, mode, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
|
||||
@@ -252,7 +280,7 @@ static int write_variable(char *name, char *guid, void *value, unsigned long siz
|
||||
|
||||
close(fd);
|
||||
free(buffer);
|
||||
- if (efivarfs_set_mutable(name, guid, 0) != 0) {
|
||||
+ if (is_existing && efivarfs_set_mutable(name, guid, 0) != 0) {
|
||||
log(LOG_ERROR, "set immutable for %s failed\n", filename);
|
||||
}
|
||||
return 0;
|
||||
@@ -261,86 +289,21 @@ err:
|
||||
close(fd);
|
||||
if (buffer)
|
||||
free(buffer);
|
||||
- if (efivarfs_set_mutable(name, guid, 0) != 0) {
|
||||
+ if (is_existing && efivarfs_set_mutable(name, guid, 0) != 0) {
|
||||
log(LOG_ERROR, "set immutable for %s failed\n", filename);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
-static int append_variable(char *name, char *guid, void *data, unsigned long size) {
|
||||
- // prepare append attribute
|
||||
- uint32_t attribute = read_variable_attribute(name, guid);
|
||||
- if (attribute == (uint32_t)-1) {
|
||||
- log(LOG_ERROR, "read %s-%s attribute failed\n", name, guid);
|
||||
- return -1;
|
||||
- }
|
||||
- attribute |= EFI_VARIABLE_APPEND_WRITE;
|
||||
-
|
||||
- return write_variable(name, guid, data, size, attribute);
|
||||
-}
|
||||
-
|
||||
-static size_t get_var_size(char *name, char *guid) {
|
||||
- char filename[PATH_MAX];
|
||||
- int fd;
|
||||
- struct stat stat;
|
||||
-
|
||||
- snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
|
||||
-
|
||||
- // open var file
|
||||
- fd = open(filename, O_RDONLY);
|
||||
- if (fd < 0) {
|
||||
- log(LOG_WARNING, "open %s failed\n", filename);
|
||||
- goto err;
|
||||
- }
|
||||
- // read stat
|
||||
- if (fstat(fd, &stat) != 0) {
|
||||
- log(LOG_WARNING, "fstat %s failed\n", filename);
|
||||
- goto err;
|
||||
- }
|
||||
- close(fd);
|
||||
- return stat.st_size;
|
||||
-err:
|
||||
- if (fd >= 0)
|
||||
- close(fd);
|
||||
- return (size_t)-1;
|
||||
-}
|
||||
-
|
||||
-int init_all_flash() {
|
||||
- for (int i = 0; i < FLASH_ENTRY_NUM; i++) {
|
||||
- // check existed entry
|
||||
- if (variable_existed(flash_names[i], flash_guids[i])) {
|
||||
- total_size += get_var_size(flash_names[i], flash_guids[i]);
|
||||
- continue;
|
||||
- }
|
||||
- // create new entry
|
||||
- uint32_t attribute = EFI_VARIABLE_NON_VOLATILE |
|
||||
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
|
||||
- EFI_VARIABLE_RUNTIME_ACCESS;
|
||||
- char *data = "";
|
||||
- unsigned long size = 1;
|
||||
- int ret = write_variable(flash_names[i], flash_guids[i], data, size, attribute);
|
||||
- if (ret) {
|
||||
- log(LOG_ERROR, "init %s-%s failed, fault info storage funtion not enabled\n", flash_names[i], flash_guids[i]);
|
||||
- return -1;
|
||||
- }
|
||||
- total_size += sizeof(uint32_t) + 1;
|
||||
- }
|
||||
- // check total entry size
|
||||
- log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n",
|
||||
- total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE);
|
||||
- if (total_size > MAX_VAR_SIZE) {
|
||||
- log(LOG_ERROR, "fault info storage reach threshold, cannot save new record\n");
|
||||
- }
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
static int write_fault_info_to_flash(const struct hisi_common_error_section *err) {
|
||||
int ret, guid_index;
|
||||
uint32_t reg_size;
|
||||
uint64_t fault_addr;
|
||||
+ bool is_existing;
|
||||
+ uint32_t attribute = -1;
|
||||
|
||||
// check flash usage threshold
|
||||
- if (total_size + sizeof(uint64_t) > MAX_VAR_SIZE) {
|
||||
+ if (flash_total_size + sizeof(uint64_t) > MAX_VAR_SIZE) {
|
||||
log(LOG_WARNING, "fault info storage reach threshold, cannot save new record into flash\n");
|
||||
return -1;
|
||||
}
|
||||
@@ -359,14 +322,29 @@ static int write_fault_info_to_flash(const struct hisi_common_error_section *err
|
||||
log(LOG_ERROR, "invalid fault info\n");
|
||||
return -1;
|
||||
}
|
||||
+
|
||||
+ // judge if the efivar is existing to set the attribute
|
||||
+ is_existing = is_variable_existing(flash_names[guid_index], flash_guids[guid_index]);
|
||||
+ attribute = EFI_VARIABLE_NON_VOLATILE |
|
||||
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
|
||||
+ EFI_VARIABLE_RUNTIME_ACCESS;
|
||||
+ if (is_existing) {
|
||||
+ ret = read_variable_attribute(flash_names[guid_index], flash_guids[guid_index], &attribute);
|
||||
+ if (ret < 0) {
|
||||
+ log(LOG_ERROR, "read variable %s-%s attribute failed, stop writing\n", flash_names[guid_index], flash_guids[guid_index]);
|
||||
+ return -1;
|
||||
+ }
|
||||
+ attribute |= EFI_VARIABLE_APPEND_WRITE;
|
||||
+ }
|
||||
+
|
||||
// record physical addr in flash
|
||||
- ret = append_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t));
|
||||
+ ret = write_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t), attribute, is_existing);
|
||||
if (ret < 0) {
|
||||
- log(LOG_ERROR, "append to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]);
|
||||
+ log(LOG_ERROR, "write to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]);
|
||||
return -1;
|
||||
}
|
||||
- total_size += sizeof(uint64_t);
|
||||
- log(LOG_INFO, "write hbm fault info to flash success\n");
|
||||
+ flash_total_size += sizeof(uint64_t);
|
||||
+ log(LOG_INFO, "write hbm fault info to flash %s-%s success\n", flash_names[guid_index], flash_guids[guid_index]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -421,7 +399,7 @@ static int get_hardware_corrupted_size()
|
||||
return hardware_corrupted_size;
|
||||
}
|
||||
|
||||
-static uint8_t get_repair_result_code(int ret)
|
||||
+static uint8_t get_repair_failed_result_code(int ret)
|
||||
{
|
||||
if (ret == -ENOSPC) {
|
||||
return REPAIR_FAILED_NO_RESOURCE;
|
||||
@@ -582,11 +560,11 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err)
|
||||
static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr)
|
||||
{
|
||||
int ret;
|
||||
- if (repair_ret < 0) {
|
||||
+ if (repair_ret <= 0) {
|
||||
log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr);
|
||||
/* not much we can do about errors here */
|
||||
(void)write_file("/sys/kernel/page_eject", "remove_page", paddr);
|
||||
- return get_repair_result_code(repair_ret);
|
||||
+ return get_repair_failed_result_code(repair_ret);
|
||||
}
|
||||
|
||||
ret = write_file("/sys/kernel/page_eject", "online_page", paddr);
|
||||
@@ -615,9 +593,13 @@ static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char
|
||||
err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_PSUE_ACLS;
|
||||
|
||||
ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr);
|
||||
- if (ret < 0) {
|
||||
- notice_BMC(err, get_repair_result_code(ret));
|
||||
- log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR");
|
||||
+
|
||||
+ /* Only positive num means the error is supported to repair */
|
||||
+ if (ret <= 0) {
|
||||
+ if (ret != -ENXIO) {
|
||||
+ notice_BMC(err, get_repair_failed_result_code(ret));
|
||||
+ log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR");
|
||||
+ }
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -642,8 +624,9 @@ static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char
|
||||
all_online_success = false;
|
||||
}
|
||||
}
|
||||
- if (ret < 0) {
|
||||
- notice_BMC(err, get_repair_result_code(ret));
|
||||
+ /* The ret is from the acls/sppr repair, and only positive num means the error is repaired successfully */
|
||||
+ if (ret <= 0) {
|
||||
+ notice_BMC(err, get_repair_failed_result_code(ret));
|
||||
return ret;
|
||||
} else if (all_online_success) {
|
||||
notice_BMC(err, ISOLATE_REPAIR_ONLINE_SUCCESS);
|
||||
@@ -698,7 +681,7 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err)
|
||||
struct dirent *dent;
|
||||
DIR *dir;
|
||||
int ret;
|
||||
- bool find_device = false, find_hbm_mem = false;
|
||||
+ bool find_device = false, find_hbm_mem = false, addr_in_hbm_device = false;
|
||||
|
||||
ret = hbmc_hbm_page_isolate(err);
|
||||
if (ret < 0) {
|
||||
@@ -723,10 +706,13 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err)
|
||||
if (hbmc_get_memory_type(path) == HBM_HBM_MEMORY) {
|
||||
find_hbm_mem = true;
|
||||
ret = hbmc_hbm_repair(err, path);
|
||||
- if (ret != -ENXIO)
|
||||
+ if (ret != -ENXIO) {
|
||||
+ addr_in_hbm_device = true;
|
||||
break;
|
||||
+ }
|
||||
}
|
||||
}
|
||||
+
|
||||
if (!find_device) {
|
||||
log(LOG_ERROR, "Repair driver is not loaded, skip error, error_type is %u\n",
|
||||
err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK);
|
||||
@@ -735,6 +721,10 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err)
|
||||
log(LOG_ERROR, "No HBM device memory type found, skip error, error_type is %u\n",
|
||||
err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK);
|
||||
notice_BMC(err, REPAIR_FAILED_OTHER_REASON);
|
||||
+ } else if (!addr_in_hbm_device) {
|
||||
+ log(LOG_ERROR, "Err addr is not in device, skip error, error_type is %u\n",
|
||||
+ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK);
|
||||
+ notice_BMC(err, REPAIR_FAILED_INVALID_PARAM);
|
||||
}
|
||||
|
||||
closedir(dir);
|
||||
@@ -769,7 +759,7 @@ static bool hbm_repair_validate(const struct hisi_common_error_section *err)
|
||||
(err->reg_array_size == HBM_CACHE_ARRAY_SIZE);
|
||||
|
||||
if (!(is_acls_valid || is_sppr_valid || is_cache_mode)) {
|
||||
- log(LOG_DEBUG, "err type (%u) is unknown or address array length (%u) is invalid\n",
|
||||
+ log(LOG_WARNING, "err type (%u) is unknown or address array length (%u) is invalid\n",
|
||||
hbm_repair_reg_type, err->reg_array_size);
|
||||
return false;
|
||||
}
|
||||
diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.h b/src/c/hbm_online_repair/non-standard-hbm-repair.h
|
||||
index 7e8e448..ecb04fe 100644
|
||||
--- a/src/c/hbm_online_repair/non-standard-hbm-repair.h
|
||||
+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.h
|
||||
@@ -84,6 +84,6 @@
|
||||
#define FLASH_ENTRY_NUM 8
|
||||
#define KB_SIZE 1024
|
||||
|
||||
-extern int init_all_flash();
|
||||
+extern void get_flash_total_size();
|
||||
|
||||
#endif
|
||||
diff --git a/src/c/hbm_online_repair/ras-events.c b/src/c/hbm_online_repair/ras-events.c
|
||||
index 0b12329..4d281ad 100644
|
||||
--- a/src/c/hbm_online_repair/ras-events.c
|
||||
+++ b/src/c/hbm_online_repair/ras-events.c
|
||||
@@ -348,7 +348,6 @@ static int read_ras_event_all_cpus(struct pcpu_data *pdata,
|
||||
"Error on CPU %i\n", i);
|
||||
warnonce[i]++;
|
||||
}
|
||||
- continue;
|
||||
}
|
||||
if (!(fds[i].revents & POLLIN)) {
|
||||
count_nready++;
|
||||
diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.c b/src/c/hbm_online_repair/ras-non-standard-handler.c
|
||||
index 1d1fd04..48ffa70 100644
|
||||
--- a/src/c/hbm_online_repair/ras-non-standard-handler.c
|
||||
+++ b/src/c/hbm_online_repair/ras-non-standard-handler.c
|
||||
@@ -7,17 +7,21 @@
|
||||
#include "ras-non-standard-handler.h"
|
||||
#include "logger.h"
|
||||
|
||||
-static char *uuid_le(const char *uu)
|
||||
+static int uuid_le(const char *uu, char* uuid)
|
||||
{
|
||||
- static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
|
||||
if (!uu) {
|
||||
log(LOG_ERROR, "uuid_le failed: uu is empty");
|
||||
- return uuid;
|
||||
+ return -1;
|
||||
}
|
||||
size_t uu_len = strlen(uu);
|
||||
- if (uu_len < SECTION_TYPE_UUID_LEN) {
|
||||
- log(LOG_ERROR, "uuid_le failed: uu is too short");
|
||||
- return uuid;
|
||||
+ if (uu_len != SECTION_TYPE_UUID_LEN) {
|
||||
+ log(LOG_ERROR, "uuid_le failed: uu len is incorrect");
|
||||
+ return -1;
|
||||
+ }
|
||||
+ size_t uuid_len = strlen(uuid);
|
||||
+ if (uuid_len != strlen(UUID_STR_TYPE)) {
|
||||
+ log(LOG_ERROR, "uuid_le failed: uuid len is incorrect");
|
||||
+ return -1;
|
||||
}
|
||||
|
||||
char *p = uuid;
|
||||
@@ -38,7 +42,7 @@ static char *uuid_le(const char *uu)
|
||||
|
||||
*p = 0;
|
||||
|
||||
- return uuid;
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
@@ -52,15 +56,20 @@ int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
ev.sec_type = tep_get_field_raw(s, event, "sec_type",
|
||||
record, &len, 1);
|
||||
if(!ev.sec_type) {
|
||||
- log(LOG_WARNING, "get event section type failed");
|
||||
+ log(LOG_WARNING, "get event section type failed\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
trace_seq_printf(s, "\n");
|
||||
- trace_seq_printf(s, "sec_type: %s\n", uuid_le(ev.sec_type));
|
||||
+ char uuid[sizeof(UUID_STR_TYPE)] = UUID_STR_TYPE;
|
||||
+ if (uuid_le(ev.sec_type, uuid) < 0) {
|
||||
+ log(LOG_WARNING, "get uuid failed\n");
|
||||
+ return -1;
|
||||
+ }
|
||||
+ trace_seq_printf(s, "sec_type: %s\n", uuid);
|
||||
|
||||
if (tep_get_field_val(s, event, "len", record, &val, 1) < 0) {
|
||||
- log(LOG_WARNING, "tep get field val failed");
|
||||
+ log(LOG_WARNING, "tep get field val failed\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -69,11 +78,11 @@ int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
|
||||
ev.error = tep_get_field_raw(s, event, "buf", record, &len, 1);
|
||||
if(!ev.error || ev.length != len) {
|
||||
- log(LOG_WARNING, "get event error failed");
|
||||
+ log(LOG_WARNING, "get event error failed\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
- if (strcmp(uuid_le(ev.sec_type), HISI_COMMON_SECTION_TYPE_UUID) == 0) {
|
||||
+ if (strcmp(uuid, HISI_COMMON_SECTION_TYPE_UUID) == 0) {
|
||||
decode_hisi_common_section(&ev);
|
||||
}
|
||||
|
||||
diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.h b/src/c/hbm_online_repair/ras-non-standard-handler.h
|
||||
index 0272dc1..15a37ee 100644
|
||||
--- a/src/c/hbm_online_repair/ras-non-standard-handler.h
|
||||
+++ b/src/c/hbm_online_repair/ras-non-standard-handler.h
|
||||
@@ -7,6 +7,7 @@
|
||||
#define BIT(nr) (1UL << (nr))
|
||||
|
||||
#define SECTION_TYPE_UUID_LEN 16
|
||||
+#define UUID_STR_TYPE "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
||||
#define HISI_COMMON_SECTION_TYPE_UUID "c8b328a8-9917-4af6-9a13-2e08ab2e7586"
|
||||
|
||||
struct ras_non_standard_event {
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
From 6307a1ff4068a541658e3312ca938c6fdd9a5c1a Mon Sep 17 00:00:00 2001
|
||||
From: zhuofeng <zhuofeng2@huawei.com>
|
||||
Date: Sat, 12 Oct 2024 14:51:51 +0800
|
||||
Subject: [PATCH] fix io_dump for collect module
|
||||
|
||||
---
|
||||
src/python/sentryCollector/collect_io.py | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
|
||||
index d734734..11c9d9a 100644
|
||||
--- a/src/python/sentryCollector/collect_io.py
|
||||
+++ b/src/python/sentryCollector/collect_io.py
|
||||
@@ -154,7 +154,7 @@ class CollectIo():
|
||||
try:
|
||||
with open(io_dump_file, 'r') as file:
|
||||
for line in file:
|
||||
- count += line.count('.op=' + Io_Category[category])
|
||||
+ count += line.count('.op=' + Io_Category[category].upper())
|
||||
if count > 0:
|
||||
logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}")
|
||||
except FileNotFoundError:
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,53 +0,0 @@
|
||||
From 878bcf61467bfd9d015a8089a8367f4333ba76f6 Mon Sep 17 00:00:00 2001
|
||||
From: PshySimon <caixiaomeng2@huawei.com>
|
||||
Date: Wed, 9 Oct 2024 10:20:34 +0800
|
||||
Subject: [PATCH] fix python 3.7 not support list[bool] type
|
||||
|
||||
---
|
||||
src/python/xalarm/register_xalarm.py | 8 ++++----
|
||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py
|
||||
index e58343d..6756b1b 100644
|
||||
--- a/src/python/xalarm/register_xalarm.py
|
||||
+++ b/src/python/xalarm/register_xalarm.py
|
||||
@@ -26,7 +26,7 @@ ALARM_REGISTER_INFO = None
|
||||
|
||||
|
||||
class AlarmRegister:
|
||||
- def __init__(self, id_filter: list[bool], callback: callable):
|
||||
+ def __init__(self, id_filter: list, callback: callable):
|
||||
self.id_filter = id_filter
|
||||
self.callback = callback
|
||||
self.socket = self.create_unix_socket()
|
||||
@@ -49,7 +49,7 @@ class AlarmRegister:
|
||||
return False
|
||||
return True
|
||||
|
||||
- def set_id_filter(self, id_filter: list[bool]) -> bool:
|
||||
+ def set_id_filter(self, id_filter: list) -> bool:
|
||||
if (len(id_filter) > MAX_NUM_OF_ALARM_ID):
|
||||
sys.stderr.write("set_id_filter: invalid param id_filter\n")
|
||||
return False
|
||||
@@ -118,7 +118,7 @@ class AlarmRegister:
|
||||
self.socket.close()
|
||||
|
||||
|
||||
-def xalarm_register(callback: callable, id_filter: list[bool]) -> int:
|
||||
+def xalarm_register(callback: callable, id_filter: list) -> int:
|
||||
global ALARM_REGISTER_INFO
|
||||
|
||||
if ALARM_REGISTER_INFO is not None:
|
||||
@@ -148,7 +148,7 @@ def xalarm_unregister(clientId: int) -> None:
|
||||
ALARM_REGISTER_INFO = None
|
||||
|
||||
|
||||
-def xalarm_upgrade(clientId: int, id_filter: list[bool]) -> None:
|
||||
+def xalarm_upgrade(clientId: int, id_filter: list) -> None:
|
||||
global ALARM_REGISTER_INFO
|
||||
if clientId < 0:
|
||||
sys.stderr.write("xalarm_unregister: invalid client\n")
|
||||
--
|
||||
2.27.0
|
||||
|
||||
|
||||
@ -1,36 +0,0 @@
|
||||
From e8e4fa5fd9e78508567782e17b7b1cb6ace3ef0d Mon Sep 17 00:00:00 2001
|
||||
From: shixuantong <shixuantong1@huawei.com>
|
||||
Date: Fri, 26 Jul 2024 15:59:42 +0800
|
||||
Subject: [PATCH] fix result when process output is None
|
||||
|
||||
---
|
||||
src/python/syssentry/cpu_sentry.py | 8 ++++++++
|
||||
1 file changed, 8 insertions(+)
|
||||
|
||||
diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py
|
||||
index d0bafa8..9287e2f 100644
|
||||
--- a/src/python/syssentry/cpu_sentry.py
|
||||
+++ b/src/python/syssentry/cpu_sentry.py
|
||||
@@ -87,11 +87,19 @@ class CpuSentry:
|
||||
}
|
||||
|
||||
def handle_cpu_output(self, stdout: str):
|
||||
+ if not stdout:
|
||||
+ logging.error("%s process output is None, it may be killed!", LOW_LEVEL_INSPECT_CMD)
|
||||
+ self.send_result["result"] = ResultLevel.FAIL
|
||||
+ self.send_result["details"]["code"] = 1005
|
||||
+ self.send_result["details"]["msg"] = "cpu_sentry task is killed!"
|
||||
+ return
|
||||
+
|
||||
if "ERROR" in stdout:
|
||||
self.send_result["result"] = ResultLevel.FAIL
|
||||
self.send_result["details"]["code"] = 1004
|
||||
self.send_result["details"]["msg"] = stdout.split("\n")[0]
|
||||
return
|
||||
+
|
||||
out_split = stdout.split("\n")
|
||||
isolated_cores_number = 0
|
||||
found_fault_cores_list = []
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,226 +0,0 @@
|
||||
From dea58a559f3dbad3dbce3b681639ee89c20b1cee Mon Sep 17 00:00:00 2001
|
||||
From: zhuofeng <zhuofeng2@huawei.com>
|
||||
Date: Fri, 20 Sep 2024 14:35:39 +0800
|
||||
Subject: [PATCH] fix some about collect module and avg block io
|
||||
|
||||
---
|
||||
config/tasks/avg_block_io.mod | 4 ++--
|
||||
src/python/sentryCollector/collect_io.py | 18 +++++++++++-------
|
||||
src/python/sentryCollector/collect_plugin.py | 17 ++++++++---------
|
||||
src/python/sentryCollector/collect_server.py | 6 +++---
|
||||
src/python/sentryCollector/collectd.py | 2 --
|
||||
.../sentryPlugins/avg_block_io/avg_block_io.py | 13 ++++++++++---
|
||||
6 files changed, 34 insertions(+), 26 deletions(-)
|
||||
|
||||
diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod
|
||||
index 814c483..b9b6f34 100644
|
||||
--- a/config/tasks/avg_block_io.mod
|
||||
+++ b/config/tasks/avg_block_io.mod
|
||||
@@ -1,5 +1,5 @@
|
||||
[common]
|
||||
enabled=yes
|
||||
task_start=/usr/bin/python3 /usr/bin/avg_block_io
|
||||
-task_stop=pkill avg_block_io
|
||||
-type=oneshot
|
||||
\ No newline at end of file
|
||||
+task_stop=pkill -f /usr/bin/avg_block_io
|
||||
+type=oneshot
|
||||
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
|
||||
index b826dc4..104b734 100644
|
||||
--- a/src/python/sentryCollector/collect_io.py
|
||||
+++ b/src/python/sentryCollector/collect_io.py
|
||||
@@ -175,8 +175,7 @@ class CollectIo():
|
||||
|
||||
threading.Timer(self.period_time, self.task_loop).start()
|
||||
|
||||
- def main_loop(self):
|
||||
- logging.info("collect io thread start")
|
||||
+ def is_kernel_avaliable(self):
|
||||
base_path = '/sys/kernel/debug/block'
|
||||
for disk_name in os.listdir(base_path):
|
||||
if not self.loop_all and disk_name not in self.disk_list:
|
||||
@@ -198,8 +197,13 @@ class CollectIo():
|
||||
self.window_value[disk_name] = {}
|
||||
IO_GLOBAL_DATA[disk_name] = {}
|
||||
|
||||
- if len(self.disk_map_stage) == 0:
|
||||
- logging.warning("no disks meet the requirements. the thread exits")
|
||||
+ return len(IO_GLOBAL_DATA) != 0
|
||||
+
|
||||
+ def main_loop(self):
|
||||
+ logging.info("collect io thread start")
|
||||
+
|
||||
+ if not self.is_kernel_avaliable() or len(self.disk_map_stage) == 0:
|
||||
+ logging.warning("no disks meet the requirements. collect io thread exits")
|
||||
return
|
||||
|
||||
for disk_name, stage_list in self.disk_map_stage.items():
|
||||
@@ -213,7 +217,7 @@ class CollectIo():
|
||||
start_time = time.time()
|
||||
|
||||
if self.stop_event.is_set():
|
||||
- logging.info("collect io thread exit")
|
||||
+ logging.debug("collect io thread exit")
|
||||
return
|
||||
|
||||
for disk_name, stage_list in self.disk_map_stage.items():
|
||||
@@ -227,7 +231,7 @@ class CollectIo():
|
||||
continue
|
||||
while sleep_time > 1:
|
||||
if self.stop_event.is_set():
|
||||
- logging.info("collect io thread exit")
|
||||
+ logging.debug("collect io thread exit")
|
||||
return
|
||||
time.sleep(1)
|
||||
sleep_time -= 1
|
||||
@@ -235,5 +239,5 @@ class CollectIo():
|
||||
|
||||
# set stop event, notify thread exit
|
||||
def stop_thread(self):
|
||||
- logging.info("collect io thread is preparing to exit")
|
||||
+ logging.debug("collect io thread is preparing to exit")
|
||||
self.stop_event.set()
|
||||
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
|
||||
index 49ce0a8..9132473 100644
|
||||
--- a/src/python/sentryCollector/collect_plugin.py
|
||||
+++ b/src/python/sentryCollector/collect_plugin.py
|
||||
@@ -142,22 +142,21 @@ def validate_parameters(param, len_limit, char_limit):
|
||||
ret = ResultMessage.RESULT_INVALID_LENGTH
|
||||
return [False, ret]
|
||||
|
||||
- if len(param) > len_limit:
|
||||
- print(f"{param} length more than {len_limit}")
|
||||
- ret = ResultMessage.RESULT_EXCEED_LIMIT
|
||||
- return [False, ret]
|
||||
-
|
||||
pattern = r'^[a-zA-Z0-9_-]+$'
|
||||
for info in param:
|
||||
- if len(info) > char_limit:
|
||||
- print(f"{info} length more than {char_limit}")
|
||||
- ret = ResultMessage.RESULT_EXCEED_LIMIT
|
||||
- return [False, ret]
|
||||
if not re.match(pattern, info):
|
||||
print(f"{info} is invalid char")
|
||||
ret = ResultMessage.RESULT_INVALID_CHAR
|
||||
return [False, ret]
|
||||
|
||||
+ # length of len_limit is exceeded, keep len_limit
|
||||
+ if len(param) > len_limit:
|
||||
+ print(f"{param} length more than {len_limit}, keep the first {len_limit}")
|
||||
+ param[:] = param[0:len_limit]
|
||||
+
|
||||
+ # only keep elements under the char_limit length
|
||||
+ param[:] = [elem for elem in param if len(elem) <= char_limit]
|
||||
+
|
||||
return [True, ret]
|
||||
|
||||
def is_iocollect_valid(period, disk_list=None, stage=None):
|
||||
diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py
|
||||
index fa49781..bab4e56 100644
|
||||
--- a/src/python/sentryCollector/collect_server.py
|
||||
+++ b/src/python/sentryCollector/collect_server.py
|
||||
@@ -256,7 +256,7 @@ class CollectServer():
|
||||
|
||||
def server_loop(self):
|
||||
"""main loop"""
|
||||
- logging.info("collect server thread start")
|
||||
+ logging.info("collect listen thread start")
|
||||
server_fd = self.server_fd_create()
|
||||
if not server_fd:
|
||||
return
|
||||
@@ -267,7 +267,7 @@ class CollectServer():
|
||||
logging.debug("start server_loop loop")
|
||||
while True:
|
||||
if self.stop_event.is_set():
|
||||
- logging.info("collect server thread exit")
|
||||
+ logging.debug("collect listen thread exit")
|
||||
server_fd = None
|
||||
return
|
||||
try:
|
||||
@@ -281,5 +281,5 @@ class CollectServer():
|
||||
pass
|
||||
|
||||
def stop_thread(self):
|
||||
- logging.info("collect server thread is preparing to exit")
|
||||
+ logging.debug("collect listen thread is preparing to exit")
|
||||
self.stop_event.set()
|
||||
diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py
|
||||
index b77c642..3a836df 100644
|
||||
--- a/src/python/sentryCollector/collectd.py
|
||||
+++ b/src/python/sentryCollector/collectd.py
|
||||
@@ -49,7 +49,6 @@ def sig_handler(signum, _f):
|
||||
Thread_List[i][0].stop_thread()
|
||||
|
||||
remove_sock_file()
|
||||
- sys.exit(0)
|
||||
|
||||
def main():
|
||||
"""main
|
||||
@@ -64,7 +63,6 @@ def main():
|
||||
try:
|
||||
signal.signal(signal.SIGINT, sig_handler)
|
||||
signal.signal(signal.SIGTERM, sig_handler)
|
||||
- signal.signal(signal.SIGHUP, sig_handler)
|
||||
|
||||
logging.info("finish main parse_args")
|
||||
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
index ff2071d..73f0b22 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
@@ -21,7 +21,7 @@ CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
|
||||
|
||||
def log_invalid_keys(not_in_list, keys_name, config_list, default_list):
|
||||
"""print invalid log"""
|
||||
- if config_list and default_list:
|
||||
+ if config_list and not_in_list:
|
||||
logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list))
|
||||
elif config_list == ["default"]:
|
||||
logging.warning("Default {} use {}".format(keys_name, default_list))
|
||||
@@ -144,9 +144,11 @@ def init_io_win(io_dic, config, common_param):
|
||||
|
||||
if avg_lim_value and avg_time_value and tot_lim_value:
|
||||
io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value)
|
||||
+ logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw))
|
||||
|
||||
if iodump_lim_value is not None:
|
||||
io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value)
|
||||
+ logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw))
|
||||
return io_data, io_avg_value
|
||||
|
||||
|
||||
@@ -159,10 +161,10 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
|
||||
for disk_stage_list in json_data.values():
|
||||
all_stage_set.update(disk_stage_list)
|
||||
|
||||
- disk_list = [key for key in config_disk if key in all_disk_set]
|
||||
+ disk_list = [key for key in all_disk_set if key in config_disk]
|
||||
not_in_disk_list = [key for key in config_disk if key not in all_disk_set]
|
||||
|
||||
- stage_list = [key for key in config_stage if key in all_stage_set]
|
||||
+ stage_list = [key for key in all_stage_set if key in config_stage]
|
||||
not_in_stage_list = [key for key in config_stage if key not in all_stage_set]
|
||||
|
||||
if not config_disk:
|
||||
@@ -171,6 +173,9 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
|
||||
if not config_stage:
|
||||
stage_list = [key for key in all_stage_set]
|
||||
|
||||
+ disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list
|
||||
+ stage_list = stage_list[:15] if len(stage_list) > 15 else stage_list
|
||||
+
|
||||
if config_disk and not disk_list:
|
||||
logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk))
|
||||
disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage)
|
||||
@@ -228,6 +233,8 @@ def main():
|
||||
signal.signal(signal.SIGINT, sig_handler)
|
||||
signal.signal(signal.SIGTERM, sig_handler)
|
||||
|
||||
+ logging.basicConfig(level=logging.INFO)
|
||||
+
|
||||
# 初始化配置读取
|
||||
config = configparser.ConfigParser(comment_prefixes=('#', ';'))
|
||||
try:
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,56 +0,0 @@
|
||||
From 497b3124f017ce4ae99b34261c4fd5dd2a358f5b Mon Sep 17 00:00:00 2001
|
||||
From: zhuofeng <zhuofeng2@huawei.com>
|
||||
Date: Sat, 14 Sep 2024 09:28:00 +0800
|
||||
Subject: [PATCH] fix syssentry fails to be started when cpu_sentry is not
|
||||
installed
|
||||
|
||||
---
|
||||
src/python/syssentry/syssentry.py | 11 ++++++-----
|
||||
1 file changed, 6 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
|
||||
index f93956e..776971f 100644
|
||||
--- a/src/python/syssentry/syssentry.py
|
||||
+++ b/src/python/syssentry/syssentry.py
|
||||
@@ -43,7 +43,6 @@ try:
|
||||
from .cpu_alarm import cpu_alarm_recv
|
||||
except ImportError:
|
||||
CPU_EXIST = False
|
||||
- logging.debug("Cannot find cpu sentry mod")
|
||||
|
||||
|
||||
INSPECTOR = None
|
||||
@@ -563,20 +562,21 @@ def main():
|
||||
if not os.path.exists(SENTRY_RUN_DIR):
|
||||
os.mkdir(SENTRY_RUN_DIR)
|
||||
os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM)
|
||||
- if not chk_and_set_pidfile():
|
||||
- logging.error("get pid file lock failed, exist")
|
||||
- sys.exit(17)
|
||||
|
||||
logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=logging.INFO)
|
||||
os.chmod(SYSSENTRY_LOG_FILE, 0o600)
|
||||
|
||||
+ if not chk_and_set_pidfile():
|
||||
+ logging.error("get pid file lock failed, exist")
|
||||
+ sys.exit(17)
|
||||
+
|
||||
try:
|
||||
signal.signal(signal.SIGINT, sig_handler)
|
||||
signal.signal(signal.SIGTERM, sig_handler)
|
||||
signal.signal(signal.SIGHUP, sig_handler)
|
||||
signal.signal(signal.SIGCHLD, sigchld_handler)
|
||||
|
||||
- logging.debug("finish main parse_args")
|
||||
+ logging.info("finish main parse_args")
|
||||
|
||||
_ = SentryConfig.init_param()
|
||||
TasksMap.init_task_map()
|
||||
@@ -587,3 +587,4 @@ def main():
|
||||
logging.error('%s', traceback.format_exc())
|
||||
finally:
|
||||
release_pidfile()
|
||||
+
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,91 +0,0 @@
|
||||
From 874daf9627c74aa31f1063c250b5477b2eb322e8 Mon Sep 17 00:00:00 2001
|
||||
From: shixuantong <shixuantong1@huawei.com>
|
||||
Date: Sat, 28 Dec 2024 11:31:23 +0800
|
||||
Subject: [PATCH] fix test_ai_block_io fail
|
||||
|
||||
---
|
||||
selftest/test/test_ai_block_io.py | 26 +++++++++++++-------------
|
||||
1 file changed, 13 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/selftest/test/test_ai_block_io.py b/selftest/test/test_ai_block_io.py
|
||||
index c36fef5..58ab096 100644
|
||||
--- a/selftest/test/test_ai_block_io.py
|
||||
+++ b/selftest/test/test_ai_block_io.py
|
||||
@@ -12,9 +12,9 @@
|
||||
import unittest
|
||||
import numpy as np
|
||||
|
||||
-from sentryPlugins.ai_threshold_slow_io_detection.threshold import AbsoluteThreshold, BoxplotThreshold, NSigmaThreshold
|
||||
-from sentryPlugins.ai_threshold_slow_io_detection.sliding_window import (NotContinuousSlidingWindow,
|
||||
- ContinuousSlidingWindow, MedianSlidingWindow)
|
||||
+from sentryPlugins.ai_block_io.threshold import AbsoluteThreshold, BoxplotThreshold, NSigmaThreshold
|
||||
+from sentryPlugins.ai_block_io.sliding_window import (NotContinuousSlidingWindow,
|
||||
+ ContinuousSlidingWindow, MedianSlidingWindow)
|
||||
|
||||
|
||||
def _get_boxplot_threshold(data_list: list, parameter):
|
||||
@@ -98,11 +98,11 @@ class Test(unittest.TestCase):
|
||||
for data in data_list1:
|
||||
boxplot_threshold.push_latest_data_to_queue(data)
|
||||
result = not_continuous.is_slow_io_event(data)
|
||||
- self.assertFalse(result[0])
|
||||
+ self.assertFalse(result[0][0])
|
||||
self.assertEqual(23.75, boxplot_threshold.get_threshold())
|
||||
boxplot_threshold.push_latest_data_to_queue(24)
|
||||
result = not_continuous.is_slow_io_event(24)
|
||||
- self.assertFalse(result[0])
|
||||
+ self.assertFalse(result[0][0])
|
||||
boxplot_threshold.push_latest_data_to_queue(25)
|
||||
result = not_continuous.is_slow_io_event(25)
|
||||
self.assertTrue(result[0])
|
||||
@@ -110,7 +110,7 @@ class Test(unittest.TestCase):
|
||||
for data in data_list2:
|
||||
boxplot_threshold.push_latest_data_to_queue(data)
|
||||
result = not_continuous.is_slow_io_event(data)
|
||||
- self.assertFalse(result[0])
|
||||
+ self.assertFalse(result[0][0])
|
||||
self.assertEqual(25.625, boxplot_threshold.get_threshold())
|
||||
|
||||
def test_continuous_sliding_window(self):
|
||||
@@ -121,14 +121,14 @@ class Test(unittest.TestCase):
|
||||
for data in data_list:
|
||||
boxplot_threshold.push_latest_data_to_queue(data)
|
||||
result = continuous.is_slow_io_event(data)
|
||||
- self.assertFalse(result[0])
|
||||
+ self.assertFalse(result[0][0])
|
||||
self.assertEqual(23.75, boxplot_threshold.get_threshold())
|
||||
# 没有三个异常点
|
||||
- self.assertFalse(continuous.is_slow_io_event(25)[0])
|
||||
+ self.assertFalse(continuous.is_slow_io_event(25)[0][0])
|
||||
# 不连续的三个异常点
|
||||
- self.assertFalse(continuous.is_slow_io_event(25)[0])
|
||||
+ self.assertFalse(continuous.is_slow_io_event(25)[0][0])
|
||||
# 连续的三个异常点
|
||||
- self.assertTrue(continuous.is_slow_io_event(25)[0])
|
||||
+ self.assertTrue(continuous.is_slow_io_event(25)[0][0])
|
||||
|
||||
def test_median_sliding_window(self):
|
||||
median = MedianSlidingWindow(5, 3)
|
||||
@@ -137,7 +137,7 @@ class Test(unittest.TestCase):
|
||||
absolute_threshold.set_threshold(24.5)
|
||||
data_list = [24, 24, 24, 25, 25]
|
||||
for data in data_list:
|
||||
- self.assertFalse(median.is_slow_io_event(data)[0])
|
||||
+ self.assertFalse(median.is_slow_io_event(data)[0][0])
|
||||
self.assertTrue(median.is_slow_io_event(25)[0])
|
||||
|
||||
def test_parse_collect_data(self):
|
||||
@@ -147,8 +147,8 @@ class Test(unittest.TestCase):
|
||||
"flush": [9.0, 10.0, 11.0, 12.0],
|
||||
"discard": [13.0, 14.0, 15.0, 16.0],
|
||||
}
|
||||
- from io_data import BaseData
|
||||
- from data_access import _get_io_stage_data
|
||||
+ from sentryPlugins.ai_block_io.io_data import BaseData
|
||||
+ from sentryPlugins.ai_block_io.data_access import _get_io_stage_data
|
||||
|
||||
io_data = _get_io_stage_data(collect)
|
||||
self.assertEqual(
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,61 +0,0 @@
|
||||
From 00ea35472d50faea89c881eb45b6d9d11f6b6632 Mon Sep 17 00:00:00 2001
|
||||
From: luckky <guodashun1@huawei.com>
|
||||
Date: Fri, 1 Nov 2024 15:09:57 +0800
|
||||
Subject: [PATCH] fix uint8 bug and change isolation default value
|
||||
|
||||
---
|
||||
src/c/hbm_online_repair/hbm_online_repair.env | 2 +-
|
||||
src/c/hbm_online_repair/non-standard-hbm-repair.c | 8 ++++----
|
||||
2 files changed, 5 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/src/c/hbm_online_repair/hbm_online_repair.env b/src/c/hbm_online_repair/hbm_online_repair.env
|
||||
index de56079..7166c8d 100644
|
||||
--- a/src/c/hbm_online_repair/hbm_online_repair.env
|
||||
+++ b/src/c/hbm_online_repair/hbm_online_repair.env
|
||||
@@ -1,2 +1,2 @@
|
||||
HBM_ONLINE_REPAIR_LOG_LEVEL=1
|
||||
-PAGE_ISOLATION_THRESHOLD=128
|
||||
+PAGE_ISOLATION_THRESHOLD=3355443
|
||||
diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c
|
||||
index f26d8ae..b8dde7a 100644
|
||||
--- a/src/c/hbm_online_repair/non-standard-hbm-repair.c
|
||||
+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c
|
||||
@@ -359,7 +359,7 @@ static int write_file(char *path, const char *name, unsigned long long value)
|
||||
|
||||
fd = open(fname, O_WRONLY);
|
||||
if (fd < 0) {
|
||||
- log(LOG_WARNING, "HBM ACLS: Cannot to open '%s': %s\n",
|
||||
+ log(LOG_WARNING, "HBM: Cannot to open '%s': %s\n",
|
||||
fname, strerror(errno));
|
||||
return -errno;
|
||||
}
|
||||
@@ -367,7 +367,7 @@ static int write_file(char *path, const char *name, unsigned long long value)
|
||||
snprintf(buf, sizeof(buf), "0x%llx\n", value);
|
||||
ret = write(fd, buf, strlen(buf));
|
||||
if (ret <= 0)
|
||||
- log(LOG_WARNING, "HBM ACLS: Failed to set %s (0x%llx): %s\n",
|
||||
+ log(LOG_WARNING, "HBM: Failed to set %s (0x%llx): %s\n",
|
||||
fname, value, strerror(errno));
|
||||
|
||||
close(fd);
|
||||
@@ -557,7 +557,7 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err)
|
||||
return ret < 0 ? ret : 0;
|
||||
}
|
||||
|
||||
-static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr)
|
||||
+static uint8_t hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr)
|
||||
{
|
||||
int ret;
|
||||
if (repair_ret <= 0) {
|
||||
@@ -577,7 +577,7 @@ static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsig
|
||||
}
|
||||
}
|
||||
|
||||
-static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path)
|
||||
+static int hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path)
|
||||
{
|
||||
unsigned long long paddr;
|
||||
int ret;
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
From 7baf2815f515c54bc33f41f495ec7c26988b5c44 Mon Sep 17 00:00:00 2001
|
||||
From: shixuantong <shixuantong1@huawei.com>
|
||||
Date: Tue, 11 Jun 2024 16:47:46 +0800
|
||||
Subject: [PATCH] fix version in setup.py
|
||||
|
||||
---
|
||||
src/python/setup.py | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/python/setup.py b/src/python/setup.py
|
||||
index 21dbe9f..f96a96e 100644
|
||||
--- a/src/python/setup.py
|
||||
+++ b/src/python/setup.py
|
||||
@@ -17,7 +17,7 @@ from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name="syssentry",
|
||||
- version="1.0.1",
|
||||
+ version="1.0.2",
|
||||
description="System inspection framework tool set",
|
||||
packages=find_packages(),
|
||||
include_package_data=True,
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,53 +0,0 @@
|
||||
From 5be0d121c6fde185d323dc4bcf3026e2c3ee8757 Mon Sep 17 00:00:00 2001
|
||||
From: jinsaihang <jinsaihang@h-partners.com>
|
||||
Date: Mon, 14 Oct 2024 11:30:58 +0800
|
||||
Subject: [PATCH] fix word error
|
||||
|
||||
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
|
||||
---
|
||||
sysSentry-1.0.2/src/python/syssentry/alarm.py | 10 +++++-----
|
||||
1 file changed, 5 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
|
||||
index d012901..bff527c 100644
|
||||
--- a/src/python/syssentry/alarm.py
|
||||
+++ b/src/python/syssentry/alarm.py
|
||||
@@ -49,7 +49,7 @@ MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
|
||||
def update_alarm_list(alarm_info: Xalarm):
|
||||
alarm_id = xalarm_getid(alarm_info)
|
||||
if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
|
||||
- logging.warnning(f"Invalid alarm_id {alarm_id}")
|
||||
+ logging.warning(f"Invalid alarm_id {alarm_id}")
|
||||
return
|
||||
timestamp = xalarm_gettime(alarm_info)
|
||||
if not timestamp:
|
||||
@@ -97,14 +97,14 @@ def alarm_register():
|
||||
task = TasksMap.tasks_dict[task_type][task_name]
|
||||
alarm_id = task.alarm_id
|
||||
if not check_alarm_id_if_number(alarm_id):
|
||||
- logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
|
||||
+ logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
|
||||
continue
|
||||
if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
|
||||
- logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
|
||||
+ logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
|
||||
continue
|
||||
alarm_clear_time = task.alarm_clear_time
|
||||
if not check_alarm_clear_time_if_positive_integer(alarm_clear_time):
|
||||
- logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
|
||||
+ logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
|
||||
continue
|
||||
try:
|
||||
alarm_clear_time = int(alarm_clear_time)
|
||||
@@ -113,7 +113,7 @@ def alarm_register():
|
||||
if alarm_clear_time > sys.maxsize:
|
||||
raise ValueError("Exceeds maximum value for int")
|
||||
except (ValueError, OverflowError, TypeError) as e:
|
||||
- logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
|
||||
+ logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
|
||||
continue
|
||||
alarm_list_dict[alarm_id] = []
|
||||
task_alarm_id_dict[task_name] = alarm_id
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,69 +0,0 @@
|
||||
From cea094acea79b88e6458cfa264a03c51f08c72fc Mon Sep 17 00:00:00 2001
|
||||
From: luckky <guodashun1@huawei.com>
|
||||
Date: Mon, 4 Nov 2024 20:18:05 +0800
|
||||
Subject: [PATCH] fix write file return code bug
|
||||
Set the return code 0 to -EINVAL to unify the processing of return code.
|
||||
|
||||
---
|
||||
.../hbm_online_repair/non-standard-hbm-repair.c | 17 ++++++++++-------
|
||||
1 file changed, 10 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c
|
||||
index b8dde7a..97cb9a7 100644
|
||||
--- a/src/c/hbm_online_repair/non-standard-hbm-repair.c
|
||||
+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c
|
||||
@@ -112,7 +112,7 @@ static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned
|
||||
info_struct->row_id = fault_addr & FAULT_ADDR_ROW_ID_MASK;
|
||||
fault_addr >>= FAULT_ADDR_ROW_ID_LEN;
|
||||
info_struct->column_id = fault_addr & FAULT_ADDR_COLUMN_ID_MASK;
|
||||
- fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN;
|
||||
+ fault_addr >>= FAULT_ADDR_COLUMN_ID_LEN;
|
||||
info_struct->error_type = fault_addr & FAULT_ADDR_ERROR_TYPE_MASK;
|
||||
fault_addr >>= FAULT_ADDR_ERROR_TYPE_LEN;
|
||||
info_struct->repair_type = fault_addr & FAULT_ADDR_REPAIR_TYPE_MASK;
|
||||
@@ -371,7 +371,12 @@ static int write_file(char *path, const char *name, unsigned long long value)
|
||||
fname, value, strerror(errno));
|
||||
|
||||
close(fd);
|
||||
- return ret > 0 ? 0 : -errno;
|
||||
+ if (ret == 0) {
|
||||
+ ret = -EINVAL;
|
||||
+ } else if (ret < 0) {
|
||||
+ ret = -errno;
|
||||
+ }
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
static int get_hardware_corrupted_size()
|
||||
@@ -560,7 +565,7 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err)
|
||||
static uint8_t hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr)
|
||||
{
|
||||
int ret;
|
||||
- if (repair_ret <= 0) {
|
||||
+ if (repair_ret < 0) {
|
||||
log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr);
|
||||
/* not much we can do about errors here */
|
||||
(void)write_file("/sys/kernel/page_eject", "remove_page", paddr);
|
||||
@@ -594,8 +599,7 @@ static int hbmc_hbm_repair(const struct hisi_common_error_section *err, char *pa
|
||||
|
||||
ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr);
|
||||
|
||||
- /* Only positive num means the error is supported to repair */
|
||||
- if (ret <= 0) {
|
||||
+ if (ret < 0) {
|
||||
if (ret != -ENXIO) {
|
||||
notice_BMC(err, get_repair_failed_result_code(ret));
|
||||
log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR");
|
||||
@@ -624,8 +628,7 @@ static int hbmc_hbm_repair(const struct hisi_common_error_section *err, char *pa
|
||||
all_online_success = false;
|
||||
}
|
||||
}
|
||||
- /* The ret is from the acls/sppr repair, and only positive num means the error is repaired successfully */
|
||||
- if (ret <= 0) {
|
||||
+ if (ret < 0) {
|
||||
notice_BMC(err, get_repair_failed_result_code(ret));
|
||||
return ret;
|
||||
} else if (all_online_success) {
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@ -1,60 +0,0 @@
|
||||
From 3eba5dcde10e05e7badc99852f76488e667d56e6 Mon Sep 17 00:00:00 2001
|
||||
From: caixiaomeng <caixiaomeng2@.com>
|
||||
Date: Mon, 21 Oct 2024 11:57:37 +0800
|
||||
Subject: [PATCH] fix xalarm non-uniform log formatting
|
||||
|
||||
---
|
||||
src/python/xalarm/sentry_notify.py | 11 ++++++-----
|
||||
1 file changed, 6 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py
|
||||
index 5838473..ffe4147 100644
|
||||
--- a/src/python/xalarm/sentry_notify.py
|
||||
+++ b/src/python/xalarm/sentry_notify.py
|
||||
@@ -2,6 +2,7 @@ import os
|
||||
import sys
|
||||
import time
|
||||
import socket
|
||||
+import logging
|
||||
from struct import error as StructParseError
|
||||
|
||||
from .xalarm_api import alarm_stu2bin, Xalarm
|
||||
@@ -27,21 +28,21 @@ ALARM_SOCKET_PERMISSION = 0o700
|
||||
|
||||
def check_params(alarm_id, alarm_level, alarm_type, puc_paras) -> bool:
|
||||
if not os.path.exists(DIR_XALARM):
|
||||
- sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed\n")
|
||||
+ logging.error(f"check_params: {DIR_XALARM} not exist, failed")
|
||||
return False
|
||||
|
||||
if not os.path.exists(PATH_REPORT_ALARM):
|
||||
- sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed\n")
|
||||
+ logging.error(f"check_params: {PATH_REPORT_ALARM} not exist, failed")
|
||||
return False
|
||||
|
||||
if (alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID or
|
||||
alarm_level < MINOR_ALM or alarm_level > CRITICAL_ALM or
|
||||
alarm_type < ALARM_TYPE_OCCUR or alarm_type > ALARM_TYPE_RECOVER):
|
||||
- sys.stderr.write("check_params: alarm info invalid\n")
|
||||
+ logging.error("check_params: alarm info invalid")
|
||||
return False
|
||||
|
||||
if len(puc_paras) >= MAX_PUC_PARAS_LEN:
|
||||
- sys.stderr.write(f"check_params: alarm msg should be less than {MAX_PUC_PARAS_LEN}\n")
|
||||
+ logging.error(f"check_params: alarm msg should be less than {MAX_PUC_PARAS_LEN}")
|
||||
return False
|
||||
|
||||
return True
|
||||
@@ -61,7 +62,7 @@ def xalarm_report(alarm_id, alarm_level, alarm_type, puc_paras) -> bool:
|
||||
|
||||
sock.sendto(alarm_stu2bin(alarm_info), PATH_REPORT_ALARM)
|
||||
except (FileNotFoundError, StructParseError, socket.error, OSError, UnicodeError) as e:
|
||||
- sys.stderr.write(f"check_params: error occurs when sending msg.{e}\n")
|
||||
+ logging.error(f"error occurs when sending msg.")
|
||||
return False
|
||||
finally:
|
||||
sock.close()
|
||||
--
|
||||
2.27.0
|
||||
|
||||
|
||||
@ -1,76 +0,0 @@
|
||||
From f6a26ea0759f36ebcaebe05d4d24c7234a110c63 Mon Sep 17 00:00:00 2001
|
||||
From: caixiaomeng <caixiaomeng2@.com>
|
||||
Date: Fri, 11 Oct 2024 12:12:53 +0800
|
||||
Subject: [PATCH] fix xalarm_Report function not refuse alarm msg exceeds
|
||||
maximum
|
||||
|
||||
---
|
||||
src/libso/xalarm/register_xalarm.c | 5 +++++
|
||||
src/python/xalarm/register_xalarm.py | 6 +++---
|
||||
src/python/xalarm/sentry_notify.py | 4 ++--
|
||||
3 files changed, 10 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c
|
||||
index 5aff2bc..952a28b 100644
|
||||
--- a/src/libso/xalarm/register_xalarm.c
|
||||
+++ b/src/libso/xalarm/register_xalarm.c
|
||||
@@ -339,6 +339,11 @@ int xalarm_Report(unsigned short usAlarmId, unsigned char ucAlarmLevel,
|
||||
return -1;
|
||||
}
|
||||
|
||||
+ if (pucParas == NULL || (int)strlen(pucParas) > MAX_PARAS_LEN) {
|
||||
+ fprintf(stderr, "%s: alarm info invalid\n", __func__);
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
if (memset(&info, 0, sizeof(struct alarm_info)) == NULL) {
|
||||
fprintf(stderr, "%s: memset info failed, ret: %d\n", __func__, ret);
|
||||
return -1;
|
||||
diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py
|
||||
index edd9994..39623bd 100644
|
||||
--- a/src/python/xalarm/register_xalarm.py
|
||||
+++ b/src/python/xalarm/register_xalarm.py
|
||||
@@ -45,7 +45,7 @@ class AlarmRegister:
|
||||
return False
|
||||
|
||||
if self.socket is None:
|
||||
- sys.stderr.write("check_params: scoket create failed\n")
|
||||
+ sys.stderr.write("check_params: socket create failed\n")
|
||||
return False
|
||||
return True
|
||||
|
||||
@@ -151,10 +151,10 @@ def xalarm_unregister(clientId: int) -> None:
|
||||
def xalarm_upgrade(clientId: int, id_filter: list) -> None:
|
||||
global ALARM_REGISTER_INFO
|
||||
if clientId < 0:
|
||||
- sys.stderr.write("xalarm_unregister: invalid client\n")
|
||||
+ sys.stderr.write("xalarm_upgrade: invalid client\n")
|
||||
return
|
||||
if ALARM_REGISTER_INFO is None:
|
||||
- sys.stderr.write("xalarm_unregister: alarm has not registered\n")
|
||||
+ sys.stderr.write("xalarm_upgrade: alarm has not registered\n")
|
||||
return
|
||||
ALARM_REGISTER_INFO.id_filter = id_filter
|
||||
|
||||
diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py
|
||||
index c763a24..5838473 100644
|
||||
--- a/src/python/xalarm/sentry_notify.py
|
||||
+++ b/src/python/xalarm/sentry_notify.py
|
||||
@@ -27,11 +27,11 @@ ALARM_SOCKET_PERMISSION = 0o700
|
||||
|
||||
def check_params(alarm_id, alarm_level, alarm_type, puc_paras) -> bool:
|
||||
if not os.path.exists(DIR_XALARM):
|
||||
- sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed")
|
||||
+ sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed\n")
|
||||
return False
|
||||
|
||||
if not os.path.exists(PATH_REPORT_ALARM):
|
||||
- sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed")
|
||||
+ sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed\n")
|
||||
return False
|
||||
|
||||
if (alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID or
|
||||
--
|
||||
2.27.0
|
||||
|
||||
|
||||
@ -1,71 +0,0 @@
|
||||
From 624efd60495403743fc251b7d689d920841e44c8 Mon Sep 17 00:00:00 2001
|
||||
From: caixiaomeng <caixiaomeng2@.com>
|
||||
Date: Fri, 11 Oct 2024 17:54:04 +0800
|
||||
Subject: [PATCH] fix xalarm_upgrade not return val and fail when thread
|
||||
stopped
|
||||
|
||||
---
|
||||
src/libso/xalarm/register_xalarm.c | 11 ++++++++++-
|
||||
src/python/xalarm/register_xalarm.py | 10 +++++++---
|
||||
2 files changed, 17 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c
|
||||
index 952a28b..6768242 100644
|
||||
--- a/src/libso/xalarm/register_xalarm.c
|
||||
+++ b/src/libso/xalarm/register_xalarm.c
|
||||
@@ -156,7 +156,11 @@ static void *alarm_recv(void *arg)
|
||||
continue;
|
||||
}
|
||||
printf("recv error len:%d errno:%d\n", recvlen, errno);
|
||||
- }
|
||||
+ } else if (recvlen == 0) {
|
||||
+ printf("connection closed by xalarmd, maybe connections reach max num or service stopped.\n");
|
||||
+ g_register_info.thread_should_stop = 1;
|
||||
+ break;
|
||||
+ }
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
@@ -211,6 +215,11 @@ bool xalarm_Upgrade(struct alarm_subscription_info id_filter, int client_id)
|
||||
printf("%s: invalid args\n", __func__);
|
||||
return false;
|
||||
}
|
||||
+
|
||||
+ if (g_register_info.thread_should_stop) {
|
||||
+ printf("%s: upgrade failed, alarm thread has stopped\n", __func__);
|
||||
+ return false;
|
||||
+ }
|
||||
set_alarm_id(id_filter);
|
||||
|
||||
return true;
|
||||
diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py
|
||||
index 39623bd..2a6dabf 100644
|
||||
--- a/src/python/xalarm/register_xalarm.py
|
||||
+++ b/src/python/xalarm/register_xalarm.py
|
||||
@@ -148,15 +148,19 @@ def xalarm_unregister(clientId: int) -> None:
|
||||
ALARM_REGISTER_INFO = None
|
||||
|
||||
|
||||
-def xalarm_upgrade(clientId: int, id_filter: list) -> None:
|
||||
+def xalarm_upgrade(id_filter: list, clientId: int) -> bool:
|
||||
global ALARM_REGISTER_INFO
|
||||
if clientId < 0:
|
||||
sys.stderr.write("xalarm_upgrade: invalid client\n")
|
||||
- return
|
||||
+ return False
|
||||
if ALARM_REGISTER_INFO is None:
|
||||
sys.stderr.write("xalarm_upgrade: alarm has not registered\n")
|
||||
- return
|
||||
+ return False
|
||||
+ if ALARM_REGISTER_INFO.thread_should_stop:
|
||||
+ sys.stderr.write("xalarm_upgrade: upgrade failed, alarm thread has stopped\n")
|
||||
+ return False
|
||||
ALARM_REGISTER_INFO.id_filter = id_filter
|
||||
+ return True
|
||||
|
||||
|
||||
def xalarm_getid(alarm_info: Xalarm) -> int:
|
||||
--
|
||||
2.27.0
|
||||
|
||||
|
||||
@ -1,26 +0,0 @@
|
||||
From 132334913c4afebefd6afa835f790fa8a5fbf123 Mon Sep 17 00:00:00 2001
|
||||
From: jinsaihang <jinsaihang@h-partners.com>
|
||||
Date: Mon, 28 Oct 2024 09:22:53 +0800
|
||||
Subject: [PATCH] get_alarm -d abnomal display
|
||||
|
||||
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
|
||||
---
|
||||
sysSentry-1.0.2/src/python/syssentry/alarm.py | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
|
||||
index b35a126..e5cc313 100644
|
||||
--- a/src/python/syssentry/alarm.py
|
||||
+++ b/src/python/syssentry/alarm.py
|
||||
@@ -184,7 +184,7 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
|
||||
# dump each {key,value} of details in one line
|
||||
if 'details' in alarm_info and isinstance(alarm_info['details'], dict):
|
||||
for key in alarm_info['details']:
|
||||
- alarm_info['details'][key] = json.dumps(alarm_info['details'][key], indent=None)
|
||||
+ alarm_info['details'][key] = str(alarm_info['details'][key])
|
||||
|
||||
alarm['alarm_info'] = alarm_info
|
||||
alarm_list = [alarm for alarm in alarm_list if 'alarm_source' in alarm['alarm_info'] and alarm['alarm_info']['alarm_source'] == task_name]
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,168 +0,0 @@
|
||||
From b21607fcec4b290bc78c9f6c4a26db1a2df32a66 Mon Sep 17 00:00:00 2001
|
||||
From: gaoruoshu <gaoruoshu@huawei.com>
|
||||
Date: Tue, 15 Oct 2024 21:21:10 +0800
|
||||
Subject: [PATCH] get_io_data failed wont stop avg_block_io and del disk not
|
||||
support
|
||||
|
||||
---
|
||||
src/python/sentryCollector/collect_plugin.py | 14 ++++-----
|
||||
.../avg_block_io/avg_block_io.py | 9 ++++--
|
||||
.../sentryPlugins/avg_block_io/module_conn.py | 31 +++++++++++++------
|
||||
3 files changed, 35 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
|
||||
index bec405a..53dddec 100644
|
||||
--- a/src/python/sentryCollector/collect_plugin.py
|
||||
+++ b/src/python/sentryCollector/collect_plugin.py
|
||||
@@ -90,14 +90,14 @@ def client_send_and_recv(request_data, data_str_len, protocol):
|
||||
try:
|
||||
client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
except socket.error:
|
||||
- logging.error("collect_plugin: client create socket error")
|
||||
+ logging.debug("collect_plugin: client create socket error")
|
||||
return None
|
||||
|
||||
try:
|
||||
client_socket.connect(COLLECT_SOCKET_PATH)
|
||||
except OSError:
|
||||
client_socket.close()
|
||||
- logging.error("collect_plugin: client connect error")
|
||||
+ logging.debug("collect_plugin: client connect error")
|
||||
return None
|
||||
|
||||
req_data_len = len(request_data)
|
||||
@@ -109,23 +109,23 @@ def client_send_and_recv(request_data, data_str_len, protocol):
|
||||
res_data = res_data.decode()
|
||||
except (OSError, UnicodeError):
|
||||
client_socket.close()
|
||||
- logging.error("collect_plugin: client communicate error")
|
||||
+ logging.debug("collect_plugin: client communicate error")
|
||||
return None
|
||||
|
||||
res_magic = res_data[:CLT_MSG_MAGIC_LEN]
|
||||
if res_magic != "RES":
|
||||
- logging.error("res msg format error")
|
||||
+ logging.debug("res msg format error")
|
||||
return None
|
||||
|
||||
protocol_str = res_data[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN]
|
||||
try:
|
||||
protocol_id = int(protocol_str)
|
||||
except ValueError:
|
||||
- logging.error("recv msg protocol id is invalid %s", protocol_str)
|
||||
+ logging.debug("recv msg protocol id is invalid %s", protocol_str)
|
||||
return None
|
||||
|
||||
if protocol_id >= ClientProtocol.PRO_END:
|
||||
- logging.error("protocol id is invalid")
|
||||
+ logging.debug("protocol id is invalid")
|
||||
return None
|
||||
|
||||
try:
|
||||
@@ -134,7 +134,7 @@ def client_send_and_recv(request_data, data_str_len, protocol):
|
||||
res_msg_data = res_msg_data.decode()
|
||||
return res_msg_data
|
||||
except (OSError, ValueError, UnicodeError):
|
||||
- logging.error("collect_plugin: client recv res msg error")
|
||||
+ logging.debug("collect_plugin: client recv res msg error")
|
||||
finally:
|
||||
client_socket.close()
|
||||
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
index cd47919..899d517 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
@@ -15,7 +15,7 @@ import time
|
||||
|
||||
from .config import read_config_log, read_config_common, read_config_algorithm, read_config_latency, read_config_iodump, read_config_stage
|
||||
from .stage_window import IoWindow, IoDumpWindow
|
||||
-from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name
|
||||
+from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name, check_disk_list_validation
|
||||
from .utils import update_avg_and_check_abnormal
|
||||
|
||||
CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
|
||||
@@ -79,6 +79,8 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
|
||||
if not disk_list:
|
||||
report_alarm_fail("Cannot get valid disk name")
|
||||
|
||||
+ disk_list = check_disk_list_validation(disk_list)
|
||||
+
|
||||
disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list
|
||||
|
||||
if not config_disk:
|
||||
@@ -117,7 +119,10 @@ def main_loop(io_dic, io_data, io_avg_value):
|
||||
time.sleep(period_time)
|
||||
|
||||
# 采集模块对接,获取周期数据
|
||||
- curr_period_data = avg_get_io_data(io_dic)
|
||||
+ is_success, curr_period_data = avg_get_io_data(io_dic)
|
||||
+ if not is_success:
|
||||
+ logging.error(f"{curr_period_data['msg']}")
|
||||
+ continue
|
||||
|
||||
# 处理周期数据
|
||||
reach_size = False
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
index cbdaad4..a67ef45 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
@@ -40,25 +40,25 @@ def avg_is_iocollect_valid(io_dic, config_disk, config_stage):
|
||||
logging.debug(f"send to sentryCollector is_iocollect_valid: period={io_dic['period_time']}, "
|
||||
f"disk={config_disk}, stage={config_stage}")
|
||||
res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage)
|
||||
- return check_result_validation(res, 'check config validation')
|
||||
+ is_success, data = check_result_validation(res, 'check config validation')
|
||||
+ if not is_success:
|
||||
+ report_alarm_fail(f"{data['msg']}")
|
||||
+ return data
|
||||
|
||||
|
||||
def check_result_validation(res, reason):
|
||||
"""check validation of result from sentryCollector"""
|
||||
if not 'ret' in res or not 'message' in res:
|
||||
- err_msg = "Failed to {}: Cannot connect to sentryCollector.".format(reason)
|
||||
- report_alarm_fail(err_msg)
|
||||
+ return False, {'msg': f"Failed to {reason}: Cannot connect to sentryCollector"}
|
||||
if res['ret'] != 0:
|
||||
- err_msg = "Failed to {}: {}".format(reason, Result_Messages[res['ret']])
|
||||
- report_alarm_fail(err_msg)
|
||||
+ return False, {'msg': f"Failed to {reason}: {Result_Messages[res['ret']]}"}
|
||||
|
||||
try:
|
||||
json_data = json.loads(res['message'])
|
||||
except json.JSONDecodeError:
|
||||
- err_msg = f"Failed to {reason}: invalid return message"
|
||||
- report_alarm_fail(err_msg)
|
||||
+ return False, {'msg': f"Failed to {reason}: invalid return message"}
|
||||
|
||||
- return json_data
|
||||
+ return True, json_data
|
||||
|
||||
|
||||
def report_alarm_fail(alarm_info):
|
||||
@@ -120,10 +120,21 @@ def process_report_data(disk_name, rw, io_data):
|
||||
xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg))
|
||||
|
||||
|
||||
+def check_disk_list_validation(disk_list):
|
||||
+ valid_disk_list = []
|
||||
+ for disk_name in disk_list:
|
||||
+ is_success, _ = check_result_validation(get_disk_type(disk_name), "")
|
||||
+ if not is_success:
|
||||
+ continue
|
||||
+ valid_disk_list.append(disk_name)
|
||||
+ return valid_disk_list
|
||||
+
|
||||
+
|
||||
def get_disk_type_by_name(disk_name):
|
||||
logging.debug(f"send to sentryCollector get_disk_type: disk_name={disk_name}")
|
||||
- res = get_disk_type(disk_name)
|
||||
- disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}')
|
||||
+ is_success, disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}')
|
||||
+ if not is_success:
|
||||
+ report_alarm_fail(f"{disk_type_str['msg']}")
|
||||
try:
|
||||
curr_disk_type = int(disk_type_str)
|
||||
if curr_disk_type not in Disk_Type:
|
||||
--
|
||||
2.27.0
|
||||
@ -1,107 +0,0 @@
|
||||
From 74f18b0e1fd4f99fa7d1d95e08894b408dcafe51 Mon Sep 17 00:00:00 2001
|
||||
From: luckky <guodashun1@huawei.com>
|
||||
Date: Wed, 18 Dec 2024 14:31:04 +0800
|
||||
Subject: [PATCH] hbm_online_repair add unload driver
|
||||
|
||||
---
|
||||
src/c/hbm_online_repair/hbm_online_repair.c | 47 +++++++++++++--------
|
||||
1 file changed, 29 insertions(+), 18 deletions(-)
|
||||
|
||||
diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c
|
||||
index 00c9c0b..6783485 100644
|
||||
--- a/src/c/hbm_online_repair/hbm_online_repair.c
|
||||
+++ b/src/c/hbm_online_repair/hbm_online_repair.c
|
||||
@@ -11,6 +11,8 @@
|
||||
#define DEFAULT_LOG_LEVEL LOG_INFO
|
||||
#define DEFAULT_PAGE_ISOLATION_THRESHOLD 3355443
|
||||
|
||||
+#define DRIVER_COMMAND_LEN 32
|
||||
+
|
||||
int global_level_setting;
|
||||
int page_isolation_threshold;
|
||||
|
||||
@@ -57,25 +59,31 @@ int execute_command(const char *command)
|
||||
return -1;
|
||||
}
|
||||
|
||||
- ret = WEXITSTATUS(ret);
|
||||
+ ret = -WEXITSTATUS(ret);
|
||||
log(LOG_DEBUG, "command %s exited with status: %d\n", command, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
-int load_required_driver(void)
|
||||
+int handle_driver(char* driver_name, bool load)
|
||||
{
|
||||
int ret;
|
||||
- ret = execute_command("modprobe hisi_mem_ras 2>&1");
|
||||
- if (ret < 0) {
|
||||
- log(LOG_ERROR, "load repair driver failed\n");
|
||||
- return ret;
|
||||
- }
|
||||
- ret = execute_command("modprobe page_eject 2>&1");
|
||||
- if (ret < 0) {
|
||||
- log(LOG_ERROR, "load page driver failed\n");
|
||||
+ char command[DRIVER_COMMAND_LEN];
|
||||
+
|
||||
+ snprintf(command, DRIVER_COMMAND_LEN, "%s %s 2>&1", load ? "modprobe" : "rmmod", driver_name);
|
||||
+ ret = execute_command(command);
|
||||
+ log(ret < 0 ? LOG_ERROR : LOG_DEBUG, "%s %s %s\n", load ? "load" : "unload", driver_name, ret < 0 ? "failed" : "success");
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+int handle_all_drivers(bool load)
|
||||
+{
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = handle_driver("hisi_mem_ras", load);
|
||||
+ if (ret < 0)
|
||||
return ret;
|
||||
- }
|
||||
- log(LOG_INFO, "load required driver success\n");
|
||||
+
|
||||
+ ret = handle_driver("page_eject", load);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -116,21 +124,21 @@ int main(int argc, char *argv[])
|
||||
|
||||
hbm_param_init();
|
||||
|
||||
- ret = load_required_driver();
|
||||
+ ret = handle_all_drivers(true);
|
||||
if (ret < 0) {
|
||||
- log(LOG_DEBUG, "load required driver failed\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct ras_events *ras = init_trace_instance();
|
||||
- if (!ras)
|
||||
- return -1;
|
||||
+ if (!ras) {
|
||||
+ ret = -1;
|
||||
+ goto err_unload;
|
||||
+ }
|
||||
|
||||
ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 1);
|
||||
if (ret < 0) {
|
||||
log(LOG_WARNING, "unable to enable ras non_standard_event.\n");
|
||||
- free(ras);
|
||||
- return -1;
|
||||
+ goto err_free;
|
||||
}
|
||||
|
||||
get_flash_total_size();
|
||||
@@ -142,6 +150,9 @@ int main(int argc, char *argv[])
|
||||
log(LOG_WARNING, "unable to disable ras non_standard_event.\n");
|
||||
}
|
||||
|
||||
+err_free:
|
||||
free(ras);
|
||||
+err_unload:
|
||||
+ handle_all_drivers(false);
|
||||
return ret;
|
||||
}
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@ -1,104 +0,0 @@
|
||||
From 2135b4e41666d99922eda79e9ee04bbc2b557fea Mon Sep 17 00:00:00 2001
|
||||
From: zhuofeng <zhuofeng2@huawei.com>
|
||||
Date: Wed, 16 Oct 2024 12:13:21 +0800
|
||||
Subject: [PATCH] listen thread of collect module exits occasionally
|
||||
|
||||
---
|
||||
src/python/sentryCollector/collect_io.py | 4 +---
|
||||
src/python/sentryCollector/collect_server.py | 18 ++++++++----------
|
||||
2 files changed, 9 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
|
||||
index 5fe1efc..de308b3 100644
|
||||
--- a/src/python/sentryCollector/collect_io.py
|
||||
+++ b/src/python/sentryCollector/collect_io.py
|
||||
@@ -231,9 +231,7 @@ class CollectIo():
|
||||
if self.get_blk_io_hierarchy(disk_name, stage_list) < 0:
|
||||
continue
|
||||
self.append_period_lat(disk_name, stage_list)
|
||||
-
|
||||
- logging.debug(f"no-lock collect data : {IO_GLOBAL_DATA}")
|
||||
-
|
||||
+
|
||||
elapsed_time = time.time() - start_time
|
||||
sleep_time = self.period_time - elapsed_time
|
||||
if sleep_time < 0:
|
||||
diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py
|
||||
index 11d1af0..ad3ac0e 100644
|
||||
--- a/src/python/sentryCollector/collect_server.py
|
||||
+++ b/src/python/sentryCollector/collect_server.py
|
||||
@@ -64,7 +64,7 @@ class CollectServer():
|
||||
self.io_global_data = IO_GLOBAL_DATA
|
||||
|
||||
if len(IO_CONFIG_DATA) == 0:
|
||||
- logging.error("the collect thread is not started, the data is invalid. ")
|
||||
+ logging.error("the collect thread is not started, the data is invalid.")
|
||||
return json.dumps(result_rev)
|
||||
|
||||
period_time = IO_CONFIG_DATA[0]
|
||||
@@ -75,7 +75,7 @@ class CollectServer():
|
||||
stage_list = json.loads(data_struct['stage'])
|
||||
|
||||
if (period < period_time) or (period > period_time * max_save) or (period % period_time):
|
||||
- logging.error("is_iocollect_valid: period time: %d is invalid", period)
|
||||
+ logging.error("is_iocollect_valid: period time is invalid, user period: %d, config period_time: %d", period, period_time)
|
||||
return json.dumps(result_rev)
|
||||
|
||||
for disk_name, stage_info in self.io_global_data.items():
|
||||
@@ -96,7 +96,7 @@ class CollectServer():
|
||||
self.io_global_data = IO_GLOBAL_DATA
|
||||
|
||||
if len(IO_CONFIG_DATA) == 0:
|
||||
- logging.error("the collect thread is not started, the data is invalid. ")
|
||||
+ logging.error("the collect thread is not started, the data is invalid.")
|
||||
return json.dumps(result_rev)
|
||||
period_time = IO_CONFIG_DATA[0]
|
||||
max_save = IO_CONFIG_DATA[1]
|
||||
@@ -107,11 +107,11 @@ class CollectServer():
|
||||
iotype_list = json.loads(data_struct['iotype'])
|
||||
|
||||
if (period < period_time) or (period > period_time * max_save) or (period % period_time):
|
||||
- logging.error("get_io_data: period time: %d is invalid", period)
|
||||
+ logging.error("get_io_data: period time is invalid, user period: %d, config period_time: %d", period, period_time)
|
||||
return json.dumps(result_rev)
|
||||
|
||||
collect_index = period // period_time - 1
|
||||
- logging.debug("period: %d, collect_index: %d", period, collect_index)
|
||||
+ logging.debug("user period: %d, config period_time: %d, collect_index: %d", period, period_time, collect_index)
|
||||
|
||||
for disk_name, stage_info in self.io_global_data.items():
|
||||
if disk_name not in disk_list:
|
||||
@@ -124,7 +124,7 @@ class CollectServer():
|
||||
for iotype_name, iotype_info in iotype_info.items():
|
||||
if iotype_name not in iotype_list:
|
||||
continue
|
||||
- if len(iotype_info) < collect_index:
|
||||
+ if len(iotype_info) - 1 < collect_index:
|
||||
continue
|
||||
result_rev[disk_name][stage_name][iotype_name] = iotype_info[collect_index]
|
||||
|
||||
@@ -250,10 +250,8 @@ class CollectServer():
|
||||
except socket.error:
|
||||
logging.error("server fd create failed")
|
||||
server_fd = None
|
||||
-
|
||||
return server_fd
|
||||
|
||||
-
|
||||
def server_loop(self):
|
||||
"""main loop"""
|
||||
logging.info("collect listen thread start")
|
||||
@@ -277,8 +275,8 @@ class CollectServer():
|
||||
self.server_recv(server_fd)
|
||||
else:
|
||||
continue
|
||||
- except socket.error:
|
||||
- pass
|
||||
+ except Exception:
|
||||
+ logging.error('collect listen exception : %s', traceback.format_exc())
|
||||
|
||||
def stop_thread(self):
|
||||
self.stop_event.set()
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,69 +0,0 @@
|
||||
From edbe32637a939d0788bcbde9211a61cfded436bf Mon Sep 17 00:00:00 2001
|
||||
From: luckky <guodashun1@huawei.com>
|
||||
Date: Tue, 5 Nov 2024 17:22:27 +0800
|
||||
Subject: [PATCH] make debug msg clear
|
||||
1. Change the page_isolation_threshold default value for 128(kb) to 3355443(kb)
|
||||
to synchronize the modification of the .mod file.
|
||||
2. Add specific command info in debug message to make debug message clear.
|
||||
3. Update the commit of the log level and format of syssentry.
|
||||
4. Change the interval 180 to 10 to short the restart time.
|
||||
|
||||
---
|
||||
config/tasks/hbm_online_repair.mod | 2 +-
|
||||
.../src/c/hbm_online_repair/hbm_online_repair.c | 8 ++++----
|
||||
2 files changed, 5 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/config/tasks/hbm_online_repair.mod b/config/tasks/hbm_online_repair.mod
|
||||
index 77dd73e..4dcef43 100644
|
||||
--- a/config/tasks/hbm_online_repair.mod
|
||||
+++ b/config/tasks/hbm_online_repair.mod
|
||||
@@ -3,7 +3,7 @@ enabled=yes
|
||||
task_start=/usr/bin/hbm_online_repair
|
||||
task_stop=kill $pid
|
||||
type=period
|
||||
-interval=180
|
||||
+interval=10
|
||||
onstart=yes
|
||||
env_file=/etc/sysconfig/hbm_online_repair.env
|
||||
conflict=up
|
||||
\ No newline at end of file
|
||||
diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c
|
||||
index b3b2742..943f201 100644
|
||||
--- a/src/c/hbm_online_repair/hbm_online_repair.c
|
||||
+++ b/src/c/hbm_online_repair/hbm_online_repair.c
|
||||
@@ -9,7 +9,7 @@
|
||||
#include "non-standard-hbm-repair.h"
|
||||
|
||||
#define DEFAULT_LOG_LEVEL LOG_INFO
|
||||
-#define DEFAULT_PAGE_ISOLATION_THRESHOLD 128
|
||||
+#define DEFAULT_PAGE_ISOLATION_THRESHOLD 3355443
|
||||
|
||||
int global_level_setting;
|
||||
int page_isolation_threshold;
|
||||
@@ -44,7 +44,7 @@ int execute_command(const char *command)
|
||||
}
|
||||
|
||||
fgets(buffer, sizeof(buffer), fp);
|
||||
- log(LOG_DEBUG, "output of command is: %s\n", buffer);
|
||||
+ log(LOG_DEBUG, "output of command %s is: %s\n", command, buffer);
|
||||
|
||||
ret = pclose(fp);
|
||||
if (ret < 0) {
|
||||
@@ -53,12 +53,12 @@ int execute_command(const char *command)
|
||||
}
|
||||
|
||||
if (!WIFEXITED(ret)) {
|
||||
- log(LOG_ERROR, "command did not terminate normally\n");
|
||||
+ log(LOG_ERROR, "command %s did not terminate normally\n", command);
|
||||
return -1;
|
||||
}
|
||||
|
||||
ret = WEXITSTATUS(ret);
|
||||
- log(LOG_DEBUG, "command exited with status: %d\n", ret);
|
||||
+ log(LOG_DEBUG, "command %s exited with status: %d\n", command, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
--
|
||||
2.43.0
|
||||
|
||||
@ -1,28 +0,0 @@
|
||||
From b5794ef43f768d7ea9bbbac450deaabbdcff4997 Mon Sep 17 00:00:00 2001
|
||||
From: zhuofeng <zhuofeng2@huawei.com>
|
||||
Date: Sat, 12 Oct 2024 17:57:01 +0800
|
||||
Subject: [PATCH] modify abnormal stack when the disk field is not configured
|
||||
|
||||
---
|
||||
src/python/sentryCollector/collect_config.py | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py
|
||||
index 5aa38ec..7ca9898 100644
|
||||
--- a/src/python/sentryCollector/collect_config.py
|
||||
+++ b/src/python/sentryCollector/collect_config.py
|
||||
@@ -127,9 +127,9 @@ class CollectConfig:
|
||||
CONF_IO, CONF_IO_MAX_SAVE, CONF_IO_MAX_SAVE_DEFAULT)
|
||||
result_io_config[CONF_IO_MAX_SAVE] = CONF_IO_MAX_SAVE_DEFAULT
|
||||
# disk
|
||||
- disk = io_map_value.get(CONF_IO_DISK).lower()
|
||||
+ disk = io_map_value.get(CONF_IO_DISK)
|
||||
if disk:
|
||||
- disk_str = disk.replace(" ", "")
|
||||
+ disk_str = disk.lower().replace(" ", "")
|
||||
pattern = r'^[a-zA-Z0-9-_,]+$'
|
||||
if not re.match(pattern, disk_str):
|
||||
logging.warning("module_name = %s section, field = %s is incorrect, use default %s",
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,27 +0,0 @@
|
||||
From 0d3323d13797f3f9d3124e3938787d2573bf249d Mon Sep 17 00:00:00 2001
|
||||
From: zhangnan <zhangnan134@huawei.com>
|
||||
Date: Mon, 28 Oct 2024 17:32:49 +0800
|
||||
Subject: [PATCH] modify logrotate rule
|
||||
|
||||
---
|
||||
config/logrotate | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/config/logrotate b/config/logrotate
|
||||
index f54e7b3..e855118 100644
|
||||
--- a/config/logrotate
|
||||
+++ b/config/logrotate
|
||||
@@ -1,8 +1,9 @@
|
||||
/var/log/sysSentry/*.log {
|
||||
- nocompress
|
||||
+ compress
|
||||
missingok
|
||||
notifempty
|
||||
copytruncate
|
||||
rotate 2
|
||||
size +4096k
|
||||
+ hourly
|
||||
}
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,125 +0,0 @@
|
||||
From 91c37cec1639c79b2b5ddcd6b173b4d7aa0ce9db Mon Sep 17 00:00:00 2001
|
||||
From: jinsaihang <jinsaihang@h-partners.com>
|
||||
Date: Wed, 16 Oct 2024 14:51:24 +0800
|
||||
Subject: [PATCH] optimize log printing
|
||||
|
||||
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
|
||||
---
|
||||
src/python/syssentry/alarm.py | 53 ++++++++++++++++---------------
|
||||
src/python/syssentry/load_mods.py | 15 +++++----
|
||||
2 files changed, 35 insertions(+), 33 deletions(-)
|
||||
|
||||
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
|
||||
index bff527c..c3f2ee1 100644
|
||||
--- a/src/python/syssentry/alarm.py
|
||||
+++ b/src/python/syssentry/alarm.py
|
||||
@@ -76,16 +76,26 @@ def update_alarm_list(alarm_info: Xalarm):
|
||||
finally:
|
||||
alarm_list_lock.release()
|
||||
|
||||
-def check_alarm_id_if_number(alarm_id):
|
||||
- if isinstance(alarm_id, int):
|
||||
- return True
|
||||
- else:
|
||||
+def validate_alarm_id(alarm_id):
|
||||
+ if alarm_id is None:
|
||||
+ return False
|
||||
+ try:
|
||||
+ alarm_id = int(alarm_id)
|
||||
+ if MIN_ALARM_ID <= alarm_id <= MAX_ALARM_ID:
|
||||
+ return True
|
||||
+ else:
|
||||
+ return False
|
||||
+ except ValueError:
|
||||
return False
|
||||
|
||||
-def check_alarm_clear_time_if_positive_integer(alarm_clear_time):
|
||||
- if isinstance(alarm_clear_time, int) and alarm_clear_time > 0:
|
||||
- return True
|
||||
- else:
|
||||
+def validate_alarm_clear_time(alarm_clear_time):
|
||||
+ try:
|
||||
+ alarm_clear_time = int(alarm_clear_time)
|
||||
+ if alarm_clear_time > 0 and alarm_clear_time <= sys.maxsize:
|
||||
+ return True
|
||||
+ else:
|
||||
+ return False
|
||||
+ except ValueError:
|
||||
return False
|
||||
|
||||
def alarm_register():
|
||||
@@ -93,34 +103,25 @@ def alarm_register():
|
||||
# 初始化告警ID映射字典、告警老化时间字典
|
||||
for task_type in TasksMap.tasks_dict:
|
||||
for task_name in TasksMap.tasks_dict[task_type]:
|
||||
- logging.info(f"alarm_register: {task_name} is registered")
|
||||
task = TasksMap.tasks_dict[task_type][task_name]
|
||||
- alarm_id = task.alarm_id
|
||||
- if not check_alarm_id_if_number(alarm_id):
|
||||
- logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
|
||||
+ if not validate_alarm_id(task.alarm_id):
|
||||
+ logging.warning(f"Invalid alarm_id {task.alarm_id}: ignore {task_name} alarm")
|
||||
continue
|
||||
- if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
|
||||
- logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
|
||||
+ if not validate_alarm_clear_time(task.alarm_clear_time):
|
||||
+ logging.warning(f"Invalid alarm_clear_time {task.alarm_clear_time}: ignore {task_name} alarm")
|
||||
continue
|
||||
+ task.alarm_id = int(task.alarm_id)
|
||||
+ task.alarm_clear_time = int(task.alarm_clear_time)
|
||||
+ alarm_id = task.alarm_id
|
||||
alarm_clear_time = task.alarm_clear_time
|
||||
- if not check_alarm_clear_time_if_positive_integer(alarm_clear_time):
|
||||
- logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
|
||||
- continue
|
||||
- try:
|
||||
- alarm_clear_time = int(alarm_clear_time)
|
||||
- if alarm_clear_time <= 0:
|
||||
- raise ValueError("Not a positive integer")
|
||||
- if alarm_clear_time > sys.maxsize:
|
||||
- raise ValueError("Exceeds maximum value for int")
|
||||
- except (ValueError, OverflowError, TypeError) as e:
|
||||
- logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
|
||||
- continue
|
||||
+
|
||||
alarm_list_dict[alarm_id] = []
|
||||
task_alarm_id_dict[task_name] = alarm_id
|
||||
if alarm_id not in alarm_id_clear_time_dict:
|
||||
alarm_id_clear_time_dict[alarm_id] = alarm_clear_time
|
||||
else:
|
||||
alarm_id_clear_time_dict[alarm_id] = max(alarm_clear_time, alarm_id_clear_time_dict[alarm_id])
|
||||
+ logging.info(f"alarm_register: {task_name} is registered")
|
||||
# 注册告警回调
|
||||
id_filter = [True] * 128
|
||||
clientId = xalarm_register(update_alarm_list, id_filter)
|
||||
diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py
|
||||
index f74f165..78db446 100644
|
||||
--- a/src/python/syssentry/load_mods.py
|
||||
+++ b/src/python/syssentry/load_mods.py
|
||||
@@ -198,15 +198,16 @@ def parse_mod_conf(mod_name, mod_conf):
|
||||
task.load_enabled = is_enabled
|
||||
|
||||
try:
|
||||
- task.alarm_id = int(mod_conf.get(CONF_TASK, CONF_ALARM_ID))
|
||||
- task.alarm_clear_time = int(mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME))
|
||||
- if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID):
|
||||
- raise ValueError("Invalid alarm_id")
|
||||
- except ValueError:
|
||||
task.alarm_id = mod_conf.get(CONF_TASK, CONF_ALARM_ID)
|
||||
- task.alarm_clear_time = mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME)
|
||||
except configparser.NoOptionError:
|
||||
- logging.warning("Unset alarm_clear_time, use 15s as default")
|
||||
+ task.alarm_id = None
|
||||
+ logging.warning(f"{mod_name} alarm_id not set, alarm_id is None")
|
||||
+
|
||||
+ if task.alarm_id is not None:
|
||||
+ try:
|
||||
+ task.alarm_clear_time = mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME)
|
||||
+ except configparser.NoOptionError:
|
||||
+ logging.warning(f"{mod_name} not set alarm_clear_time, use 15s as default")
|
||||
|
||||
if CONF_ONSTART in mod_conf.options(CONF_TASK):
|
||||
is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes')
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,77 +0,0 @@
|
||||
From cb3d0ea18eed3d48f2753f878d9726f58fe616b1 Mon Sep 17 00:00:00 2001
|
||||
From: shixuantong <shixuantong1@huawei.com>
|
||||
Date: Sat, 21 Sep 2024 09:53:42 +0800
|
||||
Subject: [PATCH] optimize the handing of cat-cli error msg in cpu_sentry
|
||||
|
||||
---
|
||||
src/python/syssentry/cpu_sentry.py | 36 +++++++++++++++++-------------
|
||||
1 file changed, 21 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py
|
||||
index 99af127..582d4b3 100644
|
||||
--- a/src/python/syssentry/cpu_sentry.py
|
||||
+++ b/src/python/syssentry/cpu_sentry.py
|
||||
@@ -26,6 +26,8 @@ CPU_SENTRY_PARAM_CONFIG = "/etc/sysSentry/plugins/cpu_sentry.ini"
|
||||
# Inspection commands running at the bottom layer
|
||||
LOW_LEVEL_INSPECT_CMD = "cat-cli"
|
||||
|
||||
+# max length of msg in details
|
||||
+DETAILS_LOG_MSG_MAX_LEN = 255
|
||||
|
||||
class CpuSentry:
|
||||
"""
|
||||
@@ -94,22 +96,10 @@ class CpuSentry:
|
||||
self.send_result["details"]["msg"] = "cpu_sentry task is killed!"
|
||||
return
|
||||
|
||||
- if "ERROR" in stdout:
|
||||
- self.send_result["result"] = ResultLevel.FAIL
|
||||
- self.send_result["details"]["code"] = 1004
|
||||
-
|
||||
- # Remove ANSI escape sequences
|
||||
- error_info = stdout.split("\n")[0]
|
||||
- if error_info.startswith("\u001b"):
|
||||
- ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])'
|
||||
- error_info = re.sub(ansi_escape, '', error_info)
|
||||
-
|
||||
- self.send_result["details"]["msg"] = error_info
|
||||
- return
|
||||
-
|
||||
out_split = stdout.split("\n")
|
||||
- isolated_cores_number = 0
|
||||
+ isolated_cores_number = -1
|
||||
found_fault_cores_list = []
|
||||
+ error_msg_list = []
|
||||
for out_line_i in out_split:
|
||||
if "handle_patrol_result: Found fault cores" in out_line_i:
|
||||
cores_number_tmp = out_line_i.split("Found fault cores:")[1]
|
||||
@@ -121,9 +111,25 @@ class CpuSentry:
|
||||
elif out_line_i.startswith('<ISOLATED-CORE-LIST>'):
|
||||
self.send_result["details"]["isolated_cpu_list"] = out_line_i.split(':')[1]
|
||||
break
|
||||
+ elif "ERROR" in out_line_i:
|
||||
+ logging.error("[cat-cli error] - %s\n", out_line_i)
|
||||
+ error_msg_list.append(out_line_i)
|
||||
|
||||
found_fault_cores_number = len(set(found_fault_cores_list))
|
||||
- if found_fault_cores_number == 0:
|
||||
+ if isolated_cores_number == -1:
|
||||
+ self.send_result["result"] = ResultLevel.FAIL
|
||||
+ self.send_result["details"]["code"] = 1004
|
||||
+
|
||||
+ send_error_msg = ""
|
||||
+ # Remove ANSI escape sequences
|
||||
+ for error_info in error_msg_list:
|
||||
+ if error_info.startswith("\u001b"):
|
||||
+ ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])'
|
||||
+ error_info = re.sub(ansi_escape, '', error_info)
|
||||
+ if len(send_error_msg) + len(error_info) < DETAILS_LOG_MSG_MAX_LEN:
|
||||
+ send_error_msg += error_info
|
||||
+ self.send_result["details"]["msg"] = send_error_msg
|
||||
+ elif found_fault_cores_number == 0:
|
||||
self.send_result["details"]["code"] = 0
|
||||
self.send_result["result"] = ResultLevel.PASS
|
||||
elif 0 in found_fault_cores_list:
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
From 3dda5f68db38b63b1e45a28558a9fcd341c1f945 Mon Sep 17 00:00:00 2001
|
||||
From: jwolf <523083921@qq.com>
|
||||
Date: Fri, 20 Sep 2024 15:59:40 +0800
|
||||
Subject: [PATCH] should be warn-level log
|
||||
|
||||
---
|
||||
src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c
|
||||
index 9f8d80c..f4f3172 100644
|
||||
--- a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c
|
||||
+++ b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c
|
||||
@@ -23,7 +23,7 @@ static cat_return_t insert_core_to_list(core_list_st *core_list, int coreid)
|
||||
return CAT_OK;
|
||||
}
|
||||
if ((core_list->current_nums == MAX_ISOLATE_CORES_PER_PATROL) || (coreid < 0)) {
|
||||
- CAT_LOG_E("Insert error, core id(%d)", coreid);
|
||||
+ CAT_LOG_W("Too many cores need to isolate,do not isolate core(%d)", coreid);
|
||||
return CAT_ERR;
|
||||
}
|
||||
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,23 +0,0 @@
|
||||
From 34febf57060060d1f8262941af49e3beeb1f7f5d Mon Sep 17 00:00:00 2001
|
||||
From: jwolf <523083921@qq.com>
|
||||
Date: Fri, 30 Aug 2024 16:59:56 +0800
|
||||
Subject: [PATCH] param must be integer
|
||||
|
||||
---
|
||||
src/c/catcli/catlib/cli_param_checker.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c
|
||||
index 5b38402..71edf17 100644
|
||||
--- a/src/c/catcli/catlib/cli_param_checker.c
|
||||
+++ b/src/c/catcli/catlib/cli_param_checker.c
|
||||
@@ -17,6 +17,7 @@ void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_r
|
||||
if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) {
|
||||
strncpy(errs->patrol_module_err,
|
||||
"\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN);
|
||||
+ p_request_body->cpu_utility = 0;
|
||||
} else {
|
||||
p_request_body->cpu_utility = (int)cpu_utility;
|
||||
}
|
||||
--
|
||||
Gitee
|
||||
@ -1,91 +0,0 @@
|
||||
From 7fa9e80531bb3d4fa587e5fb7a99e3af59feda7e Mon Sep 17 00:00:00 2001
|
||||
From: jinsaihang <jinsaihang@h-partners.com>
|
||||
Date: Sat, 12 Oct 2024 16:51:37 +0800
|
||||
Subject: [PATCH] precise alarm query time
|
||||
|
||||
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
|
||||
---
|
||||
sysSentry-1.0.2/src/python/syssentry/alarm.py | 25 +++++++++++++++++--
|
||||
.../src/python/syssentry/load_mods.py | 3 ++-
|
||||
2 files changed, 25 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py
|
||||
index 43c1065..d012901 100644
|
||||
--- a/src/python/syssentry/alarm.py
|
||||
+++ b/src/python/syssentry/alarm.py
|
||||
@@ -76,6 +76,18 @@ def update_alarm_list(alarm_info: Xalarm):
|
||||
finally:
|
||||
alarm_list_lock.release()
|
||||
|
||||
+def check_alarm_id_if_number(alarm_id):
|
||||
+ if isinstance(alarm_id, int):
|
||||
+ return True
|
||||
+ else:
|
||||
+ return False
|
||||
+
|
||||
+def check_alarm_clear_time_if_positive_integer(alarm_clear_time):
|
||||
+ if isinstance(alarm_clear_time, int) and alarm_clear_time > 0:
|
||||
+ return True
|
||||
+ else:
|
||||
+ return False
|
||||
+
|
||||
def alarm_register():
|
||||
logging.debug(f"alarm_register: enter")
|
||||
# 初始化告警ID映射字典、告警老化时间字典
|
||||
@@ -84,10 +96,16 @@ def alarm_register():
|
||||
logging.info(f"alarm_register: {task_name} is registered")
|
||||
task = TasksMap.tasks_dict[task_type][task_name]
|
||||
alarm_id = task.alarm_id
|
||||
+ if not check_alarm_id_if_number(alarm_id):
|
||||
+ logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
|
||||
+ continue
|
||||
if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID:
|
||||
logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm")
|
||||
continue
|
||||
alarm_clear_time = task.alarm_clear_time
|
||||
+ if not check_alarm_clear_time_if_positive_integer(alarm_clear_time):
|
||||
+ logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm")
|
||||
+ continue
|
||||
try:
|
||||
alarm_clear_time = int(alarm_clear_time)
|
||||
if alarm_clear_time <= 0:
|
||||
@@ -119,6 +137,9 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
|
||||
logging.debug("task_name does not exist")
|
||||
return []
|
||||
alarm_id = task_alarm_id_dict[task_name]
|
||||
+ clear_time = alarm_id_clear_time_dict[alarm_id]
|
||||
+ if clear_time < int(time_range):
|
||||
+ return []
|
||||
if alarm_id not in alarm_list_dict:
|
||||
logging.debug("alarm_id does not exist")
|
||||
return []
|
||||
@@ -126,10 +147,10 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di
|
||||
logging.debug(f"get_alarm_result: alarm_list of {alarm_id} has {len(alarm_list)} elements")
|
||||
# clear alarm_info older than clear time threshold
|
||||
stop_index = -1
|
||||
- timestamp = int(datetime.now().timestamp())
|
||||
+ timestamp = datetime.now().timestamp()
|
||||
for i in range(len(alarm_list)):
|
||||
logging.debug(f"timestamp, alarm_list[{i}].timestamp: {timestamp}, {xalarm_gettime(alarm_list[i])}")
|
||||
- if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > int(time_range):
|
||||
+ if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > time_range:
|
||||
stop_index = i
|
||||
break
|
||||
if stop_index >= 0:
|
||||
diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py
|
||||
index 7daf17d..f74f165 100644
|
||||
--- a/src/python/syssentry/load_mods.py
|
||||
+++ b/src/python/syssentry/load_mods.py
|
||||
@@ -203,7 +203,8 @@ def parse_mod_conf(mod_name, mod_conf):
|
||||
if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID):
|
||||
raise ValueError("Invalid alarm_id")
|
||||
except ValueError:
|
||||
- logging.warning("Invalid alarm_id")
|
||||
+ task.alarm_id = mod_conf.get(CONF_TASK, CONF_ALARM_ID)
|
||||
+ task.alarm_clear_time = mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME)
|
||||
except configparser.NoOptionError:
|
||||
logging.warning("Unset alarm_clear_time, use 15s as default")
|
||||
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,566 +0,0 @@
|
||||
From d5cb115a97e27c8270e8fb385fb3914af9ba3c34 Mon Sep 17 00:00:00 2001
|
||||
From: gaoruoshu <gaoruoshu@huawei.com>
|
||||
Date: Tue, 15 Oct 2024 10:00:07 +0000
|
||||
Subject: [PATCH] refactor config.py and bugfix uncorrect slow io report
|
||||
|
||||
Signed-off-by: gaoruoshu <gaoruoshu@huawei.com>
|
||||
---
|
||||
.../avg_block_io/avg_block_io.py | 155 ++-----------
|
||||
.../sentryPlugins/avg_block_io/config.py | 208 ++++++++++++++++++
|
||||
.../sentryPlugins/avg_block_io/module_conn.py | 9 +-
|
||||
.../sentryPlugins/avg_block_io/utils.py | 72 ------
|
||||
4 files changed, 238 insertions(+), 206 deletions(-)
|
||||
create mode 100644 src/python/sentryPlugins/avg_block_io/config.py
|
||||
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
index f3ade09..cd47919 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py
|
||||
@@ -13,132 +13,13 @@ import signal
|
||||
import configparser
|
||||
import time
|
||||
|
||||
+from .config import read_config_log, read_config_common, read_config_algorithm, read_config_latency, read_config_iodump, read_config_stage
|
||||
from .stage_window import IoWindow, IoDumpWindow
|
||||
from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name
|
||||
-from .utils import update_avg_and_check_abnormal, get_log_level, get_section_value
|
||||
-from sentryCollector.collect_plugin import Disk_Type
|
||||
+from .utils import update_avg_and_check_abnormal
|
||||
|
||||
CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini"
|
||||
|
||||
-def log_invalid_keys(not_in_list, keys_name, config_list, default_list):
|
||||
- """print invalid log"""
|
||||
- if config_list and not_in_list:
|
||||
- logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list))
|
||||
- elif config_list == ["default"]:
|
||||
- logging.warning("Default {} use {}".format(keys_name, default_list))
|
||||
-
|
||||
-
|
||||
-def read_config_common(config):
|
||||
- """read config file, get [common] section value"""
|
||||
- if not config.has_section("common"):
|
||||
- report_alarm_fail("Cannot find common section in config file")
|
||||
-
|
||||
- try:
|
||||
- disk_name = config.get("common", "disk")
|
||||
- disk = [] if disk_name == "default" else disk_name.split(",")
|
||||
- except configparser.NoOptionError:
|
||||
- disk = []
|
||||
- logging.warning("Unset common.disk, set to default")
|
||||
-
|
||||
- try:
|
||||
- stage_name = config.get("common", "stage")
|
||||
- stage = [] if stage_name == "default" else stage_name.split(",")
|
||||
- except configparser.NoOptionError:
|
||||
- stage = []
|
||||
- logging.warning("Unset common.stage, set to default")
|
||||
-
|
||||
- if len(disk) > 10:
|
||||
- logging.warning("Too many common.disks, record only max 10 disks")
|
||||
- disk = disk[:10]
|
||||
-
|
||||
- try:
|
||||
- iotype_name = config.get("common", "iotype").split(",")
|
||||
- iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']]
|
||||
- err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']]
|
||||
-
|
||||
- if err_iotype:
|
||||
- report_alarm_fail("Invalid common.iotype config")
|
||||
-
|
||||
- except configparser.NoOptionError:
|
||||
- iotype_list = ["read", "write"]
|
||||
- logging.warning("Unset common.iotype, set to read,write")
|
||||
-
|
||||
- try:
|
||||
- period_time = int(config.get("common", "period_time"))
|
||||
- if not (1 <= period_time <= 300):
|
||||
- raise ValueError("Invalid period_time")
|
||||
- except ValueError:
|
||||
- report_alarm_fail("Invalid common.period_time")
|
||||
- except configparser.NoOptionError:
|
||||
- period_time = 1
|
||||
- logging.warning("Unset common.period_time, use 1s as default")
|
||||
-
|
||||
- return period_time, disk, stage, iotype_list
|
||||
-
|
||||
-
|
||||
-def read_config_algorithm(config):
|
||||
- """read config file, get [algorithm] section value"""
|
||||
- if not config.has_section("algorithm"):
|
||||
- report_alarm_fail("Cannot find algorithm section in config file")
|
||||
-
|
||||
- try:
|
||||
- win_size = int(config.get("algorithm", "win_size"))
|
||||
- if not (1 <= win_size <= 300):
|
||||
- raise ValueError("Invalid algorithm.win_size")
|
||||
- except ValueError:
|
||||
- report_alarm_fail("Invalid algorithm.win_size config")
|
||||
- except configparser.NoOptionError:
|
||||
- win_size = 30
|
||||
- logging.warning("Unset algorithm.win_size, use 30 as default")
|
||||
-
|
||||
- try:
|
||||
- win_threshold = int(config.get("algorithm", "win_threshold"))
|
||||
- if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
|
||||
- raise ValueError("Invalid algorithm.win_threshold")
|
||||
- except ValueError:
|
||||
- report_alarm_fail("Invalid algorithm.win_threshold config")
|
||||
- except configparser.NoOptionError:
|
||||
- win_threshold = 6
|
||||
- logging.warning("Unset algorithm.win_threshold, use 6 as default")
|
||||
-
|
||||
- return win_size, win_threshold
|
||||
-
|
||||
-
|
||||
-def read_config_latency(config):
|
||||
- """read config file, get [latency_xxx] section value"""
|
||||
- common_param = {}
|
||||
- for type_name in Disk_Type:
|
||||
- section_name = f"latency_{Disk_Type[type_name]}"
|
||||
- if not config.has_section(section_name):
|
||||
- report_alarm_fail(f"Cannot find {section_name} section in config file")
|
||||
-
|
||||
- common_param[Disk_Type[type_name]] = get_section_value(section_name, config)
|
||||
- return common_param
|
||||
-
|
||||
-
|
||||
-def read_config_iodump(config):
|
||||
- """read config file, get [iodump] section value"""
|
||||
- common_param = {}
|
||||
- section_name = "iodump"
|
||||
- if not config.has_section(section_name):
|
||||
- report_alarm_fail(f"Cannot find {section_name} section in config file")
|
||||
-
|
||||
- return get_section_value(section_name, config)
|
||||
-
|
||||
-
|
||||
-def read_config_stage(config, stage, iotype_list, curr_disk_type):
|
||||
- """read config file, get [STAGE_NAME_diskType] section value"""
|
||||
- res = {}
|
||||
- section_name = f"{stage}_{curr_disk_type}"
|
||||
- if not config.has_section(section_name):
|
||||
- return res
|
||||
-
|
||||
- for key in config[section_name]:
|
||||
- if config[stage][key].isdecimal():
|
||||
- res[key] = int(config[stage][key])
|
||||
-
|
||||
- return res
|
||||
-
|
||||
|
||||
def init_io_win(io_dic, config, common_param):
|
||||
"""initialize windows of latency, iodump, and dict of avg_value"""
|
||||
@@ -192,24 +73,33 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage):
|
||||
disk_list = [key for key in all_disk_set if key in config_disk]
|
||||
not_in_disk_list = [key for key in config_disk if key not in all_disk_set]
|
||||
|
||||
+ if not config_disk and not not_in_disk_list:
|
||||
+ disk_list = [key for key in all_disk_set]
|
||||
+
|
||||
+ if not disk_list:
|
||||
+ report_alarm_fail("Cannot get valid disk name")
|
||||
+
|
||||
+ disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list
|
||||
+
|
||||
+ if not config_disk:
|
||||
+ logging.info(f"Default common.disk using disk={disk_list}")
|
||||
+ elif sorted(disk_list) != sorted(config_disk):
|
||||
+ logging.warning(f"Set common.disk to {disk_list}")
|
||||
+
|
||||
stage_list = [key for key in all_stage_set if key in config_stage]
|
||||
not_in_stage_list = [key for key in config_stage if key not in all_stage_set]
|
||||
|
||||
if not_in_stage_list:
|
||||
report_alarm_fail(f"Invalid common.stage_list config, cannot set {not_in_stage_list}")
|
||||
|
||||
- if not config_disk and not not_in_disk_list:
|
||||
- disk_list = [key for key in all_disk_set]
|
||||
-
|
||||
- if not config_stage and not not_in_stage_list:
|
||||
+ if not config_stage:
|
||||
stage_list = [key for key in all_stage_set]
|
||||
|
||||
- disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list
|
||||
-
|
||||
- if not stage_list or not disk_list:
|
||||
- report_alarm_fail("Cannot get valid disk name or stage name.")
|
||||
+ if not stage_list:
|
||||
+ report_alarm_fail("Cannot get valid stage name.")
|
||||
|
||||
- log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list)
|
||||
+ if not config_stage:
|
||||
+ logging.info(f"Default common.stage using stage={stage_list}")
|
||||
|
||||
return disk_list, stage_list
|
||||
|
||||
@@ -254,9 +144,8 @@ def main():
|
||||
signal.signal(signal.SIGINT, sig_handler)
|
||||
signal.signal(signal.SIGTERM, sig_handler)
|
||||
|
||||
- log_level = get_log_level(CONFIG_FILE)
|
||||
+ log_level = read_config_log(CONFIG_FILE)
|
||||
log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s"
|
||||
-
|
||||
logging.basicConfig(level=log_level, format=log_format)
|
||||
|
||||
# 初始化配置读取
|
||||
@@ -274,6 +163,8 @@ def main():
|
||||
# 采集模块对接,is_iocollect_valid()
|
||||
io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage)
|
||||
|
||||
+ logging.debug(f"disk={io_dic['disk_list']}, stage={io_dic['stage_list']}")
|
||||
+
|
||||
if "bio" not in io_dic["stage_list"]:
|
||||
report_alarm_fail("Cannot run avg_block_io without bio stage")
|
||||
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/config.py b/src/python/sentryPlugins/avg_block_io/config.py
|
||||
new file mode 100644
|
||||
index 0000000..c8f45ce
|
||||
--- /dev/null
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/config.py
|
||||
@@ -0,0 +1,208 @@
|
||||
+import configparser
|
||||
+import logging
|
||||
+import os
|
||||
+
|
||||
+from .module_conn import report_alarm_fail
|
||||
+from sentryCollector.collect_plugin import Disk_Type
|
||||
+
|
||||
+
|
||||
+CONF_LOG = 'log'
|
||||
+CONF_LOG_LEVEL = 'level'
|
||||
+LogLevel = {
|
||||
+ "debug": logging.DEBUG,
|
||||
+ "info": logging.INFO,
|
||||
+ "warning": logging.WARNING,
|
||||
+ "error": logging.ERROR,
|
||||
+ "critical": logging.CRITICAL
|
||||
+}
|
||||
+
|
||||
+CONF_COMMON = 'common'
|
||||
+CONF_COMMON_DISK = 'disk'
|
||||
+CONF_COMMON_STAGE = 'stage'
|
||||
+CONF_COMMON_IOTYPE = 'iotype'
|
||||
+CONF_COMMON_PER_TIME = 'period_time'
|
||||
+
|
||||
+CONF_ALGO = 'algorithm'
|
||||
+CONF_ALGO_SIZE = 'win_size'
|
||||
+CONF_ALGO_THRE = 'win_threshold'
|
||||
+
|
||||
+CONF_LATENCY = 'latency_{}'
|
||||
+CONF_IODUMP = 'iodump'
|
||||
+
|
||||
+
|
||||
+DEFAULT_PARAM = {
|
||||
+ CONF_LOG: {
|
||||
+ CONF_LOG_LEVEL: 'info'
|
||||
+ }, CONF_COMMON: {
|
||||
+ CONF_COMMON_DISK: 'default',
|
||||
+ CONF_COMMON_STAGE: 'default',
|
||||
+ CONF_COMMON_IOTYPE: 'read,write',
|
||||
+ CONF_COMMON_PER_TIME: 1
|
||||
+ }, CONF_ALGO: {
|
||||
+ CONF_ALGO_SIZE: 30,
|
||||
+ CONF_ALGO_THRE: 6
|
||||
+ }, 'latency_nvme_ssd': {
|
||||
+ 'read_avg_lim': 300,
|
||||
+ 'write_avg_lim': 300,
|
||||
+ 'read_avg_time': 3,
|
||||
+ 'write_avg_time': 3,
|
||||
+ 'read_tot_lim': 500,
|
||||
+ 'write_tot_lim': 500,
|
||||
+ }, 'latency_sata_ssd' : {
|
||||
+ 'read_avg_lim': 10000,
|
||||
+ 'write_avg_lim': 10000,
|
||||
+ 'read_avg_time': 3,
|
||||
+ 'write_avg_time': 3,
|
||||
+ 'read_tot_lim': 50000,
|
||||
+ 'write_tot_lim': 50000,
|
||||
+ }, 'latency_sata_hdd' : {
|
||||
+ 'read_avg_lim': 15000,
|
||||
+ 'write_avg_lim': 15000,
|
||||
+ 'read_avg_time': 3,
|
||||
+ 'write_avg_time': 3,
|
||||
+ 'read_tot_lim': 50000,
|
||||
+ 'write_tot_lim': 50000
|
||||
+ }, CONF_IODUMP: {
|
||||
+ 'read_iodump_lim': 0,
|
||||
+ 'write_iodump_lim': 0
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+
|
||||
+def get_section_value(section_name, config):
|
||||
+ common_param = {}
|
||||
+ config_sec = config[section_name]
|
||||
+ for config_key in DEFAULT_PARAM[section_name]:
|
||||
+ if config_key in config_sec:
|
||||
+ if not config_sec[config_key].isdecimal():
|
||||
+ report_alarm_fail(f"Invalid {section_name}.{config_key} config.")
|
||||
+ common_param[config_key] = int(config_sec[config_key])
|
||||
+ else:
|
||||
+ common_param[config_key] = DEFAULT_PARAM[section_name][config_key]
|
||||
+ logging.warning(f"Unset {section_name}.{config_key} in config file, use {common_param[config_key]} as default")
|
||||
+ return common_param
|
||||
+
|
||||
+
|
||||
+def read_config_log(filename):
|
||||
+ """read config file, get [log] section value"""
|
||||
+ default_log_level = DEFAULT_PARAM[CONF_LOG][CONF_LOG_LEVEL]
|
||||
+ if not os.path.exists(filename):
|
||||
+ return LogLevel.get(default_log_level)
|
||||
+
|
||||
+ config = configparser.ConfigParser()
|
||||
+ config.read(filename)
|
||||
+
|
||||
+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL, fallback=default_log_level)
|
||||
+ if log_level.lower() in LogLevel:
|
||||
+ return LogLevel.get(log_level.lower())
|
||||
+ return LogLevel.get(default_log_level)
|
||||
+
|
||||
+
|
||||
+def read_config_common(config):
|
||||
+ """read config file, get [common] section value"""
|
||||
+ if not config.has_section(CONF_COMMON):
|
||||
+ report_alarm_fail(f"Cannot find {CONF_COMMON} section in config file")
|
||||
+
|
||||
+ try:
|
||||
+ disk_name = config.get(CONF_COMMON, CONF_COMMON_DISK).lower()
|
||||
+ disk = [] if disk_name == "default" else disk_name.split(",")
|
||||
+ except configparser.NoOptionError:
|
||||
+ disk = []
|
||||
+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_DISK}, set to default")
|
||||
+
|
||||
+ try:
|
||||
+ stage_name = config.get(CONF_COMMON, CONF_COMMON_STAGE).lower()
|
||||
+ stage = [] if stage_name == "default" else stage_name.split(",")
|
||||
+ except configparser.NoOptionError:
|
||||
+ stage = []
|
||||
+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_STAGE}, set to default")
|
||||
+
|
||||
+ if len(disk) > 10:
|
||||
+ logging.warning(f"Too many {CONF_COMMON}.disks, record only max 10 disks")
|
||||
+ disk = disk[:10]
|
||||
+
|
||||
+ try:
|
||||
+ iotype_name = config.get(CONF_COMMON, CONF_COMMON_IOTYPE).lower().split(",")
|
||||
+ iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']]
|
||||
+ err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']]
|
||||
+
|
||||
+ if err_iotype:
|
||||
+ report_alarm_fail(f"Invalid {CONF_COMMON}.{CONF_COMMON_IOTYPE} config")
|
||||
+
|
||||
+ except configparser.NoOptionError:
|
||||
+ iotype_list = DEFAULT_PARAM[CONF_COMMON][CONF_COMMON_IOTYPE]
|
||||
+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_IOTYPE}, use {iotupe_list} as default")
|
||||
+
|
||||
+ try:
|
||||
+ period_time = int(config.get(CONF_COMMON, CONF_COMMON_PER_TIME))
|
||||
+ if not (1 <= period_time <= 300):
|
||||
+ raise ValueError("Invalid period_time")
|
||||
+ except ValueError:
|
||||
+ report_alarm_fail(f"Invalid {CONF_COMMON}.{CONF_COMMON_PER_TIME}")
|
||||
+ except configparser.NoOptionError:
|
||||
+ period_time = DEFAULT_PARAM[CONF_COMMON][CONF_COMMON_PER_TIME]
|
||||
+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_PER_TIME}, use {period_time} as default")
|
||||
+
|
||||
+ return period_time, disk, stage, iotype_list
|
||||
+
|
||||
+
|
||||
+def read_config_algorithm(config):
|
||||
+ """read config file, get [algorithm] section value"""
|
||||
+ if not config.has_section(CONF_ALGO):
|
||||
+ report_alarm_fail(f"Cannot find {CONF_ALGO} section in config file")
|
||||
+
|
||||
+ try:
|
||||
+ win_size = int(config.get(CONF_ALGO, CONF_ALGO_SIZE))
|
||||
+ if not (1 <= win_size <= 300):
|
||||
+ raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE}")
|
||||
+ except ValueError:
|
||||
+ report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE} config")
|
||||
+ except configparser.NoOptionError:
|
||||
+ win_size = DEFAULT_PARAM[CONF_ALGO][CONF_ALGO_SIZE]
|
||||
+ logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_SIZE}, use {win_size} as default")
|
||||
+
|
||||
+ try:
|
||||
+ win_threshold = int(config.get(CONF_ALGO, CONF_ALGO_THRE))
|
||||
+ if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size:
|
||||
+ raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE}")
|
||||
+ except ValueError:
|
||||
+ report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE} config")
|
||||
+ except configparser.NoOptionError:
|
||||
+ win_threshold = DEFAULT_PARAM[CONF_ALGO]['win_threshold']
|
||||
+ logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE}, use {win_threshold} as default")
|
||||
+
|
||||
+ return win_size, win_threshold
|
||||
+
|
||||
+
|
||||
+def read_config_latency(config):
|
||||
+ """read config file, get [latency_xxx] section value"""
|
||||
+ common_param = {}
|
||||
+ for type_name in Disk_Type:
|
||||
+ section_name = CONF_LATENCY.format(Disk_Type[type_name])
|
||||
+ if not config.has_section(section_name):
|
||||
+ report_alarm_fail(f"Cannot find {section_name} section in config file")
|
||||
+
|
||||
+ common_param[Disk_Type[type_name]] = get_section_value(section_name, config)
|
||||
+ return common_param
|
||||
+
|
||||
+
|
||||
+def read_config_iodump(config):
|
||||
+ """read config file, get [iodump] section value"""
|
||||
+ if not config.has_section(CONF_IODUMP):
|
||||
+ report_alarm_fail(f"Cannot find {CONF_IODUMP} section in config file")
|
||||
+
|
||||
+ return get_section_value(CONF_IODUMP, config)
|
||||
+
|
||||
+
|
||||
+def read_config_stage(config, stage, iotype_list, curr_disk_type):
|
||||
+ """read config file, get [STAGE_NAME_diskType] section value"""
|
||||
+ res = {}
|
||||
+ section_name = f"{stage}_{curr_disk_type}"
|
||||
+ if not config.has_section(section_name):
|
||||
+ return res
|
||||
+
|
||||
+ for key in config[section_name]:
|
||||
+ if config[stage][key].isdecimal():
|
||||
+ res[key] = int(config[stage][key])
|
||||
+
|
||||
+ return res
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
index 8d6f429..cbdaad4 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py
|
||||
@@ -29,12 +29,16 @@ def sig_handler(signum, _f):
|
||||
|
||||
def avg_get_io_data(io_dic):
|
||||
"""get_io_data from sentryCollector"""
|
||||
+ logging.debug(f"send to sentryCollector get_io_data: period={io_dic['period_time']}, "
|
||||
+ f"disk={io_dic['disk_list']}, stage={io_dic['stage_list']}, iotype={io_dic['iotype_list']}")
|
||||
res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"])
|
||||
return check_result_validation(res, 'get io data')
|
||||
|
||||
|
||||
def avg_is_iocollect_valid(io_dic, config_disk, config_stage):
|
||||
"""is_iocollect_valid from sentryCollector"""
|
||||
+ logging.debug(f"send to sentryCollector is_iocollect_valid: period={io_dic['period_time']}, "
|
||||
+ f"disk={config_disk}, stage={config_stage}")
|
||||
res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage)
|
||||
return check_result_validation(res, 'check config validation')
|
||||
|
||||
@@ -79,7 +83,7 @@ def process_report_data(disk_name, rw, io_data):
|
||||
# io press
|
||||
ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq']
|
||||
for stage_name in ctrl_stage:
|
||||
- abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data)
|
||||
+ abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data)
|
||||
if not abnormal:
|
||||
continue
|
||||
msg["reason"] = "IO press"
|
||||
@@ -117,6 +121,7 @@ def process_report_data(disk_name, rw, io_data):
|
||||
|
||||
|
||||
def get_disk_type_by_name(disk_name):
|
||||
+ logging.debug(f"send to sentryCollector get_disk_type: disk_name={disk_name}")
|
||||
res = get_disk_type(disk_name)
|
||||
disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}')
|
||||
try:
|
||||
@@ -126,4 +131,4 @@ def get_disk_type_by_name(disk_name):
|
||||
except ValueError:
|
||||
report_alarm_fail(f"Failed to get disk type for {disk_name}")
|
||||
|
||||
- return Disk_Type[curr_disk_type]
|
||||
\ No newline at end of file
|
||||
+ return Disk_Type[curr_disk_type]
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
index c381c07..1bfd4e8 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/utils.py
|
||||
@@ -8,84 +8,12 @@
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
|
||||
# PURPOSE.
|
||||
# See the Mulan PSL v2 for more details.
|
||||
-import configparser
|
||||
import logging
|
||||
import os
|
||||
|
||||
AVG_VALUE = 0
|
||||
AVG_COUNT = 1
|
||||
|
||||
-CONF_LOG = 'log'
|
||||
-CONF_LOG_LEVEL = 'level'
|
||||
-LogLevel = {
|
||||
- "debug": logging.DEBUG,
|
||||
- "info": logging.INFO,
|
||||
- "warning": logging.WARNING,
|
||||
- "error": logging.ERROR,
|
||||
- "critical": logging.CRITICAL
|
||||
-}
|
||||
-
|
||||
-
|
||||
-DEFAULT_PARAM = {
|
||||
- 'latency_nvme_ssd': {
|
||||
- 'read_avg_lim': 300,
|
||||
- 'write_avg_lim': 300,
|
||||
- 'read_avg_time': 3,
|
||||
- 'write_avg_time': 3,
|
||||
- 'read_tot_lim': 500,
|
||||
- 'write_tot_lim': 500,
|
||||
- }, 'latency_sata_ssd' : {
|
||||
- 'read_avg_lim': 10000,
|
||||
- 'write_avg_lim': 10000,
|
||||
- 'read_avg_time': 3,
|
||||
- 'write_avg_time': 3,
|
||||
- 'read_tot_lim': 50000,
|
||||
- 'write_tot_lim': 50000,
|
||||
- }, 'latency_sata_hdd' : {
|
||||
- 'read_avg_lim': 15000,
|
||||
- 'write_avg_lim': 15000,
|
||||
- 'read_avg_time': 3,
|
||||
- 'write_avg_time': 3,
|
||||
- 'read_tot_lim': 50000,
|
||||
- 'write_tot_lim': 50000
|
||||
- }, 'iodump': {
|
||||
- 'read_iodump_lim': 0,
|
||||
- 'write_iodump_lim': 0
|
||||
- }
|
||||
-}
|
||||
-
|
||||
-
|
||||
-def get_section_value(section_name, config):
|
||||
- common_param = {}
|
||||
- config_sec = config[section_name]
|
||||
- for config_key in DEFAULT_PARAM[section_name]:
|
||||
- if config_key in config_sec:
|
||||
- if not config_sec[config_key].isdecimal():
|
||||
- report_alarm_fail(f"Invalid {section_name}.{config_key} config.")
|
||||
- common_param[config_key] = int(config_sec[config_key])
|
||||
- else:
|
||||
- logging.warning(f"Unset {section_name}.{config_key} in config file, use {DEFAULT_PARAM[section_name][config_key]} as default")
|
||||
- common_param[config_key] = DEFAULT_PARAM[section_name][config_key]
|
||||
- return common_param
|
||||
-
|
||||
-
|
||||
-def get_log_level(filename):
|
||||
- if not os.path.exists(filename):
|
||||
- return logging.INFO
|
||||
-
|
||||
- try:
|
||||
- config = configparser.ConfigParser()
|
||||
- config.read(filename)
|
||||
- if not config.has_option(CONF_LOG, CONF_LOG_LEVEL):
|
||||
- return logging.INFO
|
||||
- log_level = config.get(CONF_LOG, CONF_LOG_LEVEL)
|
||||
-
|
||||
- if log_level.lower() in LogLevel:
|
||||
- return LogLevel.get(log_level.lower())
|
||||
- return logging.INFO
|
||||
- except configparser.Error:
|
||||
- return logging.INFO
|
||||
-
|
||||
|
||||
def get_nested_value(data, keys):
|
||||
"""get data from nested dict"""
|
||||
--
|
||||
2.27.0
|
||||
@ -1,92 +0,0 @@
|
||||
From d74076f4b772822de4f5bee1c8a778dd6b1771d2 Mon Sep 17 00:00:00 2001
|
||||
From: shixuantong <shixuantong1@huawei.com>
|
||||
Date: Wed, 11 Dec 2024 15:25:33 +0800
|
||||
Subject: [PATCH] set logrotate
|
||||
|
||||
---
|
||||
config/logrotate | 9 ---------
|
||||
config/logrotate-sysSentry.conf | 35 +++++++++++++++++++++++++++++++++
|
||||
src/sh/logrotate-sysSentry.cron | 13 ++++++++++++
|
||||
3 files changed, 48 insertions(+), 9 deletions(-)
|
||||
delete mode 100644 config/logrotate
|
||||
create mode 100644 config/logrotate-sysSentry.conf
|
||||
create mode 100644 src/sh/logrotate-sysSentry.cron
|
||||
|
||||
diff --git a/config/logrotate b/config/logrotate
|
||||
deleted file mode 100644
|
||||
index 3dc77f5..0000000
|
||||
--- a/config/logrotate
|
||||
+++ /dev/null
|
||||
@@ -1,9 +0,0 @@
|
||||
-/var/log/sysSentry/*.log {
|
||||
- compress
|
||||
- missingok
|
||||
- notifempty
|
||||
- copytruncate
|
||||
- rotate 2
|
||||
- size +4096k
|
||||
- hourly
|
||||
-}
|
||||
diff --git a/config/logrotate-sysSentry.conf b/config/logrotate-sysSentry.conf
|
||||
new file mode 100644
|
||||
index 0000000..cf5f994
|
||||
--- /dev/null
|
||||
+++ b/config/logrotate-sysSentry.conf
|
||||
@@ -0,0 +1,35 @@
|
||||
+# keep 4 hours worth of backlogs
|
||||
+rotate 4
|
||||
+
|
||||
+# create new (empty) log files after rotating old ones
|
||||
+create
|
||||
+
|
||||
+# compress log files
|
||||
+compress
|
||||
+
|
||||
+# if a log file does not exist, go no to the next one without an error msg
|
||||
+missingok
|
||||
+
|
||||
+# do not rotate the log if it is empty
|
||||
+notifempty
|
||||
+
|
||||
+copytruncate
|
||||
+
|
||||
+# ignore any following matches of a log file.
|
||||
+# Note that order is significant, it will not overwrite and take the first match.
|
||||
+# require logrotate >= 3.21.0
|
||||
+ignoreduplicates
|
||||
+
|
||||
+/var/log/sysSentry/sysSentry.log {
|
||||
+ rotate 8
|
||||
+ size +4096k
|
||||
+}
|
||||
+
|
||||
+/var/log/sysSentry/cpu_sentry.log {
|
||||
+ rotate 2
|
||||
+ size +2048k
|
||||
+}
|
||||
+
|
||||
+/var/log/sysSentry/*.log {
|
||||
+ size +4096k
|
||||
+}
|
||||
diff --git a/src/sh/logrotate-sysSentry.cron b/src/sh/logrotate-sysSentry.cron
|
||||
new file mode 100644
|
||||
index 0000000..64d02f9
|
||||
--- /dev/null
|
||||
+++ b/src/sh/logrotate-sysSentry.cron
|
||||
@@ -0,0 +1,13 @@
|
||||
+#!/bin/sh
|
||||
+
|
||||
+TMPF=`mktemp /tmp/logrotate-sysSentry.XXXXXXXXX`
|
||||
+
|
||||
+/usr/sbin/logrotate /etc/logrotate-sysSentry.conf -v --log=$TMPF -s /var/lib/logrotate-syssentry/logrotate.status
|
||||
+EXITVALUE=$?
|
||||
+if [ $EXITVALUE != 0 ]; then
|
||||
+ /bin/logger -t logrotate "ALERT exited abnormally with [$EXITVALUE], for details, see /var/log/sysSentry/logrotate.log"
|
||||
+ /bin/logger -t logrotate -f $TMPF
|
||||
+fi
|
||||
+rm -rf $TMPF
|
||||
+rm -rf /var/lib/logrotate-syssentry/logrotate.status
|
||||
+exit $EXITVALUE
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,38 +0,0 @@
|
||||
From 4abad77067557234d938de3914094c80181030c1 Mon Sep 17 00:00:00 2001
|
||||
From: jwolf <523083921@qq.com>
|
||||
Date: Fri, 30 Aug 2024 14:30:46 +0800
|
||||
Subject: [PATCH] must be integer
|
||||
|
||||
---
|
||||
c/catcli/catlib/cli_param_checker.c | 6 ++++--
|
||||
1 file changed, 4 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c
|
||||
index e400428..5b38402 100644
|
||||
--- a/src/c/catcli/catlib/cli_param_checker.c
|
||||
+++ b/src/c/catcli/catlib/cli_param_checker.c
|
||||
@@ -17,8 +17,9 @@ void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_r
|
||||
if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) {
|
||||
strncpy(errs->patrol_module_err,
|
||||
"\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN);
|
||||
+ } else {
|
||||
+ p_request_body->cpu_utility = (int)cpu_utility;
|
||||
}
|
||||
- p_request_body->cpu_utility = (int)cpu_utility;
|
||||
}
|
||||
|
||||
void checkset_cpulist(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs)
|
||||
@@ -73,8 +74,9 @@ void checkset_patrol_time(char *getopt_optarg, catcli_request_body *p_request_bo
|
||||
strncpy(errs->patrol_time_err,
|
||||
"\"patrol_second\" must be a number in the range of (0,INT_MAX] ,correct \"-t, --patrol_second\"\n",
|
||||
MAX_ERR_LEN);
|
||||
+ } else {
|
||||
+ p_request_body->patrol_second = (int)second;
|
||||
}
|
||||
- p_request_body->patrol_second = (int)second;
|
||||
}
|
||||
|
||||
void checkset_patrol_type(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs)
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,155 +0,0 @@
|
||||
From 3f6e4d12618597b5aab6b0633f1bda800526ea54 Mon Sep 17 00:00:00 2001
|
||||
From: gaoruoshu <gaoruoshu@huawei.com>
|
||||
Date: Wed, 14 Aug 2024 21:10:20 +0800
|
||||
Subject: [PATCH] split cpu_sentry and syssentry
|
||||
|
||||
---
|
||||
src/python/syssentry/cpu_alarm.py | 42 +++++++++++++++++++++++++
|
||||
src/python/syssentry/syssentry.py | 52 ++++++-------------------------
|
||||
2 files changed, 52 insertions(+), 42 deletions(-)
|
||||
|
||||
diff --git a/src/python/syssentry/cpu_alarm.py b/src/python/syssentry/cpu_alarm.py
|
||||
index d972c42..0b1642b 100644
|
||||
--- a/src/python/syssentry/cpu_alarm.py
|
||||
+++ b/src/python/syssentry/cpu_alarm.py
|
||||
@@ -1,6 +1,7 @@
|
||||
import re
|
||||
import math
|
||||
import logging
|
||||
+import socket
|
||||
from enum import Enum
|
||||
|
||||
from .utils import execute_command
|
||||
@@ -15,6 +16,12 @@ BINARY = 2
|
||||
MIN_DATA_LEN = 0
|
||||
MAX_DATA_LEN = 999
|
||||
|
||||
+PARAM_REP_LEN = 3
|
||||
+PARAM_TYPE_LEN = 1
|
||||
+PARAM_MODULE_LEN = 1
|
||||
+PARAM_TRANS_TO_LEN = 2
|
||||
+PARAM_DATA_LEN = 3
|
||||
+
|
||||
|
||||
class Type(Enum):
|
||||
CE = 0x00
|
||||
@@ -207,3 +214,38 @@ def check_fixed_param(data, expect):
|
||||
raise ValueError("expected str param is not valid")
|
||||
return data
|
||||
raise NotImplementedError("unexpected param type")
|
||||
+
|
||||
+
|
||||
+def cpu_alarm_recv(server_socket: socket.socket):
|
||||
+ try:
|
||||
+ client_socket, _ = server_socket.accept()
|
||||
+ logging.debug("cpu alarm fd listen ok")
|
||||
+
|
||||
+ data = client_socket.recv(PARAM_REP_LEN)
|
||||
+ check_fixed_param(data, "REP")
|
||||
+
|
||||
+ data = client_socket.recv(PARAM_TYPE_LEN)
|
||||
+ _type = check_fixed_param(data, Type)
|
||||
+
|
||||
+ data = client_socket.recv(PARAM_MODULE_LEN)
|
||||
+ module = check_fixed_param(data, Module)
|
||||
+
|
||||
+ data = client_socket.recv(PARAM_TRANS_TO_LEN)
|
||||
+ trans_to = check_fixed_param(data, TransTo)
|
||||
+
|
||||
+ data = client_socket.recv(PARAM_DATA_LEN)
|
||||
+ data_len = check_fixed_param(data, (MIN_DATA_LEN, MAX_DATA_LEN))
|
||||
+
|
||||
+ data = client_socket.recv(data_len)
|
||||
+
|
||||
+ command, event_type, socket_id, core_id = parser_cpu_alarm_info(data)
|
||||
+ except socket.error:
|
||||
+ logging.error("socket error")
|
||||
+ return
|
||||
+ except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError):
|
||||
+ logging.error("server recv cpu alarm msg failed!")
|
||||
+ client_socket.close()
|
||||
+ return
|
||||
+
|
||||
+ upload_bmc(_type, module, command, event_type, socket_id, core_id)
|
||||
+
|
||||
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
|
||||
index 3d5cb8d..f93956e 100644
|
||||
--- a/src/python/syssentry/syssentry.py
|
||||
+++ b/src/python/syssentry/syssentry.py
|
||||
@@ -36,8 +36,15 @@ from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create,
|
||||
from .result import RESULT_MSG_HEAD_LEN, RESULT_MSG_MAGIC_LEN, RESULT_MAGIC
|
||||
from .result import RESULT_LEVEL_ERR_MSG_DICT, ResultLevel
|
||||
from .utils import get_current_time_string
|
||||
-from .cpu_alarm import (upload_bmc, check_fixed_param, parser_cpu_alarm_info,
|
||||
- Type, Module, TransTo, MIN_DATA_LEN, MAX_DATA_LEN)
|
||||
+
|
||||
+
|
||||
+CPU_EXIST = True
|
||||
+try:
|
||||
+ from .cpu_alarm import cpu_alarm_recv
|
||||
+except ImportError:
|
||||
+ CPU_EXIST = False
|
||||
+ logging.debug("Cannot find cpu sentry mod")
|
||||
+
|
||||
|
||||
INSPECTOR = None
|
||||
|
||||
@@ -76,45 +83,6 @@ PID_FILE_FLOCK = None
|
||||
RESULT_SOCKET_PATH = "/var/run/sysSentry/result.sock"
|
||||
|
||||
CPU_ALARM_SOCKET_PATH = "/var/run/sysSentry/report.sock"
|
||||
-PARAM_REP_LEN = 3
|
||||
-PARAM_TYPE_LEN = 1
|
||||
-PARAM_MODULE_LEN = 1
|
||||
-PARAM_TRANS_TO_LEN = 2
|
||||
-PARAM_DATA_LEN = 3
|
||||
-
|
||||
-
|
||||
-def cpu_alarm_recv(server_socket: socket.socket):
|
||||
- try:
|
||||
- client_socket, _ = server_socket.accept()
|
||||
- logging.debug("cpu alarm fd listen ok")
|
||||
-
|
||||
- data = client_socket.recv(PARAM_REP_LEN)
|
||||
- check_fixed_param(data, "REP")
|
||||
-
|
||||
- data = client_socket.recv(PARAM_TYPE_LEN)
|
||||
- _type = check_fixed_param(data, Type)
|
||||
-
|
||||
- data = client_socket.recv(PARAM_MODULE_LEN)
|
||||
- module = check_fixed_param(data, Module)
|
||||
-
|
||||
- data = client_socket.recv(PARAM_TRANS_TO_LEN)
|
||||
- trans_to = check_fixed_param(data, TransTo)
|
||||
-
|
||||
- data = client_socket.recv(PARAM_DATA_LEN)
|
||||
- data_len = check_fixed_param(data, (MIN_DATA_LEN, MAX_DATA_LEN))
|
||||
-
|
||||
- data = client_socket.recv(data_len)
|
||||
-
|
||||
- command, event_type, socket_id, core_id = parser_cpu_alarm_info(data)
|
||||
- except socket.error:
|
||||
- logging.error("socket error")
|
||||
- return
|
||||
- except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError):
|
||||
- logging.error("server recv cpu alarm msg failed!")
|
||||
- client_socket.close()
|
||||
- return
|
||||
-
|
||||
- upload_bmc(_type, module, command, event_type, socket_id, core_id)
|
||||
|
||||
|
||||
def msg_data_process(msg_data):
|
||||
@@ -480,7 +448,7 @@ def main_loop():
|
||||
server_result_recv(server_result_fd)
|
||||
elif event_fd == heartbeat_fd.fileno():
|
||||
heartbeat_recv(heartbeat_fd)
|
||||
- elif event_fd == cpu_alarm_fd.fileno():
|
||||
+ elif CPU_EXIST and event_fd == cpu_alarm_fd.fileno():
|
||||
cpu_alarm_recv(cpu_alarm_fd)
|
||||
else:
|
||||
continue
|
||||
--
|
||||
2.33.0
|
||||
|
||||
|
||||
Binary file not shown.
BIN
sysSentry-1.0.3.tar.gz
Normal file
BIN
sysSentry-1.0.3.tar.gz
Normal file
Binary file not shown.
659
sysSentry.spec
659
sysSentry.spec
@ -3,101 +3,24 @@
|
||||
|
||||
Summary: System Inspection Framework
|
||||
Name: sysSentry
|
||||
Version: 1.0.2
|
||||
Release: 67
|
||||
Version: 1.0.3
|
||||
Release: 1
|
||||
License: Mulan PSL v2
|
||||
Group: System Environment/Daemons
|
||||
Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz
|
||||
BuildRoot: %{_builddir}/%{name}-root
|
||||
|
||||
Patch1: fix-version-in-setup.py.patch
|
||||
Patch2: Fix-the-problem-that-function-cpu_report_result-is-c.patch
|
||||
Patch3: fix-error-handling.patch
|
||||
Patch4: fix-result-when-process-output-is-None.patch
|
||||
Patch5: cpu_utility-and-cpu_patrol-must-be-an-integer.patch
|
||||
Patch6: setting-parameters-must-be-integer.patch
|
||||
Patch7: param-must-be-integer.patch
|
||||
Patch8: add-deleted-code-to-plugin-rasdaemon.patch
|
||||
Patch9: Remove-ANSI-escape-sequences.patch
|
||||
Patch10: split-cpu_sentry-and-syssentry.patch
|
||||
Patch11: fix-configparser.InterpolationSyntaxError.patch
|
||||
Patch12: fix-syssentry-fails-to-be-started-when-cpu_sentry-is.patch
|
||||
Patch13: add-collect-module-to-sysSentry.patch
|
||||
Patch14: feature-add-avg_block_io-plugin.patch
|
||||
Patch15: fix-some-about-collect-module-and-avg-block-io.patch
|
||||
Patch16: add-ai-threshold-slow-io-detection-plugin.patch
|
||||
Patch17: optimize-the-handing-of-cat-cli-error-msg-in-cpu_sentry.patch
|
||||
Patch18: over-threshold-should-be-warn-level-log-in-cat-cli.patch
|
||||
Patch19: fix-bug-step-2-about-collect-module-and-avg-block-io.patch
|
||||
Patch20: add-log-level-and-change-log-format.patch
|
||||
Patch21: fix-ai_block_io-some-issues.patch
|
||||
Patch22: add-pyxalarm-and-pySentryNotify-add-multi-users-supp.patch
|
||||
Patch23: add-sentryctl-get_alarm-module_name-s-time_range-d.patch
|
||||
Patch24: fix-python-3.7-not-support-list-bool-type.patch
|
||||
Patch25: avg_block_io-send-alarm-to-xalarmd.patch
|
||||
Patch26: bugfix-typo.patch
|
||||
Patch27: fix-config-relative-some-issues.patch
|
||||
Patch28: update-log-when-it-is-not-lock-collect.patch
|
||||
Patch29: change-alarm-length.patch
|
||||
Patch30: add-detail-time.patch
|
||||
Patch31: xalarm-add-alarm-msg-length-to-8192.patch
|
||||
Patch32: ai_block_io-adapt-alarm-module.patch
|
||||
Patch33: add-log-for-improving-maintainability.patch
|
||||
Patch34: add-get_disk_type-and-fix-some-bugs.patch
|
||||
Patch35: diff-disk-type-use-diff-config.patch
|
||||
Patch36: add-parameter-time_range-alarm_id-and-alarm_clear_ti.patch
|
||||
Patch37: fix-xalarm_Report-function-not-refuse-alarm-msg-exce.patch
|
||||
Patch38: fix-xalarm_upgrade-not-return-val-and-fail-when-thre.patch
|
||||
Patch39: add-log-for-xalarm-when-sending-msg-and-clean-invali.patch
|
||||
Patch40: add-xalarm-cleanup-invalid-server-socket-peroidly.patch
|
||||
Patch41: ai_block_io-support-stage-and-iotype.patch
|
||||
Patch42: fix-io_dump-for-collect-module.patch
|
||||
Patch43: add-root-cause-analysis.patch
|
||||
Patch44: update-collect-log.patch
|
||||
Patch45: modify-abnormal-stack-when-the-disk-field-is-not-con.patch
|
||||
Patch46: ai_block_io-fix-some-bugs.patch
|
||||
Patch47: refactor-config.py-and-bugfix-uncorrect-slow-io-repo.patch
|
||||
Patch48: get_io_data-failed-wont-stop-avg_block_io-and-del-di.patch
|
||||
Patch49: fix-ai_block_io-root-cause-bug.patch
|
||||
Patch50: listen-thread-of-collect-module-exits-occasionally.patch
|
||||
Patch51: precise-alarm-query-time.patch
|
||||
Patch52: fix-word-error.patch
|
||||
Patch53: optimize-log-printing.patch
|
||||
Patch54: enrich-alert-info-about-kernel-stack.patch
|
||||
Patch55: ai_block_io-lack-section-exit.patch
|
||||
Patch56: fix-xalarm-non-uniform-log-formatting.patch
|
||||
Patch57: update-collect-plugin-period-max.patch
|
||||
Patch58: fix-frequency-param-check-bug.patch
|
||||
Patch59: ai_block_io-support-iodump.patch
|
||||
Patch60: fix-get_alarm-error.patch
|
||||
Patch61: fix-alarm_info-newline-break-error.patch
|
||||
Patch62: add-hbm-online-repair.patch
|
||||
Patch63: fix-hbm-online-repair-notice-and-efi-create.patch
|
||||
Patch64: get_alarm-d-abnomal-display.patch
|
||||
Patch65: modify-logrotate-rule.patch
|
||||
Patch66: fix-excessive-CPU-usage.patch
|
||||
Patch67: fix-uint8-bug-and-change-isolation-default-value.patch
|
||||
Patch68: fix-write-file-return-code-bug.patch
|
||||
Patch69: change-avg_block_io-config.patch
|
||||
Patch70: ai_block_io-support-absolute-threshold-lower-limit.patch
|
||||
Patch71: ai_block_io-fix-some-config-parameters-parse-bug.patch
|
||||
Patch72: update-nvme-config.patch
|
||||
Patch73: make-debug-msg-clear.patch
|
||||
Patch74: add-boundary-check-for-settings.patch
|
||||
Patch75: change-status-of-period-task-and-sort-mod-file.patch
|
||||
Patch76: uniform-avg_block_io-log-and-ai_block_io-log.patch
|
||||
Patch77: set-logrotate.patch
|
||||
Patch78: hbm_online_repair-add-unload-driver.patch
|
||||
Patch79: fix-test_ai_block_io-fail.patch
|
||||
|
||||
BuildRequires: cmake gcc-c++
|
||||
BuildRequires: python3 python3-setuptools
|
||||
BuildRequires: json-c-devel
|
||||
BuildRequires: chrpath
|
||||
# for test
|
||||
BuildRequires: elfutils-devel clang libbpf-devel bpftool
|
||||
BuildRequires: python3-numpy python3-pytest
|
||||
|
||||
Requires: pyxalarm = %{version}
|
||||
Requires: libbpf
|
||||
|
||||
%define PYTHON_VERSION %{python3_version}
|
||||
%define PKGVER syssentry-%{version}-py%{PYTHON_VERSION}.egg-info
|
||||
|
||||
%description
|
||||
sysSentry provides framework tools for system inspection.
|
||||
@ -119,15 +42,6 @@ Provides: libxalarm-devel = %{version}
|
||||
%description -n libxalarm-devel
|
||||
This package provides developer tools for the libxalarm.
|
||||
|
||||
%package -n cpu_sentry
|
||||
Summary: CPU fault inspection program
|
||||
Requires: procps-ng
|
||||
Recommends: sysSentry = %{version}-%{release}
|
||||
Recommends: ipmitool
|
||||
|
||||
%description -n cpu_sentry
|
||||
This package provides CPU fault detection
|
||||
|
||||
%package -n avg_block_io
|
||||
Summary: Supports slow I/O detection
|
||||
Requires: sysSentry = %{version}-%{release}
|
||||
@ -182,92 +96,10 @@ This package provides hbm_online_repair for the sysSentry.
|
||||
%autosetup -n %{name}-%{version} -p1
|
||||
|
||||
%build
|
||||
# xalarm
|
||||
sh build/build.sh -b %{buildroot}%{_libdir}
|
||||
|
||||
# sysSentry
|
||||
pushd src/python
|
||||
python3 setup.py build
|
||||
popd
|
||||
|
||||
pushd src/c/catcli/catlib
|
||||
cmake -B ./build/ -S . -D CMAKE_INSTALL_PREFIX=/usr/local -D CMAKE_BUILD_TYPE=Release
|
||||
pushd build
|
||||
make
|
||||
popd
|
||||
popd
|
||||
|
||||
# hbm_online_repair
|
||||
pushd src/c/hbm_online_repair
|
||||
make
|
||||
popd
|
||||
%make_build
|
||||
|
||||
%install
|
||||
# sysSentry
|
||||
mkdir -p %{buildroot}%{_bindir}
|
||||
mkdir -p %{buildroot}%{_unitdir}
|
||||
mkdir -p %{buildroot}%{_var}/log/sysSentry
|
||||
install src/python/syssentry/sentryctl %{buildroot}%{_bindir}
|
||||
install -d -m 700 %{buildroot}/etc/sysSentry/
|
||||
install -d -m 700 %{buildroot}/etc/sysSentry/tasks/
|
||||
install -d -m 700 %{buildroot}/etc/sysSentry/plugins/
|
||||
install -m 600 config/inspect.conf %{buildroot}%{_sysconfdir}/sysSentry
|
||||
install -m 600 service/sysSentry.service %{buildroot}%{_unitdir}
|
||||
|
||||
# rasdaemon
|
||||
install config/tasks/rasdaemon.mod %{buildroot}/etc/sysSentry/tasks/
|
||||
|
||||
# xalarm
|
||||
sh build/build.sh -i %{buildroot}%{_libdir}
|
||||
install -m 600 config/xalarm.conf %{buildroot}%{_sysconfdir}/sysSentry
|
||||
install -d %{buildroot}%{_libdir}
|
||||
install -d %{buildroot}%{_includedir}/xalarm
|
||||
install -m 600 service/xalarmd.service %{buildroot}%{_unitdir}
|
||||
install -m 644 src/libso/xalarm/register_xalarm.h %{buildroot}%{_includedir}/xalarm/register_xalarm.h
|
||||
|
||||
# sentryCollector
|
||||
install -m 600 config/collector.conf %{buildroot}%{_sysconfdir}/sysSentry
|
||||
install -m 600 service/sentryCollector.service %{buildroot}%{_unitdir}
|
||||
|
||||
# cpu sentry
|
||||
install config/tasks/cpu_sentry.mod %{buildroot}/etc/sysSentry/tasks/
|
||||
install config/plugins/cpu_sentry.ini %{buildroot}/etc/sysSentry/plugins/cpu_sentry.ini
|
||||
install src/c/catcli/catlib/build/cat-cli %{buildroot}%{_bindir}/cat-cli
|
||||
install src/c/catcli/catlib/build/plugin/cpu_patrol/libcpu_patrol.so %{buildroot}%{_libdir}
|
||||
|
||||
chrpath -d %{buildroot}%{_bindir}/cat-cli
|
||||
chrpath -d %{buildroot}%{_libdir}/libcpu_patrol.so
|
||||
|
||||
# avg_block_io
|
||||
install config/tasks/avg_block_io.mod %{buildroot}/etc/sysSentry/tasks/
|
||||
install config/plugins/avg_block_io.ini %{buildroot}/etc/sysSentry/plugins/avg_block_io.ini
|
||||
|
||||
# ai_block_io
|
||||
install config/tasks/ai_block_io.mod %{buildroot}/etc/sysSentry/tasks/
|
||||
install config/plugins/ai_block_io.ini %{buildroot}/etc/sysSentry/plugins/ai_block_io.ini
|
||||
|
||||
# hbm_online_repair
|
||||
mkdir -p %{buildroot}/etc/sysconfig/
|
||||
install config/tasks/hbm_online_repair.mod %{buildroot}/etc/sysSentry/tasks/
|
||||
install src/c/hbm_online_repair/hbm_online_repair %{buildroot}%{_bindir}
|
||||
install src/c/hbm_online_repair/hbm_online_repair.env %{buildroot}/etc/sysconfig/hbm_online_repair.env
|
||||
|
||||
# logrotate
|
||||
mkdir -p %{buildroot}%{_localstatedir}/lib/logrotate-syssentry
|
||||
mkdir -p %{buildroot}%{_sysconfdir}/cron.hourly
|
||||
install -m 0600 config/logrotate-sysSentry.conf %{buildroot}%{_sysconfdir}/logrotate-sysSentry.conf
|
||||
install -m 0500 src/sh/logrotate-sysSentry.cron %{buildroot}%{_sysconfdir}/cron.hourly/logrotate-sysSentry
|
||||
|
||||
pushd src/python
|
||||
python3 setup.py install -O1 --root=$RPM_BUILD_ROOT --record=SENTRY_FILES
|
||||
cat SENTRY_FILES | grep -v register_xalarm.* | grep -v sentry_notify.* > SENTRY_FILES.tmp
|
||||
mv SENTRY_FILES.tmp SENTRY_FILES
|
||||
popd
|
||||
|
||||
%check
|
||||
PYTHONPATH=%{buildroot}%{python3_sitelib} %{__python3} -m pytest selftest/test/
|
||||
|
||||
%pre
|
||||
%make_install
|
||||
|
||||
%post
|
||||
/sbin/ldconfig
|
||||
@ -287,28 +119,36 @@ rm -rf /var/run/sysSentry | :
|
||||
%postun
|
||||
/sbin/ldconfig
|
||||
|
||||
%clean
|
||||
rm -rf %{buildroot}
|
||||
|
||||
%files -f src/python/SENTRY_FILES
|
||||
%files
|
||||
%defattr(0550,root,root)
|
||||
%dir %attr(0550,root,root) %{python3_sitelib}/xalarm
|
||||
%attr(0550,root,root) %{python3_sitelib}/xalarm
|
||||
%attr(0550,root,root) %{python3_sitelib}/syssentry
|
||||
%attr(0550,root,root) %{python3_sitelib}/%{PKGVER}
|
||||
%attr(0550,root,root) %{python3_sitelib}/sentryCollector
|
||||
%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/avg_block_io
|
||||
%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_block_io
|
||||
|
||||
# sysSentry
|
||||
%attr(0500,root,root) %{_bindir}/sentryctl
|
||||
%attr(0550,root,root) %{_bindir}/syssentry
|
||||
%attr(0550,root,root) %{_bindir}/ebpf_collector
|
||||
%attr(0750,root,root) %config(noreplace) %{_var}/log/sysSentry
|
||||
%attr(0750,root,root) %config(noreplace) %{_sysconfdir}/sysSentry
|
||||
%attr(0750,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks
|
||||
%attr(0750,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/plugins
|
||||
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/inspect.conf
|
||||
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/rasdaemon.mod
|
||||
%attr(0600,root,root) %{_unitdir}/sysSentry.service
|
||||
|
||||
%exclude %{python3_sitelib}/sentryCollector/collect_plugin.py
|
||||
%exclude %{python3_sitelib}/xalarm/register_xalarm.py
|
||||
%exclude %{python3_sitelib}/xalarm/sentry_notify.py
|
||||
|
||||
%exclude %{python3_sitelib}/syssentry/__pycache__
|
||||
%exclude %{python3_sitelib}/sentryCollector/__pycache__
|
||||
%exclude %{python3_sitelib}/xalarm/__pycache__
|
||||
|
||||
%exclude %{_sysconfdir}/sysSentry/tasks/ai_block_io.mod
|
||||
%exclude %{_sysconfdir}/sysSentry/plugins/ai_block_io.ini
|
||||
%exclude %{_sysconfdir}/sysSentry/tasks/avg_block_io.mod
|
||||
%exclude %{_sysconfdir}/sysSentry/plugins/avg_block_io.ini
|
||||
|
||||
# xalarm
|
||||
%attr(0550,root,root) %{_bindir}/xalarmd
|
||||
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/xalarm.conf
|
||||
@ -316,39 +156,14 @@ rm -rf %{buildroot}
|
||||
|
||||
# logrotate
|
||||
%dir %{_localstatedir}/lib/logrotate-syssentry
|
||||
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/logrotate-sysSentry.conf
|
||||
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/logrotate-sysSentry.conf
|
||||
%attr(0500,root,root) %{_sysconfdir}/cron.hourly/logrotate-sysSentry
|
||||
|
||||
# cpu inspection module
|
||||
%exclude %{_sysconfdir}/sysSentry/tasks/cpu_sentry.mod
|
||||
%exclude %{_sysconfdir}/sysSentry/plugins/cpu_sentry.ini
|
||||
%exclude %{_bindir}/cpu_sentry
|
||||
%exclude %{_bindir}/cat-cli
|
||||
%exclude %{python3_sitelib}/syssentry/cpu_*
|
||||
%exclude %{python3_sitelib}/syssentry/*/cpu_*
|
||||
|
||||
# avg block io
|
||||
%exclude %{_sysconfdir}/sysSentry/tasks/avg_block_io.mod
|
||||
%exclude %{_sysconfdir}/sysSentry/plugins/avg_block_io.ini
|
||||
%exclude %{_bindir}/avg_block_io
|
||||
%exclude %{python3_sitelib}/sentryPlugins/*
|
||||
|
||||
# ai_block_io
|
||||
%exclude %{_sysconfdir}/sysSentry/tasks/ai_block_io.mod
|
||||
%exclude %{_sysconfdir}/sysSentry/plugins/ai_block_io.ini
|
||||
%exclude %{_bindir}/ai_block_io
|
||||
%exclude %{python3_sitelib}/sentryPlugins/*
|
||||
|
||||
# sentryCollector
|
||||
%attr(0550,root,root) %{_bindir}/sentryCollector
|
||||
%attr(0600,root,root) %{_sysconfdir}/sysSentry/collector.conf
|
||||
%attr(0600,root,root) %{_unitdir}/sentryCollector.service
|
||||
|
||||
# pysentry_collect
|
||||
%exclude %{python3_sitelib}/sentryCollector/collect_plugin.py
|
||||
%exclude %{python3_sitelib}/sentryCollector/__pycache__/collect_plugin*
|
||||
|
||||
# hbm repair module
|
||||
%exclude %{_sysconfdir}/sysSentry/tasks/hbm_online_repair.mod
|
||||
%exclude %{python3_sitelib}/syssentry/bmc_*
|
||||
%exclude %{python3_sitelib}/syssentry/*/bmc_*
|
||||
@ -357,41 +172,30 @@ rm -rf %{buildroot}
|
||||
%attr(0550,root,root) %{_libdir}/libxalarm.so
|
||||
|
||||
%files -n libxalarm-devel
|
||||
%dir %{_includedir}/xalarm
|
||||
%attr(0550,root,root) %{_includedir}/xalarm
|
||||
%attr(0550,root,root) %{_includedir}/xalarm/register_xalarm.h
|
||||
|
||||
%files -n pyxalarm
|
||||
%attr(0550,root,root) %{python3_sitelib}/xalarm/register_xalarm.py
|
||||
%attr(0550,root,root) %{python3_sitelib}/xalarm/__pycache__/register_xalarm*
|
||||
|
||||
%files -n pysentry_notify
|
||||
%attr(0550,root,root) %{python3_sitelib}/xalarm/sentry_notify.py
|
||||
%attr(0550,root,root) %{python3_sitelib}/xalarm/__pycache__/sentry_notify*
|
||||
|
||||
%files -n cpu_sentry
|
||||
%attr(0500,root,root) %{_bindir}/cat-cli
|
||||
%attr(0500,root,root) %{_bindir}/cpu_sentry
|
||||
%attr(0550,root,root) %{_libdir}/libcpu_patrol.so
|
||||
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/cpu_sentry.mod
|
||||
%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/cpu_sentry.ini
|
||||
%attr(0550,root,root) %{python3_sitelib}/syssentry/cpu_*
|
||||
|
||||
%files -n avg_block_io
|
||||
%attr(0500,root,root) %{_bindir}/avg_block_io
|
||||
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/avg_block_io.mod
|
||||
%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/avg_block_io.ini
|
||||
%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/avg_block_io
|
||||
%exclude %{python3_sitelib}/sentryPlugins/avg_block_io/__pycache__
|
||||
|
||||
%files -n ai_block_io
|
||||
%attr(0500,root,root) %{_bindir}/ai_block_io
|
||||
%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/ai_block_io.mod
|
||||
%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/ai_block_io.ini
|
||||
%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_block_io
|
||||
%exclude %{python3_sitelib}/sentryPlugins/ai_block_io/__pycache__
|
||||
|
||||
%files -n pysentry_collect
|
||||
%attr(0550,root,root) %{python3_sitelib}/sentryCollector/collect_plugin.py
|
||||
%attr(0550,root,root) %{python3_sitelib}/sentryCollector/__pycache__/collect_plugin*
|
||||
|
||||
%files -n hbm_online_repair
|
||||
%attr(0550,root,root) %{_bindir}/hbm_online_repair
|
||||
@ -400,409 +204,8 @@ rm -rf %{buildroot}
|
||||
%attr(0550,root,root) %{python3_sitelib}/syssentry/bmc_alarm.py
|
||||
|
||||
%changelog
|
||||
* Sat Dec 28 2024 shixuantong <shixuantong@huawei.com> - 1.0.2-67
|
||||
* Mon Jan 20 2025 zhuofeng <zhuofeng2@huawei.com> - 1.0.3-1
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix test_ai_block_io fail
|
||||
|
||||
* Wed Dec 18 2024 luckky <guodashun1@huawei.com> - 1.0.2-66
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC: add boundary check for settings
|
||||
|
||||
* Wed Dec 18 2024 shixuantong <shixuantong@huawei.com> - 1.0.2-65
|
||||
- Type:enhancement
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:set logrotate
|
||||
|
||||
* Wed Dec 18 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-64
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:uniform plugins log
|
||||
|
||||
* Fri Dec 13 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-63
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC: change status of period task and sort mod file
|
||||
|
||||
* Wed Nov 6 2024 luckky <guodashun1@huawei.com> - 1.0.2-62
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC: add boundary check for settings
|
||||
|
||||
* Tue Nov 5 2024 luckky <guodashun1@huawei.com> - 1.0.2-61
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:make debug msg clear
|
||||
|
||||
* Tue Nov 5 2024 zhangnan <zhangnan134@huawei.com> - 1.0.2-60
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:update nvme config
|
||||
|
||||
* Tue Nov 5 2024 gaoruoshu <gaoruoshu@huawei.com> - 1.0.2-59
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:change avg_block_io config
|
||||
|
||||
* Mon Nov 4 2024 luckky <guodashun1@huawei.com> - 1.0.2-58
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix write file return code bug
|
||||
|
||||
* Fri Nov 1 2024 luckky <guodashun1@huawei.com> - 1.0.2-57
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix uint8 bug and change page isolation threshold default value
|
||||
|
||||
* Fri Nov 1 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-56
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DES:excessive CPU usage
|
||||
|
||||
* Thu Oct 31 2024 zhangnan <zhangnan134@huawei.com> - 1.0.2-55
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DES:modify logrotate rule
|
||||
|
||||
* Wed Oct 30 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-54
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DES:get_alarm -d abnormal display
|
||||
|
||||
* Wed Oct 30 2024 luckky <guodashun1@huawei.com> - 1.0.2-53
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix hbm online repair notice and efi create
|
||||
|
||||
* Sat Oct 26 2024 luckky <guodashun1@huawei.com> - 1.0.2-52
|
||||
- Type:requirement
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add hbm_online_repair
|
||||
|
||||
* Sat Oct 26 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-51
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DES:fix newline break error
|
||||
|
||||
* Sat Oct 26 2024 zhangnan <zhangnan134@huawei.com> - 1.0.2-50
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DES:remove extra dependency
|
||||
|
||||
* Wed Oct 23 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-49
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DES:fix get_alarm error
|
||||
|
||||
* Tue Oct 22 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-48
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DES:ai_block_io support iodump
|
||||
|
||||
* Tue Oct 22 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-47
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DES:fix frequency param check bug
|
||||
|
||||
* Mon Oct 21 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-46
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DES:update collect plugin period max
|
||||
|
||||
* Mon Oct 21 2024 caixiaomeng <caixiaomeng2@huawei.com> - 1.0.2-45
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:ai_block_io lack section exit
|
||||
|
||||
* Mon Oct 21 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-44
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:ai_block_io lack section exit
|
||||
|
||||
* Wed Oct 16 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-43
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:enrich alert info about kernel stack
|
||||
|
||||
* Wed Oct 16 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-42
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:optimize log printing
|
||||
|
||||
* Wed Oct 16 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-41
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:listen thread of collect module exits occasionally
|
||||
|
||||
* Wed Oct 16 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-40
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix ai_block_io root cause bug
|
||||
|
||||
* Tue Oct 15 2024 gaoruoshu <gaoruoshu@huawei.com> - 1.0.2-39
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:refactor config.py and bugfix uncorrect slow io report
|
||||
|
||||
* Mon Oct 14 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-38
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:ai_block_io fix some bugs
|
||||
|
||||
* Sat Oct 12 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-37
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add pysentry_collect package and update collect log
|
||||
modify abnormal stack when the disk field is not configured
|
||||
|
||||
* Sat Oct 12 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-36
|
||||
- Type:requirement
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add root cause analysis
|
||||
|
||||
* Sat Oct 12 2024 zhuofeng <zhangnan134@huawei.com> - 1.0.2-35
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix io_dump for collect module
|
||||
|
||||
* Fri Oct 11 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-34
|
||||
- Type:requirement
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:ai_block_io support stage and iotype
|
||||
|
||||
* Fri Oct 11 2024 caixiaomeng <caixiaomeng2@huawei.com> - 1.0.2-33
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix xalarm upgrade not return val, not refuse to send msg when length exceeds 8192,cleanup invalid socket peroidlly
|
||||
|
||||
* Fri Oct 11 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-32
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add parameter validation
|
||||
|
||||
* Fri Oct 11 2024 gaoruoshu <gaoruoshu@huawei.com> - 1.0.2-31
|
||||
- Type:requirement
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:avg_block_io adapt different type of disk, use different config
|
||||
|
||||
* Thu Oct 10 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-30
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add get_disk_type and fix some bugs
|
||||
add log for improving maintainability
|
||||
|
||||
* Thu Oct 10 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-29
|
||||
- Type:requirement
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:ai_block_io adapt alarm module
|
||||
|
||||
* Thu Oct 10 2024 caixiaomeng <caixiaomeng2@huawei.com> - 1.0.2-28
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:xalarm add alarm msg length to 8192
|
||||
|
||||
* Thu Oct 10 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-27
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add dependency for sysSentry and avg_block_io
|
||||
|
||||
* Thu Oct 10 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-26
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix get_alarm length and timestamp
|
||||
|
||||
* Wed Oct 9 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-25
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:update log when it is not lock collect
|
||||
|
||||
* Wed Oct 9 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-24
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix ai_block_io config relative some issues
|
||||
|
||||
* Wed Oct 9 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-23
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:avg_block_io send alarm to xalarmd
|
||||
|
||||
* Wed Oct 9 2024 caixiaomeng <caixiaomeng2@huawei.com> - 1.0.2-22
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix python 3.7 not support list bool type
|
||||
|
||||
* Tue Oct 8 2024 jinsaihang <jinsaihang@h-partners.com> - 1.0.2-21
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add alarm event query function
|
||||
|
||||
* Tue Oct 8 2024 caixiaomeng <caixiaomeng2@huawei.com> - 1.0.2-20
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add pyxalarm and pySentryNotify, add multi users support for xalarmd
|
||||
|
||||
* Mon Sep 30 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-19
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix ai_block_io some issues
|
||||
|
||||
* Fri Sep 27 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-18
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add log level and change log format
|
||||
|
||||
* Wed Sep 25 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-17
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix bug step 2 about collect module and avg block io
|
||||
|
||||
* Mon Sep 23 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-16
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:optimize the handing of cat-cli error msg in cpu_sentry
|
||||
over threshold should be warn level log in cat-cli
|
||||
|
||||
* Mon Sep 23 2024 heyouzhi <heyouzhi@huawei.com> - 1.0.2-15
|
||||
- Type:requirement
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add ai threshold slow io detection plugin
|
||||
|
||||
* Fri Sep 20 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-14
|
||||
- Type:requirement
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix some about collect module and avg block io
|
||||
|
||||
* Sat Sep 14 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-13
|
||||
- Type:requirement
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add collect module and avg_block_io plugin to sysSentry
|
||||
|
||||
* Sat Sep 14 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-12
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix syssentry fails to be started when cpu_sentry is not installed
|
||||
|
||||
* Wed Sep 11 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-11
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix configparser.InterpolationSyntaxError
|
||||
|
||||
* Mon Sep 09 2024 caixiaomeng <caixiaomeng2@huawei.com> - 1.0.2-10
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:split cpu_sentry and syssentry
|
||||
|
||||
* Mon Sep 02 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-9
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:Remove ANSI escape sequences
|
||||
|
||||
* Sat Aug 31 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-8
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add ipmitool to Recommends for cpu_sentry
|
||||
|
||||
* Sat Aug 31 2024 zhuofeng <zhuofeng2@huawei.com> - 1.0.2-7
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:add deleted code to plugin rasdaemon
|
||||
|
||||
* Fri Aug 30 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-6
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:setting parameters must be integer
|
||||
|
||||
* Wed Aug 28 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-5
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:cpu_utility and cpu_patrol must be an integer
|
||||
|
||||
* Fri Jul 26 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-4
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:fix result when process output is None
|
||||
|
||||
* Thu Jul 25 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-3
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:Fix the problem that function cpu_report_result() is called more than once
|
||||
fix error handling
|
||||
|
||||
* Tue Jun 18 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-2
|
||||
- Type:bugfix
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:delete rpath setting
|
||||
|
||||
* Tue Jun 11 2024 shixuantong <shixuantong1@huawei.com> - 1.0.2-1
|
||||
- Type:enhancement
|
||||
- CVE:NA
|
||||
- SUG:NA
|
||||
- DESC:Package init
|
||||
|
||||
- DESC:1.0.3 init
|
||||
|
||||
@ -1,63 +0,0 @@
|
||||
From c8f21d1621e96e2c8a239f8028cc9331aa0f8997 Mon Sep 17 00:00:00 2001
|
||||
From: jinsaihang <jinsaihang@h-partners.com>
|
||||
Date: Tue, 17 Dec 2024 11:36:11 +0800
|
||||
Subject: [PATCH] uniform avg_block_io log and ai_block_io log
|
||||
|
||||
Signed-off-by: jinsaihang <jinsaihang@h-partners.com>
|
||||
---
|
||||
src/python/sentryPlugins/ai_block_io/ai_block_io.py | 5 +++++
|
||||
src/python/sentryPlugins/ai_block_io/detector.py | 8 +++-----
|
||||
src/python/sentryPlugins/avg_block_io/stage_window.py | 2 +-
|
||||
3 files changed, 9 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
index 14f740d..8075f5f 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
||||
@@ -208,6 +208,11 @@ class SlowIODetection:
|
||||
tmp_alarm_content = alarm_content.copy()
|
||||
del tmp_alarm_content["details"]
|
||||
logging.warning("[SLOW IO] " + str(tmp_alarm_content))
|
||||
+ logging.warning(f'[SLOW IO] disk: {str(tmp_alarm_content.get("driver_name"))}, '
|
||||
+ f'stage: {str(tmp_alarm_content.get("driver_name"))}, '
|
||||
+ f'iotype: {str(tmp_alarm_content.get("io_type"))}, '
|
||||
+ f'type: {str(tmp_alarm_content.get("alarm_type"))}, '
|
||||
+ f'reason: {str(tmp_alarm_content.get("reason"))}')
|
||||
logging.warning(f"latency: " + str(alarm_content.get("details").get("latency")))
|
||||
logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump")))
|
||||
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
index 496e032..27fb7f7 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
|
||||
@@ -58,11 +58,9 @@ class Detector:
|
||||
logging.info(f'[abnormal_period]: disk: {self._metric_name.disk_name}, '
|
||||
f'stage: {self._metric_name.stage_name}, '
|
||||
f'iotype: {self._metric_name.io_access_type_name}, '
|
||||
- f'metric: {self._metric_name.metric_name}, '
|
||||
- f'current value: {metric_value}, '
|
||||
- f'ai threshold: {detection_result[2]}, '
|
||||
- f'absolute threshold upper limit: {detection_result[3]}, '
|
||||
- f'lower limit: {detection_result[4]}')
|
||||
+ f'type: {self._metric_name.metric_name}, '
|
||||
+ f'ai_threshold: {round(detection_result[2], 3)}, '
|
||||
+ f'curr_val: {metric_value}')
|
||||
else:
|
||||
logging.debug(f'Detection result: {str(detection_result)}')
|
||||
logging.debug(f'exit Detector: {self}')
|
||||
diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py
|
||||
index 5113782..587bd49 100644
|
||||
--- a/src/python/sentryPlugins/avg_block_io/stage_window.py
|
||||
+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py
|
||||
@@ -28,7 +28,7 @@ class AbnormalWindowBase:
|
||||
self.abnormal_window.append(False)
|
||||
|
||||
def is_abnormal_window(self):
|
||||
- return sum(self.abnormal_window) > self.window_threshold
|
||||
+ return sum(self.abnormal_window) >= self.window_threshold
|
||||
|
||||
def window_data_to_string(self):
|
||||
return ",".join(str(x) for x in self.window_data)
|
||||
--
|
||||
2.27.0
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
From 73f5028fcab08613833c9f2b432f660c70ac264e Mon Sep 17 00:00:00 2001
|
||||
From: zhuofeng <zhuofeng2@huawei.com>
|
||||
Date: Sat, 12 Oct 2024 16:06:32 +0800
|
||||
Subject: [PATCH] update collect log
|
||||
|
||||
---
|
||||
src/python/sentryCollector/collect_io.py | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
|
||||
index 2b10cde..f699c3c 100644
|
||||
--- a/src/python/sentryCollector/collect_io.py
|
||||
+++ b/src/python/sentryCollector/collect_io.py
|
||||
@@ -156,7 +156,7 @@ class CollectIo():
|
||||
for line in file:
|
||||
count += line.count('.op=' + Io_Category[category].upper())
|
||||
if count > 0:
|
||||
- logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}")
|
||||
+ logging.info(f"io_dump info : {disk_name}, {stage}, {Io_Category[category]}, {count}")
|
||||
except FileNotFoundError:
|
||||
logging.error("The file %s does not exist.", io_dump_file)
|
||||
return count
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,44 +0,0 @@
|
||||
From 4550d9cbbb7e921db168f748e8b1d5d7cc0f8b15 Mon Sep 17 00:00:00 2001
|
||||
From: zhuofeng <zhuofeng2@huawei.com>
|
||||
Date: Mon, 21 Oct 2024 17:30:39 +0800
|
||||
Subject: [PATCH] update collect plugin period max
|
||||
|
||||
---
|
||||
src/python/sentryCollector/collect_plugin.py | 7 +++++--
|
||||
1 file changed, 5 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py
|
||||
index 53dddec..9495d8b 100644
|
||||
--- a/src/python/sentryCollector/collect_plugin.py
|
||||
+++ b/src/python/sentryCollector/collect_plugin.py
|
||||
@@ -45,6 +45,9 @@ LIMIT_IOTYPE_LIST_LEN = 4
|
||||
LIMIT_PERIOD_MIN_LEN = 1
|
||||
LIMIT_PERIOD_MAX_LEN = 300
|
||||
|
||||
+# max_save
|
||||
+LIMIT_MAX_SAVE_LEN = 300
|
||||
+
|
||||
# interface protocol
|
||||
class ClientProtocol():
|
||||
IS_IOCOLLECT_VALID = 0
|
||||
@@ -189,7 +192,7 @@ def inter_is_iocollect_valid(period, disk_list=None, stage=None):
|
||||
if not period or not isinstance(period, int):
|
||||
result['ret'] = ResultMessage.RESULT_NOT_PARAM
|
||||
return result
|
||||
- if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN:
|
||||
+ if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN * LIMIT_MAX_SAVE_LEN:
|
||||
result['ret'] = ResultMessage.RESULT_INVALID_LENGTH
|
||||
return result
|
||||
|
||||
@@ -246,7 +249,7 @@ def inter_get_io_data(period, disk_list, stage, iotype):
|
||||
if not isinstance(period, int):
|
||||
result['ret'] = ResultMessage.RESULT_NOT_PARAM
|
||||
return result
|
||||
- if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN:
|
||||
+ if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN * LIMIT_MAX_SAVE_LEN:
|
||||
result['ret'] = ResultMessage.RESULT_INVALID_LENGTH
|
||||
return result
|
||||
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,35 +0,0 @@
|
||||
From ac73565fdb0e4bc544e5308ea0251dd6be410ed9 Mon Sep 17 00:00:00 2001
|
||||
From: zhuofeng <zhuofeng2@huawei.com>
|
||||
Date: Wed, 9 Oct 2024 16:37:24 +0800
|
||||
Subject: [PATCH] update log when it is not lock collect
|
||||
|
||||
---
|
||||
src/python/sentryCollector/collect_io.py | 8 ++++++--
|
||||
1 file changed, 6 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py
|
||||
index e45947a..2e75187 100644
|
||||
--- a/src/python/sentryCollector/collect_io.py
|
||||
+++ b/src/python/sentryCollector/collect_io.py
|
||||
@@ -179,13 +179,17 @@ class CollectIo():
|
||||
blk_io_hierarchy_path = os.path.join(disk_path, 'blk_io_hierarchy')
|
||||
|
||||
if not os.path.exists(blk_io_hierarchy_path):
|
||||
- logging.error("no blk_io_hierarchy directory found in %s, skipping.", disk_name)
|
||||
+ logging.warning("no blk_io_hierarchy directory found in %s, skipping.", disk_name)
|
||||
continue
|
||||
|
||||
for file_name in os.listdir(blk_io_hierarchy_path):
|
||||
file_path = os.path.join(blk_io_hierarchy_path, file_name)
|
||||
if file_name == 'stats':
|
||||
all_disk.append(disk_name)
|
||||
+
|
||||
+ if len(all_disk) == 0:
|
||||
+ logging.debug("no blk_io_hierarchy disk, it is not lock-free collection")
|
||||
+ return False
|
||||
|
||||
if self.loop_all:
|
||||
self.disk_list = all_disk
|
||||
--
|
||||
2.33.0
|
||||
|
||||
@ -1,51 +0,0 @@
|
||||
From f50b4e1b7f5fa38b1930349b1a9a905eb5307ab7 Mon Sep 17 00:00:00 2001
|
||||
From: znzjugod <zhangnan134@huawei.com>
|
||||
Date: Tue, 5 Nov 2024 11:47:56 +0800
|
||||
Subject: [PATCH] update nvme config
|
||||
|
||||
---
|
||||
config/plugins/ai_block_io.ini | 8 ++++----
|
||||
src/python/sentryPlugins/ai_block_io/config_parser.py | 8 ++++----
|
||||
2 files changed, 8 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini
|
||||
index d0b1e74..69f44ba 100644
|
||||
--- a/config/plugins/ai_block_io.ini
|
||||
+++ b/config/plugins/ai_block_io.ini
|
||||
@@ -23,10 +23,10 @@ read_tot_lim=50000
|
||||
write_tot_lim=50000
|
||||
|
||||
[latency_nvme_ssd]
|
||||
-read_avg_lim=300
|
||||
-write_avg_lim=300
|
||||
-read_tot_lim=500
|
||||
-write_tot_lim=500
|
||||
+read_avg_lim=10000
|
||||
+write_avg_lim=10000
|
||||
+read_tot_lim=50000
|
||||
+write_tot_lim=50000
|
||||
|
||||
[latency_sata_hdd]
|
||||
read_avg_lim=15000
|
||||
diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
index 3049db2..1bbb609 100644
|
||||
--- a/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py
|
||||
@@ -74,10 +74,10 @@ class ConfigParser:
|
||||
"write_tot_lim": 50000
|
||||
},
|
||||
"latency_nvme_ssd": {
|
||||
- "read_avg_lim": 300,
|
||||
- "write_avg_lim": 300,
|
||||
- "read_tot_lim": 500,
|
||||
- "write_tot_lim": 500
|
||||
+ "read_avg_lim": 10000,
|
||||
+ "write_avg_lim": 10000,
|
||||
+ "read_tot_lim": 50000,
|
||||
+ "write_tot_lim": 50000
|
||||
},
|
||||
"latency_sata_hdd": {
|
||||
"read_avg_lim": 15000,
|
||||
--
|
||||
2.45.2
|
||||
|
||||
@ -1,112 +0,0 @@
|
||||
From c95be14eee48e5afb255700c9d67c1d8ef2532dc Mon Sep 17 00:00:00 2001
|
||||
From: PshySimon <caixiaomeng2@huawei.com>
|
||||
Date: Thu, 10 Oct 2024 16:15:52 +0800
|
||||
Subject: [PATCH] xalarm add alarm msg length to 8192
|
||||
|
||||
---
|
||||
src/libso/xalarm/register_xalarm.c | 2 +-
|
||||
src/libso/xalarm/register_xalarm.h | 2 +-
|
||||
src/python/xalarm/register_xalarm.py | 2 +-
|
||||
src/python/xalarm/sentry_notify.py | 2 +-
|
||||
src/python/xalarm/xalarm_api.py | 8 ++++++--
|
||||
src/python/xalarm/xalarm_server.py | 2 +-
|
||||
6 files changed, 11 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c
|
||||
index 21a419f..5aff2bc 100644
|
||||
--- a/src/libso/xalarm/register_xalarm.c
|
||||
+++ b/src/libso/xalarm/register_xalarm.c
|
||||
@@ -35,7 +35,7 @@
|
||||
#define ALARM_SOCKET_PERMISSION 0700
|
||||
#define TIME_UNIT_MILLISECONDS 1000
|
||||
|
||||
-#define MAX_PARAS_LEN 1023
|
||||
+#define MAX_PARAS_LEN 8191
|
||||
#define MIN_ALARM_ID 1001
|
||||
#define MAX_ALARM_ID (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
|
||||
|
||||
diff --git a/src/libso/xalarm/register_xalarm.h b/src/libso/xalarm/register_xalarm.h
|
||||
index fef9482..dcf4f03 100644
|
||||
--- a/src/libso/xalarm/register_xalarm.h
|
||||
+++ b/src/libso/xalarm/register_xalarm.h
|
||||
@@ -11,7 +11,7 @@
|
||||
#include <sys/time.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
-#define ALARM_INFO_MAX_PARAS_LEN 1024
|
||||
+#define ALARM_INFO_MAX_PARAS_LEN 8192
|
||||
#define MAX_STRERROR_SIZE 1024
|
||||
#define MAX_ALARM_TYEPS 1024
|
||||
#define MIN_ALARM_ID 1001
|
||||
diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py
|
||||
index 6756b1b..edd9994 100644
|
||||
--- a/src/python/xalarm/register_xalarm.py
|
||||
+++ b/src/python/xalarm/register_xalarm.py
|
||||
@@ -11,7 +11,7 @@ from struct import error as StructParseError
|
||||
from .xalarm_api import Xalarm, alarm_bin2stu
|
||||
|
||||
|
||||
-ALARM_REPORT_LEN = 1048
|
||||
+ALARM_REPORT_LEN = 8216
|
||||
MAX_NUM_OF_ALARM_ID=128
|
||||
MIN_ALARM_ID = 1001
|
||||
MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1)
|
||||
diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py
|
||||
index a19e5b3..c763a24 100644
|
||||
--- a/src/python/xalarm/sentry_notify.py
|
||||
+++ b/src/python/xalarm/sentry_notify.py
|
||||
@@ -17,7 +17,7 @@ CRITICAL_ALM = 3
|
||||
ALARM_TYPE_OCCUR = 1
|
||||
ALARM_TYPE_RECOVER = 2
|
||||
|
||||
-MAX_PUC_PARAS_LEN = 1024
|
||||
+MAX_PUC_PARAS_LEN = 8192
|
||||
|
||||
DIR_XALARM = "/var/run/xalarm"
|
||||
PATH_REPORT_ALARM = "/var/run/xalarm/report"
|
||||
diff --git a/src/python/xalarm/xalarm_api.py b/src/python/xalarm/xalarm_api.py
|
||||
index 99eabf5..863bd02 100644
|
||||
--- a/src/python/xalarm/xalarm_api.py
|
||||
+++ b/src/python/xalarm/xalarm_api.py
|
||||
@@ -23,7 +23,7 @@ ALARM_LEVELS = (1, 2, 3, 4, 5)
|
||||
ALARM_SOCK_PATH = "/var/run/xalarm/report"
|
||||
MIN_ALARM_ID = 1001
|
||||
MAX_ALARM_ID = 1128
|
||||
-MAX_MSG_LEN = 1024
|
||||
+MAX_MSG_LEN = 8192
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
@@ -120,6 +120,10 @@ def alarm_bin2stu(bin_data):
|
||||
|
||||
|
||||
def alarm_stu2bin(alarm_info: Xalarm):
|
||||
+ alarm_msg = alarm_info.msg1
|
||||
+ padding_length = MAX_MSG_LEN - len(alarm_msg)
|
||||
+ if padding_length > 0:
|
||||
+ alarm_msg = alarm_msg + ('\x00' * padding_length)
|
||||
return struct.pack(
|
||||
f'@HBBll{MAX_MSG_LEN}s',
|
||||
alarm_info.alarm_id,
|
||||
@@ -127,4 +131,4 @@ def alarm_stu2bin(alarm_info: Xalarm):
|
||||
alarm_info.alarm_type,
|
||||
alarm_info.timetamp.tv_sec,
|
||||
alarm_info.timetamp.tv_usec,
|
||||
- alarm_info.msg1.encode('utf-8'))
|
||||
+ alarm_msg.encode('utf-8'))
|
||||
diff --git a/src/python/xalarm/xalarm_server.py b/src/python/xalarm/xalarm_server.py
|
||||
index fcaf393..2882609 100644
|
||||
--- a/src/python/xalarm/xalarm_server.py
|
||||
+++ b/src/python/xalarm/xalarm_server.py
|
||||
@@ -28,7 +28,7 @@ from .xalarm_transfer import check_filter, transmit_alarm, wait_for_connection
|
||||
ALARM_DIR = "/var/run/xalarm"
|
||||
USER_RECV_SOCK = "/var/run/xalarm/alarm"
|
||||
SOCK_FILE = "/var/run/xalarm/report"
|
||||
-ALARM_REPORT_LEN = 1048
|
||||
+ALARM_REPORT_LEN = 8216
|
||||
ALARM_DIR_PERMISSION = 0o750
|
||||
ALARM_LISTEN_QUEUE_LEN = 5
|
||||
|
||||
--
|
||||
2.27.0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user