sysSentry/ai_block_io-adapt-alarm-module.patch
贺有志 5eb6aaf745 ai_block_io adapt alarm module.patch
Signed-off-by: 贺有志 <1037617413@qq.com>

ai_block_io adapt alarm module

Signed-off-by: 贺有志 <1037617413@qq.com>
2024-10-10 21:19:16 +08:00

222 lines
9.7 KiB
Diff
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

From 367f8ab8a5ad26d80caf1bc4529c79d279ef0fb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
Date: Thu, 10 Oct 2024 17:21:48 +0800
Subject: [PATCH] ai_block_io adapt alarm module
---
config/tasks/ai_block_io.mod | 4 +-
.../sentryPlugins/ai_block_io/ai_block_io.py | 28 +++++---
.../sentryPlugins/ai_block_io/alarm_report.py | 65 ++++++++++++++-----
.../sentryPlugins/ai_block_io/data_access.py | 5 +-
.../sentryPlugins/ai_block_io/detector.py | 2 +-
5 files changed, 73 insertions(+), 31 deletions(-)
diff --git a/config/tasks/ai_block_io.mod b/config/tasks/ai_block_io.mod
index 1971d7d..82f4f0b 100644
--- a/config/tasks/ai_block_io.mod
+++ b/config/tasks/ai_block_io.mod
@@ -2,4 +2,6 @@
enabled=yes
task_start=/usr/bin/python3 /usr/bin/ai_block_io
task_stop=pkill -f /usr/bin/ai_block_io
-type=oneshot
\ No newline at end of file
+type=oneshot
+alarm_id=1002
+alarm_clear_time=5
\ No newline at end of file
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
index 3b00ef3..77104a9 100644
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
@@ -20,14 +20,14 @@ from .utils import get_data_queue_size_and_update_size
from .config_parser import ConfigParser
from .data_access import get_io_data_from_collect_plug, check_collect_valid
from .io_data import MetricName
-from .alarm_report import AlarmReport
+from .alarm_report import Xalarm, Report
CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini"
def sig_handler(signum, frame):
logging.info("receive signal: %d", signum)
- AlarmReport().report_fail(f"receive signal: {signum}")
+ Report.report_pass(f"receive signal: {signum}, exiting...")
exit(signum)
@@ -44,6 +44,10 @@ class SlowIODetection:
def __init_detector_name_list(self):
self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency())
+ if self._disk_list is None:
+ Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...")
+ exit(1)
+
logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
disks_to_detection: list = self._config_parser.get_disks_to_detection()
# 情况1None则启用所有磁盘检测
@@ -101,7 +105,8 @@ class SlowIODetection:
)
logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}')
if io_data_dict_with_disk_name is None:
- continue
+ Report.report_pass("get io data error, please check if the collector plug is enable. exitting...")
+ exit(1)
# Step2慢IO检测
logging.debug('step2. Start to detection slow io event.')
@@ -117,13 +122,16 @@ class SlowIODetection:
for slow_io_event in slow_io_event_list:
metric_name: MetricName = slow_io_event[0]
result = slow_io_event[1]
- alarm_content = (f"disk {metric_name.get_disk_name()} has slow io event. "
- f"stage is: {metric_name.get_stage_name()}, "
- f"io access type is: {metric_name.get_io_access_type_name()}, "
- f"metric is: {metric_name.get_metric_name()}, "
- f"current window is: {result[1]}, "
- f"threshold is: {result[2]}")
- AlarmReport.report_major_alm(alarm_content)
+ alarm_content = {
+ "driver_name": f"{metric_name.get_disk_name()}",
+ "reason": "disk_slow",
+ "block_stack": f"{metric_name.get_stage_name()}",
+ "io_type": f"{metric_name.get_io_access_type_name()}",
+ "alarm_source": "ai_block_io",
+ "alarm_type": "latency",
+ "details": f"current window is: {result[1]}, threshold is: {result[2]}.",
+ }
+ Xalarm.major(alarm_content)
logging.warning(alarm_content)
# Step4等待检测时间
diff --git a/src/python/sentryPlugins/ai_block_io/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py
index 230c8cd..92bd6e3 100644
--- a/src/python/sentryPlugins/ai_block_io/alarm_report.py
+++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py
@@ -9,41 +9,72 @@
# PURPOSE.
# See the Mulan PSL v2 for more details.
-from syssentry.result import ResultLevel, report_result
import logging
import json
+from xalarm.sentry_notify import (
+ xalarm_report,
+ MINOR_ALM,
+ MAJOR_ALM,
+ CRITICAL_ALM,
+ ALARM_TYPE_OCCUR,
+ ALARM_TYPE_RECOVER,
+)
+
+from syssentry.result import ResultLevel, report_result
+
-class AlarmReport:
+class Report:
TASK_NAME = "ai_block_io"
@staticmethod
def report_pass(info: str):
- report_result(AlarmReport.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info}))
- logging.info(f'Report {AlarmReport.TASK_NAME} PASS: {info}')
+ report_result(Report.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info}))
+ logging.info(f'Report {Report.TASK_NAME} PASS: {info}')
@staticmethod
def report_fail(info: str):
- report_result(AlarmReport.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info}))
- logging.info(f'Report {AlarmReport.TASK_NAME} FAIL: {info}')
+ report_result(Report.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info}))
+ logging.info(f'Report {Report.TASK_NAME} FAIL: {info}')
@staticmethod
def report_skip(info: str):
- report_result(AlarmReport.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info}))
- logging.info(f'Report {AlarmReport.TASK_NAME} SKIP: {info}')
+ report_result(Report.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info}))
+ logging.info(f'Report {Report.TASK_NAME} SKIP: {info}')
+
+
+class Xalarm:
+ ALARM_ID = 1002
@staticmethod
- def report_minor_alm(info: str):
- report_result(AlarmReport.TASK_NAME, ResultLevel.MINOR_ALM, json.dumps({"msg": info}))
- logging.info(f'Report {AlarmReport.TASK_NAME} MINOR_ALM: {info}')
+ def minor(info: dict):
+ info_str = json.dumps(info)
+ xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_OCCUR, info_str)
+ logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}")
@staticmethod
- def report_major_alm(info: str):
- report_result(AlarmReport.TASK_NAME, ResultLevel.MAJOR_ALM, json.dumps({"msg": info}))
- logging.info(f'Report {AlarmReport.TASK_NAME} MAJOR_ALM: {info}')
+ def major(info: dict):
+ info_str = json.dumps(info)
+ xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_OCCUR, info_str)
+ logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}")
@staticmethod
- def report_critical_alm(info: str):
- report_result(AlarmReport.TASK_NAME, ResultLevel.CRITICAL_ALM, json.dumps({"msg": info}))
- logging.info(f'Report {AlarmReport.TASK_NAME} CRITICAL_ALM: {info}')
+ def critical(info: dict):
+ info_str = json.dumps(info)
+ xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_OCCUR, info_str)
+ logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}")
+
+ def minor_recover(info: dict):
+ info_str = json.dumps(info)
+ xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_RECOVER, info_str)
+ logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}")
+
+ def major_recover(info: dict):
+ info_str = json.dumps(info)
+ xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_RECOVER, info_str)
+ logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}")
+ def critical_recover(info: dict):
+ info_str = json.dumps(info)
+ xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_RECOVER, info_str)
+ logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}")
diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
index 01c5315..c7679cd 100644
--- a/src/python/sentryPlugins/ai_block_io/data_access.py
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
@@ -42,10 +42,11 @@ def check_collect_valid(period):
data = json.loads(data_raw["message"])
except Exception as e:
logging.warning(f"get io data failed, {e}")
- return []
+ return None
return [k for k in data.keys()]
else:
- return []
+ logging.warning(f"get io data failed, return {data_raw}")
+ return None
def _get_raw_data(period, disk_list):
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
index a48144f..0ed282b 100644
--- a/src/python/sentryPlugins/ai_block_io/detector.py
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
@@ -35,7 +35,7 @@ class Detector:
self._count += 1
if self._count % 15 == 0:
self._count = 0
- logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
+ logging.debug(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
logging.debug(f'enter Detector: {self}')
metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name)
if metric_value is None:
--
2.23.0