222 lines
9.7 KiB
Diff
222 lines
9.7 KiB
Diff
|
|
From 367f8ab8a5ad26d80caf1bc4529c79d279ef0fb1 Mon Sep 17 00:00:00 2001
|
|||
|
|
From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com>
|
|||
|
|
Date: Thu, 10 Oct 2024 17:21:48 +0800
|
|||
|
|
Subject: [PATCH] ai_block_io adapt alarm module
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
config/tasks/ai_block_io.mod | 4 +-
|
|||
|
|
.../sentryPlugins/ai_block_io/ai_block_io.py | 28 +++++---
|
|||
|
|
.../sentryPlugins/ai_block_io/alarm_report.py | 65 ++++++++++++++-----
|
|||
|
|
.../sentryPlugins/ai_block_io/data_access.py | 5 +-
|
|||
|
|
.../sentryPlugins/ai_block_io/detector.py | 2 +-
|
|||
|
|
5 files changed, 73 insertions(+), 31 deletions(-)
|
|||
|
|
|
|||
|
|
diff --git a/config/tasks/ai_block_io.mod b/config/tasks/ai_block_io.mod
|
|||
|
|
index 1971d7d..82f4f0b 100644
|
|||
|
|
--- a/config/tasks/ai_block_io.mod
|
|||
|
|
+++ b/config/tasks/ai_block_io.mod
|
|||
|
|
@@ -2,4 +2,6 @@
|
|||
|
|
enabled=yes
|
|||
|
|
task_start=/usr/bin/python3 /usr/bin/ai_block_io
|
|||
|
|
task_stop=pkill -f /usr/bin/ai_block_io
|
|||
|
|
-type=oneshot
|
|||
|
|
\ No newline at end of file
|
|||
|
|
+type=oneshot
|
|||
|
|
+alarm_id=1002
|
|||
|
|
+alarm_clear_time=5
|
|||
|
|
\ No newline at end of file
|
|||
|
|
diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
|||
|
|
index 3b00ef3..77104a9 100644
|
|||
|
|
--- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
|||
|
|
+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py
|
|||
|
|
@@ -20,14 +20,14 @@ from .utils import get_data_queue_size_and_update_size
|
|||
|
|
from .config_parser import ConfigParser
|
|||
|
|
from .data_access import get_io_data_from_collect_plug, check_collect_valid
|
|||
|
|
from .io_data import MetricName
|
|||
|
|
-from .alarm_report import AlarmReport
|
|||
|
|
+from .alarm_report import Xalarm, Report
|
|||
|
|
|
|||
|
|
CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def sig_handler(signum, frame):
|
|||
|
|
logging.info("receive signal: %d", signum)
|
|||
|
|
- AlarmReport().report_fail(f"receive signal: {signum}")
|
|||
|
|
+ Report.report_pass(f"receive signal: {signum}, exiting...")
|
|||
|
|
exit(signum)
|
|||
|
|
|
|||
|
|
|
|||
|
|
@@ -44,6 +44,10 @@ class SlowIODetection:
|
|||
|
|
|
|||
|
|
def __init_detector_name_list(self):
|
|||
|
|
self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency())
|
|||
|
|
+ if self._disk_list is None:
|
|||
|
|
+ Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...")
|
|||
|
|
+ exit(1)
|
|||
|
|
+
|
|||
|
|
logging.info(f"ai_block_io plug has found disks: {self._disk_list}")
|
|||
|
|
disks_to_detection: list = self._config_parser.get_disks_to_detection()
|
|||
|
|
# 情况1:None,则启用所有磁盘检测
|
|||
|
|
@@ -101,7 +105,8 @@ class SlowIODetection:
|
|||
|
|
)
|
|||
|
|
logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}')
|
|||
|
|
if io_data_dict_with_disk_name is None:
|
|||
|
|
- continue
|
|||
|
|
+ Report.report_pass("get io data error, please check if the collector plug is enable. exitting...")
|
|||
|
|
+ exit(1)
|
|||
|
|
|
|||
|
|
# Step2:慢IO检测
|
|||
|
|
logging.debug('step2. Start to detection slow io event.')
|
|||
|
|
@@ -117,13 +122,16 @@ class SlowIODetection:
|
|||
|
|
for slow_io_event in slow_io_event_list:
|
|||
|
|
metric_name: MetricName = slow_io_event[0]
|
|||
|
|
result = slow_io_event[1]
|
|||
|
|
- alarm_content = (f"disk {metric_name.get_disk_name()} has slow io event. "
|
|||
|
|
- f"stage is: {metric_name.get_stage_name()}, "
|
|||
|
|
- f"io access type is: {metric_name.get_io_access_type_name()}, "
|
|||
|
|
- f"metric is: {metric_name.get_metric_name()}, "
|
|||
|
|
- f"current window is: {result[1]}, "
|
|||
|
|
- f"threshold is: {result[2]}")
|
|||
|
|
- AlarmReport.report_major_alm(alarm_content)
|
|||
|
|
+ alarm_content = {
|
|||
|
|
+ "driver_name": f"{metric_name.get_disk_name()}",
|
|||
|
|
+ "reason": "disk_slow",
|
|||
|
|
+ "block_stack": f"{metric_name.get_stage_name()}",
|
|||
|
|
+ "io_type": f"{metric_name.get_io_access_type_name()}",
|
|||
|
|
+ "alarm_source": "ai_block_io",
|
|||
|
|
+ "alarm_type": "latency",
|
|||
|
|
+ "details": f"current window is: {result[1]}, threshold is: {result[2]}.",
|
|||
|
|
+ }
|
|||
|
|
+ Xalarm.major(alarm_content)
|
|||
|
|
logging.warning(alarm_content)
|
|||
|
|
|
|||
|
|
# Step4:等待检测时间
|
|||
|
|
diff --git a/src/python/sentryPlugins/ai_block_io/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py
|
|||
|
|
index 230c8cd..92bd6e3 100644
|
|||
|
|
--- a/src/python/sentryPlugins/ai_block_io/alarm_report.py
|
|||
|
|
+++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py
|
|||
|
|
@@ -9,41 +9,72 @@
|
|||
|
|
# PURPOSE.
|
|||
|
|
# See the Mulan PSL v2 for more details.
|
|||
|
|
|
|||
|
|
-from syssentry.result import ResultLevel, report_result
|
|||
|
|
import logging
|
|||
|
|
import json
|
|||
|
|
|
|||
|
|
+from xalarm.sentry_notify import (
|
|||
|
|
+ xalarm_report,
|
|||
|
|
+ MINOR_ALM,
|
|||
|
|
+ MAJOR_ALM,
|
|||
|
|
+ CRITICAL_ALM,
|
|||
|
|
+ ALARM_TYPE_OCCUR,
|
|||
|
|
+ ALARM_TYPE_RECOVER,
|
|||
|
|
+)
|
|||
|
|
+
|
|||
|
|
+from syssentry.result import ResultLevel, report_result
|
|||
|
|
+
|
|||
|
|
|
|||
|
|
-class AlarmReport:
|
|||
|
|
+class Report:
|
|||
|
|
TASK_NAME = "ai_block_io"
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def report_pass(info: str):
|
|||
|
|
- report_result(AlarmReport.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info}))
|
|||
|
|
- logging.info(f'Report {AlarmReport.TASK_NAME} PASS: {info}')
|
|||
|
|
+ report_result(Report.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info}))
|
|||
|
|
+ logging.info(f'Report {Report.TASK_NAME} PASS: {info}')
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def report_fail(info: str):
|
|||
|
|
- report_result(AlarmReport.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info}))
|
|||
|
|
- logging.info(f'Report {AlarmReport.TASK_NAME} FAIL: {info}')
|
|||
|
|
+ report_result(Report.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info}))
|
|||
|
|
+ logging.info(f'Report {Report.TASK_NAME} FAIL: {info}')
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def report_skip(info: str):
|
|||
|
|
- report_result(AlarmReport.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info}))
|
|||
|
|
- logging.info(f'Report {AlarmReport.TASK_NAME} SKIP: {info}')
|
|||
|
|
+ report_result(Report.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info}))
|
|||
|
|
+ logging.info(f'Report {Report.TASK_NAME} SKIP: {info}')
|
|||
|
|
+
|
|||
|
|
+
|
|||
|
|
+class Xalarm:
|
|||
|
|
+ ALARM_ID = 1002
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
- def report_minor_alm(info: str):
|
|||
|
|
- report_result(AlarmReport.TASK_NAME, ResultLevel.MINOR_ALM, json.dumps({"msg": info}))
|
|||
|
|
- logging.info(f'Report {AlarmReport.TASK_NAME} MINOR_ALM: {info}')
|
|||
|
|
+ def minor(info: dict):
|
|||
|
|
+ info_str = json.dumps(info)
|
|||
|
|
+ xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_OCCUR, info_str)
|
|||
|
|
+ logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}")
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
- def report_major_alm(info: str):
|
|||
|
|
- report_result(AlarmReport.TASK_NAME, ResultLevel.MAJOR_ALM, json.dumps({"msg": info}))
|
|||
|
|
- logging.info(f'Report {AlarmReport.TASK_NAME} MAJOR_ALM: {info}')
|
|||
|
|
+ def major(info: dict):
|
|||
|
|
+ info_str = json.dumps(info)
|
|||
|
|
+ xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_OCCUR, info_str)
|
|||
|
|
+ logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}")
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
- def report_critical_alm(info: str):
|
|||
|
|
- report_result(AlarmReport.TASK_NAME, ResultLevel.CRITICAL_ALM, json.dumps({"msg": info}))
|
|||
|
|
- logging.info(f'Report {AlarmReport.TASK_NAME} CRITICAL_ALM: {info}')
|
|||
|
|
+ def critical(info: dict):
|
|||
|
|
+ info_str = json.dumps(info)
|
|||
|
|
+ xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_OCCUR, info_str)
|
|||
|
|
+ logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}")
|
|||
|
|
+
|
|||
|
|
+ def minor_recover(info: dict):
|
|||
|
|
+ info_str = json.dumps(info)
|
|||
|
|
+ xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_RECOVER, info_str)
|
|||
|
|
+ logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}")
|
|||
|
|
+
|
|||
|
|
+ def major_recover(info: dict):
|
|||
|
|
+ info_str = json.dumps(info)
|
|||
|
|
+ xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_RECOVER, info_str)
|
|||
|
|
+ logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}")
|
|||
|
|
|
|||
|
|
+ def critical_recover(info: dict):
|
|||
|
|
+ info_str = json.dumps(info)
|
|||
|
|
+ xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_RECOVER, info_str)
|
|||
|
|
+ logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}")
|
|||
|
|
diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py
|
|||
|
|
index 01c5315..c7679cd 100644
|
|||
|
|
--- a/src/python/sentryPlugins/ai_block_io/data_access.py
|
|||
|
|
+++ b/src/python/sentryPlugins/ai_block_io/data_access.py
|
|||
|
|
@@ -42,10 +42,11 @@ def check_collect_valid(period):
|
|||
|
|
data = json.loads(data_raw["message"])
|
|||
|
|
except Exception as e:
|
|||
|
|
logging.warning(f"get io data failed, {e}")
|
|||
|
|
- return []
|
|||
|
|
+ return None
|
|||
|
|
return [k for k in data.keys()]
|
|||
|
|
else:
|
|||
|
|
- return []
|
|||
|
|
+ logging.warning(f"get io data failed, return {data_raw}")
|
|||
|
|
+ return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _get_raw_data(period, disk_list):
|
|||
|
|
diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py
|
|||
|
|
index a48144f..0ed282b 100644
|
|||
|
|
--- a/src/python/sentryPlugins/ai_block_io/detector.py
|
|||
|
|
+++ b/src/python/sentryPlugins/ai_block_io/detector.py
|
|||
|
|
@@ -35,7 +35,7 @@ class Detector:
|
|||
|
|
self._count += 1
|
|||
|
|
if self._count % 15 == 0:
|
|||
|
|
self._count = 0
|
|||
|
|
- logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
|
|||
|
|
+ logging.debug(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.")
|
|||
|
|
logging.debug(f'enter Detector: {self}')
|
|||
|
|
metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name)
|
|||
|
|
if metric_value is None:
|
|||
|
|
--
|
|||
|
|
2.23.0
|
|||
|
|
|