From 367f8ab8a5ad26d80caf1bc4529c79d279ef0fb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> Date: Thu, 10 Oct 2024 17:21:48 +0800 Subject: [PATCH] ai_block_io adapt alarm module --- config/tasks/ai_block_io.mod | 4 +- .../sentryPlugins/ai_block_io/ai_block_io.py | 28 +++++--- .../sentryPlugins/ai_block_io/alarm_report.py | 65 ++++++++++++++----- .../sentryPlugins/ai_block_io/data_access.py | 5 +- .../sentryPlugins/ai_block_io/detector.py | 2 +- 5 files changed, 73 insertions(+), 31 deletions(-) diff --git a/config/tasks/ai_block_io.mod b/config/tasks/ai_block_io.mod index 1971d7d..82f4f0b 100644 --- a/config/tasks/ai_block_io.mod +++ b/config/tasks/ai_block_io.mod @@ -2,4 +2,6 @@ enabled=yes task_start=/usr/bin/python3 /usr/bin/ai_block_io task_stop=pkill -f /usr/bin/ai_block_io -type=oneshot \ No newline at end of file +type=oneshot +alarm_id=1002 +alarm_clear_time=5 \ No newline at end of file diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py index 3b00ef3..77104a9 100644 --- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py +++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py @@ -20,14 +20,14 @@ from .utils import get_data_queue_size_and_update_size from .config_parser import ConfigParser from .data_access import get_io_data_from_collect_plug, check_collect_valid from .io_data import MetricName -from .alarm_report import AlarmReport +from .alarm_report import Xalarm, Report CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini" def sig_handler(signum, frame): logging.info("receive signal: %d", signum) - AlarmReport().report_fail(f"receive signal: {signum}") + Report.report_pass(f"receive signal: {signum}, exiting...") exit(signum) @@ -44,6 +44,10 @@ class SlowIODetection: def __init_detector_name_list(self): self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) + if self._disk_list is None: + Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...") + exit(1) + logging.info(f"ai_block_io plug has found disks: {self._disk_list}") disks_to_detection: list = self._config_parser.get_disks_to_detection() # 情况1:None,则启用所有磁盘检测 @@ -101,7 +105,8 @@ class SlowIODetection: ) logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}') if io_data_dict_with_disk_name is None: - continue + Report.report_pass("get io data error, please check if the collector plug is enable. exitting...") + exit(1) # Step2:慢IO检测 logging.debug('step2. Start to detection slow io event.') @@ -117,13 +122,16 @@ class SlowIODetection: for slow_io_event in slow_io_event_list: metric_name: MetricName = slow_io_event[0] result = slow_io_event[1] - alarm_content = (f"disk {metric_name.get_disk_name()} has slow io event. " - f"stage is: {metric_name.get_stage_name()}, " - f"io access type is: {metric_name.get_io_access_type_name()}, " - f"metric is: {metric_name.get_metric_name()}, " - f"current window is: {result[1]}, " - f"threshold is: {result[2]}") - AlarmReport.report_major_alm(alarm_content) + alarm_content = { + "driver_name": f"{metric_name.get_disk_name()}", + "reason": "disk_slow", + "block_stack": f"{metric_name.get_stage_name()}", + "io_type": f"{metric_name.get_io_access_type_name()}", + "alarm_source": "ai_block_io", + "alarm_type": "latency", + "details": f"current window is: {result[1]}, threshold is: {result[2]}.", + } + Xalarm.major(alarm_content) logging.warning(alarm_content) # Step4:等待检测时间 diff --git a/src/python/sentryPlugins/ai_block_io/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py index 230c8cd..92bd6e3 100644 --- a/src/python/sentryPlugins/ai_block_io/alarm_report.py +++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py @@ -9,41 +9,72 @@ # PURPOSE. # See the Mulan PSL v2 for more details. -from syssentry.result import ResultLevel, report_result import logging import json +from xalarm.sentry_notify import ( + xalarm_report, + MINOR_ALM, + MAJOR_ALM, + CRITICAL_ALM, + ALARM_TYPE_OCCUR, + ALARM_TYPE_RECOVER, +) + +from syssentry.result import ResultLevel, report_result + -class AlarmReport: +class Report: TASK_NAME = "ai_block_io" @staticmethod def report_pass(info: str): - report_result(AlarmReport.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info})) - logging.info(f'Report {AlarmReport.TASK_NAME} PASS: {info}') + report_result(Report.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info})) + logging.info(f'Report {Report.TASK_NAME} PASS: {info}') @staticmethod def report_fail(info: str): - report_result(AlarmReport.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info})) - logging.info(f'Report {AlarmReport.TASK_NAME} FAIL: {info}') + report_result(Report.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info})) + logging.info(f'Report {Report.TASK_NAME} FAIL: {info}') @staticmethod def report_skip(info: str): - report_result(AlarmReport.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info})) - logging.info(f'Report {AlarmReport.TASK_NAME} SKIP: {info}') + report_result(Report.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info})) + logging.info(f'Report {Report.TASK_NAME} SKIP: {info}') + + +class Xalarm: + ALARM_ID = 1002 @staticmethod - def report_minor_alm(info: str): - report_result(AlarmReport.TASK_NAME, ResultLevel.MINOR_ALM, json.dumps({"msg": info})) - logging.info(f'Report {AlarmReport.TASK_NAME} MINOR_ALM: {info}') + def minor(info: dict): + info_str = json.dumps(info) + xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_OCCUR, info_str) + logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}") @staticmethod - def report_major_alm(info: str): - report_result(AlarmReport.TASK_NAME, ResultLevel.MAJOR_ALM, json.dumps({"msg": info})) - logging.info(f'Report {AlarmReport.TASK_NAME} MAJOR_ALM: {info}') + def major(info: dict): + info_str = json.dumps(info) + xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_OCCUR, info_str) + logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}") @staticmethod - def report_critical_alm(info: str): - report_result(AlarmReport.TASK_NAME, ResultLevel.CRITICAL_ALM, json.dumps({"msg": info})) - logging.info(f'Report {AlarmReport.TASK_NAME} CRITICAL_ALM: {info}') + def critical(info: dict): + info_str = json.dumps(info) + xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_OCCUR, info_str) + logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}") + + def minor_recover(info: dict): + info_str = json.dumps(info) + xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_RECOVER, info_str) + logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}") + + def major_recover(info: dict): + info_str = json.dumps(info) + xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_RECOVER, info_str) + logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}") + def critical_recover(info: dict): + info_str = json.dumps(info) + xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_RECOVER, info_str) + logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}") diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py index 01c5315..c7679cd 100644 --- a/src/python/sentryPlugins/ai_block_io/data_access.py +++ b/src/python/sentryPlugins/ai_block_io/data_access.py @@ -42,10 +42,11 @@ def check_collect_valid(period): data = json.loads(data_raw["message"]) except Exception as e: logging.warning(f"get io data failed, {e}") - return [] + return None return [k for k in data.keys()] else: - return [] + logging.warning(f"get io data failed, return {data_raw}") + return None def _get_raw_data(period, disk_list): diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py index a48144f..0ed282b 100644 --- a/src/python/sentryPlugins/ai_block_io/detector.py +++ b/src/python/sentryPlugins/ai_block_io/detector.py @@ -35,7 +35,7 @@ class Detector: self._count += 1 if self._count % 15 == 0: self._count = 0 - logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") + logging.debug(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") logging.debug(f'enter Detector: {self}') metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) if metric_value is None: -- 2.23.0