gala-anteater/0001-add-new-feature-slow-node-detection.patch
2024-11-06 16:14:33 +08:00

778 lines
29 KiB
Diff
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

From 2e30b68154f5d0b6a68eab1bf8408363bbf28a76 Mon Sep 17 00:00:00 2001
From: huangbin <huangbin58@huawei.com>
Date: Tue, 5 Nov 2024 15:37:00 +0800
Subject: [PATCH] add new feature slow node detection
---
anteater/model/detector/slow_node_detector.py | 397 ++++++++++++++++++
config/module/slow_node_detection.job.json | 352 ++++++++++++++++
2 files changed, 749 insertions(+)
create mode 100644 anteater/model/detector/slow_node_detector.py
create mode 100644 config/module/slow_node_detection.job.json
diff --git a/anteater/model/detector/slow_node_detector.py b/anteater/model/detector/slow_node_detector.py
new file mode 100644
index 0000000..15a6cee
--- /dev/null
+++ b/anteater/model/detector/slow_node_detector.py
@@ -0,0 +1,397 @@
+#!/usr/bin/python3
+# ******************************************************************************
+# Copyright (c) 2023 Huawei Technologies Co., Ltd.
+# gala-anteater is licensed under Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+# See the Mulan PSL v2 for more details.
+# ******************************************************************************/
+import time
+import json
+import os.path
+import pprint
+import traceback
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from anteater.core.slow_node_response import AIJobDetectResult, ResultCode, NodeData
+from anteater.core.anomaly import Anomaly, RootCause
+from anteater.core.kpi import KPI, ModelConfig, Feature
+from anteater.utils.datetime import DateTimeManager as dt
+from anteater.utils.timer import timer
+from anteater.utils.log import logger
+from anteater.source.metric_loader import MetricLoader
+from anteater.model.detector.base import Detector
+from anteater.model.process.rank_table_loader import GroupDataLoader
+from anteater.model.algorithms.slow_node_algs import time_node_detectors, space_node_detectors
+
+
+class SlowNodeDetector(Detector):
+ def __init__(self, data_loader: MetricLoader, config: ModelConfig, **kwargs):
+ """The detector base class initializer"""
+ super().__init__(data_loader, **kwargs)
+ self.config = config
+ self.max_num_normal_results = self.config.params.get("max_num_normal_results", 10)
+ self.record_kpi_value = self.config.params.get("record_kpi", False)
+ self.hccl_domain, self.rank_table = self._init_hccl_and_rank_table()
+
+ def _init_hccl_and_rank_table(self):
+ params = self.config.params
+ hccl_domain_path = params.get("hccl_domain_json")
+ rank_table_path = params.get("rank_table_json")
+
+ hccl_domain = {}
+ rank_table = {}
+
+ if os.path.exists(hccl_domain_path):
+ try:
+ with open(rank_table_path, 'r', encoding='utf-8') as f_out:
+ hccl_domain = json.load(f_out)
+ except Exception:
+ logger.error(f"Read hccl domain info fail!")
+ if os.path.exists(rank_table_path):
+ try:
+ with open(rank_table_path, 'r', encoding='utf-8') as f_out:
+ rank_table = json.load(f_out)
+ except Exception:
+ logger.error(f"Read rank table info fail!")
+
+ return hccl_domain, rank_table
+
+ @staticmethod
+ def npu_id2host_id(machines2devices: dict):
+ npu_id2host_id_dict = {}
+ npu_ids = []
+ hosts_ids = []
+ for machine_ip, devices in machines2devices.items():
+ if devices == [""]:
+ hosts_ids.append(machine_ip)
+ else:
+ npu_ids.append(machine_ip)
+
+ for npu_id in npu_ids:
+ for host_id in hosts_ids:
+ if npu_id.split(":")[0] in host_id:
+ npu_id2host_id_dict[npu_id] = host_id
+ break
+
+ return npu_id2host_id_dict, hosts_ids
+
+ def get_host_ids_by_npu_ids(self, npu_ids: dict, npu_id2host_id_dict: dict, hosts_ids: list) -> list:
+ host_ids = []
+ if npu_ids:
+ for npu_id in npu_ids:
+ host_id = npu_id2host_id_dict.get(npu_id, "")
+ if host_id:
+ host_ids.append(host_id)
+ else:
+ host_ids = hosts_ids
+
+ return host_ids
+
+ @timer
+ def _execute(self, kpis: List[KPI], features: List[Feature], **kwargs) \
+ -> List[Anomaly]:
+ # save to kafka response
+ anomalies = []
+
+ logger.info('Execute cdt model: %s.', self.__class__.__name__)
+ start, end = dt.last(minutes=30)
+ # 获取machine_ids,
+ machines_to_devices = self.get_machines_to_devices(start, end, kpis)
+ npu_id2host_id, hosts_ids = self.npu_id2host_id(machines_to_devices)
+
+ group_dataloader = GroupDataLoader(self.hccl_domain, self.rank_table, machines_to_devices)
+ group_ranks: list = group_dataloader.get_group_ranks()
+ all_results = []
+ for kpi in kpis:
+ for ranks in group_ranks:
+ machine_ids: dict = group_dataloader.rank_table_loader.get_group_nodes_by_ranks(ranks)
+ host_ids: list = self.get_host_ids_by_npu_ids(machine_ids, npu_id2host_id, hosts_ids)
+ group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids)
+ all_results.extend(group_result)
+
+ response, all_anomaly_nodes = self.gen_final_alarm(kpis, all_results)
+
+ if response.result_code == ResultCode.anomaly:
+ all_anomaly_nodes = sorted(list(set(all_anomaly_nodes)))
+ anomaly = Anomaly(
+ machine_id=json.dumps(all_anomaly_nodes),
+ metric="slow_node_metric",
+ labels={"instance": "node_ip"},
+ score=1.0,
+ entity_name="sli",
+ details={"detect_method": "slow_node_detection"},
+ description=response)
+ anomalies.append(anomaly)
+
+ return anomalies
+
+ def gen_final_alarm(self, kpis: List[KPI], detect_results: List):
+ response = AIJobDetectResult()
+ all_anomaly_nodes = []
+
+ for index, result in enumerate(detect_results):
+ try:
+ aomaly_devices = result.get("anomaly_devices")
+ all_anomaly_nodes.extend(aomaly_devices)
+ response = self.group_detect_ret_agg(response, result, kpis)
+ except Exception:
+ logger.error(traceback.format_exc())
+ logger.info("accomplishment: %s/%s", index + 1, len(detect_results))
+
+ return response, all_anomaly_nodes
+
+ def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list) -> list:
+ """Detects kpi based on signal time series anomaly detection model"""
+ # 普罗会一次性抓到所有的数据需要根据machine_id, device_id去对数据作分组
+ # get数据
+ metric_name: str = kpi.metric
+
+ all_machines_ts = []
+ for machine_id in machine_ids:
+ single_machine_ts_list = self.get_kpi_ts_list(metric_name, machine_id, kpi.params)
+ all_machines_ts.extend(single_machine_ts_list)
+ for host_id in host_ids:
+ single_machine_ts_list = self.get_kpi_ts_list(metric_name, host_id, kpi.params)
+ all_machines_ts.extend(single_machine_ts_list)
+
+ anomaly_devices = []
+ anomaly_locations = {}
+ space_anomaly_locations = {}
+
+ detect_data, min_data_len = self.preprocessing_data(metric_name, all_machines_ts)
+ detection_results = {
+ "anomaly_devices": anomaly_devices,
+ "anomaly_locations": anomaly_locations,
+ "detect_result_type": "TIME",
+ "metric_name": metric_name,
+ "group_data": detect_data,
+ }
+ if min_data_len == 0:
+ logger.warning("GROUP data contains EMPTY DATA. GROUP_DATA:%s", pprint.pformat(all_machines_ts))
+ return [detection_results]
+ logger.info("work on %s, %s start.", metric_name, "slow_node_detection")
+
+ # 时间检测
+ # logger.info("work on %s, %s started.", metric_name, "time_node_compare")
+ time_anomaly_locations = self.time_node_compare(kpi, detect_data)
+ logger.info(f"time_node_compare result: {self.output_anomaly_devices(metric_name, time_anomaly_locations)}.")
+ # logger.info("work on %s, %s finished.", metric_name, "time_node_compare")
+
+ # 空间维度对比
+ # 若指标空间维度配置为空,则不进行均质化对比
+ if kpi.params.get("space_detector") is not None:
+ # 四个以上的对象才进行均质化
+ if len(all_machines_ts) >= 4:
+ # 空间维度对比,输出异常节点
+ space_anomaly_locations = self.space_nodes_compare(kpi, detect_data)
+ logger.info(
+ f"space_nodes_compare result: {self.output_anomaly_devices(metric_name, space_anomaly_locations)}.")
+ else:
+ logger.info(f"Skip space nodes compare, due to nodes number{len(all_machines_ts)} is smaller than 4.")
+ else:
+ logger.info(f"Skip space nodes compare.")
+
+ # 时间空间结果融合
+ anomaly_locations, detect_result_type = self.time_space_agg(time_anomaly_locations, space_anomaly_locations,
+ metric_name)
+
+ anomaly_devices = self.output_anomaly_devices(metric_name, anomaly_locations)
+ detection_results["anomaly_devices"] = anomaly_devices
+ detection_results["anomaly_locations"] = anomaly_locations
+ detection_results["detect_result_type"] = detect_result_type
+
+ logger.info(f'''Time and space aggregated result: {anomaly_devices}.''')
+ logger.info("work on %s, %s end.\n", metric_name, "slow_node_detection")
+
+ return [detection_results]
+
+ @staticmethod
+ def output_anomaly_devices(metric: str, anomaly_location: dict):
+ anomaly_devices = []
+ for device_info in anomaly_location.keys():
+ # 异常点数大于0, 则认为该指标出现异常
+ if np.sum(anomaly_location[device_info][metric][1]) > 0:
+ anomaly_devices.append(device_info)
+
+ return anomaly_devices
+
+ @staticmethod
+ def preprocessing_data(metric_name: str, metric_data: list):
+ if len(metric_data) == 0:
+ return {}, 0
+
+ detect_data = {}
+ length = 0
+ for index, metric_ts in enumerate(metric_data):
+ time_stamps = metric_ts.time_stamps
+ length = len(time_stamps)
+ values = metric_ts.values
+ labels = metric_ts.labels
+ if labels.get("id"):
+ device_label = f'''{labels.get("instance")}*{labels.get("id")}'''
+ else:
+ device_label = f'''{labels.get("instance")}*-1'''
+ detect_data[device_label] = pd.DataFrame({"timestamp": time_stamps, metric_name: values})
+
+ return detect_data, length
+
+ def time_node_compare(self, kpi: KPI, detect_data: dict):
+ metric_name = kpi.metric
+ cfg = kpi.params.get("time_detector", {})
+ detector_class = time_node_detectors.get(cfg.get("type"))
+
+ time_node_detector = detector_class(metric_name=metric_name, cfg=cfg)
+ time_node_detector.fit(detect_data)
+ locations = time_node_detector.predict(detect_data)
+ expert_alarm_window_size = kpi.params.get("alarm_filter_window_size")
+
+ for device_info, anomaly_locations in locations.items():
+ filter_labels = self.alarm_filter(anomaly_locations[metric_name][1], expert_alarm_window_size)
+ locations[device_info][metric_name][1][:] = filter_labels
+
+ return locations
+
+ def space_nodes_compare(self, kpi: KPI, detect_data: dict):
+ metric_name = kpi.metric
+ cfg = kpi.params.get("space_detector", {})
+ detector_class = space_node_detectors.get(cfg.get("type"))
+ space_detector = detector_class(cfg)
+ df = pd.DataFrame()
+ column_list = []
+ for device_label, infer_data in detect_data.items():
+ df[device_label] = infer_data[metric_name]
+ column_list.append(device_label)
+
+ detect_node_data = df[column_list].values
+ labels = space_detector.detect(detect_node_data)
+
+ labels = np.swapaxes(labels, 0, 1)
+ space_detect_locations = {}
+
+ i = 0
+ for device_label in column_list:
+ space_detect_locations[device_label] = {}
+ space_detect_locations[device_label][metric_name] = detect_data[device_label]["timestamp"], labels[i]
+ i += 1
+ return space_detect_locations
+
+ def get_kpi_ts_list(self, metric, machine_id: str, kpi_params: dict):
+ look_back = self.config.params.get("look_back", 10)
+ metric_type = kpi_params.get("metric_type", "device")
+ start, end = dt.last(minutes=look_back)
+ # from datetime import timedelta
+ # # 6:40
+ # start = start - timedelta(hours=4.5)
+ # end = end - timedelta(hours=4.5)
+
+ if metric_type == "device":
+ # npu device
+ ts_list = self.data_loader.get_metric(start, end, metric, instance=machine_id)
+ else:
+ # host
+ op = kpi_params.get("method", "avg")
+ ts_list = self.data_loader.get_metric(start, end, metric, operator=op, keys="instance", instance=machine_id)
+
+ return ts_list
+
+ @staticmethod
+ def alarm_filter(labels, alarm_filter_window_size):
+ copy_labels = np.zeros(len(labels))
+ start_index = alarm_filter_window_size
+ alarm_points = set()
+ for i in range(start_index, len(labels) + 1):
+ is_sequential_alarm = (np.sum(labels[i - alarm_filter_window_size:i]) >= alarm_filter_window_size)
+ if not is_sequential_alarm:
+ if np.sum(labels[i - alarm_filter_window_size:i]) > 0:
+ alarm_points.add(i - alarm_filter_window_size)
+ else:
+ copy_labels[i - alarm_filter_window_size:i] = labels[i - alarm_filter_window_size:i]
+ # if alarm_points:
+ # logger.info(f"Alert Remove from point loc", list(alarm_points))
+
+ return copy_labels
+
+ @staticmethod
+ def time_space_agg(time_anomaly_locations, space_anomaly_locations, metric_name):
+ detect_result_type = {}
+
+ for node_id in time_anomaly_locations.keys():
+ time_ret = np.sum(time_anomaly_locations[node_id][metric_name][1])
+ if space_anomaly_locations:
+ space_ret = np.sum(space_anomaly_locations[node_id][metric_name][1])
+ # 如果均质化没有报错则消除告警
+ # 若空间检测和时间检测结果都为空,则返回正常值
+ # 若时间维度和空间维度都出现异常,以空间维度为主返回结果
+ if space_ret == 0 or (space_ret > 0 and time_ret >= 0):
+ time_anomaly_locations[node_id][metric_name] = space_anomaly_locations[node_id][metric_name]
+ detect_result_type.setdefault(node_id, {}).setdefault(metric_name, "SPACE")
+ else:
+ detect_result_type.setdefault(node_id, {}).setdefault(metric_name, "TIME")
+ else:
+ detect_result_type.setdefault(node_id, {}).setdefault(metric_name, "TIME")
+
+ return time_anomaly_locations, detect_result_type
+
+ @staticmethod
+ def _get_kpi_params(kpis: List[KPI], metric_name):
+ for kpi in kpis:
+ if kpi.metric == metric_name:
+ return kpi.params
+
+ return {}
+
+ def group_detect_ret_agg(self, response, detect_result, kpis: List[KPI]):
+ anomaly_device_labels = detect_result.get("anomaly_devices")
+ anomaly_locations = detect_result.get("anomaly_locations")
+ metric_name = detect_result.get("metric_name")
+ detect_result_type = detect_result.get("detect_result_type")
+ group_data = detect_result.get("group_data")
+ if len(anomaly_device_labels) == 0:
+ return response
+ else:
+ response.result_code = ResultCode.anomaly
+ kpi_params = self._get_kpi_params(kpis, metric_name)
+ response(kpi_params.get('type', "compute"))
+
+ keep_devices = []
+ omitted_devices = []
+ for device_label in anomaly_device_labels:
+ method_type = detect_result_type.get(device_label, {}).get(metric_name, "TIME")
+ if method_type == "SPACE":
+ normal_devices = sorted(set(group_data.keys()) - set(anomaly_device_labels))
+ keep_devices = normal_devices[:self.max_num_normal_results]
+ omitted_devices = normal_devices[self.max_num_normal_results:]
+ abnormal_node_data = NodeData(metric_name, device_label, method_type, keep_devices, omitted_devices)
+ time_stamp_data, values = anomaly_locations[device_label][metric_name]
+ label_dict = dict(zip(time_stamp_data.tolist(), values.tolist()))
+
+ # see user requirements for this real kpi value
+ if self.record_kpi_value:
+ # record anomaly kpi value
+ g_ts, g_value = group_data[device_label].values[:, 0], group_data[device_label].values[:, 1]
+ kpi_data = []
+ for key, value in sorted(zip(g_ts.tolist(), g_value.tolist()), key=lambda x: x[0]):
+ kpi_data.append({str(key): str(value), "abnormal": label_dict.get(key, 0)})
+
+ abnormal_node_data.kpi_data = kpi_data
+ response.abnormal_detail.append(abnormal_node_data)
+
+ if keep_devices:
+ for device_label in keep_devices:
+ normal_node_data = NodeData(metric_name, device_label, "SPACE")
+ # see user requirements for this real kpi value
+ if self.record_kpi_value:
+ # record normal kpi data for space compare
+ g_ts, g_value = group_data[device_label].values[:, 0], group_data[device_label].values[:, 1]
+ kpi_data = [{str(key): str(value)} for key, value in zip(g_ts.tolist(), g_value.tolist())]
+ normal_node_data.kpi_data = kpi_data
+ response.normal_detail.append(normal_node_data)
+ return response
diff --git a/config/module/slow_node_detection.job.json b/config/module/slow_node_detection.job.json
new file mode 100644
index 0000000..91ff621
--- /dev/null
+++ b/config/module/slow_node_detection.job.json
@@ -0,0 +1,352 @@
+{
+ "name": "SlowNodeDetector",
+ "enable": true,
+ "job_type": "anomaly_detection",
+ "keywords": [
+ "app"
+ ],
+ "root_cause_num": 20,
+ "detector": "slow-node-detection",
+ "template": "slow_node",
+ "model_config": {
+ "name": "disruption_model",
+ "params": {
+ "record_kpi": false,
+ "max_num_normal_results": 16,
+ "look_back": 20,
+ "obs_size": 5,
+ "outlier_ratio_th": 0.6,
+ "hccl_domain_json": "./hccl_domain.json",
+ "rank_table_json": "./hccl_domain.json"
+ }
+ },
+ "kpis": [
+ {
+ "metric": "gala_gopher_cpu_total_used_per",
+ "entity_name": "sli",
+ "atrend": "rise",
+ "enable": true,
+ "params": {
+ "metric_type": "host",
+ "method": "avg",
+ "priority": 30,
+ "alarm_filter_window_size": 5,
+ "space_detector": null,
+ "time_detector": {
+ "preprocess_eps": 0.1,
+ "preprocess_min_samples": 10,
+ "type": "SlidingWindowKSigmaDetector",
+ "n_sigma_method": {
+ "type": "SlidingWindowNSigma",
+ "training_window_size": 40,
+ "min_update_window_size": 10,
+ "min_std_val": 0.0001,
+ "bias": 0.01,
+ "abs_bias": 0,
+ "nsigma_coefficient": 4,
+ "detect_type": "upper_bound",
+ "min_expert_lower_bound": null,
+ "max_expert_lower_bound": null,
+ "min_expert_upper_bound": 70,
+ "max_expert_upper_bound": 80
+ }
+ },
+ "type": "compute"
+ }
+ },
+ {
+ "metric": "gala_gopher_mem_util",
+ "entity_name": "sli",
+ "atrend": "rise",
+ "enable": true,
+ "params": {
+ "metric_type": "host",
+ "method": "sum",
+ "priority": 20,
+ "alarm_filter_window_size": 5,
+ "space_detector": {
+ "first_gap_rate": 0.3,
+ "second_gap_rate": 0.2,
+ "base_threshold": 150,
+ "discrete_rate": 1.5,
+ "nsigma_coefficient": 2,
+ "discrete_point_suppression_ratio": 0.03,
+ "non_major_anomaly_suppression": 0.1,
+ "type": "OuterDataDetector"
+ },
+ "time_detector": {
+ "preprocess_eps": 0.1,
+ "preprocess_min_samples": 10,
+ "type": "SlidingWindowKSigmaDetector",
+ "n_sigma_method": {
+ "type": "SlidingWindowNSigma",
+ "training_window_size": 40,
+ "min_update_window_size": 10,
+ "min_std_val": 0.0001,
+ "bias": 0.1,
+ "abs_bias": 5,
+ "nsigma_coefficient": 4,
+ "detect_type": "upper_bound",
+ "min_expert_lower_bound": null,
+ "max_expert_lower_bound": null,
+ "min_expert_upper_bound": 50,
+ "max_expert_upper_bound": null
+ }
+ },
+ "type": "compute"
+ }
+ },
+ {
+ "metric": "gala_gopher_disk_wspeed_kB",
+ "entity_name": "sli",
+ "atrend": "rise",
+ "enable": true,
+ "params": {
+ "metric_type": "host",
+ "method": "sum",
+ "priority": 5,
+ "alarm_filter_window_size": 30,
+ "space_detector": null,
+ "time_detector": {
+ "preprocess_eps": 0.1,
+ "preprocess_min_samples": 10,
+ "type": "SlidingWindowKSigmaDetector",
+ "n_sigma_method": {
+ "type": "SlidingWindowNSigma",
+ "training_window_size": 60,
+ "min_update_window_size": 10,
+ "min_std_val": 0.0001,
+ "bias": 0.3,
+ "abs_bias": 0,
+ "nsigma_coefficient": 3,
+ "detect_type": "lower_bound",
+ "min_expert_lower_bound": null,
+ "max_expert_lower_bound": null,
+ "min_expert_upper_bound": null,
+ "max_expert_upper_bound": null
+ }
+ },
+ "type": "storage"
+ }
+ },
+ {
+ "metric": "gala_gopher_nic_tx_dropped",
+ "entity_name": "sli",
+ "atrend": "rise",
+ "enable": true,
+ "params": {
+ "metric_type": "host",
+ "method": "sum",
+ "priority": 5,
+ "alarm_filter_window_size": 5,
+ "space_detector": null,
+ "time_detector": {
+ "preprocess_eps": 0.1,
+ "preprocess_min_samples": 10,
+ "type": "SlidingWindowKSigmaDetector",
+ "n_sigma_method": {
+ "type": "SlidingWindowNSigma",
+ "training_window_size": 40,
+ "min_update_window_size": 10,
+ "min_std_val": 0.0001,
+ "bias": 0.05,
+ "abs_bias": 0,
+ "nsigma_coefficient": 4,
+ "detect_type": "upper_bound",
+ "min_expert_lower_bound": null,
+ "max_expert_lower_bound": null,
+ "min_expert_upper_bound": 10,
+ "max_expert_upper_bound": null
+ }
+ },
+ "type": "network"
+ }
+ },
+ {
+ "metric": "gala_gopher_nic_tx_errs",
+ "entity_name": "sli",
+ "atrend": "rise",
+ "enable": true,
+ "params": {
+ "metric_type": "host",
+ "method": "sum",
+ "priority": 5,
+ "alarm_filter_window_size": 5,
+ "space_detector": null,
+ "time_detector": {
+ "preprocess_eps": 0.1,
+ "preprocess_min_samples": 10,
+ "type": "SlidingWindowKSigmaDetector",
+ "n_sigma_method": {
+ "type": "SlidingWindowNSigma",
+ "training_window_size": 40,
+ "min_update_window_size": 10,
+ "min_std_val": 0.0001,
+ "bias": 0.05,
+ "abs_bias": 0,
+ "nsigma_coefficient": 4,
+ "detect_type": "upper_bound",
+ "min_expert_lower_bound": null,
+ "max_expert_lower_bound": null,
+ "min_expert_upper_bound": 10,
+ "max_expert_upper_bound": null
+ }
+ },
+ "type": "network"
+ }
+ },
+ {
+ "metric": "npu_chip_info_temperature",
+ "entity_name": "sli",
+ "atrend": "rise",
+ "enable": true,
+ "params": {
+ "metric_type": "device",
+ "method": "max",
+ "priority": 25,
+ "alarm_filter_window_size": 12,
+ "space_detector": null,
+ "time_detector": {
+ "preprocess_eps": 0.1,
+ "preprocess_min_samples": 10,
+ "type": "SlidingWindowKSigmaDetector",
+ "n_sigma_method": {
+ "type": "SlidingWindowNSigma",
+ "training_window_size": 40,
+ "min_update_window_size": 10,
+ "min_std_val": 0.0001,
+ "bias": 0.01,
+ "abs_bias": 0,
+ "nsigma_coefficient": 4,
+ "detect_type": "upper_bound",
+ "min_expert_lower_bound": null,
+ "max_expert_lower_bound": null,
+ "min_expert_upper_bound": 70,
+ "max_expert_upper_bound": 100
+ }
+ },
+ "type": "compute"
+ }
+ },
+ {
+ "metric": "npu_chip_info_hbm_used_memory",
+ "entity_name": "sli",
+ "atrend": "rise",
+ "enable": true,
+ "params": {
+ "metric_type": "device",
+ "method": "max",
+ "priority": 30,
+ "alarm_filter_window_size": 5,
+ "space_detector": {
+ "dist_metric": "euclidean",
+ "eps": 0.4,
+ "cv_threshold": 0.03,
+ "min_samples": 2,
+ "window_size": 100,
+ "scaling": false,
+ "type": "SlidingWindowDBSCAN"
+ },
+ "time_detector": {
+ "preprocess_eps": 0.1,
+ "preprocess_min_samples": 10,
+ "type": "SlidingWindowKSigmaDetector",
+ "n_sigma_method": {
+ "type": "SlidingWindowNSigma",
+ "training_window_size": 40,
+ "min_update_window_size": 10,
+ "min_std_val": 0.0001,
+ "bias": 0.02,
+ "abs_bias": 5,
+ "nsigma_coefficient": 4,
+ "detect_type": "upper_bound",
+ "min_expert_lower_bound": null,
+ "max_expert_lower_bound": null,
+ "min_expert_upper_bound": null,
+ "max_expert_upper_bound": null
+ }
+ },
+ "type": "compute"
+ }
+ },
+ {
+ "metric": "npu_chip_info_aicore_current_freq",
+ "entity_name": "sli",
+ "atrend": "rise",
+ "enable": true,
+ "params": {
+ "metric_type": "device",
+ "method": "max",
+ "priority": 30,
+ "alarm_filter_window_size": 5,
+ "space_detector": {
+ "dist_metric": "euclidean",
+ "eps": 0.4,
+ "cv_threshold": 0.03,
+ "min_samples": 2,
+ "window_size": 100,
+ "scaling": true,
+ "type": "SlidingWindowDBSCAN"
+ },
+ "time_detector": {
+ "preprocess_eps": 0.1,
+ "preprocess_min_samples": 10,
+ "type": "SlidingWindowKSigmaDetector",
+ "n_sigma_method": {
+ "type": "SlidingWindowNSigma",
+ "training_window_size": 40,
+ "min_update_window_size": 10,
+ "min_std_val": 0.0001,
+ "bias": 0.05,
+ "abs_bias": 0,
+ "nsigma_coefficient": 4,
+ "detect_type": "lower_bound",
+ "min_expert_lower_bound": null,
+ "max_expert_lower_bound": null,
+ "min_expert_upper_bound": 10,
+ "max_expert_upper_bound": null
+ }
+ },
+ "type": "compute"
+ }
+ },
+ {
+ "metric": "npu_chip_roce_tx_err_pkt_num",
+ "entity_name": "sli",
+ "atrend": "rise",
+ "enable": true,
+ "params": {
+ "metric_type": "device",
+ "method": "max",
+ "priority": 30,
+ "alarm_filter_window_size": 5,
+ "space_detector": null,
+ "time_detector": {
+ "preprocess_eps": 0.1,
+ "preprocess_min_samples": 10,
+ "type": "SlidingWindowKSigmaDetector",
+ "n_sigma_method": {
+ "type": "SlidingWindowNSigma",
+ "training_window_size": 40,
+ "min_update_window_size": 10,
+ "min_std_val": 0.0001,
+ "bias": 0.05,
+ "abs_bias": 0,
+ "nsigma_coefficient": 4,
+ "detect_type": "upper_bound",
+ "min_expert_lower_bound": null,
+ "max_expert_lower_bound": null,
+ "min_expert_upper_bound": 10,
+ "max_expert_upper_bound": null
+ }
+ },
+ "type": "network"
+ }
+ }
+ ],
+ "features": [
+ {
+ "metric": "gala_gopher_container_cpu_usage_seconds_total"
+ }
+ ]
+}
\ No newline at end of file
--
2.33.0