gala-anteater/0001-add-new-feature-slow-node-detection.patch

From 2e30b68154f5d0b6a68eab1bf8408363bbf28a76 Mon Sep 17 00:00:00 2001
From: huangbin <huangbin58@huawei.com>
Date: Tue, 5 Nov 2024 15:37:00 +0800
Subject: [PATCH] add new feature slow node detection

---
 anteater/model/detector/slow_node_detector.py | 397 ++++++++++++++++++
 config/module/slow_node_detection.job.json    | 352 ++++++++++++++++
 2 files changed, 749 insertions(+)
 create mode 100644 anteater/model/detector/slow_node_detector.py
 create mode 100644 config/module/slow_node_detection.job.json

diff --git a/anteater/model/detector/slow_node_detector.py b/anteater/model/detector/slow_node_detector.py
new file mode 100644
index 0000000..15a6cee
--- /dev/null
+++ b/anteater/model/detector/slow_node_detector.py
@@ -0,0 +1,397 @@
+#!/usr/bin/python3
+# ******************************************************************************
+# Copyright (c) 2023 Huawei Technologies Co., Ltd.
+# gala-anteater is licensed under Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+#          http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+# See the Mulan PSL v2 for more details.
+# ******************************************************************************/
+import time
+import json
+import os.path
+import pprint
+import traceback
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from anteater.core.slow_node_response import AIJobDetectResult, ResultCode, NodeData
+from anteater.core.anomaly import Anomaly, RootCause
+from anteater.core.kpi import KPI, ModelConfig, Feature
+from anteater.utils.datetime import DateTimeManager as dt
+from anteater.utils.timer import timer
+from anteater.utils.log import logger
+from anteater.source.metric_loader import MetricLoader
+from anteater.model.detector.base import Detector
+from anteater.model.process.rank_table_loader import GroupDataLoader
+from anteater.model.algorithms.slow_node_algs import time_node_detectors, space_node_detectors
+
+
+class SlowNodeDetector(Detector):
+    def __init__(self, data_loader: MetricLoader, config: ModelConfig, **kwargs):
+        """The detector base class initializer"""
+        super().__init__(data_loader, **kwargs)
+        self.config = config
+        self.max_num_normal_results = self.config.params.get("max_num_normal_results", 10)
+        self.record_kpi_value = self.config.params.get("record_kpi", False)
+        self.hccl_domain, self.rank_table = self._init_hccl_and_rank_table()
+
+    def _init_hccl_and_rank_table(self):
+        params = self.config.params
+        hccl_domain_path = params.get("hccl_domain_json")
+        rank_table_path = params.get("rank_table_json")
+
+        hccl_domain = {}
+        rank_table = {}
+
+        if os.path.exists(hccl_domain_path):
+            try:
+                with open(rank_table_path, 'r', encoding='utf-8') as f_out:
+                    hccl_domain = json.load(f_out)
+            except Exception:
+                logger.error(f"Read hccl domain info fail!")
+        if os.path.exists(rank_table_path):
+            try:
+                with open(rank_table_path, 'r', encoding='utf-8') as f_out:
+                    rank_table = json.load(f_out)
+            except Exception:
+                logger.error(f"Read rank table info fail!")
+
+        return hccl_domain, rank_table
+
+    @staticmethod
+    def npu_id2host_id(machines2devices: dict):
+        npu_id2host_id_dict = {}
+        npu_ids = []
+        hosts_ids = []
+        for machine_ip, devices in machines2devices.items():
+            if devices == [""]:
+                hosts_ids.append(machine_ip)
+            else:
+                npu_ids.append(machine_ip)
+
+        for npu_id in npu_ids:
+            for host_id in hosts_ids:
+                if npu_id.split(":")[0] in host_id:
+                    npu_id2host_id_dict[npu_id] = host_id
+                    break
+
+        return npu_id2host_id_dict, hosts_ids
+
+    def get_host_ids_by_npu_ids(self, npu_ids: dict, npu_id2host_id_dict: dict, hosts_ids: list) -> list:
+        host_ids = []
+        if npu_ids:
+            for npu_id in npu_ids:
+                host_id = npu_id2host_id_dict.get(npu_id, "")
+                if host_id:
+                    host_ids.append(host_id)
+        else:
+            host_ids = hosts_ids
+
+        return host_ids
+
+    @timer
+    def _execute(self, kpis: List[KPI], features: List[Feature], **kwargs) \
+            -> List[Anomaly]:
+        # save to kafka response
+        anomalies = []
+
+        logger.info('Execute cdt model: %s.', self.__class__.__name__)
+        start, end = dt.last(minutes=30)
+        # 获取machine_ids,
+        machines_to_devices = self.get_machines_to_devices(start, end, kpis)
+        npu_id2host_id, hosts_ids = self.npu_id2host_id(machines_to_devices)
+
+        group_dataloader = GroupDataLoader(self.hccl_domain, self.rank_table, machines_to_devices)
+        group_ranks: list = group_dataloader.get_group_ranks()
+        all_results = []
+        for kpi in kpis:
+            for ranks in group_ranks:
+                machine_ids: dict = group_dataloader.rank_table_loader.get_group_nodes_by_ranks(ranks)
+                host_ids: list = self.get_host_ids_by_npu_ids(machine_ids, npu_id2host_id, hosts_ids)
+                group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids)
+                all_results.extend(group_result)
+
+        response, all_anomaly_nodes = self.gen_final_alarm(kpis, all_results)
+
+        if response.result_code == ResultCode.anomaly:
+            all_anomaly_nodes = sorted(list(set(all_anomaly_nodes)))
+            anomaly = Anomaly(
+                machine_id=json.dumps(all_anomaly_nodes),
+                metric="slow_node_metric",
+                labels={"instance": "node_ip"},
+                score=1.0,
+                entity_name="sli",
+                details={"detect_method": "slow_node_detection"},
+                description=response)
+            anomalies.append(anomaly)
+
+        return anomalies
+
+    def gen_final_alarm(self, kpis: List[KPI], detect_results: List):
+        response = AIJobDetectResult()
+        all_anomaly_nodes = []
+
+        for index, result in enumerate(detect_results):
+            try:
+                aomaly_devices = result.get("anomaly_devices")
+                all_anomaly_nodes.extend(aomaly_devices)
+                response = self.group_detect_ret_agg(response, result, kpis)
+            except Exception:
+                logger.error(traceback.format_exc())
+            logger.info("accomplishment: %s/%s", index + 1, len(detect_results))
+
+        return response, all_anomaly_nodes
+
+    def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list) -> list:
+        """Detects kpi based on signal time series anomaly detection model"""
+        # 普罗会一次性抓到所有的数据，需要根据machine_id, device_id去对数据作分组
+        # get数据
+        metric_name: str = kpi.metric
+
+        all_machines_ts = []
+        for machine_id in machine_ids:
+            single_machine_ts_list = self.get_kpi_ts_list(metric_name, machine_id, kpi.params)
+            all_machines_ts.extend(single_machine_ts_list)
+        for host_id in host_ids:
+            single_machine_ts_list = self.get_kpi_ts_list(metric_name, host_id, kpi.params)
+            all_machines_ts.extend(single_machine_ts_list)
+
+        anomaly_devices = []
+        anomaly_locations = {}
+        space_anomaly_locations = {}
+
+        detect_data, min_data_len = self.preprocessing_data(metric_name, all_machines_ts)
+        detection_results = {
+            "anomaly_devices": anomaly_devices,
+            "anomaly_locations": anomaly_locations,
+            "detect_result_type": "TIME",
+            "metric_name": metric_name,
+            "group_data": detect_data,
+        }
+        if min_data_len == 0:
+            logger.warning("GROUP data contains EMPTY DATA. GROUP_DATA:%s", pprint.pformat(all_machines_ts))
+            return [detection_results]
+        logger.info("work on %s, %s start.", metric_name, "slow_node_detection")
+
+        # 时间检测
+        # logger.info("work on %s, %s started.", metric_name, "time_node_compare")
+        time_anomaly_locations = self.time_node_compare(kpi, detect_data)
+        logger.info(f"time_node_compare result: {self.output_anomaly_devices(metric_name, time_anomaly_locations)}.")
+        # logger.info("work on %s, %s finished.", metric_name, "time_node_compare")
+
+        # 空间维度对比
+        # 若指标空间维度配置为空，则不进行均质化对比
+        if kpi.params.get("space_detector") is not None:
+            # 四个以上的对象才进行均质化
+            if len(all_machines_ts) >= 4:
+                # 空间维度对比，输出异常节点
+                space_anomaly_locations = self.space_nodes_compare(kpi, detect_data)
+                logger.info(
+                    f"space_nodes_compare result: {self.output_anomaly_devices(metric_name, space_anomaly_locations)}.")
+            else:
+                logger.info(f"Skip space nodes compare, due to nodes number{len(all_machines_ts)} is smaller than 4.")
+        else:
+            logger.info(f"Skip space nodes compare.")
+
+        # 时间空间结果融合
+        anomaly_locations, detect_result_type = self.time_space_agg(time_anomaly_locations, space_anomaly_locations,
+                                                                    metric_name)
+
+        anomaly_devices = self.output_anomaly_devices(metric_name, anomaly_locations)
+        detection_results["anomaly_devices"] = anomaly_devices
+        detection_results["anomaly_locations"] = anomaly_locations
+        detection_results["detect_result_type"] = detect_result_type
+
+        logger.info(f'''Time and space aggregated result: {anomaly_devices}.''')
+        logger.info("work on %s, %s end.\n", metric_name, "slow_node_detection")
+
+        return [detection_results]
+
+    @staticmethod
+    def output_anomaly_devices(metric: str, anomaly_location: dict):
+        anomaly_devices = []
+        for device_info in anomaly_location.keys():
+            # 异常点数大于0, 则认为该指标出现异常
+            if np.sum(anomaly_location[device_info][metric][1]) > 0:
+                anomaly_devices.append(device_info)
+
+        return anomaly_devices
+
+    @staticmethod
+    def preprocessing_data(metric_name: str, metric_data: list):
+        if len(metric_data) == 0:
+            return {}, 0
+
+        detect_data = {}
+        length = 0
+        for index, metric_ts in enumerate(metric_data):
+            time_stamps = metric_ts.time_stamps
+            length = len(time_stamps)
+            values = metric_ts.values
+            labels = metric_ts.labels
+            if labels.get("id"):
+                device_label = f'''{labels.get("instance")}*{labels.get("id")}'''
+            else:
+                device_label = f'''{labels.get("instance")}*-1'''
+            detect_data[device_label] = pd.DataFrame({"timestamp": time_stamps, metric_name: values})
+
+        return detect_data, length
+
+    def time_node_compare(self, kpi: KPI, detect_data: dict):
+        metric_name = kpi.metric
+        cfg = kpi.params.get("time_detector", {})
+        detector_class = time_node_detectors.get(cfg.get("type"))
+
+        time_node_detector = detector_class(metric_name=metric_name, cfg=cfg)
+        time_node_detector.fit(detect_data)
+        locations = time_node_detector.predict(detect_data)
+        expert_alarm_window_size = kpi.params.get("alarm_filter_window_size")
+
+        for device_info, anomaly_locations in locations.items():
+            filter_labels = self.alarm_filter(anomaly_locations[metric_name][1], expert_alarm_window_size)
+            locations[device_info][metric_name][1][:] = filter_labels
+
+        return locations
+
+    def space_nodes_compare(self, kpi: KPI, detect_data: dict):
+        metric_name = kpi.metric
+        cfg = kpi.params.get("space_detector", {})
+        detector_class = space_node_detectors.get(cfg.get("type"))
+        space_detector = detector_class(cfg)
+        df = pd.DataFrame()
+        column_list = []
+        for device_label, infer_data in detect_data.items():
+            df[device_label] = infer_data[metric_name]
+            column_list.append(device_label)
+
+        detect_node_data = df[column_list].values
+        labels = space_detector.detect(detect_node_data)
+
+        labels = np.swapaxes(labels, 0, 1)
+        space_detect_locations = {}
+
+        i = 0
+        for device_label in column_list:
+            space_detect_locations[device_label] = {}
+            space_detect_locations[device_label][metric_name] = detect_data[device_label]["timestamp"], labels[i]
+            i += 1
+        return space_detect_locations
+
+    def get_kpi_ts_list(self, metric, machine_id: str, kpi_params: dict):
+        look_back = self.config.params.get("look_back", 10)
+        metric_type = kpi_params.get("metric_type", "device")
+        start, end = dt.last(minutes=look_back)
+        # from datetime import timedelta
+        # # 6:40
+        # start = start - timedelta(hours=4.5)
+        # end = end - timedelta(hours=4.5)
+
+        if metric_type == "device":
+            # npu device
+            ts_list = self.data_loader.get_metric(start, end, metric, instance=machine_id)
+        else:
+            # host
+            op = kpi_params.get("method", "avg")
+            ts_list = self.data_loader.get_metric(start, end, metric, operator=op, keys="instance", instance=machine_id)
+
+        return ts_list
+
+    @staticmethod
+    def alarm_filter(labels, alarm_filter_window_size):
+        copy_labels = np.zeros(len(labels))
+        start_index = alarm_filter_window_size
+        alarm_points = set()
+        for i in range(start_index, len(labels) + 1):
+            is_sequential_alarm = (np.sum(labels[i - alarm_filter_window_size:i]) >= alarm_filter_window_size)
+            if not is_sequential_alarm:
+                if np.sum(labels[i - alarm_filter_window_size:i]) > 0:
+                    alarm_points.add(i - alarm_filter_window_size)
+            else:
+                copy_labels[i - alarm_filter_window_size:i] = labels[i - alarm_filter_window_size:i]
+        # if alarm_points:
+        #     logger.info(f"Alert Remove from point loc", list(alarm_points))
+
+        return copy_labels
+
+    @staticmethod
+    def time_space_agg(time_anomaly_locations, space_anomaly_locations, metric_name):
+        detect_result_type = {}
+
+        for node_id in time_anomaly_locations.keys():
+            time_ret = np.sum(time_anomaly_locations[node_id][metric_name][1])
+            if space_anomaly_locations:
+                space_ret = np.sum(space_anomaly_locations[node_id][metric_name][1])
+                # 如果均质化没有报错则消除告警
+                # 若空间检测和时间检测结果都为空，则返回正常值
+                # 若时间维度和空间维度都出现异常，以空间维度为主返回结果
+                if space_ret == 0 or (space_ret > 0 and time_ret >= 0):
+                    time_anomaly_locations[node_id][metric_name] = space_anomaly_locations[node_id][metric_name]
+                    detect_result_type.setdefault(node_id, {}).setdefault(metric_name, "SPACE")
+                else:
+                    detect_result_type.setdefault(node_id, {}).setdefault(metric_name, "TIME")
+            else:
+                detect_result_type.setdefault(node_id, {}).setdefault(metric_name, "TIME")
+
+        return time_anomaly_locations, detect_result_type
+
+    @staticmethod
+    def _get_kpi_params(kpis: List[KPI], metric_name):
+        for kpi in kpis:
+            if kpi.metric == metric_name:
+                return kpi.params
+
+        return {}
+
+    def group_detect_ret_agg(self, response, detect_result, kpis: List[KPI]):
+        anomaly_device_labels = detect_result.get("anomaly_devices")
+        anomaly_locations = detect_result.get("anomaly_locations")
+        metric_name = detect_result.get("metric_name")
+        detect_result_type = detect_result.get("detect_result_type")
+        group_data = detect_result.get("group_data")
+        if len(anomaly_device_labels) == 0:
+            return response
+        else:
+            response.result_code = ResultCode.anomaly
+        kpi_params = self._get_kpi_params(kpis, metric_name)
+        response(kpi_params.get('type', "compute"))
+
+        keep_devices = []
+        omitted_devices = []
+        for device_label in anomaly_device_labels:
+            method_type = detect_result_type.get(device_label, {}).get(metric_name, "TIME")
+            if method_type == "SPACE":
+                normal_devices = sorted(set(group_data.keys()) - set(anomaly_device_labels))
+                keep_devices = normal_devices[:self.max_num_normal_results]
+                omitted_devices = normal_devices[self.max_num_normal_results:]
+            abnormal_node_data = NodeData(metric_name, device_label, method_type, keep_devices, omitted_devices)
+            time_stamp_data, values = anomaly_locations[device_label][metric_name]
+            label_dict = dict(zip(time_stamp_data.tolist(), values.tolist()))
+
+            # see user requirements for this real kpi value
+            if self.record_kpi_value:
+                # record anomaly kpi value
+                g_ts, g_value = group_data[device_label].values[:, 0], group_data[device_label].values[:, 1]
+                kpi_data = []
+                for key, value in sorted(zip(g_ts.tolist(), g_value.tolist()), key=lambda x: x[0]):
+                    kpi_data.append({str(key): str(value), "abnormal": label_dict.get(key, 0)})
+
+                abnormal_node_data.kpi_data = kpi_data
+            response.abnormal_detail.append(abnormal_node_data)
+
+        if keep_devices:
+            for device_label in keep_devices:
+                normal_node_data = NodeData(metric_name, device_label, "SPACE")
+                # see user requirements for this real kpi value
+                if self.record_kpi_value:
+                    # record normal kpi data for space compare
+                    g_ts, g_value = group_data[device_label].values[:, 0], group_data[device_label].values[:, 1]
+                    kpi_data = [{str(key): str(value)} for key, value in zip(g_ts.tolist(), g_value.tolist())]
+                    normal_node_data.kpi_data = kpi_data
+                response.normal_detail.append(normal_node_data)
+        return response
diff --git a/config/module/slow_node_detection.job.json b/config/module/slow_node_detection.job.json
new file mode 100644
index 0000000..91ff621
--- /dev/null
+++ b/config/module/slow_node_detection.job.json
@@ -0,0 +1,352 @@
+{
+  "name": "SlowNodeDetector",
+  "enable": true,
+  "job_type": "anomaly_detection",
+  "keywords": [
+    "app"
+  ],
+  "root_cause_num": 20,
+  "detector": "slow-node-detection",
+  "template": "slow_node",
+  "model_config": {
+    "name": "disruption_model",
+    "params": {
+      "record_kpi": false,
+      "max_num_normal_results": 16,
+      "look_back": 20,
+      "obs_size": 5,
+      "outlier_ratio_th": 0.6,
+      "hccl_domain_json": "./hccl_domain.json",
+      "rank_table_json": "./hccl_domain.json"
+    }
+  },
+  "kpis": [
+    {
+      "metric": "gala_gopher_cpu_total_used_per",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "host",
+        "method": "avg",
+        "priority": 30,
+        "alarm_filter_window_size": 5,
+        "space_detector": null,
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.01,
+            "abs_bias": 0,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 70,
+            "max_expert_upper_bound": 80
+          }
+        },
+        "type": "compute"
+      }
+    },
+    {
+      "metric": "gala_gopher_mem_util",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "host",
+        "method": "sum",
+        "priority": 20,
+        "alarm_filter_window_size": 5,
+        "space_detector": {
+          "first_gap_rate": 0.3,
+          "second_gap_rate": 0.2,
+          "base_threshold": 150,
+          "discrete_rate": 1.5,
+          "nsigma_coefficient": 2,
+          "discrete_point_suppression_ratio": 0.03,
+          "non_major_anomaly_suppression": 0.1,
+          "type": "OuterDataDetector"
+        },
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.1,
+            "abs_bias": 5,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 50,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "compute"
+      }
+    },
+    {
+      "metric": "gala_gopher_disk_wspeed_kB",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "host",
+        "method": "sum",
+        "priority": 5,
+        "alarm_filter_window_size": 30,
+        "space_detector": null,
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 60,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.3,
+            "abs_bias": 0,
+            "nsigma_coefficient": 3,
+            "detect_type": "lower_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": null,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "storage"
+      }
+    },
+    {
+      "metric": "gala_gopher_nic_tx_dropped",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "host",
+        "method": "sum",
+        "priority": 5,
+        "alarm_filter_window_size": 5,
+        "space_detector": null,
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.05,
+            "abs_bias": 0,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 10,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "network"
+      }
+    },
+    {
+      "metric": "gala_gopher_nic_tx_errs",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "host",
+        "method": "sum",
+        "priority": 5,
+        "alarm_filter_window_size": 5,
+        "space_detector": null,
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.05,
+            "abs_bias": 0,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 10,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "network"
+      }
+    },
+    {
+      "metric": "npu_chip_info_temperature",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "device",
+        "method": "max",
+        "priority": 25,
+        "alarm_filter_window_size": 12,
+        "space_detector": null,
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.01,
+            "abs_bias": 0,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 70,
+            "max_expert_upper_bound": 100
+          }
+        },
+        "type": "compute"
+      }
+    },
+    {
+      "metric": "npu_chip_info_hbm_used_memory",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "device",
+        "method": "max",
+        "priority": 30,
+        "alarm_filter_window_size": 5,
+        "space_detector": {
+          "dist_metric": "euclidean",
+          "eps": 0.4,
+          "cv_threshold": 0.03,
+          "min_samples": 2,
+          "window_size": 100,
+          "scaling": false,
+          "type": "SlidingWindowDBSCAN"
+        },
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.02,
+            "abs_bias": 5,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": null,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "compute"
+      }
+    },
+    {
+      "metric": "npu_chip_info_aicore_current_freq",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "device",
+        "method": "max",
+        "priority": 30,
+        "alarm_filter_window_size": 5,
+        "space_detector": {
+          "dist_metric": "euclidean",
+          "eps": 0.4,
+          "cv_threshold": 0.03,
+          "min_samples": 2,
+          "window_size": 100,
+          "scaling": true,
+          "type": "SlidingWindowDBSCAN"
+        },
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.05,
+            "abs_bias": 0,
+            "nsigma_coefficient": 4,
+            "detect_type": "lower_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 10,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "compute"
+      }
+    },
+    {
+      "metric": "npu_chip_roce_tx_err_pkt_num",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "device",
+        "method": "max",
+        "priority": 30,
+        "alarm_filter_window_size": 5,
+        "space_detector": null,
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.05,
+            "abs_bias": 0,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 10,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "network"
+      }
+    }
+  ],
+  "features": [
+    {
+      "metric": "gala_gopher_container_cpu_usage_seconds_total"
+    }
+  ]
+}
\ No newline at end of file
-- 
2.33.0
-												Two-dimensional kpi anomaly detection.

											
										
										
											2024-11-05 16:44:53 +08:00
+								From 2e30b68154f5d0b6a68eab1bf8408363bbf28a76 Mon Sep 17 00:00:00 2001
 								From: huangbin <huangbin58@huawei.com>
 								Date: Tue, 5 Nov 2024 15:37:00 +0800
 								Subject: [PATCH] add new feature slow node detection
 								---
 								 anteater/model/detector/slow_node_detector.py | 397 ++++++++++++++++++
 								 config/module/slow_node_detection.job.json    | 352 ++++++++++++++++
 files changed, 749 insertions(+)
 								 create mode 100644 anteater/model/detector/slow_node_detector.py
 								 create mode 100644 config/module/slow_node_detection.job.json
 								diff --git a/anteater/model/detector/slow_node_detector.py b/anteater/model/detector/slow_node_detector.py
 								new file mode 100644
 								index 0000000..15a6cee
 								--- /dev/null
 								+++ b/anteater/model/detector/slow_node_detector.py
@@ -0,0 +1,397 @@
 								+#!/usr/bin/python3
 								+# ******************************************************************************
 								+# Copyright (c) 2023 Huawei Technologies Co., Ltd.
 								+# gala-anteater is licensed under Mulan PSL v2.
 								+# You can use this software according to the terms and conditions of the Mulan PSL v2.
 								+# You may obtain a copy of Mulan PSL v2 at:
 								+#          http://license.coscl.org.cn/MulanPSL2
 								+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 								+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 								+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 								+# See the Mulan PSL v2 for more details.
 								+# ******************************************************************************/
 								+import time
 								+import json
 								+import os.path
 								+import pprint
 								+import traceback
 								+from typing import List
 								+
 								+import numpy as np
 								+import pandas as pd
 								+
 								+from anteater.core.slow_node_response import AIJobDetectResult, ResultCode, NodeData
 								+from anteater.core.anomaly import Anomaly, RootCause
 								+from anteater.core.kpi import KPI, ModelConfig, Feature
 								+from anteater.utils.datetime import DateTimeManager as dt
 								+from anteater.utils.timer import timer
 								+from anteater.utils.log import logger
 								+from anteater.source.metric_loader import MetricLoader
 								+from anteater.model.detector.base import Detector
 								+from anteater.model.process.rank_table_loader import GroupDataLoader
 								+from anteater.model.algorithms.slow_node_algs import time_node_detectors, space_node_detectors
 								+
 								+
 								+class SlowNodeDetector(Detector):
 								+    def __init__(self, data_loader: MetricLoader, config: ModelConfig, **kwargs):
 								+        """The detector base class initializer"""
 								+        super().__init__(data_loader, **kwargs)
 								+        self.config = config
 								+        self.max_num_normal_results = self.config.params.get("max_num_normal_results", 10)
 								+        self.record_kpi_value = self.config.params.get("record_kpi", False)
 								+        self.hccl_domain, self.rank_table = self._init_hccl_and_rank_table()
 								+
 								+    def _init_hccl_and_rank_table(self):
 								+        params = self.config.params
 								+        hccl_domain_path = params.get("hccl_domain_json")
 								+        rank_table_path = params.get("rank_table_json")
 								+
 								+        hccl_domain = {}
 								+        rank_table = {}
 								+
 								+        if os.path.exists(hccl_domain_path):
 								+            try:
 								+                with open(rank_table_path, 'r', encoding='utf-8') as f_out:
 								+                    hccl_domain = json.load(f_out)
 								+            except Exception:
 								+                logger.error(f"Read hccl domain info fail!")
 								+        if os.path.exists(rank_table_path):
 								+            try:
 								+                with open(rank_table_path, 'r', encoding='utf-8') as f_out:
 								+                    rank_table = json.load(f_out)
 								+            except Exception:
 								+                logger.error(f"Read rank table info fail!")
 								+
 								+        return hccl_domain, rank_table
 								+
 								+    @staticmethod
 								+    def npu_id2host_id(machines2devices: dict):
 								+        npu_id2host_id_dict = {}
 								+        npu_ids = []
 								+        hosts_ids = []
 								+        for machine_ip, devices in machines2devices.items():
 								+            if devices == [""]:
 								+                hosts_ids.append(machine_ip)
 								+            else:
 								+                npu_ids.append(machine_ip)
 								+
 								+        for npu_id in npu_ids:
 								+            for host_id in hosts_ids:
 								+                if npu_id.split(":")[0] in host_id:
 								+                    npu_id2host_id_dict[npu_id] = host_id
 								+                    break
 								+
 								+        return npu_id2host_id_dict, hosts_ids
 								+
 								+    def get_host_ids_by_npu_ids(self, npu_ids: dict, npu_id2host_id_dict: dict, hosts_ids: list) -> list:
 								+        host_ids = []
 								+        if npu_ids:
 								+            for npu_id in npu_ids:
 								+                host_id = npu_id2host_id_dict.get(npu_id, "")
 								+                if host_id:
 								+                    host_ids.append(host_id)
 								+        else:
 								+            host_ids = hosts_ids
 								+
 								+        return host_ids
 								+
 								+    @timer
 								+    def _execute(self, kpis: List[KPI], features: List[Feature], **kwargs) \
 								+            -> List[Anomaly]:
 								+        # save to kafka response
 								+        anomalies = []
 								+
 								+        logger.info('Execute cdt model: %s.', self.__class__.__name__)
 								+        start, end = dt.last(minutes=30)
 								+        # 获取machine_ids,
 								+        machines_to_devices = self.get_machines_to_devices(start, end, kpis)
 								+        npu_id2host_id, hosts_ids = self.npu_id2host_id(machines_to_devices)
 								+
 								+        group_dataloader = GroupDataLoader(self.hccl_domain, self.rank_table, machines_to_devices)
 								+        group_ranks: list = group_dataloader.get_group_ranks()
 								+        all_results = []
 								+        for kpi in kpis:
 								+            for ranks in group_ranks:
 								+                machine_ids: dict = group_dataloader.rank_table_loader.get_group_nodes_by_ranks(ranks)
 								+                host_ids: list = self.get_host_ids_by_npu_ids(machine_ids, npu_id2host_id, hosts_ids)
 								+                group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids)
 								+                all_results.extend(group_result)
 								+
 								+        response, all_anomaly_nodes = self.gen_final_alarm(kpis, all_results)
 								+
 								+        if response.result_code == ResultCode.anomaly:
 								+            all_anomaly_nodes = sorted(list(set(all_anomaly_nodes)))
 								+            anomaly = Anomaly(
 								+                machine_id=json.dumps(all_anomaly_nodes),
 								+                metric="slow_node_metric",
 								+                labels={"instance": "node_ip"},
 								+                score=1.0,
 								+                entity_name="sli",
 								+                details={"detect_method": "slow_node_detection"},
 								+                description=response)
 								+            anomalies.append(anomaly)
 								+
 								+        return anomalies
 								+
 								+    def gen_final_alarm(self, kpis: List[KPI], detect_results: List):
 								+        response = AIJobDetectResult()
 								+        all_anomaly_nodes = []
 								+
 								+        for index, result in enumerate(detect_results):
 								+            try:
 								+                aomaly_devices = result.get("anomaly_devices")
 								+                all_anomaly_nodes.extend(aomaly_devices)
 								+                response = self.group_detect_ret_agg(response, result, kpis)
 								+            except Exception:
 								+                logger.error(traceback.format_exc())
 								+            logger.info("accomplishment: %s/%s", index + 1, len(detect_results))
 								+
 								+        return response, all_anomaly_nodes
 								+
 								+    def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list) -> list:
 								+        """Detects kpi based on signal time series anomaly detection model"""
 								+        # 普罗会一次性抓到所有的数据，需要根据machine_id, device_id去对数据作分组
 								+        # get数据
 								+        metric_name: str = kpi.metric
 								+
 								+        all_machines_ts = []
 								+        for machine_id in machine_ids:
 								+            single_machine_ts_list = self.get_kpi_ts_list(metric_name, machine_id, kpi.params)
 								+            all_machines_ts.extend(single_machine_ts_list)
 								+        for host_id in host_ids:
 								+            single_machine_ts_list = self.get_kpi_ts_list(metric_name, host_id, kpi.params)
 								+            all_machines_ts.extend(single_machine_ts_list)
 								+
 								+        anomaly_devices = []
 								+        anomaly_locations = {}
 								+        space_anomaly_locations = {}
 								+
 								+        detect_data, min_data_len = self.preprocessing_data(metric_name, all_machines_ts)
 								+        detection_results = {
 								+            "anomaly_devices": anomaly_devices,
 								+            "anomaly_locations": anomaly_locations,
 								+            "detect_result_type": "TIME",
 								+            "metric_name": metric_name,
 								+            "group_data": detect_data,
 								+        }
 								+        if min_data_len == 0:
 								+            logger.warning("GROUP data contains EMPTY DATA. GROUP_DATA:%s", pprint.pformat(all_machines_ts))
 								+            return [detection_results]
 								+        logger.info("work on %s, %s start.", metric_name, "slow_node_detection")
 								+
 								+        # 时间检测
 								+        # logger.info("work on %s, %s started.", metric_name, "time_node_compare")
 								+        time_anomaly_locations = self.time_node_compare(kpi, detect_data)
 								+        logger.info(f"time_node_compare result: {self.output_anomaly_devices(metric_name, time_anomaly_locations)}.")
 								+        # logger.info("work on %s, %s finished.", metric_name, "time_node_compare")
 								+
 								+        # 空间维度对比
 								+        # 若指标空间维度配置为空，则不进行均质化对比
 								+        if kpi.params.get("space_detector") is not None:
 								+            # 四个以上的对象才进行均质化
 								+            if len(all_machines_ts) >= 4:
 								+                # 空间维度对比，输出异常节点
 								+                space_anomaly_locations = self.space_nodes_compare(kpi, detect_data)
 								+                logger.info(
 								+                    f"space_nodes_compare result: {self.output_anomaly_devices(metric_name, space_anomaly_locations)}.")
 								+            else:
 								+                logger.info(f"Skip space nodes compare, due to nodes number{len(all_machines_ts)} is smaller than 4.")
 								+        else:
 								+            logger.info(f"Skip space nodes compare.")
 								+
 								+        # 时间空间结果融合
 								+        anomaly_locations, detect_result_type = self.time_space_agg(time_anomaly_locations, space_anomaly_locations,
 								+                                                                    metric_name)
 								+
 								+        anomaly_devices = self.output_anomaly_devices(metric_name, anomaly_locations)
 								+        detection_results["anomaly_devices"] = anomaly_devices
 								+        detection_results["anomaly_locations"] = anomaly_locations
 								+        detection_results["detect_result_type"] = detect_result_type
 								+
 								+        logger.info(f'''Time and space aggregated result: {anomaly_devices}.''')
 								+        logger.info("work on %s, %s end.\n", metric_name, "slow_node_detection")
 								+
 								+        return [detection_results]
 								+
 								+    @staticmethod
 								+    def output_anomaly_devices(metric: str, anomaly_location: dict):
 								+        anomaly_devices = []
 								+        for device_info in anomaly_location.keys():
 								+            # 异常点数大于0, 则认为该指标出现异常
 								+            if np.sum(anomaly_location[device_info][metric][1]) > 0:
 								+                anomaly_devices.append(device_info)
 								+
 								+        return anomaly_devices
 								+
 								+    @staticmethod
 								+    def preprocessing_data(metric_name: str, metric_data: list):
 								+        if len(metric_data) == 0:
 								+            return {}, 0
 								+
 								+        detect_data = {}
 								+        length = 0
 								+        for index, metric_ts in enumerate(metric_data):
 								+            time_stamps = metric_ts.time_stamps
 								+            length = len(time_stamps)
 								+            values = metric_ts.values
 								+            labels = metric_ts.labels
 								+            if labels.get("id"):
 								+                device_label = f'''{labels.get("instance")}*{labels.get("id")}'''
 								+            else:
 								+                device_label = f'''{labels.get("instance")}*-1'''
 								+            detect_data[device_label] = pd.DataFrame({"timestamp": time_stamps, metric_name: values})
 								+
 								+        return detect_data, length
 								+
 								+    def time_node_compare(self, kpi: KPI, detect_data: dict):
 								+        metric_name = kpi.metric
 								+        cfg = kpi.params.get("time_detector", {})
 								+        detector_class = time_node_detectors.get(cfg.get("type"))
 								+
 								+        time_node_detector = detector_class(metric_name=metric_name, cfg=cfg)
 								+        time_node_detector.fit(detect_data)
 								+        locations = time_node_detector.predict(detect_data)
 								+        expert_alarm_window_size = kpi.params.get("alarm_filter_window_size")
 								+
 								+        for device_info, anomaly_locations in locations.items():
 								+            filter_labels = self.alarm_filter(anomaly_locations[metric_name][1], expert_alarm_window_size)
 								+            locations[device_info][metric_name][1][:] = filter_labels
 								+
 								+        return locations
 								+
 								+    def space_nodes_compare(self, kpi: KPI, detect_data: dict):
 								+        metric_name = kpi.metric
 								+        cfg = kpi.params.get("space_detector", {})
 								+        detector_class = space_node_detectors.get(cfg.get("type"))
 								+        space_detector = detector_class(cfg)
 								+        df = pd.DataFrame()
 								+        column_list = []
 								+        for device_label, infer_data in detect_data.items():
 								+            df[device_label] = infer_data[metric_name]
 								+            column_list.append(device_label)
 								+
 								+        detect_node_data = df[column_list].values
 								+        labels = space_detector.detect(detect_node_data)
 								+
 								+        labels = np.swapaxes(labels, 0, 1)
 								+        space_detect_locations = {}
 								+
 								+        i = 0
 								+        for device_label in column_list:
 								+            space_detect_locations[device_label] = {}
 								+            space_detect_locations[device_label][metric_name] = detect_data[device_label]["timestamp"], labels[i]
 								+            i += 1
 								+        return space_detect_locations
 								+
 								+    def get_kpi_ts_list(self, metric, machine_id: str, kpi_params: dict):
 								+        look_back = self.config.params.get("look_back", 10)
 								+        metric_type = kpi_params.get("metric_type", "device")
 								+        start, end = dt.last(minutes=look_back)
 								+        # from datetime import timedelta
 								+        # # 6:40
 								+        # start = start - timedelta(hours=4.5)
 								+        # end = end - timedelta(hours=4.5)
 								+
 								+        if metric_type == "device":
 								+            # npu device
 								+            ts_list = self.data_loader.get_metric(start, end, metric, instance=machine_id)
 								+        else:
 								+            # host
 								+            op = kpi_params.get("method", "avg")
 								+            ts_list = self.data_loader.get_metric(start, end, metric, operator=op, keys="instance", instance=machine_id)
 								+
 								+        return ts_list
 								+
 								+    @staticmethod
 								+    def alarm_filter(labels, alarm_filter_window_size):
 								+        copy_labels = np.zeros(len(labels))
 								+        start_index = alarm_filter_window_size
 								+        alarm_points = set()
 								+        for i in range(start_index, len(labels) + 1):
 								+            is_sequential_alarm = (np.sum(labels[i - alarm_filter_window_size:i]) >= alarm_filter_window_size)
 								+            if not is_sequential_alarm:
 								+                if np.sum(labels[i - alarm_filter_window_size:i]) > 0:
 								+                    alarm_points.add(i - alarm_filter_window_size)
 								+            else:
 								+                copy_labels[i - alarm_filter_window_size:i] = labels[i - alarm_filter_window_size:i]
 								+        # if alarm_points:
 								+        #     logger.info(f"Alert Remove from point loc", list(alarm_points))
 								+
 								+        return copy_labels
 								+
 								+    @staticmethod
 								+    def time_space_agg(time_anomaly_locations, space_anomaly_locations, metric_name):
 								+        detect_result_type = {}
 								+
 								+        for node_id in time_anomaly_locations.keys():
 								+            time_ret = np.sum(time_anomaly_locations[node_id][metric_name][1])
 								+            if space_anomaly_locations:
 								+                space_ret = np.sum(space_anomaly_locations[node_id][metric_name][1])
 								+                # 如果均质化没有报错则消除告警
 								+                # 若空间检测和时间检测结果都为空，则返回正常值
 								+                # 若时间维度和空间维度都出现异常，以空间维度为主返回结果
 								+                if space_ret == 0 or (space_ret > 0 and time_ret >= 0):
 								+                    time_anomaly_locations[node_id][metric_name] = space_anomaly_locations[node_id][metric_name]
 								+                    detect_result_type.setdefault(node_id, {}).setdefault(metric_name, "SPACE")
 								+                else:
 								+                    detect_result_type.setdefault(node_id, {}).setdefault(metric_name, "TIME")
 								+            else:
 								+                detect_result_type.setdefault(node_id, {}).setdefault(metric_name, "TIME")
 								+
 								+        return time_anomaly_locations, detect_result_type
 								+
 								+    @staticmethod
 								+    def _get_kpi_params(kpis: List[KPI], metric_name):
 								+        for kpi in kpis:
 								+            if kpi.metric == metric_name:
 								+                return kpi.params
 								+
 								+        return {}
 								+
 								+    def group_detect_ret_agg(self, response, detect_result, kpis: List[KPI]):
 								+        anomaly_device_labels = detect_result.get("anomaly_devices")
 								+        anomaly_locations = detect_result.get("anomaly_locations")
 								+        metric_name = detect_result.get("metric_name")
 								+        detect_result_type = detect_result.get("detect_result_type")
 								+        group_data = detect_result.get("group_data")
 								+        if len(anomaly_device_labels) == 0:
 								+            return response
 								+        else:
 								+            response.result_code = ResultCode.anomaly
 								+        kpi_params = self._get_kpi_params(kpis, metric_name)
 								+        response(kpi_params.get('type', "compute"))
 								+
 								+        keep_devices = []
 								+        omitted_devices = []
 								+        for device_label in anomaly_device_labels:
 								+            method_type = detect_result_type.get(device_label, {}).get(metric_name, "TIME")
 								+            if method_type == "SPACE":
 								+                normal_devices = sorted(set(group_data.keys()) - set(anomaly_device_labels))
 								+                keep_devices = normal_devices[:self.max_num_normal_results]
 								+                omitted_devices = normal_devices[self.max_num_normal_results:]
 								+            abnormal_node_data = NodeData(metric_name, device_label, method_type, keep_devices, omitted_devices)
 								+            time_stamp_data, values = anomaly_locations[device_label][metric_name]
 								+            label_dict = dict(zip(time_stamp_data.tolist(), values.tolist()))
 								+
 								+            # see user requirements for this real kpi value
 								+            if self.record_kpi_value:
 								+                # record anomaly kpi value
 								+                g_ts, g_value = group_data[device_label].values[:, 0], group_data[device_label].values[:, 1]
 								+                kpi_data = []
 								+                for key, value in sorted(zip(g_ts.tolist(), g_value.tolist()), key=lambda x: x[0]):
 								+                    kpi_data.append({str(key): str(value), "abnormal": label_dict.get(key, 0)})
 								+
 								+                abnormal_node_data.kpi_data = kpi_data
 								+            response.abnormal_detail.append(abnormal_node_data)
 								+
 								+        if keep_devices:
 								+            for device_label in keep_devices:
 								+                normal_node_data = NodeData(metric_name, device_label, "SPACE")
 								+                # see user requirements for this real kpi value
 								+                if self.record_kpi_value:
 								+                    # record normal kpi data for space compare
 								+                    g_ts, g_value = group_data[device_label].values[:, 0], group_data[device_label].values[:, 1]
 								+                    kpi_data = [{str(key): str(value)} for key, value in zip(g_ts.tolist(), g_value.tolist())]
 								+                    normal_node_data.kpi_data = kpi_data
 								+                response.normal_detail.append(normal_node_data)
 								+        return response
 								diff --git a/config/module/slow_node_detection.job.json b/config/module/slow_node_detection.job.json
 								new file mode 100644
 								index 0000000..91ff621
 								--- /dev/null
 								+++ b/config/module/slow_node_detection.job.json
@@ -0,0 +1,352 @@
 								+{
 								+  "name": "SlowNodeDetector",
 								+  "enable": true,
 								+  "job_type": "anomaly_detection",
 								+  "keywords": [
 								+    "app"
 								+  ],
 								+  "root_cause_num": 20,
 								+  "detector": "slow-node-detection",
 								+  "template": "slow_node",
 								+  "model_config": {
 								+    "name": "disruption_model",
 								+    "params": {
 								+      "record_kpi": false,
 								+      "max_num_normal_results": 16,
 								+      "look_back": 20,
 								+      "obs_size": 5,
 								+      "outlier_ratio_th": 0.6,
 								+      "hccl_domain_json": "./hccl_domain.json",
 								+      "rank_table_json": "./hccl_domain.json"
 								+    }
 								+  },
 								+  "kpis": [
 								+    {
 								+      "metric": "gala_gopher_cpu_total_used_per",
 								+      "entity_name": "sli",
 								+      "atrend": "rise",
 								+      "enable": true,
 								+      "params": {
 								+        "metric_type": "host",
 								+        "method": "avg",
 								+        "priority": 30,
 								+        "alarm_filter_window_size": 5,
 								+        "space_detector": null,
 								+        "time_detector": {
 								+          "preprocess_eps": 0.1,
 								+          "preprocess_min_samples": 10,
 								+          "type": "SlidingWindowKSigmaDetector",
 								+          "n_sigma_method": {
 								+            "type": "SlidingWindowNSigma",
 								+            "training_window_size": 40,
 								+            "min_update_window_size": 10,
 								+            "min_std_val": 0.0001,
 								+            "bias": 0.01,
 								+            "abs_bias": 0,
 								+            "nsigma_coefficient": 4,
 								+            "detect_type": "upper_bound",
 								+            "min_expert_lower_bound": null,
 								+            "max_expert_lower_bound": null,
 								+            "min_expert_upper_bound": 70,
 								+            "max_expert_upper_bound": 80
 								+          }
 								+        },
 								+        "type": "compute"
 								+      }
 								+    },
 								+    {
 								+      "metric": "gala_gopher_mem_util",
 								+      "entity_name": "sli",
 								+      "atrend": "rise",
 								+      "enable": true,
 								+      "params": {
 								+        "metric_type": "host",
 								+        "method": "sum",
 								+        "priority": 20,
 								+        "alarm_filter_window_size": 5,
 								+        "space_detector": {
 								+          "first_gap_rate": 0.3,
 								+          "second_gap_rate": 0.2,
 								+          "base_threshold": 150,
 								+          "discrete_rate": 1.5,
 								+          "nsigma_coefficient": 2,
 								+          "discrete_point_suppression_ratio": 0.03,
 								+          "non_major_anomaly_suppression": 0.1,
 								+          "type": "OuterDataDetector"
 								+        },
 								+        "time_detector": {
 								+          "preprocess_eps": 0.1,
 								+          "preprocess_min_samples": 10,
 								+          "type": "SlidingWindowKSigmaDetector",
 								+          "n_sigma_method": {
 								+            "type": "SlidingWindowNSigma",
 								+            "training_window_size": 40,
 								+            "min_update_window_size": 10,
 								+            "min_std_val": 0.0001,
 								+            "bias": 0.1,
 								+            "abs_bias": 5,
 								+            "nsigma_coefficient": 4,
 								+            "detect_type": "upper_bound",
 								+            "min_expert_lower_bound": null,
 								+            "max_expert_lower_bound": null,
 								+            "min_expert_upper_bound": 50,
 								+            "max_expert_upper_bound": null
 								+          }
 								+        },
 								+        "type": "compute"
 								+      }
 								+    },
 								+    {
 								+      "metric": "gala_gopher_disk_wspeed_kB",
 								+      "entity_name": "sli",
 								+      "atrend": "rise",
 								+      "enable": true,
 								+      "params": {
 								+        "metric_type": "host",
 								+        "method": "sum",
 								+        "priority": 5,
 								+        "alarm_filter_window_size": 30,
 								+        "space_detector": null,
 								+        "time_detector": {
 								+          "preprocess_eps": 0.1,
 								+          "preprocess_min_samples": 10,
 								+          "type": "SlidingWindowKSigmaDetector",
 								+          "n_sigma_method": {
 								+            "type": "SlidingWindowNSigma",
 								+            "training_window_size": 60,
 								+            "min_update_window_size": 10,
 								+            "min_std_val": 0.0001,
 								+            "bias": 0.3,
 								+            "abs_bias": 0,
 								+            "nsigma_coefficient": 3,
 								+            "detect_type": "lower_bound",
 								+            "min_expert_lower_bound": null,
 								+            "max_expert_lower_bound": null,
 								+            "min_expert_upper_bound": null,
 								+            "max_expert_upper_bound": null
 								+          }
 								+        },
 								+        "type": "storage"
 								+      }
 								+    },
 								+    {
 								+      "metric": "gala_gopher_nic_tx_dropped",
 								+      "entity_name": "sli",
 								+      "atrend": "rise",
 								+      "enable": true,
 								+      "params": {
 								+        "metric_type": "host",
 								+        "method": "sum",
 								+        "priority": 5,
 								+        "alarm_filter_window_size": 5,
 								+        "space_detector": null,
 								+        "time_detector": {
 								+          "preprocess_eps": 0.1,
 								+          "preprocess_min_samples": 10,
 								+          "type": "SlidingWindowKSigmaDetector",
 								+          "n_sigma_method": {
 								+            "type": "SlidingWindowNSigma",
 								+            "training_window_size": 40,
 								+            "min_update_window_size": 10,
 								+            "min_std_val": 0.0001,
 								+            "bias": 0.05,
 								+            "abs_bias": 0,
 								+            "nsigma_coefficient": 4,
 								+            "detect_type": "upper_bound",
 								+            "min_expert_lower_bound": null,
 								+            "max_expert_lower_bound": null,
 								+            "min_expert_upper_bound": 10,
 								+            "max_expert_upper_bound": null
 								+          }
 								+        },
 								+        "type": "network"
 								+      }
 								+    },
 								+    {
 								+      "metric": "gala_gopher_nic_tx_errs",
 								+      "entity_name": "sli",
 								+      "atrend": "rise",
 								+      "enable": true,
 								+      "params": {
 								+        "metric_type": "host",
 								+        "method": "sum",
 								+        "priority": 5,
 								+        "alarm_filter_window_size": 5,
 								+        "space_detector": null,
 								+        "time_detector": {
 								+          "preprocess_eps": 0.1,
 								+          "preprocess_min_samples": 10,
 								+          "type": "SlidingWindowKSigmaDetector",
 								+          "n_sigma_method": {
 								+            "type": "SlidingWindowNSigma",
 								+            "training_window_size": 40,
 								+            "min_update_window_size": 10,
 								+            "min_std_val": 0.0001,
 								+            "bias": 0.05,
 								+            "abs_bias": 0,
 								+            "nsigma_coefficient": 4,
 								+            "detect_type": "upper_bound",
 								+            "min_expert_lower_bound": null,
 								+            "max_expert_lower_bound": null,
 								+            "min_expert_upper_bound": 10,
 								+            "max_expert_upper_bound": null
 								+          }
 								+        },
 								+        "type": "network"
 								+      }
 								+    },
 								+    {
 								+      "metric": "npu_chip_info_temperature",
 								+      "entity_name": "sli",
 								+      "atrend": "rise",
 								+      "enable": true,
 								+      "params": {
 								+        "metric_type": "device",
 								+        "method": "max",
 								+        "priority": 25,
 								+        "alarm_filter_window_size": 12,
 								+        "space_detector": null,
 								+        "time_detector": {
 								+          "preprocess_eps": 0.1,
 								+          "preprocess_min_samples": 10,
 								+          "type": "SlidingWindowKSigmaDetector",
 								+          "n_sigma_method": {
 								+            "type": "SlidingWindowNSigma",
 								+            "training_window_size": 40,
 								+            "min_update_window_size": 10,
 								+            "min_std_val": 0.0001,
 								+            "bias": 0.01,
 								+            "abs_bias": 0,
 								+            "nsigma_coefficient": 4,
 								+            "detect_type": "upper_bound",
 								+            "min_expert_lower_bound": null,
 								+            "max_expert_lower_bound": null,
 								+            "min_expert_upper_bound": 70,
 								+            "max_expert_upper_bound": 100
 								+          }
 								+        },
 								+        "type": "compute"
 								+      }
 								+    },
 								+    {
 								+      "metric": "npu_chip_info_hbm_used_memory",
 								+      "entity_name": "sli",
 								+      "atrend": "rise",
 								+      "enable": true,
 								+      "params": {
 								+        "metric_type": "device",
 								+        "method": "max",
 								+        "priority": 30,
 								+        "alarm_filter_window_size": 5,
 								+        "space_detector": {
 								+          "dist_metric": "euclidean",
 								+          "eps": 0.4,
 								+          "cv_threshold": 0.03,
 								+          "min_samples": 2,
 								+          "window_size": 100,
 								+          "scaling": false,
 								+          "type": "SlidingWindowDBSCAN"
 								+        },
 								+        "time_detector": {
 								+          "preprocess_eps": 0.1,
 								+          "preprocess_min_samples": 10,
 								+          "type": "SlidingWindowKSigmaDetector",
 								+          "n_sigma_method": {
 								+            "type": "SlidingWindowNSigma",
 								+            "training_window_size": 40,
 								+            "min_update_window_size": 10,
 								+            "min_std_val": 0.0001,
 								+            "bias": 0.02,
 								+            "abs_bias": 5,
 								+            "nsigma_coefficient": 4,
 								+            "detect_type": "upper_bound",
 								+            "min_expert_lower_bound": null,
 								+            "max_expert_lower_bound": null,
 								+            "min_expert_upper_bound": null,
 								+            "max_expert_upper_bound": null
 								+          }
 								+        },
 								+        "type": "compute"
 								+      }
 								+    },
 								+    {
 								+      "metric": "npu_chip_info_aicore_current_freq",
 								+      "entity_name": "sli",
 								+      "atrend": "rise",
 								+      "enable": true,
 								+      "params": {
 								+        "metric_type": "device",
 								+        "method": "max",
 								+        "priority": 30,
 								+        "alarm_filter_window_size": 5,
 								+        "space_detector": {
 								+          "dist_metric": "euclidean",
 								+          "eps": 0.4,
 								+          "cv_threshold": 0.03,
 								+          "min_samples": 2,
 								+          "window_size": 100,
 								+          "scaling": true,
 								+          "type": "SlidingWindowDBSCAN"
 								+        },
 								+        "time_detector": {
 								+          "preprocess_eps": 0.1,
 								+          "preprocess_min_samples": 10,
 								+          "type": "SlidingWindowKSigmaDetector",
 								+          "n_sigma_method": {
 								+            "type": "SlidingWindowNSigma",
 								+            "training_window_size": 40,
 								+            "min_update_window_size": 10,
 								+            "min_std_val": 0.0001,
 								+            "bias": 0.05,
 								+            "abs_bias": 0,
 								+            "nsigma_coefficient": 4,
 								+            "detect_type": "lower_bound",
 								+            "min_expert_lower_bound": null,
 								+            "max_expert_lower_bound": null,
 								+            "min_expert_upper_bound": 10,
 								+            "max_expert_upper_bound": null
 								+          }
 								+        },
 								+        "type": "compute"
 								+      }
 								+    },
 								+    {
 								+      "metric": "npu_chip_roce_tx_err_pkt_num",
 								+      "entity_name": "sli",
 								+      "atrend": "rise",
 								+      "enable": true,
 								+      "params": {
 								+        "metric_type": "device",
 								+        "method": "max",
 								+        "priority": 30,
 								+        "alarm_filter_window_size": 5,
 								+        "space_detector": null,
 								+        "time_detector": {
 								+          "preprocess_eps": 0.1,
 								+          "preprocess_min_samples": 10,
 								+          "type": "SlidingWindowKSigmaDetector",
 								+          "n_sigma_method": {
 								+            "type": "SlidingWindowNSigma",
 								+            "training_window_size": 40,
 								+            "min_update_window_size": 10,
 								+            "min_std_val": 0.0001,
 								+            "bias": 0.05,
 								+            "abs_bias": 0,
 								+            "nsigma_coefficient": 4,
 								+            "detect_type": "upper_bound",
 								+            "min_expert_lower_bound": null,
 								+            "max_expert_lower_bound": null,
 								+            "min_expert_upper_bound": 10,
 								+            "max_expert_upper_bound": null
 								+          }
 								+        },
 								+        "type": "network"
 								+      }
 								+    }
 								+  ],
 								+  "features": [
 								+    {
 								+      "metric": "gala_gopher_container_cpu_usage_seconds_total"
 								+    }
 								+  ]
 								+}
 								\ No newline at end of file
 								--
 .33.0