!37 基于kpi的AI节点异常检测

From: @webankto Reviewed-by: @wangfenglai Signed-off-by: @wangfenglai
2024-11-08 01:23:32 +00:00 · 2024-11-08 01:23:32 +00:00 · 1c30dd89df
commit 1c30dd89df
parent 60e7878e7b 506b0a92f2
11 changed files with 816 additions and 2322 deletions
--- a/0001-add-new-feature-slow-node-detection.patch
+++ b/0001-add-new-feature-slow-node-detection.patch
@ -0,0 +1,777 @@
+From 2e30b68154f5d0b6a68eab1bf8408363bbf28a76 Mon Sep 17 00:00:00 2001
+From: huangbin <huangbin58@huawei.com>
+Date: Tue, 5 Nov 2024 15:37:00 +0800
+Subject: [PATCH] add new feature slow node detection
+
+---
+ anteater/model/detector/slow_node_detector.py | 397 ++++++++++++++++++
+ config/module/slow_node_detection.job.json    | 352 ++++++++++++++++
+ 2 files changed, 749 insertions(+)
+ create mode 100644 anteater/model/detector/slow_node_detector.py
+ create mode 100644 config/module/slow_node_detection.job.json
+
+diff --git a/anteater/model/detector/slow_node_detector.py b/anteater/model/detector/slow_node_detector.py
+new file mode 100644
+index 0000000..15a6cee
+--- /dev/null
+++ b/anteater/model/detector/slow_node_detector.py
+@@ -0,0 +1,397 @@
+#!/usr/bin/python3
+# ******************************************************************************
+# Copyright (c) 2023 Huawei Technologies Co., Ltd.
+# gala-anteater is licensed under Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+#          http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+# See the Mulan PSL v2 for more details.
+# ******************************************************************************/
+import time
+import json
+import os.path
+import pprint
+import traceback
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from anteater.core.slow_node_response import AIJobDetectResult, ResultCode, NodeData
+from anteater.core.anomaly import Anomaly, RootCause
+from anteater.core.kpi import KPI, ModelConfig, Feature
+from anteater.utils.datetime import DateTimeManager as dt
+from anteater.utils.timer import timer
+from anteater.utils.log import logger
+from anteater.source.metric_loader import MetricLoader
+from anteater.model.detector.base import Detector
+from anteater.model.process.rank_table_loader import GroupDataLoader
+from anteater.model.algorithms.slow_node_algs import time_node_detectors, space_node_detectors
+
+
+class SlowNodeDetector(Detector):
+    def __init__(self, data_loader: MetricLoader, config: ModelConfig, **kwargs):
+        """The detector base class initializer"""
+        super().__init__(data_loader, **kwargs)
+        self.config = config
+        self.max_num_normal_results = self.config.params.get("max_num_normal_results", 10)
+        self.record_kpi_value = self.config.params.get("record_kpi", False)
+        self.hccl_domain, self.rank_table = self._init_hccl_and_rank_table()
+
+    def _init_hccl_and_rank_table(self):
+        params = self.config.params
+        hccl_domain_path = params.get("hccl_domain_json")
+        rank_table_path = params.get("rank_table_json")
+
+        hccl_domain = {}
+        rank_table = {}
+
+        if os.path.exists(hccl_domain_path):
+            try:
+                with open(rank_table_path, 'r', encoding='utf-8') as f_out:
+                    hccl_domain = json.load(f_out)
+            except Exception:
+                logger.error(f"Read hccl domain info fail!")
+        if os.path.exists(rank_table_path):
+            try:
+                with open(rank_table_path, 'r', encoding='utf-8') as f_out:
+                    rank_table = json.load(f_out)
+            except Exception:
+                logger.error(f"Read rank table info fail!")
+
+        return hccl_domain, rank_table
+
+    @staticmethod
+    def npu_id2host_id(machines2devices: dict):
+        npu_id2host_id_dict = {}
+        npu_ids = []
+        hosts_ids = []
+        for machine_ip, devices in machines2devices.items():
+            if devices == [""]:
+                hosts_ids.append(machine_ip)
+            else:
+                npu_ids.append(machine_ip)
+
+        for npu_id in npu_ids:
+            for host_id in hosts_ids:
+                if npu_id.split(":")[0] in host_id:
+                    npu_id2host_id_dict[npu_id] = host_id
+                    break
+
+        return npu_id2host_id_dict, hosts_ids
+
+    def get_host_ids_by_npu_ids(self, npu_ids: dict, npu_id2host_id_dict: dict, hosts_ids: list) -> list:
+        host_ids = []
+        if npu_ids:
+            for npu_id in npu_ids:
+                host_id = npu_id2host_id_dict.get(npu_id, "")
+                if host_id:
+                    host_ids.append(host_id)
+        else:
+            host_ids = hosts_ids
+
+        return host_ids
+
+    @timer
+    def _execute(self, kpis: List[KPI], features: List[Feature], **kwargs) \
+            -> List[Anomaly]:
+        # save to kafka response
+        anomalies = []
+
+        logger.info('Execute cdt model: %s.', self.__class__.__name__)
+        start, end = dt.last(minutes=30)
+        # 获取machine_ids,
+        machines_to_devices = self.get_machines_to_devices(start, end, kpis)
+        npu_id2host_id, hosts_ids = self.npu_id2host_id(machines_to_devices)
+
+        group_dataloader = GroupDataLoader(self.hccl_domain, self.rank_table, machines_to_devices)
+        group_ranks: list = group_dataloader.get_group_ranks()
+        all_results = []
+        for kpi in kpis:
+            for ranks in group_ranks:
+                machine_ids: dict = group_dataloader.rank_table_loader.get_group_nodes_by_ranks(ranks)
+                host_ids: list = self.get_host_ids_by_npu_ids(machine_ids, npu_id2host_id, hosts_ids)
+                group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids)
+                all_results.extend(group_result)
+
+        response, all_anomaly_nodes = self.gen_final_alarm(kpis, all_results)
+
+        if response.result_code == ResultCode.anomaly:
+            all_anomaly_nodes = sorted(list(set(all_anomaly_nodes)))
+            anomaly = Anomaly(
+                machine_id=json.dumps(all_anomaly_nodes),
+                metric="slow_node_metric",
+                labels={"instance": "node_ip"},
+                score=1.0,
+                entity_name="sli",
+                details={"detect_method": "slow_node_detection"},
+                description=response)
+            anomalies.append(anomaly)
+
+        return anomalies
+
+    def gen_final_alarm(self, kpis: List[KPI], detect_results: List):
+        response = AIJobDetectResult()
+        all_anomaly_nodes = []
+
+        for index, result in enumerate(detect_results):
+            try:
+                aomaly_devices = result.get("anomaly_devices")
+                all_anomaly_nodes.extend(aomaly_devices)
+                response = self.group_detect_ret_agg(response, result, kpis)
+            except Exception:
+                logger.error(traceback.format_exc())
+            logger.info("accomplishment: %s/%s", index + 1, len(detect_results))
+
+        return response, all_anomaly_nodes
+
+    def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list) -> list:
+        """Detects kpi based on signal time series anomaly detection model"""
+        # 普罗会一次性抓到所有的数据，需要根据machine_id, device_id去对数据作分组
+        # get数据
+        metric_name: str = kpi.metric
+
+        all_machines_ts = []
+        for machine_id in machine_ids:
+            single_machine_ts_list = self.get_kpi_ts_list(metric_name, machine_id, kpi.params)
+            all_machines_ts.extend(single_machine_ts_list)
+        for host_id in host_ids:
+            single_machine_ts_list = self.get_kpi_ts_list(metric_name, host_id, kpi.params)
+            all_machines_ts.extend(single_machine_ts_list)
+
+        anomaly_devices = []
+        anomaly_locations = {}
+        space_anomaly_locations = {}
+
+        detect_data, min_data_len = self.preprocessing_data(metric_name, all_machines_ts)
+        detection_results = {
+            "anomaly_devices": anomaly_devices,
+            "anomaly_locations": anomaly_locations,
+            "detect_result_type": "TIME",
+            "metric_name": metric_name,
+            "group_data": detect_data,
+        }
+        if min_data_len == 0:
+            logger.warning("GROUP data contains EMPTY DATA. GROUP_DATA:%s", pprint.pformat(all_machines_ts))
+            return [detection_results]
+        logger.info("work on %s, %s start.", metric_name, "slow_node_detection")
+
+        # 时间检测
+        # logger.info("work on %s, %s started.", metric_name, "time_node_compare")
+        time_anomaly_locations = self.time_node_compare(kpi, detect_data)
+        logger.info(f"time_node_compare result: {self.output_anomaly_devices(metric_name, time_anomaly_locations)}.")
+        # logger.info("work on %s, %s finished.", metric_name, "time_node_compare")
+
+        # 空间维度对比
+        # 若指标空间维度配置为空，则不进行均质化对比
+        if kpi.params.get("space_detector") is not None:
+            # 四个以上的对象才进行均质化
+            if len(all_machines_ts) >= 4:
+                # 空间维度对比，输出异常节点
+                space_anomaly_locations = self.space_nodes_compare(kpi, detect_data)
+                logger.info(
+                    f"space_nodes_compare result: {self.output_anomaly_devices(metric_name, space_anomaly_locations)}.")
+            else:
+                logger.info(f"Skip space nodes compare, due to nodes number{len(all_machines_ts)} is smaller than 4.")
+        else:
+            logger.info(f"Skip space nodes compare.")
+
+        # 时间空间结果融合
+        anomaly_locations, detect_result_type = self.time_space_agg(time_anomaly_locations, space_anomaly_locations,
+                                                                    metric_name)
+
+        anomaly_devices = self.output_anomaly_devices(metric_name, anomaly_locations)
+        detection_results["anomaly_devices"] = anomaly_devices
+        detection_results["anomaly_locations"] = anomaly_locations
+        detection_results["detect_result_type"] = detect_result_type
+
+        logger.info(f'''Time and space aggregated result: {anomaly_devices}.''')
+        logger.info("work on %s, %s end.\n", metric_name, "slow_node_detection")
+
+        return [detection_results]
+
+    @staticmethod
+    def output_anomaly_devices(metric: str, anomaly_location: dict):
+        anomaly_devices = []
+        for device_info in anomaly_location.keys():
+            # 异常点数大于0, 则认为该指标出现异常
+            if np.sum(anomaly_location[device_info][metric][1]) > 0:
+                anomaly_devices.append(device_info)
+
+        return anomaly_devices
+
+    @staticmethod
+    def preprocessing_data(metric_name: str, metric_data: list):
+        if len(metric_data) == 0:
+            return {}, 0
+
+        detect_data = {}
+        length = 0
+        for index, metric_ts in enumerate(metric_data):
+            time_stamps = metric_ts.time_stamps
+            length = len(time_stamps)
+            values = metric_ts.values
+            labels = metric_ts.labels
+            if labels.get("id"):
+                device_label = f'''{labels.get("instance")}*{labels.get("id")}'''
+            else:
+                device_label = f'''{labels.get("instance")}*-1'''
+            detect_data[device_label] = pd.DataFrame({"timestamp": time_stamps, metric_name: values})
+
+        return detect_data, length
+
+    def time_node_compare(self, kpi: KPI, detect_data: dict):
+        metric_name = kpi.metric
+        cfg = kpi.params.get("time_detector", {})
+        detector_class = time_node_detectors.get(cfg.get("type"))
+
+        time_node_detector = detector_class(metric_name=metric_name, cfg=cfg)
+        time_node_detector.fit(detect_data)
+        locations = time_node_detector.predict(detect_data)
+        expert_alarm_window_size = kpi.params.get("alarm_filter_window_size")
+
+        for device_info, anomaly_locations in locations.items():
+            filter_labels = self.alarm_filter(anomaly_locations[metric_name][1], expert_alarm_window_size)
+            locations[device_info][metric_name][1][:] = filter_labels
+
+        return locations
+
+    def space_nodes_compare(self, kpi: KPI, detect_data: dict):
+        metric_name = kpi.metric
+        cfg = kpi.params.get("space_detector", {})
+        detector_class = space_node_detectors.get(cfg.get("type"))
+        space_detector = detector_class(cfg)
+        df = pd.DataFrame()
+        column_list = []
+        for device_label, infer_data in detect_data.items():
+            df[device_label] = infer_data[metric_name]
+            column_list.append(device_label)
+
+        detect_node_data = df[column_list].values
+        labels = space_detector.detect(detect_node_data)
+
+        labels = np.swapaxes(labels, 0, 1)
+        space_detect_locations = {}
+
+        i = 0
+        for device_label in column_list:
+            space_detect_locations[device_label] = {}
+            space_detect_locations[device_label][metric_name] = detect_data[device_label]["timestamp"], labels[i]
+            i += 1
+        return space_detect_locations
+
+    def get_kpi_ts_list(self, metric, machine_id: str, kpi_params: dict):
+        look_back = self.config.params.get("look_back", 10)
+        metric_type = kpi_params.get("metric_type", "device")
+        start, end = dt.last(minutes=look_back)
+        # from datetime import timedelta
+        # # 6:40
+        # start = start - timedelta(hours=4.5)
+        # end = end - timedelta(hours=4.5)
+
+        if metric_type == "device":
+            # npu device
+            ts_list = self.data_loader.get_metric(start, end, metric, instance=machine_id)
+        else:
+            # host
+            op = kpi_params.get("method", "avg")
+            ts_list = self.data_loader.get_metric(start, end, metric, operator=op, keys="instance", instance=machine_id)
+
+        return ts_list
+
+    @staticmethod
+    def alarm_filter(labels, alarm_filter_window_size):
+        copy_labels = np.zeros(len(labels))
+        start_index = alarm_filter_window_size
+        alarm_points = set()
+        for i in range(start_index, len(labels) + 1):
+            is_sequential_alarm = (np.sum(labels[i - alarm_filter_window_size:i]) >= alarm_filter_window_size)
+            if not is_sequential_alarm:
+                if np.sum(labels[i - alarm_filter_window_size:i]) > 0:
+                    alarm_points.add(i - alarm_filter_window_size)
+            else:
+                copy_labels[i - alarm_filter_window_size:i] = labels[i - alarm_filter_window_size:i]
+        # if alarm_points:
+        #     logger.info(f"Alert Remove from point loc", list(alarm_points))
+
+        return copy_labels
+
+    @staticmethod
+    def time_space_agg(time_anomaly_locations, space_anomaly_locations, metric_name):
+        detect_result_type = {}
+
+        for node_id in time_anomaly_locations.keys():
+            time_ret = np.sum(time_anomaly_locations[node_id][metric_name][1])
+            if space_anomaly_locations:
+                space_ret = np.sum(space_anomaly_locations[node_id][metric_name][1])
+                # 如果均质化没有报错则消除告警
+                # 若空间检测和时间检测结果都为空，则返回正常值
+                # 若时间维度和空间维度都出现异常，以空间维度为主返回结果
+                if space_ret == 0 or (space_ret > 0 and time_ret >= 0):
+                    time_anomaly_locations[node_id][metric_name] = space_anomaly_locations[node_id][metric_name]
+                    detect_result_type.setdefault(node_id, {}).setdefault(metric_name, "SPACE")
+                else:
+                    detect_result_type.setdefault(node_id, {}).setdefault(metric_name, "TIME")
+            else:
+                detect_result_type.setdefault(node_id, {}).setdefault(metric_name, "TIME")
+
+        return time_anomaly_locations, detect_result_type
+
+    @staticmethod
+    def _get_kpi_params(kpis: List[KPI], metric_name):
+        for kpi in kpis:
+            if kpi.metric == metric_name:
+                return kpi.params
+
+        return {}
+
+    def group_detect_ret_agg(self, response, detect_result, kpis: List[KPI]):
+        anomaly_device_labels = detect_result.get("anomaly_devices")
+        anomaly_locations = detect_result.get("anomaly_locations")
+        metric_name = detect_result.get("metric_name")
+        detect_result_type = detect_result.get("detect_result_type")
+        group_data = detect_result.get("group_data")
+        if len(anomaly_device_labels) == 0:
+            return response
+        else:
+            response.result_code = ResultCode.anomaly
+        kpi_params = self._get_kpi_params(kpis, metric_name)
+        response(kpi_params.get('type', "compute"))
+
+        keep_devices = []
+        omitted_devices = []
+        for device_label in anomaly_device_labels:
+            method_type = detect_result_type.get(device_label, {}).get(metric_name, "TIME")
+            if method_type == "SPACE":
+                normal_devices = sorted(set(group_data.keys()) - set(anomaly_device_labels))
+                keep_devices = normal_devices[:self.max_num_normal_results]
+                omitted_devices = normal_devices[self.max_num_normal_results:]
+            abnormal_node_data = NodeData(metric_name, device_label, method_type, keep_devices, omitted_devices)
+            time_stamp_data, values = anomaly_locations[device_label][metric_name]
+            label_dict = dict(zip(time_stamp_data.tolist(), values.tolist()))
+
+            # see user requirements for this real kpi value
+            if self.record_kpi_value:
+                # record anomaly kpi value
+                g_ts, g_value = group_data[device_label].values[:, 0], group_data[device_label].values[:, 1]
+                kpi_data = []
+                for key, value in sorted(zip(g_ts.tolist(), g_value.tolist()), key=lambda x: x[0]):
+                    kpi_data.append({str(key): str(value), "abnormal": label_dict.get(key, 0)})
+
+                abnormal_node_data.kpi_data = kpi_data
+            response.abnormal_detail.append(abnormal_node_data)
+
+        if keep_devices:
+            for device_label in keep_devices:
+                normal_node_data = NodeData(metric_name, device_label, "SPACE")
+                # see user requirements for this real kpi value
+                if self.record_kpi_value:
+                    # record normal kpi data for space compare
+                    g_ts, g_value = group_data[device_label].values[:, 0], group_data[device_label].values[:, 1]
+                    kpi_data = [{str(key): str(value)} for key, value in zip(g_ts.tolist(), g_value.tolist())]
+                    normal_node_data.kpi_data = kpi_data
+                response.normal_detail.append(normal_node_data)
+        return response
+diff --git a/config/module/slow_node_detection.job.json b/config/module/slow_node_detection.job.json
+new file mode 100644
+index 0000000..91ff621
+--- /dev/null
+++ b/config/module/slow_node_detection.job.json
+@@ -0,0 +1,352 @@
+{
+  "name": "SlowNodeDetector",
+  "enable": true,
+  "job_type": "anomaly_detection",
+  "keywords": [
+    "app"
+  ],
+  "root_cause_num": 20,
+  "detector": "slow-node-detection",
+  "template": "slow_node",
+  "model_config": {
+    "name": "disruption_model",
+    "params": {
+      "record_kpi": false,
+      "max_num_normal_results": 16,
+      "look_back": 20,
+      "obs_size": 5,
+      "outlier_ratio_th": 0.6,
+      "hccl_domain_json": "./hccl_domain.json",
+      "rank_table_json": "./hccl_domain.json"
+    }
+  },
+  "kpis": [
+    {
+      "metric": "gala_gopher_cpu_total_used_per",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "host",
+        "method": "avg",
+        "priority": 30,
+        "alarm_filter_window_size": 5,
+        "space_detector": null,
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.01,
+            "abs_bias": 0,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 70,
+            "max_expert_upper_bound": 80
+          }
+        },
+        "type": "compute"
+      }
+    },
+    {
+      "metric": "gala_gopher_mem_util",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "host",
+        "method": "sum",
+        "priority": 20,
+        "alarm_filter_window_size": 5,
+        "space_detector": {
+          "first_gap_rate": 0.3,
+          "second_gap_rate": 0.2,
+          "base_threshold": 150,
+          "discrete_rate": 1.5,
+          "nsigma_coefficient": 2,
+          "discrete_point_suppression_ratio": 0.03,
+          "non_major_anomaly_suppression": 0.1,
+          "type": "OuterDataDetector"
+        },
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.1,
+            "abs_bias": 5,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 50,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "compute"
+      }
+    },
+    {
+      "metric": "gala_gopher_disk_wspeed_kB",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "host",
+        "method": "sum",
+        "priority": 5,
+        "alarm_filter_window_size": 30,
+        "space_detector": null,
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 60,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.3,
+            "abs_bias": 0,
+            "nsigma_coefficient": 3,
+            "detect_type": "lower_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": null,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "storage"
+      }
+    },
+    {
+      "metric": "gala_gopher_nic_tx_dropped",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "host",
+        "method": "sum",
+        "priority": 5,
+        "alarm_filter_window_size": 5,
+        "space_detector": null,
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.05,
+            "abs_bias": 0,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 10,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "network"
+      }
+    },
+    {
+      "metric": "gala_gopher_nic_tx_errs",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "host",
+        "method": "sum",
+        "priority": 5,
+        "alarm_filter_window_size": 5,
+        "space_detector": null,
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.05,
+            "abs_bias": 0,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 10,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "network"
+      }
+    },
+    {
+      "metric": "npu_chip_info_temperature",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "device",
+        "method": "max",
+        "priority": 25,
+        "alarm_filter_window_size": 12,
+        "space_detector": null,
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.01,
+            "abs_bias": 0,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 70,
+            "max_expert_upper_bound": 100
+          }
+        },
+        "type": "compute"
+      }
+    },
+    {
+      "metric": "npu_chip_info_hbm_used_memory",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "device",
+        "method": "max",
+        "priority": 30,
+        "alarm_filter_window_size": 5,
+        "space_detector": {
+          "dist_metric": "euclidean",
+          "eps": 0.4,
+          "cv_threshold": 0.03,
+          "min_samples": 2,
+          "window_size": 100,
+          "scaling": false,
+          "type": "SlidingWindowDBSCAN"
+        },
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.02,
+            "abs_bias": 5,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": null,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "compute"
+      }
+    },
+    {
+      "metric": "npu_chip_info_aicore_current_freq",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "device",
+        "method": "max",
+        "priority": 30,
+        "alarm_filter_window_size": 5,
+        "space_detector": {
+          "dist_metric": "euclidean",
+          "eps": 0.4,
+          "cv_threshold": 0.03,
+          "min_samples": 2,
+          "window_size": 100,
+          "scaling": true,
+          "type": "SlidingWindowDBSCAN"
+        },
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.05,
+            "abs_bias": 0,
+            "nsigma_coefficient": 4,
+            "detect_type": "lower_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 10,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "compute"
+      }
+    },
+    {
+      "metric": "npu_chip_roce_tx_err_pkt_num",
+      "entity_name": "sli",
+      "atrend": "rise",
+      "enable": true,
+      "params": {
+        "metric_type": "device",
+        "method": "max",
+        "priority": 30,
+        "alarm_filter_window_size": 5,
+        "space_detector": null,
+        "time_detector": {
+          "preprocess_eps": 0.1,
+          "preprocess_min_samples": 10,
+          "type": "SlidingWindowKSigmaDetector",
+          "n_sigma_method": {
+            "type": "SlidingWindowNSigma",
+            "training_window_size": 40,
+            "min_update_window_size": 10,
+            "min_std_val": 0.0001,
+            "bias": 0.05,
+            "abs_bias": 0,
+            "nsigma_coefficient": 4,
+            "detect_type": "upper_bound",
+            "min_expert_lower_bound": null,
+            "max_expert_lower_bound": null,
+            "min_expert_upper_bound": 10,
+            "max_expert_upper_bound": null
+          }
+        },
+        "type": "network"
+      }
+    }
+  ],
+  "features": [
+    {
+      "metric": "gala_gopher_container_cpu_usage_seconds_total"
+    }
+  ]
+}
+\ No newline at end of file
+-- 
+2.33.0
+
--- a/Add-disk-throughput-detector.patch
+++ b/Add-disk-throughput-detector.patch
@ -1,478 +0,0 @@
-From ac1383471f72420e3320eb7c7999021f3658fb7d Mon Sep 17 00:00:00 2001
-From: lizhenxing11 <lizhenxing11@huawei.com>
-Date: Wed, 7 Dec 2022 16:59:15 +0800
-Subject: [PATCH] Add disk throughput detector
-
-add keywords
-
-extract cause metric to the attributes
-
-update template
---
- anteater/config.py                            |  3 -
- anteater/core/kpi.py                          |  1 +
- anteater/main.py                              |  2 +
- anteater/model/algorithms/three_sigma.py      |  2 +-
- anteater/module/base.py                       |  6 +-
- anteater/module/sys/disk_throughput.py        | 62 +++++++++++++
- anteater/module/sys/proc_io_latency.py        |  4 +-
- anteater/source/anomaly_report.py             |  3 +-
- anteater/template/app_anomaly_template.py     |  4 +-
- anteater/template/sys_anomaly_template.py     |  4 +-
- anteater/template/template.py                 |  3 +-
- anteater/utils/data_load.py                   |  2 +
- config/module/app_sli_rtt.json                |  3 +
- config/module/disk_throughput.json            | 92 +++++++++++++++++++
- config/module/proc_io_latency.json            |  3 +
- config/module/sys_io_latency.json             |  3 +
- config/module/sys_tcp_establish.json          |  3 +
- .../module/sys_tcp_transmission_latency.json  |  3 +
- .../sys_tcp_transmission_throughput.json      |  3 +
- 19 files changed, 193 insertions(+), 13 deletions(-)
- create mode 100644 anteater/module/sys/disk_throughput.py
- create mode 100644 config/module/disk_throughput.json
-
-diff --git a/anteater/config.py b/anteater/config.py
-index ea02702..e9ab557 100644
--- a/anteater/config.py
-+++ b/anteater/config.py
-@@ -81,9 +81,6 @@ class AnteaterConf:
-         """Loads config from yaml file"""
-         data_path = os.path.realpath(data_path)
- 
-        if not os.path.exists(data_path):
-            os.makedirs(data_path)
-
-         try:
-             with open(os.path.join(data_path, "config", self.filename), "rb") as f:
-                 result = yaml.safe_load(f)
-diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py
-index 5a9d8ab..3480139 100644
--- a/anteater/core/kpi.py
-+++ b/anteater/core/kpi.py
-@@ -48,6 +48,7 @@ class ModelConfig:
- class JobConfig:
-     name: str
-     job_type: str
-+    keywords: List[str]
-     root_cause_number: int
-     kpis: List[KPI]
-     features: List[Feature]
-diff --git a/anteater/main.py b/anteater/main.py
-index 11e0409..ba7be70 100644
--- a/anteater/main.py
-+++ b/anteater/main.py
-@@ -21,6 +21,7 @@ from apscheduler.schedulers.blocking import BlockingScheduler
- from anteater.anomaly_detection import AnomalyDetection
- from anteater.config import AnteaterConf
- from anteater.module.app.app_sli_detector import APPSliDetector
-+from anteater.module.sys.disk_throughput import DiskThroughputDetector
- from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector
- from anteater.module.sys.sys_io_latency import SysIOLatencyDetector
- from anteater.module.sys.tcp_establish import SysTcpEstablishDetector
-@@ -57,6 +58,7 @@ def main():
-             SysTcpTransmissionLatencyDetector(loader, report),
-             SysIOLatencyDetector(loader, report),
-             ProcIOLatencyDetector(loader, report),
-+            DiskThroughputDetector(loader, report),
-         ]
-     else:
-         detectors = [
-diff --git a/anteater/model/algorithms/three_sigma.py b/anteater/model/algorithms/three_sigma.py
-index 457b606..49b9952 100644
--- a/anteater/model/algorithms/three_sigma.py
-+++ b/anteater/model/algorithms/three_sigma.py
-@@ -31,7 +31,7 @@ def three_sigma(values, obs_size, n=3, method="abs"):
-     elif method == 'min':
-         outlier = [val for val in obs_val if val < mean - n * std]
-     elif method == 'max':
-        outlier = [val for val in obs_val if val > mean + 3 * std]
-+        outlier = [val for val in obs_val if val > mean + n * std]
-     else:
-         raise ValueError(f'Unknown method {method}')
- 
-diff --git a/anteater/module/base.py b/anteater/module/base.py
-index 7b5fc84..63436ac 100644
--- a/anteater/module/base.py
-+++ b/anteater/module/base.py
-@@ -48,14 +48,14 @@ class E2EDetector:
-         for detector in self.detectors:
-             anomalies = detector.execute(self.job_config)
-             for anomaly in anomalies:
-                self.report(anomaly)
-+                self.report(anomaly, self.job_config.keywords)
- 
-     @abstractmethod
-     def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]:
-         """Parses the cause metrics into the specific formats"""
-         pass
- 
-    def report(self, anomaly: Anomaly):
-+    def report(self, anomaly: Anomaly, keywords):
-         """Parses the anomaly into a specific formats
-         based on the template and reports parsed results
-         """
-@@ -63,4 +63,4 @@ class E2EDetector:
-         timestamp = dt.utc_now()
-         template = self.template(timestamp, anomaly.machine_id,
-                                  anomaly.metric, anomaly.entity_name)
-        self.reporter.sent_anomaly(anomaly, cause_metrics, template)
-+        self.reporter.sent_anomaly(anomaly, cause_metrics, keywords, template)
-diff --git a/anteater/module/sys/disk_throughput.py b/anteater/module/sys/disk_throughput.py
-new file mode 100644
-index 0000000..9a192fb
--- /dev/null
-+++ b/anteater/module/sys/disk_throughput.py
-@@ -0,0 +1,62 @@
-+#!/usr/bin/python3
-+# ******************************************************************************
-+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
-+# gala-anteater is licensed under Mulan PSL v2.
-+# You can use this software according to the terms and conditions of the Mulan PSL v2.
-+# You may obtain a copy of Mulan PSL v2 at:
-+#          http://license.coscl.org.cn/MulanPSL2
-+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-+# See the Mulan PSL v2 for more details.
-+# ******************************************************************************/
-+
-+from typing import List, Dict
-+
-+from anteater.core.anomaly import Anomaly
-+from anteater.module.base import E2EDetector
-+from anteater.model.detector.online_vae_detector import OnlineVAEDetector
-+from anteater.model.detector.n_sigma_detector import NSigmaDetector
-+from anteater.source.anomaly_report import AnomalyReport
-+from anteater.source.metric_loader import MetricLoader
-+from anteater.template.sys_anomaly_template import SysAnomalyTemplate
-+
-+
-+class DiskThroughputDetector(E2EDetector):
-+    """Disk throughput e2e detector which detects the disk read or write
-+    await time performance deteriorates
-+    """
-+
-+    config_file = 'disk_throughput.json'
-+
-+    def __init__(self, data_loader: MetricLoader, reporter: AnomalyReport):
-+        """The disk throughput e2e detector initializer"""
-+        super().__init__(reporter, SysAnomalyTemplate)
-+
-+        self.detectors = self.init_detectors(data_loader)
-+
-+    def init_detectors(self, data_loader):
-+        if self.job_config.model_config.enable:
-+            detectors = [
-+                NSigmaDetector(data_loader, method='max'),
-+                OnlineVAEDetector(data_loader, self.job_config.model_config)
-+            ]
-+        else:
-+            detectors = [
-+                NSigmaDetector(data_loader, method='max')
-+            ]
-+
-+        return detectors
-+
-+    def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]:
-+        """Parses the cause metrics into the specific formats"""
-+        cause_metrics = [
-+            {
-+                'metric': cause.ts.metric,
-+                'labels': cause.ts.labels,
-+                'score': cause.score,
-+                'description': cause.description.format(
-+                    cause.ts.labels.get('disk_name', ''))}
-+            for cause in anomaly.root_causes]
-+
-+        return cause_metrics
-diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py
-index 94fd05d..43e069f 100644
--- a/anteater/module/sys/proc_io_latency.py
-+++ b/anteater/module/sys/proc_io_latency.py
-@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector):
-     def init_detectors(self, data_loader):
-         if self.job_config.model_config.enable:
-             detectors = [
-                NSigmaDetector(data_loader, method='min'),
-+                NSigmaDetector(data_loader, method='abs'),
-                 OnlineVAEDetector(data_loader, self.job_config.model_config)
-             ]
-         else:
-             detectors = [
-                NSigmaDetector(data_loader, method='min')
-+                NSigmaDetector(data_loader, method='abs')
-             ]
- 
-         return detectors
-diff --git a/anteater/source/anomaly_report.py b/anteater/source/anomaly_report.py
-index b226763..3d3bb09 100644
--- a/anteater/source/anomaly_report.py
-+++ b/anteater/source/anomaly_report.py
-@@ -42,7 +42,7 @@ class AnomalyReport:
- 
-         return keys
- 
-    def sent_anomaly(self, anomaly: Anomaly, cause_metrics: List, template: Template):
-+    def sent_anomaly(self, anomaly: Anomaly, cause_metrics: List, keywords: List[str], template: Template):
-         keys = self.get_keys(template.entity_name)
-         machine_id = template.machine_id
-         entity_name = template.entity_name
-@@ -54,6 +54,7 @@ class AnomalyReport:
-         template.keys = keys
-         template.description = anomaly.description
-         template.cause_metrics = cause_metrics
-+        template.keywords = keywords
- 
-         msg = template.get_template()
-         self.provider.send_message(msg)
-diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py
-index 5b8caf8..a509c96 100644
--- a/anteater/template/app_anomaly_template.py
-+++ b/anteater/template/app_anomaly_template.py
-@@ -31,7 +31,9 @@ class AppAnomalyTemplate(Template):
-                 'entity_id': self.entity_id,
-                 'event_id': f'{timestamp}_{self.entity_id}',
-                 'event_type': 'app',
-                'event_source': 'gala-anteater'
-+                'event_source': 'gala-anteater',
-+                'keywords': self.keywords,
-+                'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'}
-             },
-             'Resource': {
-                 'metric': self.metric,
-diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py
-index 1083fb3..4ac6abb 100644
--- a/anteater/template/sys_anomaly_template.py
-+++ b/anteater/template/sys_anomaly_template.py
-@@ -31,7 +31,9 @@ class SysAnomalyTemplate(Template):
-                 'entity_id': self.entity_id,
-                 'event_id': f'{timestamp}_{self.entity_id}',
-                 'event_type': 'sys',
-                'event_source': 'gala-anteater'
-+                'event_source': 'gala-anteater',
-+                'keywords': self.keywords,
-+                'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'}
-             },
-             'Resource': {
-                 'metric': self.metric,
-diff --git a/anteater/template/template.py b/anteater/template/template.py
-index 9e4461a..794c121 100644
--- a/anteater/template/template.py
-+++ b/anteater/template/template.py
-@@ -26,7 +26,8 @@ class Template:
-         self.labels = {}
-         self.entity_id = ""
-         self.description = ""
-        self.cause_metrics = {}
-+        self.cause_metrics = []
-+        self.keywords = []
- 
-     @abstractmethod
-     def get_template(self):
-diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py
-index 6ac92c7..b6991c6 100644
--- a/anteater/utils/data_load.py
-+++ b/anteater/utils/data_load.py
-@@ -45,6 +45,7 @@ def load_job_config(file_name) -> JobConfig:
- 
-     name = config['name']
-     job_type = config['job_type']
-+    keywords = config['keywords']
-     root_cause_number = config['root_cause_number']
-     kpis = [KPI(**_conf) for _conf in config['KPI']]
-     features = [Feature(**_conf) for _conf in config['Features']]
-@@ -74,6 +75,7 @@ def load_job_config(file_name) -> JobConfig:
-     return JobConfig(
-         name=name,
-         job_type=job_type,
-+        keywords=keywords,
-         root_cause_number=root_cause_number,
-         kpis=kpis,
-         features=features,
-diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json
-index 7c05094..db29392 100644
--- a/config/module/app_sli_rtt.json
-+++ b/config/module/app_sli_rtt.json
-@@ -1,6 +1,9 @@
- {
-   "name": "app_sli_rtt",
-   "job_type": "app",
-+  "keywords": [
-+    "app"
-+  ],
-   "root_cause_number": 20,
-   "KPI": [
-     {
-diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json
-new file mode 100644
-index 0000000..00276c0
--- /dev/null
-+++ b/config/module/disk_throughput.json
-@@ -0,0 +1,92 @@
-+{
-+  "name": "disk_throughput",
-+  "job_type": "sys",
-+  "keywords": [
-+    "disk"
-+  ],
-+  "root_cause_number": 1,
-+  "KPI": [
-+    {
-+      "metric": "gala_gopher_disk_r_await",
-+      "kpi_type": "",
-+      "entity_name": "disk",
-+      "enable": true,
-+      "description": "Disk read await time is increasing!",
-+      "params": {
-+        "look_back": 20,
-+        "obs_size": 25,
-+        "outlier_ratio_th": 0.3,
-+        "smooth_params": {
-+          "method": "conv_smooth",
-+          "box_pts": 3
-+        }
-+      }
-+    },
-+    {
-+      "metric": "gala_gopher_disk_w_await",
-+      "kpi_type": "",
-+      "entity_name": "disk",
-+      "enable": true,
-+      "description": "Disk write await time is increasing!",
-+      "params": {
-+        "look_back": 20,
-+        "obs_size": 25,
-+        "outlier_ratio_th": 0.3,
-+        "smooth_params": {
-+          "method": "conv_smooth",
-+          "box_pts": 3
-+        }
-+      }
-+    }
-+  ],
-+  "OnlineModel": {
-+    "name": "online_vae_model",
-+    "enable": false,
-+    "params": {
-+      "th": 0.5,
-+      "max_error_rate": 0.7,
-+      "min_retrain_hours": 24,
-+      "min_predict_minutes": 20,
-+      "norm": {},
-+      "vae": {
-+        "hidden_sizes": [25, 10, 5],
-+        "latent_size": 5,
-+        "dropout_rate": 0.25,
-+        "batch_size": 1024,
-+        "num_epochs": 30,
-+        "learning_rate": 0.001,
-+        "k": 120,
-+        "step_size": 60,
-+        "num_eval_samples": 10
-+      },
-+      "calibrate": {},
-+      "threshold": {}
-+    }
-+  },
-+  "Features": [
-+    {
-+      "metric": "gala_gopher_disk_rspeed_kB",
-+      "priority": 0,
-+      "description": "The disk I/O await time performance deteriorates due to read throughput rise (read kbytes/second).(Disk = {})",
-+      "atrend": "rise"
-+    },
-+    {
-+      "metric": "gala_gopher_disk_wspeed_kB",
-+      "priority": 0,
-+      "description": "The disk I/O await time performance deteriorates due to write throughput rise (write kbytes/second).(Disk = {})",
-+      "atrend": "rise"
-+    },
-+    {
-+      "metric": "gala_gopher_disk_rareq",
-+      "priority": 0,
-+      "description": "The disk I/O await time performance deteriorates due to read saturation rise.(Disk = {})",
-+      "atrend": "rise"
-+    },
-+    {
-+      "metric": "gala_gopher_disk_wareq",
-+      "priority": 0,
-+      "description": "The disk I/O await time performance deteriorates due to write saturation rise.(Disk = {})",
-+      "atrend": "rise"
-+    }
-+  ]
-+}
-\ No newline at end of file
-diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json
-index c45b7df..c6c03c1 100644
--- a/config/module/proc_io_latency.json
-+++ b/config/module/proc_io_latency.json
-@@ -1,6 +1,9 @@
- {
-   "name": "proc_io_latency",
-   "job_type": "sys",
-+  "keywords": [
-+    "process"
-+  ],
-   "root_cause_number": 3,
-   "KPI": [
-     {
-diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json
-index e92dd4c..e58990d 100644
--- a/config/module/sys_io_latency.json
-+++ b/config/module/sys_io_latency.json
-@@ -1,6 +1,9 @@
- {
-   "name": "sys_io_latency",
-   "job_type": "sys",
-+  "keywords": [
-+    "block"
-+  ],
-   "root_cause_number": 3,
-   "KPI": [
-     {
-diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json
-index b6f8eb4..61ae72d 100644
--- a/config/module/sys_tcp_establish.json
-+++ b/config/module/sys_tcp_establish.json
-@@ -1,6 +1,9 @@
- {
-   "name": "sys_tcp_establish",
-   "job_type": "sys",
-+  "keywords": [
-+    "tcp"
-+  ],
-   "root_cause_number": 3,
-   "KPI": [
-     {
-diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json
-index 4927d8e..d9e7f80 100644
--- a/config/module/sys_tcp_transmission_latency.json
-+++ b/config/module/sys_tcp_transmission_latency.json
-@@ -1,6 +1,9 @@
- {
-   "name": "sys_tcp_transmission_latency",
-   "job_type": "sys",
-+  "keywords": [
-+    "tcp"
-+  ],
-   "root_cause_number": 3,
-   "KPI": [
-     {
-diff --git a/config/module/sys_tcp_transmission_throughput.json b/config/module/sys_tcp_transmission_throughput.json
-index 060f640..28ee784 100644
--- a/config/module/sys_tcp_transmission_throughput.json
-+++ b/config/module/sys_tcp_transmission_throughput.json
-@@ -1,6 +1,9 @@
- {
-   "name": "sys_tcp_transmission_throughput",
-   "job_type": "sys",
-+  "keywords": [
-+    "net"
-+  ],
-   "root_cause_number": 3,
-   "KPI": [
-     {
-- 
-2.33.0
-
--- a/Update-TCP-Establish-Model-Add-Nic-Loss-Detector.patch
+++ b/Update-TCP-Establish-Model-Add-Nic-Loss-Detector.patch
@ -1,377 +0,0 @@
-From dd870b17120f3c7961c4613d454f1653fbd42214 Mon Sep 17 00:00:00 2001
-From: lizhenxing11 <lizhenxing11@huawei.com>
-Date: Tue, 27 Dec 2022 18:39:32 +0800
-Subject: [PATCH] Update TCP Establish Model & Add Nic Loss Detector
-
-change method 'abs' to 'max'
---
- anteater/main.py                              |  2 +
- anteater/model/algorithms/three_sigma.py      |  4 +-
- anteater/model/detector/n_sigma_detector.py   |  4 +-
- .../tcp_establish_n_sigma_detector.py         | 12 +++-
- anteater/model/detector/th_base_detector.py   | 66 +++++++++++++++++++
- anteater/module/sys/nic_loss.py               | 59 +++++++++++++++++
- anteater/module/sys/proc_io_latency.py        |  4 +-
- anteater/template/app_anomaly_template.py     |  2 +
- anteater/template/sys_anomaly_template.py     |  1 +
- config/module/sys_nic_loss.json               | 53 +++++++++++++++
- config/module/sys_tcp_establish.json          |  3 +-
- 11 files changed, 200 insertions(+), 10 deletions(-)
- create mode 100644 anteater/model/detector/th_base_detector.py
- create mode 100644 anteater/module/sys/nic_loss.py
- create mode 100644 config/module/sys_nic_loss.json
-
-diff --git a/anteater/main.py b/anteater/main.py
-index ba7be70..4de72f9 100644
--- a/anteater/main.py
-+++ b/anteater/main.py
-@@ -22,6 +22,7 @@ from anteater.anomaly_detection import AnomalyDetection
- from anteater.config import AnteaterConf
- from anteater.module.app.app_sli_detector import APPSliDetector
- from anteater.module.sys.disk_throughput import DiskThroughputDetector
-+from anteater.module.sys.nic_loss import NICLossDetector
- from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector
- from anteater.module.sys.sys_io_latency import SysIOLatencyDetector
- from anteater.module.sys.tcp_establish import SysTcpEstablishDetector
-@@ -59,6 +60,7 @@ def main():
-             SysIOLatencyDetector(loader, report),
-             ProcIOLatencyDetector(loader, report),
-             DiskThroughputDetector(loader, report),
-+            NICLossDetector(loader, report),
-         ]
-     else:
-         detectors = [
-diff --git a/anteater/model/algorithms/three_sigma.py b/anteater/model/algorithms/three_sigma.py
-index 49b9952..0865417 100644
--- a/anteater/model/algorithms/three_sigma.py
-+++ b/anteater/model/algorithms/three_sigma.py
-@@ -14,8 +14,8 @@
- import numpy as np
- 
- 
-def three_sigma(values, obs_size, n=3, method="abs"):
-    """The '3-sigma rule' outlier detect function"""
-+def n_sigma(values, obs_size, n=3, method="abs"):
-+    """The 'N-sigma rule' outlier detect function"""
-     if obs_size <= 0:
-         raise ValueError("The obs_size should great than zero!")
-     if len(values) <= obs_size:
-diff --git a/anteater/model/detector/n_sigma_detector.py b/anteater/model/detector/n_sigma_detector.py
-index f632326..3a2ab01 100644
--- a/anteater/model/detector/n_sigma_detector.py
-+++ b/anteater/model/detector/n_sigma_detector.py
-@@ -19,7 +19,7 @@ from anteater.core.kpi import KPI
- from anteater.core.time_series import TimeSeriesScore
- from anteater.model.detector.base import Detector
- from anteater.model.algorithms.smooth import smoothing
-from anteater.model.algorithms.three_sigma import three_sigma
-+from anteater.model.algorithms.three_sigma import n_sigma
- from anteater.source.metric_loader import MetricLoader
- from anteater.utils.common import divide
- from anteater.utils.datetime import DateTimeManager as dt
-@@ -91,7 +91,7 @@ class NSigmaDetector(Detector):
-                 ratio = 0
-             else:
-                 smoothed_val = smoothing(_ts.values, **smooth_params)
-                outlier, mean, std = three_sigma(
-+                outlier, mean, std = n_sigma(
-                     smoothed_val, obs_size=obs_size, n=n, method=self.method)
-                 ratio = divide(len(outlier), obs_size)
- 
-diff --git a/anteater/model/detector/tcp_establish_n_sigma_detector.py b/anteater/model/detector/tcp_establish_n_sigma_detector.py
-index 8dcf9ae..82d7837 100644
--- a/anteater/model/detector/tcp_establish_n_sigma_detector.py
-+++ b/anteater/model/detector/tcp_establish_n_sigma_detector.py
-@@ -42,8 +42,13 @@ class TcpEstablishNSigmaDetector(Detector):
-         start, _ = dt.last(minutes=look_back)
-         mid, _ = dt.last(minutes=3)
- 
-+        filtered_ts_list = []
-         ts_list = self.data_loader.get_metric(start, mid, kpi.metric)
-        establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in ts_list])
-+        for _ts in ts_list:
-+            if sum(_ts.values) > 0:
-+                filtered_ts_list.append(_ts)
-+
-+        establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in filtered_ts_list])
- 
-         self.mean = np.mean(establish_time)
-         self.std = np.std(establish_time)
-@@ -65,6 +70,7 @@ class TcpEstablishNSigmaDetector(Detector):
-         """Detects kpi based on signal time series anomaly detection model"""
-         outlier_ratio_th = kpi.params.get('outlier_ratio_th')
-         look_back = kpi.params.get('obs_size')
-+        min_rtt = kpi.params.get('min_rtt')
- 
-         start, end = dt.last(minutes=look_back)
-         ts_list = self.data_loader.\
-@@ -72,9 +78,9 @@ class TcpEstablishNSigmaDetector(Detector):
- 
-         anomalies = []
-         for _ts in ts_list:
-            outlier = [val for val in _ts.values if abs(val - self.mean) > 3 * self.std]
-+            outlier = [val for val in _ts.values if val > self.mean + 5 * self.std]
-             ratio = divide(len(outlier), len(_ts.values))
-            if outlier and ratio > outlier_ratio_th:
-+            if outlier and ratio > outlier_ratio_th and np.average(outlier) >= min_rtt:
-                 anomalies.append(
-                     Anomaly(
-                         machine_id=machine_id,
-diff --git a/anteater/model/detector/th_base_detector.py b/anteater/model/detector/th_base_detector.py
-new file mode 100644
-index 0000000..bec9705
--- /dev/null
-+++ b/anteater/model/detector/th_base_detector.py
-@@ -0,0 +1,66 @@
-+#!/usr/bin/python3
-+# ******************************************************************************
-+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
-+# gala-anteater is licensed under Mulan PSL v2.
-+# You can use this software according to the terms and conditions of the Mulan PSL v2.
-+# You may obtain a copy of Mulan PSL v2 at:
-+#          http://license.coscl.org.cn/MulanPSL2
-+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-+# See the Mulan PSL v2 for more details.
-+# ******************************************************************************/
-+
-+from typing import List
-+
-+from anteater.core.anomaly import Anomaly
-+from anteater.core.kpi import KPI
-+from anteater.model.detector.base import Detector
-+from anteater.source.metric_loader import MetricLoader
-+from anteater.utils.datetime import DateTimeManager as dt
-+from anteater.utils.log import logger
-+
-+
-+class ThBaseDetector(Detector):
-+    """The threshold-based anomaly detector"""
-+
-+    def __init__(self, data_loader: MetricLoader):
-+        """The detector base class initializer"""
-+        super().__init__(data_loader)
-+
-+    def detect_kpis(self, kpis: List[KPI]):
-+        """Executes anomaly detection on kpis"""
-+        start, end = dt.last(minutes=1)
-+        machine_ids = self.get_unique_machine_id(start, end, kpis)
-+        anomalies = []
-+        for _id in machine_ids:
-+            for kpi in kpis:
-+                anomalies.extend(self.detect_signal_kpi(kpi, _id))
-+
-+        return anomalies
-+
-+    def detect_signal_kpi(self, kpi, machine_id: str) -> List[Anomaly]:
-+        """Detects kpi based on threshold based anomaly detection model"""
-+        look_back = kpi.params.get('look_back')
-+        th = kpi.params.get('th')
-+        start, end = dt.last(minutes=look_back)
-+        ts_list = self.data_loader.\
-+            get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
-+
-+        if not ts_list:
-+            logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
-+            return []
-+
-+        anomalies = [
-+            Anomaly(
-+                machine_id=machine_id,
-+                metric=_ts.metric,
-+                labels=_ts.labels,
-+                score=1,
-+                entity_name=kpi.entity_name,
-+                description=kpi.description)
-+            for _ts in ts_list
-+            if sum(_ts.values) >= th
-+        ]
-+
-+        return anomalies
-diff --git a/anteater/module/sys/nic_loss.py b/anteater/module/sys/nic_loss.py
-new file mode 100644
-index 0000000..d24e06f
--- /dev/null
-+++ b/anteater/module/sys/nic_loss.py
-@@ -0,0 +1,59 @@
-+#!/usr/bin/python3
-+# ******************************************************************************
-+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
-+# gala-anteater is licensed under Mulan PSL v2.
-+# You can use this software according to the terms and conditions of the Mulan PSL v2.
-+# You may obtain a copy of Mulan PSL v2 at:
-+#          http://license.coscl.org.cn/MulanPSL2
-+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-+# See the Mulan PSL v2 for more details.
-+# ******************************************************************************/
-+
-+from typing import List, Dict
-+
-+from anteater.core.anomaly import Anomaly
-+from anteater.model.detector.th_base_detector import ThBaseDetector
-+from anteater.module.base import E2EDetector
-+from anteater.source.anomaly_report import AnomalyReport
-+from anteater.source.metric_loader import MetricLoader
-+from anteater.template.sys_anomaly_template import SysAnomalyTemplate
-+
-+
-+class NICLossDetector(E2EDetector):
-+    """SYS nic loss e2e detector which detects the network loss.
-+    """
-+
-+    config_file = 'sys_nic_loss.json'
-+
-+    def __init__(self, data_loader: MetricLoader, reporter: AnomalyReport):
-+        """The system tcp transmission latency e2e detector initializer"""
-+        super().__init__(reporter, SysAnomalyTemplate)
-+
-+        self.detectors = [
-+            ThBaseDetector(data_loader)
-+        ]
-+
-+    def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]:
-+        """Parses the cause metrics into the specific formats"""
-+        cause_metrics = []
-+        for _cs in anomaly.root_causes:
-+            tmp = {
-+                'metric': _cs.ts.metric,
-+                'labels': _cs.ts.labels,
-+                'score': _cs.score,
-+            }
-+            if 'tcp' in _cs.ts.metric:
-+                tmp['description'] = _cs.description.format(
-+                    _cs.ts.labels.get('tgid', ''),
-+                    _cs.ts.labels.get('client_port', ''),
-+                    _cs.ts.labels.get('server_ip', ''),
-+                    _cs.ts.labels.get('server_port', ''))
-+            else:
-+                tmp['description'] = _cs.description.format(
-+                    _cs.ts.labels.get('dev_name', ''))
-+
-+            cause_metrics.append(tmp)
-+
-+        return cause_metrics
-diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py
-index 43e069f..a34c48d 100644
--- a/anteater/module/sys/proc_io_latency.py
-+++ b/anteater/module/sys/proc_io_latency.py
-@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector):
-     def init_detectors(self, data_loader):
-         if self.job_config.model_config.enable:
-             detectors = [
-                NSigmaDetector(data_loader, method='abs'),
-+                NSigmaDetector(data_loader, method='max'),
-                 OnlineVAEDetector(data_loader, self.job_config.model_config)
-             ]
-         else:
-             detectors = [
-                NSigmaDetector(data_loader, method='abs')
-+                NSigmaDetector(data_loader, method='max')
-             ]
- 
-         return detectors
-diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py
-index a509c96..4df4a35 100644
--- a/anteater/template/app_anomaly_template.py
-+++ b/anteater/template/app_anomaly_template.py
-@@ -46,6 +46,8 @@ class AppAnomalyTemplate(Template):
-             'SeverityNumber': 13,
-             'Body': f'{self.timestamp.strftime("%c")} WARN, APP may be impacting sli performance issues.',
-             'event_id': f'{timestamp}_{self.entity_id}',
-+            "keywords": self.keywords,
-+            'cause_metrics': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'}
-         }
- 
-         return result
-diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py
-index 4ac6abb..aec6ea0 100644
--- a/anteater/template/sys_anomaly_template.py
-+++ b/anteater/template/sys_anomaly_template.py
-@@ -46,6 +46,7 @@ class SysAnomalyTemplate(Template):
-             'SeverityNumber': 13,
-             'Body': f'{self.timestamp.strftime("%c")} WARN, SYS may be impacting performance issues.',
-             'event_id': f'{timestamp}_{self.entity_id}',
-+            "keywords": self.keywords
-         }
- 
-         return result
-diff --git a/config/module/sys_nic_loss.json b/config/module/sys_nic_loss.json
-new file mode 100644
-index 0000000..793f82f
--- /dev/null
-+++ b/config/module/sys_nic_loss.json
-@@ -0,0 +1,53 @@
-+{
-+  "name": "sys_tcp_transmission_latency",
-+  "job_type": "sys",
-+  "keywords": [
-+    "net"
-+  ],
-+  "root_cause_number": 3,
-+  "KPI": [
-+    {
-+      "metric": "gala_gopher_nic_tc_sent_drop",
-+      "kpi_type": "",
-+      "entity_name": "nic",
-+      "enable": true,
-+      "description": "TC发送丢包数异常",
-+      "params": {
-+        "look_back": 2,
-+        "th": 1
-+      }
-+    }
-+  ],
-+  "Features": [
-+    {
-+      "metric": "gala_gopher_nic_tx_dropped",
-+      "priority": 0,
-+      "description": "网卡发送丢弃的数据包数异常。(dev_name = {})"
-+    },
-+    {
-+      "metric": "gala_gopher_nic_rx_dropped",
-+      "priority": 0,
-+      "description": "网卡接收丢弃的数据包数异常。(dev_name = {})"
-+    },
-+    {
-+      "metric": "gala_gopher_tcp_link_sk_drops",
-+      "priority": 3,
-+      "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+    },
-+    {
-+      "metric": "gala_gopher_tcp_link_retran_packets",
-+      "priority": 1,
-+      "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+    },
-+    {
-+      "metric": "gala_gopher_tcp_link_lost_out",
-+      "priority": 3,
-+      "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+    },
-+    {
-+      "metric": "gala_gopher_tcp_link_notsent_bytes",
-+      "priority": 4,
-+      "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})"
-+    }
-+  ]
-+}
-\ No newline at end of file
-diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json
-index 61ae72d..2c158c0 100644
--- a/config/module/sys_tcp_establish.json
-+++ b/config/module/sys_tcp_establish.json
-@@ -15,7 +15,8 @@
-       "params": {
-         "look_back": 30,
-         "outlier_ratio_th": 0.5,
-        "obs_size": 3
-+        "obs_size": 3,
-+        "min_rtt": 500000
-       }
-     }
-   ],
-- 
-2.33.0
-
--- a/add-chinese-descriptions.patch
+++ b/add-chinese-descriptions.patch
@ -1,533 +0,0 @@
-From e0e99ac8fc3de9e8781f5d7acd5e9fe1832461b0 Mon Sep 17 00:00:00 2001
-From: lizhenxing11 <lizhenxing11@huawei.com>
-Date: Tue, 3 Jan 2023 15:27:45 +0800
-Subject: [PATCH] add chinese descriptions
-
-update description
-
-fix typo
-
-update th
---
- anteater/core/kpi.py                          |  2 +-
- anteater/template/app_anomaly_template.py     |  5 ++-
- anteater/template/sys_anomaly_template.py     |  2 +-
- anteater/utils/data_load.py                   | 14 ++++++--
- config/module/app_sli_rtt.json                |  2 ++
- config/module/disk_throughput.json            |  6 ++++
- config/module/proc_io_latency.json            | 31 +++++++++++-----
- config/module/sys_io_latency.json             | 25 ++++++++-----
- config/module/sys_nic_loss.json               | 21 +++++++----
- config/module/sys_tcp_establish.json          |  4 ++-
- .../module/sys_tcp_transmission_latency.json  | 36 ++++++++++++-------
- 11 files changed, 104 insertions(+), 44 deletions(-)
-
-diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py
-index 3480139..f83b666 100644
--- a/anteater/core/kpi.py
-+++ b/anteater/core/kpi.py
-@@ -23,7 +23,7 @@ class KPI:
-     kpi_type: str
-     entity_name: str
-     enable: bool
-    description: str = ""
-+    description: str
-     params: dict = field(default=dict)
-     atrend: AnomalyTrend = AnomalyTrend.DEFAULT
- 
-diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py
-index 4df4a35..3770d2e 100644
--- a/anteater/template/app_anomaly_template.py
-+++ b/anteater/template/app_anomaly_template.py
-@@ -33,7 +33,7 @@ class AppAnomalyTemplate(Template):
-                 'event_type': 'app',
-                 'event_source': 'gala-anteater',
-                 'keywords': self.keywords,
-                'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'}
-+                'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': self.description}
-             },
-             'Resource': {
-                 'metric': self.metric,
-@@ -46,8 +46,7 @@ class AppAnomalyTemplate(Template):
-             'SeverityNumber': 13,
-             'Body': f'{self.timestamp.strftime("%c")} WARN, APP may be impacting sli performance issues.',
-             'event_id': f'{timestamp}_{self.entity_id}',
-            "keywords": self.keywords,
-            'cause_metrics': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'}
-+            "keywords": self.keywords
-         }
- 
-         return result
-diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py
-index aec6ea0..d3c7e82 100644
--- a/anteater/template/sys_anomaly_template.py
-+++ b/anteater/template/sys_anomaly_template.py
-@@ -33,7 +33,7 @@ class SysAnomalyTemplate(Template):
-                 'event_type': 'sys',
-                 'event_source': 'gala-anteater',
-                 'keywords': self.keywords,
-                'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'}
-+                'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': self.description}
-             },
-             'Resource': {
-                 'metric': self.metric,
-diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py
-index b6991c6..730c9c6 100644
--- a/anteater/utils/data_load.py
-+++ b/anteater/utils/data_load.py
-@@ -47,8 +47,9 @@ def load_job_config(file_name) -> JobConfig:
-     job_type = config['job_type']
-     keywords = config['keywords']
-     root_cause_number = config['root_cause_number']
-    kpis = [KPI(**_conf) for _conf in config['KPI']]
-    features = [Feature(**_conf) for _conf in config['Features']]
-+
-+    kpis = [KPI(**update_description(_conf)) for _conf in config['KPI']]
-+    features = [Feature(**update_description(_conf)) for _conf in config['Features']]
- 
-     model_config = None
-     if 'OnlineModel' in config:
-@@ -81,3 +82,12 @@ def load_job_config(file_name) -> JobConfig:
-         features=features,
-         model_config=model_config
-     )
-+
-+
-+def update_description(conf: dict):
-+    """Changes description to zh"""
-+    if 'description-zh' in conf:
-+        conf['description'] = conf['description-zh']
-+        del conf['description-zh']
-+
-+    return conf
-diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json
-index db29392..0146883 100644
--- a/config/module/app_sli_rtt.json
-+++ b/config/module/app_sli_rtt.json
-@@ -12,6 +12,7 @@
-       "entity_name": "sli",
-       "enable": false,
-       "description": "sli rtt 异常",
-+      "description-zh": "应用级请求往返时延（RTT）异常",
-       "params": {
-         "look_back": 10,
-         "obs_size": 25,
-@@ -28,6 +29,7 @@
-       "entity_name": "sli",
-       "enable": true,
-       "description": "sli tps 异常",
-+      "description-zh": "应用级请求吞吐量（TPS）异常",
-       "params": {
-         "look_back": 10,
-         "obs_size": 25,
-diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json
-index 00276c0..f6244f6 100644
--- a/config/module/disk_throughput.json
-+++ b/config/module/disk_throughput.json
-@@ -12,6 +12,7 @@
-       "entity_name": "disk",
-       "enable": true,
-       "description": "Disk read await time is increasing!",
-+      "description-zh": "磁盘读响应时间升高，性能发生劣化",
-       "params": {
-         "look_back": 20,
-         "obs_size": 25,
-@@ -28,6 +29,7 @@
-       "entity_name": "disk",
-       "enable": true,
-       "description": "Disk write await time is increasing!",
-+      "description-zh": "磁盘写响应时间升高，性能发生劣化",
-       "params": {
-         "look_back": 20,
-         "obs_size": 25,
-@@ -68,24 +70,28 @@
-       "metric": "gala_gopher_disk_rspeed_kB",
-       "priority": 0,
-       "description": "The disk I/O await time performance deteriorates due to read throughput rise (read kbytes/second).(Disk = {})",
-+      "description-zh": "磁盘读吞吐量异常升高，导致I/O等待时间性能劣化(Disk = {})",
-       "atrend": "rise"
-     },
-     {
-       "metric": "gala_gopher_disk_wspeed_kB",
-       "priority": 0,
-       "description": "The disk I/O await time performance deteriorates due to write throughput rise (write kbytes/second).(Disk = {})",
-+      "description-zh": "磁盘写吞吐量异常升高，导致I/O等待时间性能劣化(Disk = {})",
-       "atrend": "rise"
-     },
-     {
-       "metric": "gala_gopher_disk_rareq",
-       "priority": 0,
-       "description": "The disk I/O await time performance deteriorates due to read saturation rise.(Disk = {})",
-+      "description-zh": "磁盘读饱和度量异常升高，导致I/O等待时间性能劣化(Disk = {})",
-       "atrend": "rise"
-     },
-     {
-       "metric": "gala_gopher_disk_wareq",
-       "priority": 0,
-       "description": "The disk I/O await time performance deteriorates due to write saturation rise.(Disk = {})",
-+      "description-zh": "磁盘读写饱和度量异常升高，导致I/O等待时间性能劣化(Disk = {})",
-       "atrend": "rise"
-     }
-   ]
-diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json
-index c6c03c1..f086b87 100644
--- a/config/module/proc_io_latency.json
-+++ b/config/module/proc_io_latency.json
-@@ -12,6 +12,7 @@
-       "entity_name": "proc",
-       "enable": true,
-       "description": "I/O operation delay at the BIO layer (unit: us)",
-+      "description-zh": "BIO层I/O操作延时高(单位：us)",
-       "params": {
-         "look_back": 20,
-         "obs_size": 37,
-@@ -28,6 +29,7 @@
-       "entity_name": "proc",
-       "enable": true,
-       "description": "Number of small I/O (less than 4 KB) read operations at the BIO layer.",
-+      "description-zh": "BIO层小数据I/O读操作数量异常（小于4KB）",
-       "params": {
-         "look_back": 20,
-         "obs_size": 25,
-@@ -44,6 +46,7 @@
-       "entity_name": "proc",
-       "enable": true,
-       "description": "Number of small I/O (less than 4 KB) write operations at the BIO layer.",
-+      "description-zh": "BIO层小数据I/O写操作数量异常（小于4KB）",
-       "params": {
-         "look_back": 20,
-         "obs_size": 25,
-@@ -61,6 +64,7 @@
-       "entity_name": "proc",
-       "enable": true,
-       "description": "Number of big I/O (greater than 4 KB) read operations at the BIO layer.",
-+      "description-zh": "BIO层大数据I/O读操作数量异常（大于4KB）",
-       "params": {
-         "look_back": 20,
-         "obs_size": 25,
-@@ -76,7 +80,8 @@
-       "kpi_type": "",
-       "entity_name": "proc",
-       "enable": true,
-      "description": "Number of big I/O (greater than 4 KB) read operations at the BIO layer.",
-+      "description": "Number of big I/O (greater than 4 KB) write operations at the BIO layer.",
-+      "description-zh": "BIO层大数据写操作数量异常（大于4KB）",
-       "params": {
-         "look_back": 20,
-         "obs_size": 25,
-@@ -116,42 +121,50 @@
-     {
-       "metric": "gala_gopher_block_latency_req_max",
-       "priority": 4,
-      "description": "The system I/O performance deteriorates due to a drive failure.(Disk = {})"
-+      "description": "Process I/O performance deteriorates due to system I/O bandwidth insufficient.(Disk = {})",
-+      "description-zh": "系统I/O带宽不足引起进程I/O性能劣化（Disk={}）"
-     },
-     {
-       "metric": "gala_gopher_block_latency_device_max",
-       "priority": 3,
-      "description": "Degraded system I/O performance due to device (disk) failure.(Disk = {})"
-+      "description": "Process I/O performance deteriorates due to device I/O bandwidth insufficient.(Disk = {})",
-+      "description-zh": "设备I/O带宽不足引起进程I/O性能劣化（Disk={}）"
-     },
-     {
-       "metric": "gala_gopher_block_read_bytes",
-       "priority": 2,
-      "description": "System performance deteriorates due to frequent read I/O operations.(Disk = {})"
-+      "description": "Process I/O performance deteriorates due to frequent read I/O operations.(Disk = {})",
-+      "description-zh": "频繁I/O读操作引起进程I/O性能劣化（Disk={}）"
-     },
-     {
-       "metric": "gala_gopher_block_write_bytes",
-       "priority": 2,
-      "description": "System performance deteriorates due to frequent write I/O operations.(Disk = {})"
-+      "description": "Process I/O performance deteriorates due to frequent write I/O operations.(Disk = {})",
-+      "description-zh": "频繁写操作引起进程I/O性能劣化（Disk={}）"
-     },
-     {
-       "metric": "gala_gopher_proc_less_4k_io_read",
-       "priority": 0,
-      "description": "System performance degrades due to frequent small I/O read operations.(Disk = {}, PID = {}, comm = {})"
-+      "description": "Process I/O performance degrades due to frequent small I/O read operations.(Disk = {}, PID = {}, comm = {})",
-+      "description-zh": "频繁小数据量（小于4KB）读操作引起进程I/O性能劣化（Disk={}，PID={}，comm={}）"
-     },
-     {
-       "metric": "gala_gopher_proc_less_4k_io_write",
-       "priority": 0,
-      "description": "System performance degrades due to frequent small I/O write operations.(Disk = {}, PID = {}, comm = {})"
-+      "description": "Process I/O performance degrades due to frequent small I/O write operations.(Disk = {}, PID = {}, comm = {})",
-+      "description-zh": "频繁小数据量（小于4KB）写操作引起进程I/O性能劣化（Disk={}，PID={}，comm={}）"
-     },
-     {
-       "metric": "gala_gopher_proc_greater_4k_io_read",
-       "priority": 1,
-      "description": "System performance degrades due to frequent big I/O read operations.(Disk = {}, PID = {}, comm = {})"
-+      "description": "Process I/O performance degrades due to frequent big I/O read operations.(Disk = {}, PID = {}, comm = {})",
-+      "description-zh": "频繁大数据量（大于4KB）读操作引起进程I/O性能劣化（Disk={}，PID={}，comm={}）"
-     },
-     {
-       "metric": "gala_gopher_proc_greater_4k_io_write",
-       "priority": 1,
-      "description": "System performance degrades due to frequent big I/O write operations.(Disk = {}, PID = {}, comm = {})"
-+      "description": "Process I/O performance degrades due to frequent big I/O write operations.(Disk = {}, PID = {}, comm = {})",
-+      "description-zh": "频繁大数据量（大于4KB）写操作引起进程I/O性能劣化（Disk={}，PID={}，comm={}）"
-     }
-   ]
- }
-\ No newline at end of file
-diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json
-index e58990d..bdf17d3 100644
--- a/config/module/sys_io_latency.json
-+++ b/config/module/sys_io_latency.json
-@@ -12,6 +12,7 @@
-       "entity_name": "block",
-       "enable": true,
-       "description": "Block I/O latency performance is deteriorating!",
-+      "description-zh": "Block层I/O操作时延性能劣化",
-       "params": {
-         "look_back": 20,
-         "obs_size": 25,
-@@ -51,42 +52,50 @@
-     {
-       "metric": "gala_gopher_block_latency_driver_max",
-       "priority": 4,
-      "description": "The system I/O performance deteriorates due to a drive failure.(Disk = {})"
-+      "description": "The system I/O performance deteriorates due to a drive failure.(Disk = {})",
-+      "description-zh": "驱动异常引起系统I/O性能劣化（Disk={}）"
-     },
-     {
-       "metric": "gala_gopher_block_latency_device_max",
-       "priority": 3,
-      "description": "Degraded system I/O performance due to device (disk) failure.(Disk = {})"
-+      "description": "Degraded system I/O performance due to device (disk) failure.(Disk = {})",
-+      "description-zh": "设备（磁盘）异常引起系统I/O性能劣化（Disk={}）"
-     },
-     {
-       "metric": "gala_gopher_block_read_bytes",
-       "priority": 2,
-      "description": "System performance deteriorates due to frequent read I/O operations.(Disk = {})"
-+      "description": "System performance deteriorates due to frequent read I/O operations.(Disk = {})",
-+      "description-zh": "频繁读操作引起系统I/O性能劣化（Disk={}）"
-     },
-     {
-       "metric": "gala_gopher_block_write_bytes",
-       "priority": 2,
-      "description": "System performance deteriorates due to frequent write I/O operations.(Disk = {})"
-+      "description": "System performance deteriorates due to frequent write I/O operations.(Disk = {})",
-+      "description-zh": "频繁写操作引起系统I/O性能劣化（Disk={}）"
-     },
-     {
-       "metric": "gala_gopher_proc_less_4k_io_read",
-       "priority": 0,
-      "description": "System performance degrades due to frequent small I/O read operations.(Disk = {}, PID = {}, comm = {})"
-+      "description": "System performance degrades due to frequent small I/O read operations.(Disk = {}, PID = {}, comm = {})",
-+      "description-zh": "频繁小数据量（小于4KB）读操作引起系统I/O性能劣化（Disk={}，PID={}，comm={}）"
-     },
-     {
-       "metric": "gala_gopher_proc_less_4k_io_write",
-       "priority": 0,
-      "description": "System performance degrades due to frequent small I/O write operations.(Disk = {}, PID = {}, comm = {})"
-+      "description": "System performance degrades due to frequent small I/O write operations.(Disk = {}, PID = {}, comm = {})",
-+      "description-zh": "频繁小数据量（小于4KB）写操作引起系统I/O性能劣化（Disk={}，PID={}，comm={}）"
-     },
-     {
-       "metric": "gala_gopher_proc_greater_4k_io_read",
-       "priority": 1,
-      "description": "System performance degrades due to frequent big I/O read operations.(Disk = {}, PID = {}, comm = {})"
-+      "description": "System performance degrades due to frequent big I/O read operations.(Disk = {}, PID = {}, comm = {})",
-+      "description-zh": "频繁大数据量（大于4KB）读操作引起系统I/O性能劣化（Disk={}，PID={}，comm={}）"
-     },
-     {
-       "metric": "gala_gopher_proc_greater_4k_io_write",
-       "priority": 1,
-      "description": "System performance degrades due to frequent big I/O write operations.(Disk = {}, PID = {}, comm = {})"
-+      "description": "System performance degrades due to frequent big I/O write operations.(Disk = {}, PID = {}, comm = {})",
-+      "description-zh": "频繁大数据量（大于4KB）写操作引起系统I/O性能劣化（Disk={}，PID={}，comm={}）"
-     }
-   ]
- }
-\ No newline at end of file
-diff --git a/config/module/sys_nic_loss.json b/config/module/sys_nic_loss.json
-index 793f82f..8a1feb8 100644
--- a/config/module/sys_nic_loss.json
-+++ b/config/module/sys_nic_loss.json
-@@ -11,7 +11,8 @@
-       "kpi_type": "",
-       "entity_name": "nic",
-       "enable": true,
-      "description": "TC发送丢包数异常",
-+      "description": "TC sent dropped packets",
-+      "description-zh": "TC发送丢包数异常",
-       "params": {
-         "look_back": 2,
-         "th": 1
-@@ -22,32 +23,38 @@
-     {
-       "metric": "gala_gopher_nic_tx_dropped",
-       "priority": 0,
-      "description": "网卡发送丢弃的数据包数异常。(dev_name = {})"
-+      "description": "The number of lost packets sent by the nic card are increasing and the NIC performance deteriorates.(dev_name = {})",
-+      "description-zh": "网卡发送丢弃的数据包数增加，导致网卡性能劣化（dev_name={}）"
-     },
-     {
-       "metric": "gala_gopher_nic_rx_dropped",
-       "priority": 0,
-      "description": "网卡接收丢弃的数据包数异常。(dev_name = {})"
-+      "description": "The number of lost packets received by the nic card are increasing and the NIC performance deteriorates.(dev_name = {})",
-+      "description-zh": "网卡接收丢弃的数据包数增加，导致网卡性能劣化（dev_name={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_sk_drops",
-       "priority": 3,
-      "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "Packets are lost in the host protocol stack due to unknown causes, and the NIC performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "由于未知原因，数据包在主机协议栈中丢失，导致网卡性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_retran_packets",
-       "priority": 1,
-      "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "TCP retransmission is triggered due to network faults, resulting in the NIC performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "网络故障触发TCP重传，导致网卡性能下降（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_lost_out",
-       "priority": 3,
-      "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "The network may be congested, causing abnormal NIC packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "网络拥塞，导致网卡异常丢包，性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_notsent_bytes",
-       "priority": 4,
-      "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "由于网络延迟或对端应用程序性能，滑动窗口中累积了太多要发送的数据包，导致网卡性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     }
-   ]
- }
-\ No newline at end of file
-diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json
-index 2c158c0..7cd2369 100644
--- a/config/module/sys_tcp_establish.json
-+++ b/config/module/sys_tcp_establish.json
-@@ -12,6 +12,7 @@
-       "entity_name": "tcp_link",
-       "enable": true,
-       "description": "RTT of syn packet(us): the max syn packets rtt is {:.0f} us",
-+      "description-zh": "SYN数据包时延异常：最大SYN数据包时延为：{:.0f}us。",
-       "params": {
-         "look_back": 30,
-         "outlier_ratio_th": 0.5,
-@@ -24,7 +25,8 @@
-     {
-       "metric": "gala_gopher_endpoint_retran_synacks",
-       "priority": 0,
-      "description": "TCP established performance deteriorates due to loss of SYN/ACK packets.(PID = {}, TCP Listen Port = {})"
-+      "description": "TCP established performance deteriorates due to loss of SYN/ACK packets.(PID = {}, TCP Listen Port = {})",
-+      "description-zh": "由于SYN/ACK数据包丢失，TCP建链性能劣化（PID={}，TCP Listen Port={}）"
-     }
-   ]
- }
-\ No newline at end of file
-diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json
-index d9e7f80..0527487 100644
--- a/config/module/sys_tcp_transmission_latency.json
-+++ b/config/module/sys_tcp_transmission_latency.json
-@@ -12,10 +12,11 @@
-       "entity_name": "tcp_link",
-       "enable": true,
-       "description": "Smoothed Round Trip Time(us)",
-+      "description-zh": "TCP链接往返时延异常，性能劣化",
-       "params": {
-         "look_back": 20,
-         "obs_size": 25,
-        "n": 4,
-+        "n": 3,
-         "outlier_ratio_th": 0.4,
-         "smooth_params": {
-           "method": "conv_smooth",
-@@ -52,57 +53,68 @@
-     {
-       "metric": "gala_gopher_tcp_link_notsent_bytes",
-       "priority": 4,
-      "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "由于网络延迟或对端应用程序性能，滑动窗口中累积了太多要发送的数据包，导致TCP传输性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_notack_bytes",
-       "priority": 4,
-      "description": "Due to network delay or peer application performance, too many NO ACK packets are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "Due to network delay or peer application performance, too many NO ACK packets are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "由于网络延迟或对端应用程序性能，滑动窗口中累积了过多的NO ACK数据包，导致TCP传输性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_snd_wnd",
-       "priority": 4,
-      "description": "The TCP send window is abnormal due to peer application performance or network congestion. As a result, the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "The TCP send window is abnormal due to peer application performance or network congestion. As a result, the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "对端应用性能或网络拥塞导致TCP发送窗口异常，导致TCP传输性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_rcv_wnd",
-       "priority": 4,
-      "description": "The TCP receive window becomes abnormal due to the local application performance. As a result, the performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "The TCP receive window becomes abnormal due to the local application performance. As a result, the performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "本地应用性能导致TCP接收窗口异常，传输性能变差（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_avl_snd_wnd",
-       "priority": 4,
-      "description": "The available TCP send window may be abnormal due to network congestion and the performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "The available TCP send window may be abnormal due to network congestion and the performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "可用的TCP发送窗口可能因网络拥塞而异常，传输性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_lost_out",
-       "priority": 3,
-      "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "网络可能拥塞，导致TCP异常丢包，传输性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_sk_drops",
-       "priority": 3,
-      "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "主机协议栈不明原因丢包，导致TCP传输性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_retran_packets",
-       "priority": 1,
-      "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "网络故障触发TCP重传，导致TCP传输性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_backlog_drops",
-       "priority": 0,
-      "description": "TCP backlog overflows due to local application performance. As a result, TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "TCP backlog overflows due to local application performance. As a result, TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "由于本地应用程序性能问题，TCP积压溢出，导致TCP传输性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_sacked_out",
-       "priority": 2,
-      "description": "TCP performance deteriorates due to network out-of-order. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "TCP performance deteriorates due to network out-of-order. (PID ={}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "网络乱序导致TCP传输性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     },
-     {
-       "metric": "gala_gopher_tcp_link_sk_backlog_size",
-       "priority": 0,
-      "description": "The TCP backlog queue length is abnormal due to the local application performance. As a result, the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
-+      "description": "The TCP backlog queue length is abnormal due to the local application performance. As a result, the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})",
-+      "description-zh": "本地应用性能导致TCP backlog队列长度异常，TCP传输性能劣化（PID={}，client IP={}，Server IP={}，Port={}）"
-     }
-   ]
- }
-\ No newline at end of file
-- 
-2.33.0
-
--- a/add-systemd-service-for-anteater.patch
+++ b/add-systemd-service-for-anteater.patch
@ -1,81 +0,0 @@
-From 2ef581e4960dd0ba49bbe371496933841da001fe Mon Sep 17 00:00:00 2001
-From: lizhenxing11 <lizhenxing11@huawei.com>
-Date: Mon, 9 Jan 2023 15:08:01 +0800
-Subject: [PATCH] add systemd service for anteater
-
-add manifest.in
---
- MANIFEST.in                   | 11 +++++++++++
- service/gala-anteater.service | 12 ++++++++++++
- setup.py                      |  5 +++--
- 3 files changed, 26 insertions(+), 2 deletions(-)
- create mode 100644 MANIFEST.in
- create mode 100644 service/gala-anteater.service
-
-diff --git a/MANIFEST.in b/MANIFEST.in
-new file mode 100644
-index 0000000..7120af9
--- /dev/null
-+++ b/MANIFEST.in
-@@ -0,0 +1,11 @@
-+include LICENSE
-+include README.en.md
-+include README.md
-+include requirements.txt
-+
-+recursive-include service *
-+recursive-include tests *
-+recursive-include docs *
-+
-+recursive-exclude * __pycache__
-+recursive-exclude * *.py[co]
-\ No newline at end of file
-diff --git a/service/gala-anteater.service b/service/gala-anteater.service
-new file mode 100644
-index 0000000..24af354
--- /dev/null
-+++ b/service/gala-anteater.service
-@@ -0,0 +1,12 @@
-+[Unit]
-+Description=A-Ops gala-anteater service
-+After=network.target
-+
-+[Service]
-+Type=exec
-+ExecStart=/usr/bin/gala-anteater
-+Restart=on-failure
-+RestartSec=1
-+
-+[Install]
-+WantedBy=multi-user.target
-\ No newline at end of file
-diff --git a/setup.py b/setup.py
-index 4471a0f..e075391 100644
--- a/setup.py
-+++ b/setup.py
-@@ -23,11 +23,12 @@ setup(
-     description="Times Series Anomaly Detection Platform on Operating System",
-     url="https://gitee.com/openeuler/A-Ops/tree/master/gala-anteater",
-     keywords=["Anomaly Detection", "Time Series Analysis", "Operating System"],
-    packages=find_packages(where="."),
-+    packages=find_packages(where=".", exclude=("tests",)),
-     data_files=[
-         ('/etc/gala-anteater/config/', glob('config/gala-anteater.yaml')),
-         ('/etc/gala-anteater/config/', glob('config/log.settings.ini')),
-         ('/etc/gala-anteater/config/module/', glob('config/module/*')),
-+        ('/usr/lib/systemd/system/', glob('service/*')),
-     ],
-     install_requires=[
-         "APScheduler",
-@@ -42,7 +43,7 @@ setup(
-     ],
-     entry_points={
-         "console_scripts": [
-            "gala-anteater = anteater.main:main",
-+            "gala-anteater=anteater.main:main",
-         ]
-     }
- )
-- 
-2.33.0
-
--- a/fix-str2enum-bug-data-query-refactor.patch
+++ b/fix-str2enum-bug-data-query-refactor.patch
@ -1,737 +0,0 @@
-From 27bb7cdd80f76bfc7ebb0f3041544740aa2fa91b Mon Sep 17 00:00:00 2001
-From: lizhenxing11 <lizhenxing11@huawei.com>
-Date: Tue, 10 Jan 2023 15:31:44 +0800
-Subject: [PATCH] fix str2enum bug & data query refactor
-
---
- anteater/core/anomaly.py                      | 10 ++++
- anteater/core/kpi.py                          | 14 ++++++
- anteater/model/algorithms/slope.py            | 11 +++--
- anteater/model/detector/base.py               | 20 ++++----
- anteater/model/detector/n_sigma_detector.py   | 15 +++---
- .../model/detector/online_vae_detector.py     |  3 +-
- .../tcp_establish_n_sigma_detector.py         |  3 +-
- .../tcp_trans_latency_n_sigma_detector.py     | 48 +++++++++++++++++--
- anteater/model/detector/th_base_detector.py   |  3 +-
- anteater/module/app/app_sli_detector.py       |  4 +-
- anteater/module/sys/disk_throughput.py        |  4 +-
- anteater/module/sys/proc_io_latency.py        |  4 +-
- anteater/module/sys/sys_io_latency.py         |  4 +-
- .../module/sys/tcp_transmission_latency.py    |  4 +-
- .../module/sys/tcp_transmission_throughput.py |  4 +-
- anteater/source/metric_loader.py              | 41 +++++++++++++++-
- anteater/utils/data_load.py                   |  4 +-
- config/module/app_sli_rtt.json                |  6 ++-
- config/module/disk_throughput.json            |  6 ++-
- config/module/proc_io_latency.json            | 15 ++++--
- config/module/sys_io_latency.json             |  2 +-
- config/module/sys_tcp_establish.json          |  2 +-
- .../module/sys_tcp_transmission_latency.json  |  4 +-
- 23 files changed, 172 insertions(+), 59 deletions(-)
-
-diff --git a/anteater/core/anomaly.py b/anteater/core/anomaly.py
-index 45c4fc3..fdee3d1 100644
--- a/anteater/core/anomaly.py
-+++ b/anteater/core/anomaly.py
-@@ -52,3 +52,13 @@ class AnomalyTrend(Enum):
-     DEFAULT = 0
-     RISE = 1
-     FALL = 2
-+
-+    @staticmethod
-+    def from_str(label: str):
-+        """Trans str to Enum type"""
-+        if label.upper() == 'RISE':
-+            return AnomalyTrend.RISE
-+        elif label.upper() == 'FALL':
-+            return AnomalyTrend.FALL
-+        else:
-+            return AnomalyTrend.DEFAULT
-diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py
-index f83b666..70cc9ee 100644
--- a/anteater/core/kpi.py
-+++ b/anteater/core/kpi.py
-@@ -27,6 +27,13 @@ class KPI:
-     params: dict = field(default=dict)
-     atrend: AnomalyTrend = AnomalyTrend.DEFAULT
- 
-+    @classmethod
-+    def from_dict(cls, **data):
-+        if 'atrend' in data:
-+            data['atrend'] = AnomalyTrend.from_str(data.get('atrend'))
-+
-+        return cls(**data)
-+
- 
- @dataclass
- class Feature:
-@@ -35,6 +42,13 @@ class Feature:
-     priority: int = 0
-     atrend: AnomalyTrend = AnomalyTrend.DEFAULT
- 
-+    @classmethod
-+    def from_dict(cls, **data):
-+        if 'atrend' in data:
-+            data['atrend'] = AnomalyTrend.from_str(data.get('atrend'))
-+
-+        return cls(**data)
-+
- 
- @dataclass
- class ModelConfig:
-diff --git a/anteater/model/algorithms/slope.py b/anteater/model/algorithms/slope.py
-index d324d58..e546183 100644
--- a/anteater/model/algorithms/slope.py
-+++ b/anteater/model/algorithms/slope.py
-@@ -17,6 +17,7 @@ import numpy as np
- 
- from anteater.core.anomaly import AnomalyTrend
- from anteater.model.algorithms.smooth import conv_smooth
-+from anteater.utils.common import divide
- 
- 
- def slope(y, win_len):
-@@ -36,13 +37,15 @@ def smooth_slope(time_series, windows_length):
- 
- def trend(y, win_len=None):
-     """Gets the trend for the y"""
-+    y = conv_smooth(y, box_pts=7)
-+
-     if not win_len:
-         win_len = len(y) // 2
- 
-    if np.mean(y[:win_len]) < np.mean(y[-win_len:]):
-+    if divide(np.mean(y[:win_len]), np.mean(y[-win_len:])) < 0.9:
-         return 1
- 
-    elif np.mean(y[:win_len]) > np.mean(y[-win_len:]):
-+    elif divide(np.mean(y[:win_len]), np.mean(y[-win_len:])) > 1.1:
-         return -1
- 
-     else:
-@@ -51,10 +54,10 @@ def trend(y, win_len=None):
- 
- def check_trend(values: List[float], atrend: AnomalyTrend):
-     """Checks the values with an 'atrend' trend"""
-    if atrend == AnomalyTrend.RISE and trend(values) < 0:
-+    if atrend == AnomalyTrend.RISE and trend(values) != 1:
-         return False
- 
-    if atrend == AnomalyTrend.FALL and trend(values) > 0:
-+    if atrend == AnomalyTrend.FALL and trend(values) != -1:
-         return False
- 
-     return True
-diff --git a/anteater/model/detector/base.py b/anteater/model/detector/base.py
-index 2b2dafe..a23b6d9 100644
--- a/anteater/model/detector/base.py
-+++ b/anteater/model/detector/base.py
-@@ -11,6 +11,7 @@
- # See the Mulan PSL v2 for more details.
- # ******************************************************************************/
- 
-+import logging
- import math
- from abc import abstractmethod
- from typing import List
-@@ -39,12 +40,6 @@ class Detector:
-         """Executes anomaly detection on kpis"""
-         pass
- 
-    def get_unique_machine_id(self, start, end, kpis: List[KPI]) -> List[str]:
-        """Gets unique machine ids during past minutes"""
-        metrics = [_kpi.metric for _kpi in kpis]
-        machine_ids = self.data_loader.get_unique_machines(start, end, metrics)
-        return machine_ids
-
-     def execute(self, job_config: JobConfig) -> List[Anomaly]:
-         """The main function of the detector"""
-         kpis = job_config.kpis
-@@ -56,6 +51,12 @@ class Detector:
- 
-         return self._execute(kpis, features, top_n=n)
- 
-+    def get_unique_machine_id(self, start, end, kpis: List[KPI]) -> List[str]:
-+        """Gets unique machine ids during past minutes"""
-+        metrics = [_kpi.metric for _kpi in kpis]
-+        machine_ids = self.data_loader.get_unique_machines(start, end, metrics)
-+        return machine_ids
-+
-     def find_root_causes(self, anomalies: List[Anomaly], features: List[Feature], top_n=3)\
-             -> List[Anomaly]:
-         """Finds root causes for each anomaly events"""
-@@ -82,6 +83,7 @@ class Detector:
-             tmp_ts_scores = self.cal_anomaly_score(f.metric, f.description, machine_id=machine_id)
-             for _ts_score in tmp_ts_scores:
-                 if not check_trend(_ts_score.ts.values, f.atrend):
-+                    logging.info(f"Trends Filtered: {f.metric}")
-                     _ts_score.score = 0
-                 if same_intersection_key_value(_ts_score.ts.labels, filters):
-                     ts_scores.append(_ts_score)
-@@ -101,6 +103,7 @@ class Detector:
-             for _ts_s in ts_scores:
-                 if same_intersection_key_value(_ts_s.ts.labels, anomaly.labels):
-                     if not check_trend(_ts_s.ts.values, kpi_atrends[anomaly.metric]):
-+                        logging.info(f"Trends Filtered: {anomaly.metric}")
-                         anomaly.score = 0
-                     else:
-                         anomaly.score = _ts_s.score
-@@ -115,12 +118,11 @@ class Detector:
-             machine_id: str)\
-             -> List[TimeSeriesScore]:
-         """Calculates metric anomaly scores based on sr model"""
-        start, end = dt.last(minutes=6)
-+        start, end = dt.last(minutes=10)
-         point_count = self.data_loader.expected_point_length(start, end)
-         model = SpectralResidual(12, 24, 50)
-         ts_scores = []
-        ts_list = self.data_loader.\
-            get_metric(start, end, metric, label_name='machine_id', label_value=machine_id)
-+        ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id)
-         for _ts in ts_list:
-             if sum(_ts.values) == 0 or \
-                len(_ts.values) < point_count * 0.9 or\
-diff --git a/anteater/model/detector/n_sigma_detector.py b/anteater/model/detector/n_sigma_detector.py
-index 3a2ab01..dbf83c6 100644
--- a/anteater/model/detector/n_sigma_detector.py
-+++ b/anteater/model/detector/n_sigma_detector.py
-@@ -29,10 +29,9 @@ from anteater.utils.log import logger
- class NSigmaDetector(Detector):
-     """The three sigma anomaly detector"""
- 
-    def __init__(self, data_loader: MetricLoader, method: str):
-+    def __init__(self, data_loader: MetricLoader):
-         """The detector base class initializer"""
-         super().__init__(data_loader)
-        self.method = method
- 
-     def detect_kpis(self, kpis: List[KPI]):
-         """Executes anomaly detection on kpis"""
-@@ -48,7 +47,7 @@ class NSigmaDetector(Detector):
-     def detect_signal_kpi(self, kpi, machine_id: str) -> List[Anomaly]:
-         """Detects kpi based on signal time series anomaly detection model"""
-         outlier_ratio_th = kpi.params['outlier_ratio_th']
-        ts_scores = self.calculate_metric_three_sigma_score(
-+        ts_scores = self.calculate_n_sigma_score(
-             kpi.metric, kpi.description, machine_id, **kpi.params)
-         if not ts_scores:
-             logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
-@@ -68,17 +67,17 @@ class NSigmaDetector(Detector):
- 
-         return anomalies
- 
-    def calculate_metric_three_sigma_score(self, metric, description, machine_id: str, **kwargs)\
-+    def calculate_n_sigma_score(self, metric, description, machine_id: str, **kwargs)\
-             -> List[TimeSeriesScore]:
-         """Calculate kpi anomaly scores based on three sigma scores"""
-+        method = kwargs.get('method', 'abs')
-         look_back = kwargs.get('look_back')
-         smooth_params = kwargs.get('smooth_params')
-         obs_size = kwargs.get('obs_size')
-         n = kwargs.get('n', 3)
-         start, end = dt.last(minutes=look_back)
-         point_count = self.data_loader.expected_point_length(start, end)
-        ts_list = self.data_loader.\
-            get_metric(start, end, metric, label_name='machine_id', label_value=machine_id)
-+        ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id)
-         ts_scores = []
-         for _ts in ts_list:
-             dedup_values = [k for k, g in groupby(_ts.values)]
-@@ -87,12 +86,12 @@ class NSigmaDetector(Detector):
-                len(_ts.values) > point_count * 1.5 or \
-                all(x == _ts.values[0] for x in _ts.values):
-                 ratio = 0
-            elif len(dedup_values) < point_count * 0.3:
-+            elif len(dedup_values) < point_count * 0.6:
-                 ratio = 0
-             else:
-                 smoothed_val = smoothing(_ts.values, **smooth_params)
-                 outlier, mean, std = n_sigma(
-                    smoothed_val, obs_size=obs_size, n=n, method=self.method)
-+                    smoothed_val, obs_size=obs_size, n=n, method=method)
-                 ratio = divide(len(outlier), obs_size)
- 
-             ts_scores.append(TimeSeriesScore(ts=_ts, score=ratio, description=description))
-diff --git a/anteater/model/detector/online_vae_detector.py b/anteater/model/detector/online_vae_detector.py
-index 63a7b09..0f91576 100644
--- a/anteater/model/detector/online_vae_detector.py
-+++ b/anteater/model/detector/online_vae_detector.py
-@@ -110,8 +110,7 @@ class OnlineVAEDetector(Detector):
-             metric_dfs = []
-             for metric in metrics:
-                 _ts_list = self.data_loader.\
-                    get_metric(start, end, metric, label_name="machine_id",
-                               label_value=machine_id, operator_name='avg')
-+                    get_metric(start, end, metric, operator='avg', keys="machine_id", machine_id=machine_id)
- 
-                 if len(_ts_list) > 1:
-                     raise ValueError(f'Got multiple time_series based on machine id: {len(_ts_list)}')
-diff --git a/anteater/model/detector/tcp_establish_n_sigma_detector.py b/anteater/model/detector/tcp_establish_n_sigma_detector.py
-index 82d7837..3720069 100644
--- a/anteater/model/detector/tcp_establish_n_sigma_detector.py
-+++ b/anteater/model/detector/tcp_establish_n_sigma_detector.py
-@@ -73,8 +73,7 @@ class TcpEstablishNSigmaDetector(Detector):
-         min_rtt = kpi.params.get('min_rtt')
- 
-         start, end = dt.last(minutes=look_back)
-        ts_list = self.data_loader.\
-            get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
-+        ts_list = self.data_loader.get_metric(start, end, kpi.metric, machine_id=machine_id)
- 
-         anomalies = []
-         for _ts in ts_list:
-diff --git a/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py b/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py
-index 1eeb95f..6d41775 100644
--- a/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py
-+++ b/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py
-@@ -11,20 +11,61 @@
- # See the Mulan PSL v2 for more details.
- # ******************************************************************************/
- 
-+from itertools import groupby
- from typing import List
- 
-+import numpy as np
-+
- from anteater.core.time_series import TimeSeriesScore
-+from anteater.model.algorithms.smooth import smoothing
-+from anteater.model.algorithms.three_sigma import n_sigma
- from anteater.model.detector.n_sigma_detector import NSigmaDetector
- from anteater.source.metric_loader import MetricLoader
-+from anteater.utils.common import divide
- from anteater.utils.datetime import DateTimeManager as dt
- 
- 
- class TcpTransLatencyNSigmaDetector(NSigmaDetector):
-     """The three sigma anomaly detector"""
- 
-    def __init__(self, data_loader: MetricLoader, method: str):
-+    def __init__(self, data_loader: MetricLoader):
-         """The detector base class initializer"""
-        super().__init__(data_loader, method)
-+        super().__init__(data_loader)
-+
-+    def calculate_n_sigma_score(self, metric, description, machine_id: str, **kwargs)\
-+            -> List[TimeSeriesScore]:
-+        """Calculates anomaly scores based on n sigma scores"""
-+        method = kwargs.get('method', 'abs')
-+        look_back = kwargs.get('look_back')
-+        smooth_params = kwargs.get('smooth_params')
-+        obs_size = kwargs.get('obs_size')
-+        min_srtt = kwargs.get("min_srtt")
-+        n = kwargs.get('n', 3)
-+        start, end = dt.last(minutes=look_back)
-+        point_count = self.data_loader.expected_point_length(start, end)
-+        ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id)
-+        ts_scores = []
-+        for _ts in ts_list:
-+            dedup_values = [k for k, g in groupby(_ts.values)]
-+            if sum(_ts.values) == 0 or \
-+               len(_ts.values) < point_count * 0.6 or \
-+               len(_ts.values) > point_count * 1.5 or \
-+               all(x == _ts.values[0] for x in _ts.values):
-+                ratio = 0
-+            elif len(dedup_values) < point_count * 0.6:
-+                ratio = 0
-+            else:
-+                smoothed_val = smoothing(_ts.values, **smooth_params)
-+                outlier, mean, std = n_sigma(
-+                    smoothed_val, obs_size=obs_size, n=n, method=method)
-+                if outlier and np.average(outlier) <= min_srtt:
-+                    ratio = 0
-+                else:
-+                    ratio = divide(len(outlier), obs_size)
-+
-+            ts_scores.append(TimeSeriesScore(ts=_ts, score=ratio, description=description))
-+
-+        return ts_scores
- 
-     def cal_anomaly_score(self, metric, description, machine_id: str) \
-             -> List[TimeSeriesScore]:
-@@ -32,8 +73,7 @@ class TcpTransLatencyNSigmaDetector(NSigmaDetector):
-         start, end = dt.last(minutes=2)
-         point_count = self.data_loader.expected_point_length(start, end)
-         ts_scores = []
-        ts_list = self.data_loader. \
-            get_metric(start, end, metric, label_name='machine_id', label_value=machine_id)
-+        ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id)
-         for _ts in ts_list:
-             if sum(_ts.values) == 0 or \
-                     len(_ts.values) < point_count * 0.5 or \
-diff --git a/anteater/model/detector/th_base_detector.py b/anteater/model/detector/th_base_detector.py
-index bec9705..0af4f22 100644
--- a/anteater/model/detector/th_base_detector.py
-+++ b/anteater/model/detector/th_base_detector.py
-@@ -44,8 +44,7 @@ class ThBaseDetector(Detector):
-         look_back = kpi.params.get('look_back')
-         th = kpi.params.get('th')
-         start, end = dt.last(minutes=look_back)
-        ts_list = self.data_loader.\
-            get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
-+        ts_list = self.data_loader.get_metric(start, end, kpi.metric, machine_id=machine_id)
- 
-         if not ts_list:
-             logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
-diff --git a/anteater/module/app/app_sli_detector.py b/anteater/module/app/app_sli_detector.py
-index 102ed11..e506332 100644
--- a/anteater/module/app/app_sli_detector.py
-+++ b/anteater/module/app/app_sli_detector.py
-@@ -44,12 +44,12 @@ class APPSliDetector(E2EDetector):
-     def init_detectors(self, data_loader):
-         if self.job_config.model_config.enable:
-             detectors = [
-                NSigmaDetector(data_loader, method='min'),
-+                NSigmaDetector(data_loader),
-                 OnlineVAEDetector(data_loader, self.job_config.model_config)
-             ]
-         else:
-             detectors = [
-                NSigmaDetector(data_loader, method='min')
-+                NSigmaDetector(data_loader)
-             ]
- 
-         return detectors
-diff --git a/anteater/module/sys/disk_throughput.py b/anteater/module/sys/disk_throughput.py
-index 9a192fb..7971505 100644
--- a/anteater/module/sys/disk_throughput.py
-+++ b/anteater/module/sys/disk_throughput.py
-@@ -38,12 +38,12 @@ class DiskThroughputDetector(E2EDetector):
-     def init_detectors(self, data_loader):
-         if self.job_config.model_config.enable:
-             detectors = [
-                NSigmaDetector(data_loader, method='max'),
-+                NSigmaDetector(data_loader),
-                 OnlineVAEDetector(data_loader, self.job_config.model_config)
-             ]
-         else:
-             detectors = [
-                NSigmaDetector(data_loader, method='max')
-+                NSigmaDetector(data_loader)
-             ]
- 
-         return detectors
-diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py
-index a34c48d..b76acea 100644
--- a/anteater/module/sys/proc_io_latency.py
-+++ b/anteater/module/sys/proc_io_latency.py
-@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector):
-     def init_detectors(self, data_loader):
-         if self.job_config.model_config.enable:
-             detectors = [
-                NSigmaDetector(data_loader, method='max'),
-+                NSigmaDetector(data_loader),
-                 OnlineVAEDetector(data_loader, self.job_config.model_config)
-             ]
-         else:
-             detectors = [
-                NSigmaDetector(data_loader, method='max')
-+                NSigmaDetector(data_loader)
-             ]
- 
-         return detectors
-diff --git a/anteater/module/sys/sys_io_latency.py b/anteater/module/sys/sys_io_latency.py
-index a6f01c2..17a34c9 100644
--- a/anteater/module/sys/sys_io_latency.py
-+++ b/anteater/module/sys/sys_io_latency.py
-@@ -38,12 +38,12 @@ class SysIOLatencyDetector(E2EDetector):
-     def init_detectors(self, data_loader):
-         if self.job_config.model_config.enable:
-             detectors = [
-                NSigmaDetector(data_loader, method='abs'),
-+                NSigmaDetector(data_loader),
-                 OnlineVAEDetector(data_loader, self.job_config.model_config)
-             ]
-         else:
-             detectors = [
-                NSigmaDetector(data_loader, method='abs')
-+                NSigmaDetector(data_loader)
-             ]
- 
-         return detectors
-diff --git a/anteater/module/sys/tcp_transmission_latency.py b/anteater/module/sys/tcp_transmission_latency.py
-index cf0f406..e085ec3 100644
--- a/anteater/module/sys/tcp_transmission_latency.py
-+++ b/anteater/module/sys/tcp_transmission_latency.py
-@@ -39,12 +39,12 @@ class SysTcpTransmissionLatencyDetector(E2EDetector):
-     def init_detectors(self, data_loader):
-         if self.job_config.model_config.enable:
-             detectors = [
-                TcpTransLatencyNSigmaDetector(data_loader, method='max'),
-+                TcpTransLatencyNSigmaDetector(data_loader),
-                 OnlineVAEDetector(data_loader, self.job_config.model_config)
-             ]
-         else:
-             detectors = [
-                TcpTransLatencyNSigmaDetector(data_loader, method='max')
-+                TcpTransLatencyNSigmaDetector(data_loader)
-             ]
- 
-         return detectors
-diff --git a/anteater/module/sys/tcp_transmission_throughput.py b/anteater/module/sys/tcp_transmission_throughput.py
-index 86ecc9e..2921602 100644
--- a/anteater/module/sys/tcp_transmission_throughput.py
-+++ b/anteater/module/sys/tcp_transmission_throughput.py
-@@ -38,12 +38,12 @@ class SysTcpTransmissionThroughputDetector(E2EDetector):
-     def init_detectors(self, data_loader):
-         if self.job_config.model_config.enable:
-             detectors = [
-                NSigmaDetector(data_loader, method='abs'),
-+                NSigmaDetector(data_loader),
-                 OnlineVAEDetector(data_loader, self.job_config.model_config)
-             ]
-         else:
-             detectors = [
-                NSigmaDetector(data_loader, method='abs')
-+                NSigmaDetector(data_loader)
-             ]
- 
-         return detectors
-diff --git a/anteater/source/metric_loader.py b/anteater/source/metric_loader.py
-index ef2d012..4745d87 100644
--- a/anteater/source/metric_loader.py
-+++ b/anteater/source/metric_loader.py
-@@ -65,6 +65,43 @@ def get_query(metric: str,
-     return query
- 
- 
-+def get_query2(
-+        metric: str, operator: str = None, value: float = None, keys: Union[str, List] = None, **labels):
-+    """Gets aggregated query patterns
-+
-+    Format: [operator]([value,] metric{[**labels]}) by (keys)
-+
-+        Such as:
-+            - 1. gala_gopher_bind_sends{machine_id="1234"}
-+            - 2. sum(gala_gopher_bind_sends) by (machine_id)
-+            - 2. sum(gala_gopher_bind_sends) by (machine_id)
-+            - 3. sum(gala_gopher_bind_sends{machine_id="1234"}) by (machine_id)
-+            - 4. quantile(0.7, gala_gopher_bind_sends{machine_id="1234"}) by (machine_id)
-+    """
-+    if operator and not keys:
-+        raise ValueError("Please provide param 'keys' when specified 'operator'!")
-+
-+    rule = ""
-+    if labels:
-+        pairs = ",".join([f"{n}='{v}'" for n, v in labels.items()])
-+        rule = f"{{{pairs}}}"
-+
-+    group = ""
-+    if isinstance(keys, list):
-+        group = ",".join([k for k in keys])
-+    elif isinstance(keys, str):
-+        group = keys
-+
-+    if operator and value:
-+        query = f"{operator}({value}, {metric}{rule}) by ({group})"
-+    elif operator:
-+        query = f"{operator}({metric}{rule}) by ({group})"
-+    else:
-+        query = f"{metric}{rule}"
-+
-+    return query
-+
-+
- class MetricLoader:
-     """
-     The metric loader that consumes raw data from PrometheusAdapter,
-@@ -87,7 +124,7 @@ class MetricLoader:
- 
-         :return List of TimeSeries
-         """
-        query = get_query(metric, **kwargs)
-+        query = get_query2(metric, **kwargs)
-         time_series = self.provider.range_query(start, end, metric, query)
- 
-         return time_series
-@@ -109,7 +146,7 @@ class MetricLoader:
-         """Gets unique labels of all metrics"""
-         unique_labels = set()
-         for metric in metrics:
-            time_series = self.get_metric(start, end, metric, label_name=label_name)
-+            time_series = self.get_metric(start, end, metric)
-             unique_labels.update([item.labels.get(label_name, "") for item in time_series])
- 
-         return list([lbl for lbl in unique_labels if lbl])
-diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py
-index 730c9c6..60c28e5 100644
--- a/anteater/utils/data_load.py
-+++ b/anteater/utils/data_load.py
-@@ -48,8 +48,8 @@ def load_job_config(file_name) -> JobConfig:
-     keywords = config['keywords']
-     root_cause_number = config['root_cause_number']
- 
-    kpis = [KPI(**update_description(_conf)) for _conf in config['KPI']]
-    features = [Feature(**update_description(_conf)) for _conf in config['Features']]
-+    kpis = [KPI.from_dict(**update_description(_conf)) for _conf in config['KPI']]
-+    features = [Feature.from_dict(**update_description(_conf)) for _conf in config['Features']]
- 
-     model_config = None
-     if 'OnlineModel' in config:
-diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json
-index 0146883..5027b8d 100644
--- a/config/module/app_sli_rtt.json
-+++ b/config/module/app_sli_rtt.json
-@@ -10,13 +10,14 @@
-       "metric": "gala_gopher_sli_rtt_nsec",
-       "kpi_type": "rtt",
-       "entity_name": "sli",
-      "enable": false,
-+      "enable": true,
-       "description": "sli rtt 异常",
-       "description-zh": "应用级请求往返时延（RTT）异常",
-       "params": {
-+        "method": "max",
-         "look_back": 10,
-         "obs_size": 25,
-        "outlier_ratio_th": 0.3,
-+        "outlier_ratio_th": 0.5,
-         "smooth_params": {
-           "method": "conv_smooth",
-           "box_pts": 3
-@@ -31,6 +32,7 @@
-       "description": "sli tps 异常",
-       "description-zh": "应用级请求吞吐量（TPS）异常",
-       "params": {
-+        "method": "min",
-         "look_back": 10,
-         "obs_size": 25,
-         "outlier_ratio_th": 0.3,
-diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json
-index f6244f6..e3bcf68 100644
--- a/config/module/disk_throughput.json
-+++ b/config/module/disk_throughput.json
-@@ -14,9 +14,10 @@
-       "description": "Disk read await time is increasing!",
-       "description-zh": "磁盘读响应时间升高，性能发生劣化",
-       "params": {
-+        "method": "max",
-         "look_back": 20,
-         "obs_size": 25,
-        "outlier_ratio_th": 0.3,
-+        "outlier_ratio_th": 0.5,
-         "smooth_params": {
-           "method": "conv_smooth",
-           "box_pts": 3
-@@ -31,9 +32,10 @@
-       "description": "Disk write await time is increasing!",
-       "description-zh": "磁盘写响应时间升高，性能发生劣化",
-       "params": {
-+        "method": "max",
-         "look_back": 20,
-         "obs_size": 25,
-        "outlier_ratio_th": 0.3,
-+        "outlier_ratio_th": 0.5,
-         "smooth_params": {
-           "method": "conv_smooth",
-           "box_pts": 3
-diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json
-index f086b87..171c5f4 100644
--- a/config/module/proc_io_latency.json
-+++ b/config/module/proc_io_latency.json
-@@ -14,9 +14,10 @@
-       "description": "I/O operation delay at the BIO layer (unit: us)",
-       "description-zh": "BIO层I/O操作延时高(单位：us)",
-       "params": {
-+        "method":"max",
-         "look_back": 20,
-         "obs_size": 37,
-        "outlier_ratio_th": 0.4,
-+        "outlier_ratio_th": 0.5,
-         "smooth_params": {
-           "method": "conv_smooth",
-           "box_pts": 3
-@@ -31,9 +32,10 @@
-       "description": "Number of small I/O (less than 4 KB) read operations at the BIO layer.",
-       "description-zh": "BIO层小数据I/O读操作数量异常（小于4KB）",
-       "params": {
-+        "method":"max",
-         "look_back": 20,
-         "obs_size": 25,
-        "outlier_ratio_th": 0.3,
-+        "outlier_ratio_th": 0.4,
-         "smooth_params": {
-           "method": "conv_smooth",
-           "box_pts": 3
-@@ -48,9 +50,10 @@
-       "description": "Number of small I/O (less than 4 KB) write operations at the BIO layer.",
-       "description-zh": "BIO层小数据I/O写操作数量异常（小于4KB）",
-       "params": {
-+        "method":"max",
-         "look_back": 20,
-         "obs_size": 25,
-        "outlier_ratio_th": 0.3,
-+        "outlier_ratio_th": 0.4,
-         "smooth_params": {
-           "method": "savgol_smooth",
-           "window_length": 13,
-@@ -66,9 +69,10 @@
-       "description": "Number of big I/O (greater than 4 KB) read operations at the BIO layer.",
-       "description-zh": "BIO层大数据I/O读操作数量异常（大于4KB）",
-       "params": {
-+        "method":"max",
-         "look_back": 20,
-         "obs_size": 25,
-        "outlier_ratio_th": 0.3,
-+        "outlier_ratio_th": 0.4,
-         "smooth_params": {
-           "method": "conv_smooth",
-           "box_pts": 3
-@@ -83,9 +87,10 @@
-       "description": "Number of big I/O (greater than 4 KB) write operations at the BIO layer.",
-       "description-zh": "BIO层大数据写操作数量异常（大于4KB）",
-       "params": {
-+        "method":"max",
-         "look_back": 20,
-         "obs_size": 25,
-        "outlier_ratio_th": 0.3,
-+        "outlier_ratio_th": 0.4,
-         "smooth_params": {
-           "method": "conv_smooth",
-           "box_pts": 3
-diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json
-index bdf17d3..3fa1266 100644
--- a/config/module/sys_io_latency.json
-+++ b/config/module/sys_io_latency.json
-@@ -16,7 +16,7 @@
-       "params": {
-         "look_back": 20,
-         "obs_size": 25,
-        "outlier_ratio_th": 0.3,
-+        "outlier_ratio_th": 0.4,
-         "smooth_params": {
-           "method": "conv_smooth",
-           "box_pts": 3
-diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json
-index 7cd2369..9bd2a46 100644
--- a/config/module/sys_tcp_establish.json
-+++ b/config/module/sys_tcp_establish.json
-@@ -17,7 +17,7 @@
-         "look_back": 30,
-         "outlier_ratio_th": 0.5,
-         "obs_size": 3,
-        "min_rtt": 500000
-+        "min_rtt": 100000
-       }
-     }
-   ],
-diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json
-index 0527487..3ba8113 100644
--- a/config/module/sys_tcp_transmission_latency.json
-+++ b/config/module/sys_tcp_transmission_latency.json
-@@ -14,10 +14,12 @@
-       "description": "Smoothed Round Trip Time(us)",
-       "description-zh": "TCP链接往返时延异常，性能劣化",
-       "params": {
-+        "method": "max",
-         "look_back": 20,
-         "obs_size": 25,
-         "n": 3,
-        "outlier_ratio_th": 0.4,
-+        "min_srtt": 20000,
-+        "outlier_ratio_th": 0.6,
-         "smooth_params": {
-           "method": "conv_smooth",
-           "box_pts": 3
-- 
-2.33.0
-
--- a/gala-anteater-1.0.1.tar.gz
+++ b/gala-anteater-1.0.1.tar.gz
--- a/gala-anteater-1.2.1.tar.gz
+++ b/gala-anteater-1.2.1.tar.gz
--- a/gala-anteater.spec
+++ b/gala-anteater.spec
@ -1,8 +1,8 @@
 %define debug_package %{nil}

 Name:            gala-anteater
-Version:         1.0.1
-Release:         4
+Version:         1.2.1
+Release:         1
 Summary:         A time-series anomaly detection platform for operating system.
 License:         MulanPSL2
 URL:             https://gitee.com/openeuler/gala-anteater
@ -11,21 +11,17 @@ BuildRoot:       %{_builddir}/%{name}-%{version}
 BuildRequires:   procps-ng python3-setuptools
 Requires:        python3-gala-anteater = %{version}-%{release}

-Patch1:        Add-disk-throughput-detector.patch
-Patch2:        Update-TCP-Establish-Model-Add-Nic-Loss-Detector.patch
-Patch3:        add-chinese-descriptions.patch
-Patch4:        remove-sys-level-config-param.patch
-Patch5:        add-systemd-service-for-anteater.patch
-Patch6:        fix-str2enum-bug-data-query-refactor.patch
-
+patch0:          0001-add-new-feature-slow-node-detection.patch
 %description
 Abnormal detection module for A-Ops project

+
 %package -n python3-gala-anteater
 Summary:         Python3 package of gala-anteater
 Requires:        python3-APScheduler python3-kafka-python python3-joblib python3-numpy
 Requires:        python3-pandas python3-requests python3-scikit-learn python3-pytorch
-Requires:        python3-pyyaml
+Requires:        python3-pyyaml python3-networkx python3-pyArango python3-statsmodels
+

 %description -n python3-gala-anteater
 Python3 package of gala-anteater
@ -59,16 +55,21 @@ fi
 %doc README.md
 %license LICENSE
 %{_bindir}/gala-anteater
+%config(noreplace) %{_sysconfdir}/%{name}/config/metricinfo.json
 %config(noreplace) %{_sysconfdir}/%{name}/config/gala-anteater.yaml
 %config(noreplace) %{_sysconfdir}/%{name}/config/log.settings.ini
-%config(noreplace) %{_sysconfdir}/%{name}/config/module/app_sli_rtt.json
-%config(noreplace) %{_sysconfdir}/%{name}/config/module/proc_io_latency.json
-%config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_io_latency.json
-%config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_tcp_establish.json
-%config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_tcp_transmission_latency.json
-%config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_tcp_transmission_throughput.json
-%config(noreplace) %{_sysconfdir}/%{name}/config/module/disk_throughput.json
-%config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_nic_loss.json
+%config(noreplace) %{_sysconfdir}/%{name}/module/app_sli_rtt.job.json
+%config(noreplace) %{_sysconfdir}/%{name}/module/container_disruption.job.json
+%config(noreplace) %{_sysconfdir}/%{name}/module/disk_throughput.job.json
+%config(noreplace) %{_sysconfdir}/%{name}/module/jvm_oom.job.json
+%config(noreplace) %{_sysconfdir}/%{name}/module/proc_io_latency.job.json
+%config(noreplace) %{_sysconfdir}/%{name}/module/rca.job.json
+%config(noreplace) %{_sysconfdir}/%{name}/module/sys_io_latency.job.json
+%config(noreplace) %{_sysconfdir}/%{name}/module/sys_nic_loss.job.json
+%config(noreplace) %{_sysconfdir}/%{name}/module/sys_tcp_establish.job.json
+%config(noreplace) %{_sysconfdir}/%{name}/module/sys_tcp_transmission_latency.job.json
+%config(noreplace) %{_sysconfdir}/%{name}/module/usad_model.job.json
+%config(noreplace) %{_sysconfdir}/%{name}/module/slow_node_detection.job.json
 /usr/lib/systemd/system/gala-anteater.service

 %files -n python3-gala-anteater
@ -77,6 +78,21 @@ fi


 %changelog
+* Tue Nov 5 2024 huangbin <huangbin58@huawei.com> - 1.2.1-1
+- Add new feature slow node detection.
+
+* Sat Sep 21 2024 huangbin <huangbin58@huawei.com> - 1.2.0-3
+- Fixbug with rca time range extend.
+
+* Wed Sep 18 2024 huangbin <huangbin58@huawei.com> - 1.2.0-2
+- Fixbug with large value exceed
+
+* Sat Aug 31 2024 huangbin <huangbin58@huawei.com> - 1.2.0-1
+- Upgrade anteater version to 1.2.0
+
+* Thu Aug 31 2023 Li Zhenxing <lizhenxing11@huawei.com> - 1.1.0-1
+- Upgrade anteater version to 1.1.0
+
 * Fri Jan 20 2023 Zhen Chen <chenzhen126@huawei.com> - 1.0.1-4
 - eliminate 'Fail to try-restart' warning when downgrading to 1.0.1-1

@ -99,3 +115,4 @@ fi

 * Sat Nov 12 2022 Zhen Chen <chenzhen126@huawei.com> - 1.0.0-1
 - Package init
+
--- a/gala-anteater.yaml
+++ b/gala-anteater.yaml
@ -0,0 +1,4 @@
+seperator: .
+src_repo: openeuler/gala
+tag_prefix: ^
+version_control: gitee.com
--- a/remove-sys-level-config-param.patch
+++ b/remove-sys-level-config-param.patch
@ -1,98 +0,0 @@
-From 5c6b03a49a49ddc56574e906f959d5fe34c1debc Mon Sep 17 00:00:00 2001
-From: lizhenxing11 <lizhenxing11@huawei.com>
-Date: Fri, 6 Jan 2023 10:59:12 +0800
-Subject: [PATCH] remove 'sys-level' config param
-
---
- anteater/config.py        |  1 -
- anteater/main.py          | 29 +++++++++++------------------
- config/gala-anteater.yaml |  1 -
- docs/conf_introduction.md |  1 -
- 4 files changed, 11 insertions(+), 21 deletions(-)
-
-diff --git a/anteater/config.py b/anteater/config.py
-index e9ab557..caeceec 100644
--- a/anteater/config.py
-+++ b/anteater/config.py
-@@ -27,7 +27,6 @@ import yaml
- class GlobalConf:
-     """The global config"""
-     data_source: str
-    sys_level: bool
- 
- 
- @dataclass
-diff --git a/anteater/main.py b/anteater/main.py
-index 4de72f9..87aae95 100644
--- a/anteater/main.py
-+++ b/anteater/main.py
-@@ -26,7 +26,6 @@ from anteater.module.sys.nic_loss import NICLossDetector
- from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector
- from anteater.module.sys.sys_io_latency import SysIOLatencyDetector
- from anteater.module.sys.tcp_establish import SysTcpEstablishDetector
-from anteater.module.sys.tcp_transmission_throughput import SysTcpTransmissionThroughputDetector
- from anteater.module.sys.tcp_transmission_latency import SysTcpTransmissionLatencyDetector
- from anteater.provider.kafka import KafkaProvider
- from anteater.source.anomaly_report import AnomalyReport
-@@ -49,24 +48,18 @@ def main():
-     kafka_provider = KafkaProvider(conf.kafka)
-     loader = MetricLoader(conf)
-     report = AnomalyReport(kafka_provider)
-    if conf.global_conf.sys_level:
-        detectors = [
-            # APP sli anomaly detection
-            APPSliDetector(loader, report),
-+    detectors = [
-+        # APP sli anomaly detection
-+        APPSliDetector(loader, report),
- 
-            # SYS tcp/io detection
-            SysTcpEstablishDetector(loader, report),
-            SysTcpTransmissionLatencyDetector(loader, report),
-            SysIOLatencyDetector(loader, report),
-            ProcIOLatencyDetector(loader, report),
-            DiskThroughputDetector(loader, report),
-            NICLossDetector(loader, report),
-        ]
-    else:
-        detectors = [
-            # APP sli anomaly detection
-            APPSliDetector(loader, report)
-        ]
-+        # SYS tcp/io detection
-+        SysTcpEstablishDetector(loader, report),
-+        SysTcpTransmissionLatencyDetector(loader, report),
-+        SysIOLatencyDetector(loader, report),
-+        ProcIOLatencyDetector(loader, report),
-+        DiskThroughputDetector(loader, report),
-+        NICLossDetector(loader, report),
-+    ]
- 
-     anomaly_detect = AnomalyDetection(detectors, conf)
- 
-diff --git a/config/gala-anteater.yaml b/config/gala-anteater.yaml
-index c4c54a0..72ffc31 100644
--- a/config/gala-anteater.yaml
-+++ b/config/gala-anteater.yaml
-@@ -1,6 +1,5 @@
- Global:
-     data_source: "prometheus"
-    sys_level: false
- 
- Kafka:
-   server: "localhost"
-diff --git a/docs/conf_introduction.md b/docs/conf_introduction.md
-index 09a7284..869d3e9 100644
--- a/docs/conf_introduction.md
-+++ b/docs/conf_introduction.md
-@@ -16,7 +16,6 @@ gala-anteater               # gala-anteater 主目录
- 在文件`gala-anteater.yaml`中，配置`gala-anteater`启动时所需的参数。该配置项中，主要包含：
- - Global: 配置启动时的全局变量
-   - data_source: 时序数据的来源，目前支持`"prometheus"`（Prometheus）和`"aom"`（AOM）两种数据来源；
-  - sys_level: 是否支持`系统级`异常检测，可选：`true`、`false`。
- 
- - Kafka: 配置中间件Kafka所需的参数
-   - server: Kafak对应的`server ip`，如："10.xxx.xxx.xxx"；
-- 
-2.33.0
-