add configure for group setting.

2024-11-08 16:52:40 +08:00 · 2024-11-08 16:52:40 +08:00 · 7cfc41df49
commit 7cfc41df49
parent 506b0a92f2
2 changed files with 127 additions and 1 deletions
--- a/0002-configure-group-in-json.patch
+++ b/0002-configure-group-in-json.patch
@ -0,0 +1,121 @@
+From acefcbdbb4891aa0b3f1b7afe500b8fdef440806 Mon Sep 17 00:00:00 2001
+From: huangbin <huangbin58@huawei.com>
+Date: Fri, 8 Nov 2024 16:42:52 +0800
+Subject: [PATCH] configure-group-in-json
+
+---
+ anteater/model/detector/slow_node_detector.py | 28 +++++++++++--------
+ config/module/slow_node_detection.job.json    |  9 ++++--
+ 2 files changed, 24 insertions(+), 13 deletions(-)
+
+diff --git a/anteater/model/detector/slow_node_detector.py b/anteater/model/detector/slow_node_detector.py
+index 15a6cee..d5d10e1 100644
+--- a/anteater/model/detector/slow_node_detector.py
+++ b/anteater/model/detector/slow_node_detector.py
+@@ -55,6 +55,9 @@ class SlowNodeDetector(Detector):
+                     hccl_domain = json.load(f_out)
+             except Exception:
+                 logger.error(f"Read hccl domain info fail!")
+        if not hccl_domain:
+            # 增加手动设置hccl_domain
+            hccl_domain = params.get("hccl_domain", {})
+         if os.path.exists(rank_table_path):
+             try:
+                 with open(rank_table_path, 'r', encoding='utf-8') as f_out:
+@@ -106,15 +109,15 @@ class SlowNodeDetector(Detector):
+         # 获取machine_ids,
+         machines_to_devices = self.get_machines_to_devices(start, end, kpis)
+         npu_id2host_id, hosts_ids = self.npu_id2host_id(machines_to_devices)
+-
+         group_dataloader = GroupDataLoader(self.hccl_domain, self.rank_table, machines_to_devices)
+         group_ranks: list = group_dataloader.get_group_ranks()
+         all_results = []
+         for kpi in kpis:
+-            for ranks in group_ranks:
+            for index, ranks in enumerate(group_ranks):
+                logger.info(f"Groups-{index}, metric: {kpi.metric}, start detection.")
+                 machine_ids: dict = group_dataloader.rank_table_loader.get_group_nodes_by_ranks(ranks)
+                 host_ids: list = self.get_host_ids_by_npu_ids(machine_ids, npu_id2host_id, hosts_ids)
+-                group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids)
+                group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids, ranks)
+                 all_results.extend(group_result)
+ 
+         response, all_anomaly_nodes = self.gen_final_alarm(kpis, all_results)
+@@ -148,19 +151,26 @@ class SlowNodeDetector(Detector):
+ 
+         return response, all_anomaly_nodes
+ 
+-    def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list) -> list:
+    def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list, ranks) -> list:
+         """Detects kpi based on signal time series anomaly detection model"""
+         # 普罗会一次性抓到所有的数据，需要根据machine_id, device_id去对数据作分组
+-        # get数据
+         metric_name: str = kpi.metric
+ 
+         all_machines_ts = []
+         for machine_id in machine_ids:
+             single_machine_ts_list = self.get_kpi_ts_list(metric_name, machine_id, kpi.params)
+-            all_machines_ts.extend(single_machine_ts_list)
+            if single_machine_ts_list:
+                # 根据ranks匹配组内device的指标
+                local_ranks = [int(rank) % 8 for rank in ranks]
+                for single_machine_ts in single_machine_ts_list:
+                    ts_id = int(single_machine_ts.labels.get("id", -1))
+                    if ts_id in local_ranks:
+                        all_machines_ts.append(single_machine_ts)
+
+         for host_id in host_ids:
+             single_machine_ts_list = self.get_kpi_ts_list(metric_name, host_id, kpi.params)
+             all_machines_ts.extend(single_machine_ts_list)
+        logger.info(f"Metric-{metric_name} single group has data {len(all_machines_ts)}. ranks: {ranks}")
+ 
+         anomaly_devices = []
+         anomaly_locations = {}
+@@ -195,7 +205,7 @@ class SlowNodeDetector(Detector):
+                 logger.info(
+                     f"space_nodes_compare result: {self.output_anomaly_devices(metric_name, space_anomaly_locations)}.")
+             else:
+-                logger.info(f"Skip space nodes compare, due to nodes number{len(all_machines_ts)} is smaller than 4.")
+                logger.info(f"Skip space nodes compare, due to nodes number {len(all_machines_ts)} is smaller than 4.")
+         else:
+             logger.info(f"Skip space nodes compare.")
+ 
+@@ -287,10 +297,6 @@ class SlowNodeDetector(Detector):
+         look_back = self.config.params.get("look_back", 10)
+         metric_type = kpi_params.get("metric_type", "device")
+         start, end = dt.last(minutes=look_back)
+-        # from datetime import timedelta
+-        # # 6:40
+-        # start = start - timedelta(hours=4.5)
+-        # end = end - timedelta(hours=4.5)
+ 
+         if metric_type == "device":
+             # npu device
+diff --git a/config/module/slow_node_detection.job.json b/config/module/slow_node_detection.job.json
+index 91ff621..27a6d53 100644
+--- a/config/module/slow_node_detection.job.json
+++ b/config/module/slow_node_detection.job.json
+@@ -17,7 +17,12 @@
+       "obs_size": 5,
+       "outlier_ratio_th": 0.6,
+       "hccl_domain_json": "./hccl_domain.json",
+-      "rank_table_json": "./hccl_domain.json"
+      "hccl_domain":{
+        "pp": 2,
+        "dp": 1,
+        "tp": 1
+      },
+      "rank_table_json": "./rank_table.json"
+     }
+   },
+   "kpis": [
+@@ -349,4 +354,4 @@
+       "metric": "gala_gopher_container_cpu_usage_seconds_total"
+     }
+   ]
+-}
+\ No newline at end of file
+}
+-- 
+2.33.0
+
--- a/gala-anteater.spec
+++ b/gala-anteater.spec
@ -2,7 +2,7 @@

 Name:            gala-anteater
 Version:         1.2.1
-Release:         1
+Release:         2
 Summary:         A time-series anomaly detection platform for operating system.
 License:         MulanPSL2
 URL:             https://gitee.com/openeuler/gala-anteater
@ -12,6 +12,8 @@ BuildRequires:   procps-ng python3-setuptools
 Requires:        python3-gala-anteater = %{version}-%{release}

 patch0:          0001-add-new-feature-slow-node-detection.patch
+patch1:		 0002-configure-group-in-json.patch
+
 %description
 Abnormal detection module for A-Ops project

@ -78,6 +80,9 @@ fi


 %changelog
+* Fri Nov 8 2024 huangbin <huangbin58@huawei.com> - 1.2.1-2
+- Add new feature slow node detection.
+
 * Tue Nov 5 2024 huangbin <huangbin58@huawei.com> - 1.2.1-1
 - Add new feature slow node detection.