diff --git a/0002-configure-group-in-json.patch b/0002-configure-group-in-json.patch new file mode 100644 index 0000000..3ec9cad --- /dev/null +++ b/0002-configure-group-in-json.patch @@ -0,0 +1,121 @@ +From acefcbdbb4891aa0b3f1b7afe500b8fdef440806 Mon Sep 17 00:00:00 2001 +From: huangbin +Date: Fri, 8 Nov 2024 16:42:52 +0800 +Subject: [PATCH] configure-group-in-json + +--- + anteater/model/detector/slow_node_detector.py | 28 +++++++++++-------- + config/module/slow_node_detection.job.json | 9 ++++-- + 2 files changed, 24 insertions(+), 13 deletions(-) + +diff --git a/anteater/model/detector/slow_node_detector.py b/anteater/model/detector/slow_node_detector.py +index 15a6cee..d5d10e1 100644 +--- a/anteater/model/detector/slow_node_detector.py ++++ b/anteater/model/detector/slow_node_detector.py +@@ -55,6 +55,9 @@ class SlowNodeDetector(Detector): + hccl_domain = json.load(f_out) + except Exception: + logger.error(f"Read hccl domain info fail!") ++ if not hccl_domain: ++ # 增加手动设置hccl_domain ++ hccl_domain = params.get("hccl_domain", {}) + if os.path.exists(rank_table_path): + try: + with open(rank_table_path, 'r', encoding='utf-8') as f_out: +@@ -106,15 +109,15 @@ class SlowNodeDetector(Detector): + # 获取machine_ids, + machines_to_devices = self.get_machines_to_devices(start, end, kpis) + npu_id2host_id, hosts_ids = self.npu_id2host_id(machines_to_devices) +- + group_dataloader = GroupDataLoader(self.hccl_domain, self.rank_table, machines_to_devices) + group_ranks: list = group_dataloader.get_group_ranks() + all_results = [] + for kpi in kpis: +- for ranks in group_ranks: ++ for index, ranks in enumerate(group_ranks): ++ logger.info(f"Groups-{index}, metric: {kpi.metric}, start detection.") + machine_ids: dict = group_dataloader.rank_table_loader.get_group_nodes_by_ranks(ranks) + host_ids: list = self.get_host_ids_by_npu_ids(machine_ids, npu_id2host_id, hosts_ids) +- group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids) ++ group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids, ranks) + all_results.extend(group_result) + + response, all_anomaly_nodes = self.gen_final_alarm(kpis, all_results) +@@ -148,19 +151,26 @@ class SlowNodeDetector(Detector): + + return response, all_anomaly_nodes + +- def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list) -> list: ++ def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list, ranks) -> list: + """Detects kpi based on signal time series anomaly detection model""" + # 普罗会一次性抓到所有的数据,需要根据machine_id, device_id去对数据作分组 +- # get数据 + metric_name: str = kpi.metric + + all_machines_ts = [] + for machine_id in machine_ids: + single_machine_ts_list = self.get_kpi_ts_list(metric_name, machine_id, kpi.params) +- all_machines_ts.extend(single_machine_ts_list) ++ if single_machine_ts_list: ++ # 根据ranks匹配组内device的指标 ++ local_ranks = [int(rank) % 8 for rank in ranks] ++ for single_machine_ts in single_machine_ts_list: ++ ts_id = int(single_machine_ts.labels.get("id", -1)) ++ if ts_id in local_ranks: ++ all_machines_ts.append(single_machine_ts) ++ + for host_id in host_ids: + single_machine_ts_list = self.get_kpi_ts_list(metric_name, host_id, kpi.params) + all_machines_ts.extend(single_machine_ts_list) ++ logger.info(f"Metric-{metric_name} single group has data {len(all_machines_ts)}. ranks: {ranks}") + + anomaly_devices = [] + anomaly_locations = {} +@@ -195,7 +205,7 @@ class SlowNodeDetector(Detector): + logger.info( + f"space_nodes_compare result: {self.output_anomaly_devices(metric_name, space_anomaly_locations)}.") + else: +- logger.info(f"Skip space nodes compare, due to nodes number{len(all_machines_ts)} is smaller than 4.") ++ logger.info(f"Skip space nodes compare, due to nodes number {len(all_machines_ts)} is smaller than 4.") + else: + logger.info(f"Skip space nodes compare.") + +@@ -287,10 +297,6 @@ class SlowNodeDetector(Detector): + look_back = self.config.params.get("look_back", 10) + metric_type = kpi_params.get("metric_type", "device") + start, end = dt.last(minutes=look_back) +- # from datetime import timedelta +- # # 6:40 +- # start = start - timedelta(hours=4.5) +- # end = end - timedelta(hours=4.5) + + if metric_type == "device": + # npu device +diff --git a/config/module/slow_node_detection.job.json b/config/module/slow_node_detection.job.json +index 91ff621..27a6d53 100644 +--- a/config/module/slow_node_detection.job.json ++++ b/config/module/slow_node_detection.job.json +@@ -17,7 +17,12 @@ + "obs_size": 5, + "outlier_ratio_th": 0.6, + "hccl_domain_json": "./hccl_domain.json", +- "rank_table_json": "./hccl_domain.json" ++ "hccl_domain":{ ++ "pp": 2, ++ "dp": 1, ++ "tp": 1 ++ }, ++ "rank_table_json": "./rank_table.json" + } + }, + "kpis": [ +@@ -349,4 +354,4 @@ + "metric": "gala_gopher_container_cpu_usage_seconds_total" + } + ] +-} +\ No newline at end of file ++} +-- +2.33.0 + diff --git a/gala-anteater.spec b/gala-anteater.spec index a0362a3..2918f2c 100644 --- a/gala-anteater.spec +++ b/gala-anteater.spec @@ -2,7 +2,7 @@ Name: gala-anteater Version: 1.2.1 -Release: 1 +Release: 2 Summary: A time-series anomaly detection platform for operating system. License: MulanPSL2 URL: https://gitee.com/openeuler/gala-anteater @@ -12,6 +12,8 @@ BuildRequires: procps-ng python3-setuptools Requires: python3-gala-anteater = %{version}-%{release} patch0: 0001-add-new-feature-slow-node-detection.patch +patch1: 0002-configure-group-in-json.patch + %description Abnormal detection module for A-Ops project @@ -78,6 +80,9 @@ fi %changelog +* Fri Nov 8 2024 huangbin - 1.2.1-2 +- Add new feature slow node detection. + * Tue Nov 5 2024 huangbin - 1.2.1-1 - Add new feature slow node detection.