From acefcbdbb4891aa0b3f1b7afe500b8fdef440806 Mon Sep 17 00:00:00 2001 From: huangbin Date: Fri, 8 Nov 2024 16:42:52 +0800 Subject: [PATCH] configure-group-in-json --- anteater/model/detector/slow_node_detector.py | 28 +++++++++++-------- config/module/slow_node_detection.job.json | 9 ++++-- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/anteater/model/detector/slow_node_detector.py b/anteater/model/detector/slow_node_detector.py index 15a6cee..d5d10e1 100644 --- a/anteater/model/detector/slow_node_detector.py +++ b/anteater/model/detector/slow_node_detector.py @@ -55,6 +55,9 @@ class SlowNodeDetector(Detector): hccl_domain = json.load(f_out) except Exception: logger.error(f"Read hccl domain info fail!") + if not hccl_domain: + # 增加手动设置hccl_domain + hccl_domain = params.get("hccl_domain", {}) if os.path.exists(rank_table_path): try: with open(rank_table_path, 'r', encoding='utf-8') as f_out: @@ -106,15 +109,15 @@ class SlowNodeDetector(Detector): # 获取machine_ids, machines_to_devices = self.get_machines_to_devices(start, end, kpis) npu_id2host_id, hosts_ids = self.npu_id2host_id(machines_to_devices) - group_dataloader = GroupDataLoader(self.hccl_domain, self.rank_table, machines_to_devices) group_ranks: list = group_dataloader.get_group_ranks() all_results = [] for kpi in kpis: - for ranks in group_ranks: + for index, ranks in enumerate(group_ranks): + logger.info(f"Groups-{index}, metric: {kpi.metric}, start detection.") machine_ids: dict = group_dataloader.rank_table_loader.get_group_nodes_by_ranks(ranks) host_ids: list = self.get_host_ids_by_npu_ids(machine_ids, npu_id2host_id, hosts_ids) - group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids) + group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids, ranks) all_results.extend(group_result) response, all_anomaly_nodes = self.gen_final_alarm(kpis, all_results) @@ -148,19 +151,26 @@ class SlowNodeDetector(Detector): return response, all_anomaly_nodes - def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list) -> list: + def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list, ranks) -> list: """Detects kpi based on signal time series anomaly detection model""" # 普罗会一次性抓到所有的数据,需要根据machine_id, device_id去对数据作分组 - # get数据 metric_name: str = kpi.metric all_machines_ts = [] for machine_id in machine_ids: single_machine_ts_list = self.get_kpi_ts_list(metric_name, machine_id, kpi.params) - all_machines_ts.extend(single_machine_ts_list) + if single_machine_ts_list: + # 根据ranks匹配组内device的指标 + local_ranks = [int(rank) % 8 for rank in ranks] + for single_machine_ts in single_machine_ts_list: + ts_id = int(single_machine_ts.labels.get("id", -1)) + if ts_id in local_ranks: + all_machines_ts.append(single_machine_ts) + for host_id in host_ids: single_machine_ts_list = self.get_kpi_ts_list(metric_name, host_id, kpi.params) all_machines_ts.extend(single_machine_ts_list) + logger.info(f"Metric-{metric_name} single group has data {len(all_machines_ts)}. ranks: {ranks}") anomaly_devices = [] anomaly_locations = {} @@ -195,7 +205,7 @@ class SlowNodeDetector(Detector): logger.info( f"space_nodes_compare result: {self.output_anomaly_devices(metric_name, space_anomaly_locations)}.") else: - logger.info(f"Skip space nodes compare, due to nodes number{len(all_machines_ts)} is smaller than 4.") + logger.info(f"Skip space nodes compare, due to nodes number {len(all_machines_ts)} is smaller than 4.") else: logger.info(f"Skip space nodes compare.") @@ -287,10 +297,6 @@ class SlowNodeDetector(Detector): look_back = self.config.params.get("look_back", 10) metric_type = kpi_params.get("metric_type", "device") start, end = dt.last(minutes=look_back) - # from datetime import timedelta - # # 6:40 - # start = start - timedelta(hours=4.5) - # end = end - timedelta(hours=4.5) if metric_type == "device": # npu device diff --git a/config/module/slow_node_detection.job.json b/config/module/slow_node_detection.job.json index 91ff621..27a6d53 100644 --- a/config/module/slow_node_detection.job.json +++ b/config/module/slow_node_detection.job.json @@ -17,7 +17,12 @@ "obs_size": 5, "outlier_ratio_th": 0.6, "hccl_domain_json": "./hccl_domain.json", - "rank_table_json": "./hccl_domain.json" + "hccl_domain":{ + "pp": 2, + "dp": 1, + "tp": 1 + }, + "rank_table_json": "./rank_table.json" } }, "kpis": [ @@ -349,4 +354,4 @@ "metric": "gala_gopher_container_cpu_usage_seconds_total" } ] -} \ No newline at end of file +} -- 2.33.0