gala-anteater/0002-configure-group-in-json.patch
2024-11-08 16:52:40 +08:00

122 lines
5.4 KiB
Diff
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

From acefcbdbb4891aa0b3f1b7afe500b8fdef440806 Mon Sep 17 00:00:00 2001
From: huangbin <huangbin58@huawei.com>
Date: Fri, 8 Nov 2024 16:42:52 +0800
Subject: [PATCH] configure-group-in-json
---
anteater/model/detector/slow_node_detector.py | 28 +++++++++++--------
config/module/slow_node_detection.job.json | 9 ++++--
2 files changed, 24 insertions(+), 13 deletions(-)
diff --git a/anteater/model/detector/slow_node_detector.py b/anteater/model/detector/slow_node_detector.py
index 15a6cee..d5d10e1 100644
--- a/anteater/model/detector/slow_node_detector.py
+++ b/anteater/model/detector/slow_node_detector.py
@@ -55,6 +55,9 @@ class SlowNodeDetector(Detector):
hccl_domain = json.load(f_out)
except Exception:
logger.error(f"Read hccl domain info fail!")
+ if not hccl_domain:
+ # 增加手动设置hccl_domain
+ hccl_domain = params.get("hccl_domain", {})
if os.path.exists(rank_table_path):
try:
with open(rank_table_path, 'r', encoding='utf-8') as f_out:
@@ -106,15 +109,15 @@ class SlowNodeDetector(Detector):
# 获取machine_ids,
machines_to_devices = self.get_machines_to_devices(start, end, kpis)
npu_id2host_id, hosts_ids = self.npu_id2host_id(machines_to_devices)
-
group_dataloader = GroupDataLoader(self.hccl_domain, self.rank_table, machines_to_devices)
group_ranks: list = group_dataloader.get_group_ranks()
all_results = []
for kpi in kpis:
- for ranks in group_ranks:
+ for index, ranks in enumerate(group_ranks):
+ logger.info(f"Groups-{index}, metric: {kpi.metric}, start detection.")
machine_ids: dict = group_dataloader.rank_table_loader.get_group_nodes_by_ranks(ranks)
host_ids: list = self.get_host_ids_by_npu_ids(machine_ids, npu_id2host_id, hosts_ids)
- group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids)
+ group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids, ranks)
all_results.extend(group_result)
response, all_anomaly_nodes = self.gen_final_alarm(kpis, all_results)
@@ -148,19 +151,26 @@ class SlowNodeDetector(Detector):
return response, all_anomaly_nodes
- def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list) -> list:
+ def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list, ranks) -> list:
"""Detects kpi based on signal time series anomaly detection model"""
# 普罗会一次性抓到所有的数据需要根据machine_id, device_id去对数据作分组
- # get数据
metric_name: str = kpi.metric
all_machines_ts = []
for machine_id in machine_ids:
single_machine_ts_list = self.get_kpi_ts_list(metric_name, machine_id, kpi.params)
- all_machines_ts.extend(single_machine_ts_list)
+ if single_machine_ts_list:
+ # 根据ranks匹配组内device的指标
+ local_ranks = [int(rank) % 8 for rank in ranks]
+ for single_machine_ts in single_machine_ts_list:
+ ts_id = int(single_machine_ts.labels.get("id", -1))
+ if ts_id in local_ranks:
+ all_machines_ts.append(single_machine_ts)
+
for host_id in host_ids:
single_machine_ts_list = self.get_kpi_ts_list(metric_name, host_id, kpi.params)
all_machines_ts.extend(single_machine_ts_list)
+ logger.info(f"Metric-{metric_name} single group has data {len(all_machines_ts)}. ranks: {ranks}")
anomaly_devices = []
anomaly_locations = {}
@@ -195,7 +205,7 @@ class SlowNodeDetector(Detector):
logger.info(
f"space_nodes_compare result: {self.output_anomaly_devices(metric_name, space_anomaly_locations)}.")
else:
- logger.info(f"Skip space nodes compare, due to nodes number{len(all_machines_ts)} is smaller than 4.")
+ logger.info(f"Skip space nodes compare, due to nodes number {len(all_machines_ts)} is smaller than 4.")
else:
logger.info(f"Skip space nodes compare.")
@@ -287,10 +297,6 @@ class SlowNodeDetector(Detector):
look_back = self.config.params.get("look_back", 10)
metric_type = kpi_params.get("metric_type", "device")
start, end = dt.last(minutes=look_back)
- # from datetime import timedelta
- # # 6:40
- # start = start - timedelta(hours=4.5)
- # end = end - timedelta(hours=4.5)
if metric_type == "device":
# npu device
diff --git a/config/module/slow_node_detection.job.json b/config/module/slow_node_detection.job.json
index 91ff621..27a6d53 100644
--- a/config/module/slow_node_detection.job.json
+++ b/config/module/slow_node_detection.job.json
@@ -17,7 +17,12 @@
"obs_size": 5,
"outlier_ratio_th": 0.6,
"hccl_domain_json": "./hccl_domain.json",
- "rank_table_json": "./hccl_domain.json"
+ "hccl_domain":{
+ "pp": 2,
+ "dp": 1,
+ "tp": 1
+ },
+ "rank_table_json": "./rank_table.json"
}
},
"kpis": [
@@ -349,4 +354,4 @@
"metric": "gala_gopher_container_cpu_usage_seconds_total"
}
]
-}
\ No newline at end of file
+}
--
2.33.0