add configure for group setting.
This commit is contained in:
parent
506b0a92f2
commit
7cfc41df49
121
0002-configure-group-in-json.patch
Normal file
121
0002-configure-group-in-json.patch
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
From acefcbdbb4891aa0b3f1b7afe500b8fdef440806 Mon Sep 17 00:00:00 2001
|
||||||
|
From: huangbin <huangbin58@huawei.com>
|
||||||
|
Date: Fri, 8 Nov 2024 16:42:52 +0800
|
||||||
|
Subject: [PATCH] configure-group-in-json
|
||||||
|
|
||||||
|
---
|
||||||
|
anteater/model/detector/slow_node_detector.py | 28 +++++++++++--------
|
||||||
|
config/module/slow_node_detection.job.json | 9 ++++--
|
||||||
|
2 files changed, 24 insertions(+), 13 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/anteater/model/detector/slow_node_detector.py b/anteater/model/detector/slow_node_detector.py
|
||||||
|
index 15a6cee..d5d10e1 100644
|
||||||
|
--- a/anteater/model/detector/slow_node_detector.py
|
||||||
|
+++ b/anteater/model/detector/slow_node_detector.py
|
||||||
|
@@ -55,6 +55,9 @@ class SlowNodeDetector(Detector):
|
||||||
|
hccl_domain = json.load(f_out)
|
||||||
|
except Exception:
|
||||||
|
logger.error(f"Read hccl domain info fail!")
|
||||||
|
+ if not hccl_domain:
|
||||||
|
+ # 增加手动设置hccl_domain
|
||||||
|
+ hccl_domain = params.get("hccl_domain", {})
|
||||||
|
if os.path.exists(rank_table_path):
|
||||||
|
try:
|
||||||
|
with open(rank_table_path, 'r', encoding='utf-8') as f_out:
|
||||||
|
@@ -106,15 +109,15 @@ class SlowNodeDetector(Detector):
|
||||||
|
# 获取machine_ids,
|
||||||
|
machines_to_devices = self.get_machines_to_devices(start, end, kpis)
|
||||||
|
npu_id2host_id, hosts_ids = self.npu_id2host_id(machines_to_devices)
|
||||||
|
-
|
||||||
|
group_dataloader = GroupDataLoader(self.hccl_domain, self.rank_table, machines_to_devices)
|
||||||
|
group_ranks: list = group_dataloader.get_group_ranks()
|
||||||
|
all_results = []
|
||||||
|
for kpi in kpis:
|
||||||
|
- for ranks in group_ranks:
|
||||||
|
+ for index, ranks in enumerate(group_ranks):
|
||||||
|
+ logger.info(f"Groups-{index}, metric: {kpi.metric}, start detection.")
|
||||||
|
machine_ids: dict = group_dataloader.rank_table_loader.get_group_nodes_by_ranks(ranks)
|
||||||
|
host_ids: list = self.get_host_ids_by_npu_ids(machine_ids, npu_id2host_id, hosts_ids)
|
||||||
|
- group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids)
|
||||||
|
+ group_result = self.group_detect_single_kpi(kpi, machine_ids, host_ids, ranks)
|
||||||
|
all_results.extend(group_result)
|
||||||
|
|
||||||
|
response, all_anomaly_nodes = self.gen_final_alarm(kpis, all_results)
|
||||||
|
@@ -148,19 +151,26 @@ class SlowNodeDetector(Detector):
|
||||||
|
|
||||||
|
return response, all_anomaly_nodes
|
||||||
|
|
||||||
|
- def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list) -> list:
|
||||||
|
+ def group_detect_single_kpi(self, kpi: KPI, machine_ids: dict, host_ids: list, ranks) -> list:
|
||||||
|
"""Detects kpi based on signal time series anomaly detection model"""
|
||||||
|
# 普罗会一次性抓到所有的数据,需要根据machine_id, device_id去对数据作分组
|
||||||
|
- # get数据
|
||||||
|
metric_name: str = kpi.metric
|
||||||
|
|
||||||
|
all_machines_ts = []
|
||||||
|
for machine_id in machine_ids:
|
||||||
|
single_machine_ts_list = self.get_kpi_ts_list(metric_name, machine_id, kpi.params)
|
||||||
|
- all_machines_ts.extend(single_machine_ts_list)
|
||||||
|
+ if single_machine_ts_list:
|
||||||
|
+ # 根据ranks匹配组内device的指标
|
||||||
|
+ local_ranks = [int(rank) % 8 for rank in ranks]
|
||||||
|
+ for single_machine_ts in single_machine_ts_list:
|
||||||
|
+ ts_id = int(single_machine_ts.labels.get("id", -1))
|
||||||
|
+ if ts_id in local_ranks:
|
||||||
|
+ all_machines_ts.append(single_machine_ts)
|
||||||
|
+
|
||||||
|
for host_id in host_ids:
|
||||||
|
single_machine_ts_list = self.get_kpi_ts_list(metric_name, host_id, kpi.params)
|
||||||
|
all_machines_ts.extend(single_machine_ts_list)
|
||||||
|
+ logger.info(f"Metric-{metric_name} single group has data {len(all_machines_ts)}. ranks: {ranks}")
|
||||||
|
|
||||||
|
anomaly_devices = []
|
||||||
|
anomaly_locations = {}
|
||||||
|
@@ -195,7 +205,7 @@ class SlowNodeDetector(Detector):
|
||||||
|
logger.info(
|
||||||
|
f"space_nodes_compare result: {self.output_anomaly_devices(metric_name, space_anomaly_locations)}.")
|
||||||
|
else:
|
||||||
|
- logger.info(f"Skip space nodes compare, due to nodes number{len(all_machines_ts)} is smaller than 4.")
|
||||||
|
+ logger.info(f"Skip space nodes compare, due to nodes number {len(all_machines_ts)} is smaller than 4.")
|
||||||
|
else:
|
||||||
|
logger.info(f"Skip space nodes compare.")
|
||||||
|
|
||||||
|
@@ -287,10 +297,6 @@ class SlowNodeDetector(Detector):
|
||||||
|
look_back = self.config.params.get("look_back", 10)
|
||||||
|
metric_type = kpi_params.get("metric_type", "device")
|
||||||
|
start, end = dt.last(minutes=look_back)
|
||||||
|
- # from datetime import timedelta
|
||||||
|
- # # 6:40
|
||||||
|
- # start = start - timedelta(hours=4.5)
|
||||||
|
- # end = end - timedelta(hours=4.5)
|
||||||
|
|
||||||
|
if metric_type == "device":
|
||||||
|
# npu device
|
||||||
|
diff --git a/config/module/slow_node_detection.job.json b/config/module/slow_node_detection.job.json
|
||||||
|
index 91ff621..27a6d53 100644
|
||||||
|
--- a/config/module/slow_node_detection.job.json
|
||||||
|
+++ b/config/module/slow_node_detection.job.json
|
||||||
|
@@ -17,7 +17,12 @@
|
||||||
|
"obs_size": 5,
|
||||||
|
"outlier_ratio_th": 0.6,
|
||||||
|
"hccl_domain_json": "./hccl_domain.json",
|
||||||
|
- "rank_table_json": "./hccl_domain.json"
|
||||||
|
+ "hccl_domain":{
|
||||||
|
+ "pp": 2,
|
||||||
|
+ "dp": 1,
|
||||||
|
+ "tp": 1
|
||||||
|
+ },
|
||||||
|
+ "rank_table_json": "./rank_table.json"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"kpis": [
|
||||||
|
@@ -349,4 +354,4 @@
|
||||||
|
"metric": "gala_gopher_container_cpu_usage_seconds_total"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
-}
|
||||||
|
\ No newline at end of file
|
||||||
|
+}
|
||||||
|
--
|
||||||
|
2.33.0
|
||||||
|
|
||||||
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
Name: gala-anteater
|
Name: gala-anteater
|
||||||
Version: 1.2.1
|
Version: 1.2.1
|
||||||
Release: 1
|
Release: 2
|
||||||
Summary: A time-series anomaly detection platform for operating system.
|
Summary: A time-series anomaly detection platform for operating system.
|
||||||
License: MulanPSL2
|
License: MulanPSL2
|
||||||
URL: https://gitee.com/openeuler/gala-anteater
|
URL: https://gitee.com/openeuler/gala-anteater
|
||||||
@ -12,6 +12,8 @@ BuildRequires: procps-ng python3-setuptools
|
|||||||
Requires: python3-gala-anteater = %{version}-%{release}
|
Requires: python3-gala-anteater = %{version}-%{release}
|
||||||
|
|
||||||
patch0: 0001-add-new-feature-slow-node-detection.patch
|
patch0: 0001-add-new-feature-slow-node-detection.patch
|
||||||
|
patch1: 0002-configure-group-in-json.patch
|
||||||
|
|
||||||
%description
|
%description
|
||||||
Abnormal detection module for A-Ops project
|
Abnormal detection module for A-Ops project
|
||||||
|
|
||||||
@ -78,6 +80,9 @@ fi
|
|||||||
|
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Fri Nov 8 2024 huangbin <huangbin58@huawei.com> - 1.2.1-2
|
||||||
|
- Add new feature slow node detection.
|
||||||
|
|
||||||
* Tue Nov 5 2024 huangbin <huangbin58@huawei.com> - 1.2.1-1
|
* Tue Nov 5 2024 huangbin <huangbin58@huawei.com> - 1.2.1-1
|
||||||
- Add new feature slow node detection.
|
- Add new feature slow node detection.
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user