diff --git a/Add-disk-throughput-detector.patch b/Add-disk-throughput-detector.patch deleted file mode 100644 index 7a0cbed..0000000 --- a/Add-disk-throughput-detector.patch +++ /dev/null @@ -1,478 +0,0 @@ -From ac1383471f72420e3320eb7c7999021f3658fb7d Mon Sep 17 00:00:00 2001 -From: lizhenxing11 -Date: Wed, 7 Dec 2022 16:59:15 +0800 -Subject: [PATCH] Add disk throughput detector - -add keywords - -extract cause metric to the attributes - -update template ---- - anteater/config.py | 3 - - anteater/core/kpi.py | 1 + - anteater/main.py | 2 + - anteater/model/algorithms/three_sigma.py | 2 +- - anteater/module/base.py | 6 +- - anteater/module/sys/disk_throughput.py | 62 +++++++++++++ - anteater/module/sys/proc_io_latency.py | 4 +- - anteater/source/anomaly_report.py | 3 +- - anteater/template/app_anomaly_template.py | 4 +- - anteater/template/sys_anomaly_template.py | 4 +- - anteater/template/template.py | 3 +- - anteater/utils/data_load.py | 2 + - config/module/app_sli_rtt.json | 3 + - config/module/disk_throughput.json | 92 +++++++++++++++++++ - config/module/proc_io_latency.json | 3 + - config/module/sys_io_latency.json | 3 + - config/module/sys_tcp_establish.json | 3 + - .../module/sys_tcp_transmission_latency.json | 3 + - .../sys_tcp_transmission_throughput.json | 3 + - 19 files changed, 193 insertions(+), 13 deletions(-) - create mode 100644 anteater/module/sys/disk_throughput.py - create mode 100644 config/module/disk_throughput.json - -diff --git a/anteater/config.py b/anteater/config.py -index ea02702..e9ab557 100644 ---- a/anteater/config.py -+++ b/anteater/config.py -@@ -81,9 +81,6 @@ class AnteaterConf: - """Loads config from yaml file""" - data_path = os.path.realpath(data_path) - -- if not os.path.exists(data_path): -- os.makedirs(data_path) -- - try: - with open(os.path.join(data_path, "config", self.filename), "rb") as f: - result = yaml.safe_load(f) -diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py -index 5a9d8ab..3480139 100644 ---- a/anteater/core/kpi.py -+++ b/anteater/core/kpi.py -@@ -48,6 +48,7 @@ class ModelConfig: - class JobConfig: - name: str - job_type: str -+ keywords: List[str] - root_cause_number: int - kpis: List[KPI] - features: List[Feature] -diff --git a/anteater/main.py b/anteater/main.py -index 11e0409..ba7be70 100644 ---- a/anteater/main.py -+++ b/anteater/main.py -@@ -21,6 +21,7 @@ from apscheduler.schedulers.blocking import BlockingScheduler - from anteater.anomaly_detection import AnomalyDetection - from anteater.config import AnteaterConf - from anteater.module.app.app_sli_detector import APPSliDetector -+from anteater.module.sys.disk_throughput import DiskThroughputDetector - from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector - from anteater.module.sys.sys_io_latency import SysIOLatencyDetector - from anteater.module.sys.tcp_establish import SysTcpEstablishDetector -@@ -57,6 +58,7 @@ def main(): - SysTcpTransmissionLatencyDetector(loader, report), - SysIOLatencyDetector(loader, report), - ProcIOLatencyDetector(loader, report), -+ DiskThroughputDetector(loader, report), - ] - else: - detectors = [ -diff --git a/anteater/model/algorithms/three_sigma.py b/anteater/model/algorithms/three_sigma.py -index 457b606..49b9952 100644 ---- a/anteater/model/algorithms/three_sigma.py -+++ b/anteater/model/algorithms/three_sigma.py -@@ -31,7 +31,7 @@ def three_sigma(values, obs_size, n=3, method="abs"): - elif method == 'min': - outlier = [val for val in obs_val if val < mean - n * std] - elif method == 'max': -- outlier = [val for val in obs_val if val > mean + 3 * std] -+ outlier = [val for val in obs_val if val > mean + n * std] - else: - raise ValueError(f'Unknown method {method}') - -diff --git a/anteater/module/base.py b/anteater/module/base.py -index 7b5fc84..63436ac 100644 ---- a/anteater/module/base.py -+++ b/anteater/module/base.py -@@ -48,14 +48,14 @@ class E2EDetector: - for detector in self.detectors: - anomalies = detector.execute(self.job_config) - for anomaly in anomalies: -- self.report(anomaly) -+ self.report(anomaly, self.job_config.keywords) - - @abstractmethod - def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]: - """Parses the cause metrics into the specific formats""" - pass - -- def report(self, anomaly: Anomaly): -+ def report(self, anomaly: Anomaly, keywords): - """Parses the anomaly into a specific formats - based on the template and reports parsed results - """ -@@ -63,4 +63,4 @@ class E2EDetector: - timestamp = dt.utc_now() - template = self.template(timestamp, anomaly.machine_id, - anomaly.metric, anomaly.entity_name) -- self.reporter.sent_anomaly(anomaly, cause_metrics, template) -+ self.reporter.sent_anomaly(anomaly, cause_metrics, keywords, template) -diff --git a/anteater/module/sys/disk_throughput.py b/anteater/module/sys/disk_throughput.py -new file mode 100644 -index 0000000..9a192fb ---- /dev/null -+++ b/anteater/module/sys/disk_throughput.py -@@ -0,0 +1,62 @@ -+#!/usr/bin/python3 -+# ****************************************************************************** -+# Copyright (c) 2022 Huawei Technologies Co., Ltd. -+# gala-anteater is licensed under Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, -+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, -+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. -+# See the Mulan PSL v2 for more details. -+# ******************************************************************************/ -+ -+from typing import List, Dict -+ -+from anteater.core.anomaly import Anomaly -+from anteater.module.base import E2EDetector -+from anteater.model.detector.online_vae_detector import OnlineVAEDetector -+from anteater.model.detector.n_sigma_detector import NSigmaDetector -+from anteater.source.anomaly_report import AnomalyReport -+from anteater.source.metric_loader import MetricLoader -+from anteater.template.sys_anomaly_template import SysAnomalyTemplate -+ -+ -+class DiskThroughputDetector(E2EDetector): -+ """Disk throughput e2e detector which detects the disk read or write -+ await time performance deteriorates -+ """ -+ -+ config_file = 'disk_throughput.json' -+ -+ def __init__(self, data_loader: MetricLoader, reporter: AnomalyReport): -+ """The disk throughput e2e detector initializer""" -+ super().__init__(reporter, SysAnomalyTemplate) -+ -+ self.detectors = self.init_detectors(data_loader) -+ -+ def init_detectors(self, data_loader): -+ if self.job_config.model_config.enable: -+ detectors = [ -+ NSigmaDetector(data_loader, method='max'), -+ OnlineVAEDetector(data_loader, self.job_config.model_config) -+ ] -+ else: -+ detectors = [ -+ NSigmaDetector(data_loader, method='max') -+ ] -+ -+ return detectors -+ -+ def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]: -+ """Parses the cause metrics into the specific formats""" -+ cause_metrics = [ -+ { -+ 'metric': cause.ts.metric, -+ 'labels': cause.ts.labels, -+ 'score': cause.score, -+ 'description': cause.description.format( -+ cause.ts.labels.get('disk_name', ''))} -+ for cause in anomaly.root_causes] -+ -+ return cause_metrics -diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py -index 94fd05d..43e069f 100644 ---- a/anteater/module/sys/proc_io_latency.py -+++ b/anteater/module/sys/proc_io_latency.py -@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector): - def init_detectors(self, data_loader): - if self.job_config.model_config.enable: - detectors = [ -- NSigmaDetector(data_loader, method='min'), -+ NSigmaDetector(data_loader, method='abs'), - OnlineVAEDetector(data_loader, self.job_config.model_config) - ] - else: - detectors = [ -- NSigmaDetector(data_loader, method='min') -+ NSigmaDetector(data_loader, method='abs') - ] - - return detectors -diff --git a/anteater/source/anomaly_report.py b/anteater/source/anomaly_report.py -index b226763..3d3bb09 100644 ---- a/anteater/source/anomaly_report.py -+++ b/anteater/source/anomaly_report.py -@@ -42,7 +42,7 @@ class AnomalyReport: - - return keys - -- def sent_anomaly(self, anomaly: Anomaly, cause_metrics: List, template: Template): -+ def sent_anomaly(self, anomaly: Anomaly, cause_metrics: List, keywords: List[str], template: Template): - keys = self.get_keys(template.entity_name) - machine_id = template.machine_id - entity_name = template.entity_name -@@ -54,6 +54,7 @@ class AnomalyReport: - template.keys = keys - template.description = anomaly.description - template.cause_metrics = cause_metrics -+ template.keywords = keywords - - msg = template.get_template() - self.provider.send_message(msg) -diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py -index 5b8caf8..a509c96 100644 ---- a/anteater/template/app_anomaly_template.py -+++ b/anteater/template/app_anomaly_template.py -@@ -31,7 +31,9 @@ class AppAnomalyTemplate(Template): - 'entity_id': self.entity_id, - 'event_id': f'{timestamp}_{self.entity_id}', - 'event_type': 'app', -- 'event_source': 'gala-anteater' -+ 'event_source': 'gala-anteater', -+ 'keywords': self.keywords, -+ 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} - }, - 'Resource': { - 'metric': self.metric, -diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py -index 1083fb3..4ac6abb 100644 ---- a/anteater/template/sys_anomaly_template.py -+++ b/anteater/template/sys_anomaly_template.py -@@ -31,7 +31,9 @@ class SysAnomalyTemplate(Template): - 'entity_id': self.entity_id, - 'event_id': f'{timestamp}_{self.entity_id}', - 'event_type': 'sys', -- 'event_source': 'gala-anteater' -+ 'event_source': 'gala-anteater', -+ 'keywords': self.keywords, -+ 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} - }, - 'Resource': { - 'metric': self.metric, -diff --git a/anteater/template/template.py b/anteater/template/template.py -index 9e4461a..794c121 100644 ---- a/anteater/template/template.py -+++ b/anteater/template/template.py -@@ -26,7 +26,8 @@ class Template: - self.labels = {} - self.entity_id = "" - self.description = "" -- self.cause_metrics = {} -+ self.cause_metrics = [] -+ self.keywords = [] - - @abstractmethod - def get_template(self): -diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py -index 6ac92c7..b6991c6 100644 ---- a/anteater/utils/data_load.py -+++ b/anteater/utils/data_load.py -@@ -45,6 +45,7 @@ def load_job_config(file_name) -> JobConfig: - - name = config['name'] - job_type = config['job_type'] -+ keywords = config['keywords'] - root_cause_number = config['root_cause_number'] - kpis = [KPI(**_conf) for _conf in config['KPI']] - features = [Feature(**_conf) for _conf in config['Features']] -@@ -74,6 +75,7 @@ def load_job_config(file_name) -> JobConfig: - return JobConfig( - name=name, - job_type=job_type, -+ keywords=keywords, - root_cause_number=root_cause_number, - kpis=kpis, - features=features, -diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json -index 7c05094..db29392 100644 ---- a/config/module/app_sli_rtt.json -+++ b/config/module/app_sli_rtt.json -@@ -1,6 +1,9 @@ - { - "name": "app_sli_rtt", - "job_type": "app", -+ "keywords": [ -+ "app" -+ ], - "root_cause_number": 20, - "KPI": [ - { -diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json -new file mode 100644 -index 0000000..00276c0 ---- /dev/null -+++ b/config/module/disk_throughput.json -@@ -0,0 +1,92 @@ -+{ -+ "name": "disk_throughput", -+ "job_type": "sys", -+ "keywords": [ -+ "disk" -+ ], -+ "root_cause_number": 1, -+ "KPI": [ -+ { -+ "metric": "gala_gopher_disk_r_await", -+ "kpi_type": "", -+ "entity_name": "disk", -+ "enable": true, -+ "description": "Disk read await time is increasing!", -+ "params": { -+ "look_back": 20, -+ "obs_size": 25, -+ "outlier_ratio_th": 0.3, -+ "smooth_params": { -+ "method": "conv_smooth", -+ "box_pts": 3 -+ } -+ } -+ }, -+ { -+ "metric": "gala_gopher_disk_w_await", -+ "kpi_type": "", -+ "entity_name": "disk", -+ "enable": true, -+ "description": "Disk write await time is increasing!", -+ "params": { -+ "look_back": 20, -+ "obs_size": 25, -+ "outlier_ratio_th": 0.3, -+ "smooth_params": { -+ "method": "conv_smooth", -+ "box_pts": 3 -+ } -+ } -+ } -+ ], -+ "OnlineModel": { -+ "name": "online_vae_model", -+ "enable": false, -+ "params": { -+ "th": 0.5, -+ "max_error_rate": 0.7, -+ "min_retrain_hours": 24, -+ "min_predict_minutes": 20, -+ "norm": {}, -+ "vae": { -+ "hidden_sizes": [25, 10, 5], -+ "latent_size": 5, -+ "dropout_rate": 0.25, -+ "batch_size": 1024, -+ "num_epochs": 30, -+ "learning_rate": 0.001, -+ "k": 120, -+ "step_size": 60, -+ "num_eval_samples": 10 -+ }, -+ "calibrate": {}, -+ "threshold": {} -+ } -+ }, -+ "Features": [ -+ { -+ "metric": "gala_gopher_disk_rspeed_kB", -+ "priority": 0, -+ "description": "The disk I/O await time performance deteriorates due to read throughput rise (read kbytes/second).(Disk = {})", -+ "atrend": "rise" -+ }, -+ { -+ "metric": "gala_gopher_disk_wspeed_kB", -+ "priority": 0, -+ "description": "The disk I/O await time performance deteriorates due to write throughput rise (write kbytes/second).(Disk = {})", -+ "atrend": "rise" -+ }, -+ { -+ "metric": "gala_gopher_disk_rareq", -+ "priority": 0, -+ "description": "The disk I/O await time performance deteriorates due to read saturation rise.(Disk = {})", -+ "atrend": "rise" -+ }, -+ { -+ "metric": "gala_gopher_disk_wareq", -+ "priority": 0, -+ "description": "The disk I/O await time performance deteriorates due to write saturation rise.(Disk = {})", -+ "atrend": "rise" -+ } -+ ] -+} -\ No newline at end of file -diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json -index c45b7df..c6c03c1 100644 ---- a/config/module/proc_io_latency.json -+++ b/config/module/proc_io_latency.json -@@ -1,6 +1,9 @@ - { - "name": "proc_io_latency", - "job_type": "sys", -+ "keywords": [ -+ "process" -+ ], - "root_cause_number": 3, - "KPI": [ - { -diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json -index e92dd4c..e58990d 100644 ---- a/config/module/sys_io_latency.json -+++ b/config/module/sys_io_latency.json -@@ -1,6 +1,9 @@ - { - "name": "sys_io_latency", - "job_type": "sys", -+ "keywords": [ -+ "block" -+ ], - "root_cause_number": 3, - "KPI": [ - { -diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json -index b6f8eb4..61ae72d 100644 ---- a/config/module/sys_tcp_establish.json -+++ b/config/module/sys_tcp_establish.json -@@ -1,6 +1,9 @@ - { - "name": "sys_tcp_establish", - "job_type": "sys", -+ "keywords": [ -+ "tcp" -+ ], - "root_cause_number": 3, - "KPI": [ - { -diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json -index 4927d8e..d9e7f80 100644 ---- a/config/module/sys_tcp_transmission_latency.json -+++ b/config/module/sys_tcp_transmission_latency.json -@@ -1,6 +1,9 @@ - { - "name": "sys_tcp_transmission_latency", - "job_type": "sys", -+ "keywords": [ -+ "tcp" -+ ], - "root_cause_number": 3, - "KPI": [ - { -diff --git a/config/module/sys_tcp_transmission_throughput.json b/config/module/sys_tcp_transmission_throughput.json -index 060f640..28ee784 100644 ---- a/config/module/sys_tcp_transmission_throughput.json -+++ b/config/module/sys_tcp_transmission_throughput.json -@@ -1,6 +1,9 @@ - { - "name": "sys_tcp_transmission_throughput", - "job_type": "sys", -+ "keywords": [ -+ "net" -+ ], - "root_cause_number": 3, - "KPI": [ - { --- -2.33.0 - diff --git a/Update-TCP-Establish-Model-Add-Nic-Loss-Detector.patch b/Update-TCP-Establish-Model-Add-Nic-Loss-Detector.patch deleted file mode 100644 index 782a879..0000000 --- a/Update-TCP-Establish-Model-Add-Nic-Loss-Detector.patch +++ /dev/null @@ -1,377 +0,0 @@ -From dd870b17120f3c7961c4613d454f1653fbd42214 Mon Sep 17 00:00:00 2001 -From: lizhenxing11 -Date: Tue, 27 Dec 2022 18:39:32 +0800 -Subject: [PATCH] Update TCP Establish Model & Add Nic Loss Detector - -change method 'abs' to 'max' ---- - anteater/main.py | 2 + - anteater/model/algorithms/three_sigma.py | 4 +- - anteater/model/detector/n_sigma_detector.py | 4 +- - .../tcp_establish_n_sigma_detector.py | 12 +++- - anteater/model/detector/th_base_detector.py | 66 +++++++++++++++++++ - anteater/module/sys/nic_loss.py | 59 +++++++++++++++++ - anteater/module/sys/proc_io_latency.py | 4 +- - anteater/template/app_anomaly_template.py | 2 + - anteater/template/sys_anomaly_template.py | 1 + - config/module/sys_nic_loss.json | 53 +++++++++++++++ - config/module/sys_tcp_establish.json | 3 +- - 11 files changed, 200 insertions(+), 10 deletions(-) - create mode 100644 anteater/model/detector/th_base_detector.py - create mode 100644 anteater/module/sys/nic_loss.py - create mode 100644 config/module/sys_nic_loss.json - -diff --git a/anteater/main.py b/anteater/main.py -index ba7be70..4de72f9 100644 ---- a/anteater/main.py -+++ b/anteater/main.py -@@ -22,6 +22,7 @@ from anteater.anomaly_detection import AnomalyDetection - from anteater.config import AnteaterConf - from anteater.module.app.app_sli_detector import APPSliDetector - from anteater.module.sys.disk_throughput import DiskThroughputDetector -+from anteater.module.sys.nic_loss import NICLossDetector - from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector - from anteater.module.sys.sys_io_latency import SysIOLatencyDetector - from anteater.module.sys.tcp_establish import SysTcpEstablishDetector -@@ -59,6 +60,7 @@ def main(): - SysIOLatencyDetector(loader, report), - ProcIOLatencyDetector(loader, report), - DiskThroughputDetector(loader, report), -+ NICLossDetector(loader, report), - ] - else: - detectors = [ -diff --git a/anteater/model/algorithms/three_sigma.py b/anteater/model/algorithms/three_sigma.py -index 49b9952..0865417 100644 ---- a/anteater/model/algorithms/three_sigma.py -+++ b/anteater/model/algorithms/three_sigma.py -@@ -14,8 +14,8 @@ - import numpy as np - - --def three_sigma(values, obs_size, n=3, method="abs"): -- """The '3-sigma rule' outlier detect function""" -+def n_sigma(values, obs_size, n=3, method="abs"): -+ """The 'N-sigma rule' outlier detect function""" - if obs_size <= 0: - raise ValueError("The obs_size should great than zero!") - if len(values) <= obs_size: -diff --git a/anteater/model/detector/n_sigma_detector.py b/anteater/model/detector/n_sigma_detector.py -index f632326..3a2ab01 100644 ---- a/anteater/model/detector/n_sigma_detector.py -+++ b/anteater/model/detector/n_sigma_detector.py -@@ -19,7 +19,7 @@ from anteater.core.kpi import KPI - from anteater.core.time_series import TimeSeriesScore - from anteater.model.detector.base import Detector - from anteater.model.algorithms.smooth import smoothing --from anteater.model.algorithms.three_sigma import three_sigma -+from anteater.model.algorithms.three_sigma import n_sigma - from anteater.source.metric_loader import MetricLoader - from anteater.utils.common import divide - from anteater.utils.datetime import DateTimeManager as dt -@@ -91,7 +91,7 @@ class NSigmaDetector(Detector): - ratio = 0 - else: - smoothed_val = smoothing(_ts.values, **smooth_params) -- outlier, mean, std = three_sigma( -+ outlier, mean, std = n_sigma( - smoothed_val, obs_size=obs_size, n=n, method=self.method) - ratio = divide(len(outlier), obs_size) - -diff --git a/anteater/model/detector/tcp_establish_n_sigma_detector.py b/anteater/model/detector/tcp_establish_n_sigma_detector.py -index 8dcf9ae..82d7837 100644 ---- a/anteater/model/detector/tcp_establish_n_sigma_detector.py -+++ b/anteater/model/detector/tcp_establish_n_sigma_detector.py -@@ -42,8 +42,13 @@ class TcpEstablishNSigmaDetector(Detector): - start, _ = dt.last(minutes=look_back) - mid, _ = dt.last(minutes=3) - -+ filtered_ts_list = [] - ts_list = self.data_loader.get_metric(start, mid, kpi.metric) -- establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in ts_list]) -+ for _ts in ts_list: -+ if sum(_ts.values) > 0: -+ filtered_ts_list.append(_ts) -+ -+ establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in filtered_ts_list]) - - self.mean = np.mean(establish_time) - self.std = np.std(establish_time) -@@ -65,6 +70,7 @@ class TcpEstablishNSigmaDetector(Detector): - """Detects kpi based on signal time series anomaly detection model""" - outlier_ratio_th = kpi.params.get('outlier_ratio_th') - look_back = kpi.params.get('obs_size') -+ min_rtt = kpi.params.get('min_rtt') - - start, end = dt.last(minutes=look_back) - ts_list = self.data_loader.\ -@@ -72,9 +78,9 @@ class TcpEstablishNSigmaDetector(Detector): - - anomalies = [] - for _ts in ts_list: -- outlier = [val for val in _ts.values if abs(val - self.mean) > 3 * self.std] -+ outlier = [val for val in _ts.values if val > self.mean + 5 * self.std] - ratio = divide(len(outlier), len(_ts.values)) -- if outlier and ratio > outlier_ratio_th: -+ if outlier and ratio > outlier_ratio_th and np.average(outlier) >= min_rtt: - anomalies.append( - Anomaly( - machine_id=machine_id, -diff --git a/anteater/model/detector/th_base_detector.py b/anteater/model/detector/th_base_detector.py -new file mode 100644 -index 0000000..bec9705 ---- /dev/null -+++ b/anteater/model/detector/th_base_detector.py -@@ -0,0 +1,66 @@ -+#!/usr/bin/python3 -+# ****************************************************************************** -+# Copyright (c) 2022 Huawei Technologies Co., Ltd. -+# gala-anteater is licensed under Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, -+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, -+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. -+# See the Mulan PSL v2 for more details. -+# ******************************************************************************/ -+ -+from typing import List -+ -+from anteater.core.anomaly import Anomaly -+from anteater.core.kpi import KPI -+from anteater.model.detector.base import Detector -+from anteater.source.metric_loader import MetricLoader -+from anteater.utils.datetime import DateTimeManager as dt -+from anteater.utils.log import logger -+ -+ -+class ThBaseDetector(Detector): -+ """The threshold-based anomaly detector""" -+ -+ def __init__(self, data_loader: MetricLoader): -+ """The detector base class initializer""" -+ super().__init__(data_loader) -+ -+ def detect_kpis(self, kpis: List[KPI]): -+ """Executes anomaly detection on kpis""" -+ start, end = dt.last(minutes=1) -+ machine_ids = self.get_unique_machine_id(start, end, kpis) -+ anomalies = [] -+ for _id in machine_ids: -+ for kpi in kpis: -+ anomalies.extend(self.detect_signal_kpi(kpi, _id)) -+ -+ return anomalies -+ -+ def detect_signal_kpi(self, kpi, machine_id: str) -> List[Anomaly]: -+ """Detects kpi based on threshold based anomaly detection model""" -+ look_back = kpi.params.get('look_back') -+ th = kpi.params.get('th') -+ start, end = dt.last(minutes=look_back) -+ ts_list = self.data_loader.\ -+ get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id) -+ -+ if not ts_list: -+ logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!') -+ return [] -+ -+ anomalies = [ -+ Anomaly( -+ machine_id=machine_id, -+ metric=_ts.metric, -+ labels=_ts.labels, -+ score=1, -+ entity_name=kpi.entity_name, -+ description=kpi.description) -+ for _ts in ts_list -+ if sum(_ts.values) >= th -+ ] -+ -+ return anomalies -diff --git a/anteater/module/sys/nic_loss.py b/anteater/module/sys/nic_loss.py -new file mode 100644 -index 0000000..d24e06f ---- /dev/null -+++ b/anteater/module/sys/nic_loss.py -@@ -0,0 +1,59 @@ -+#!/usr/bin/python3 -+# ****************************************************************************** -+# Copyright (c) 2022 Huawei Technologies Co., Ltd. -+# gala-anteater is licensed under Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, -+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, -+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. -+# See the Mulan PSL v2 for more details. -+# ******************************************************************************/ -+ -+from typing import List, Dict -+ -+from anteater.core.anomaly import Anomaly -+from anteater.model.detector.th_base_detector import ThBaseDetector -+from anteater.module.base import E2EDetector -+from anteater.source.anomaly_report import AnomalyReport -+from anteater.source.metric_loader import MetricLoader -+from anteater.template.sys_anomaly_template import SysAnomalyTemplate -+ -+ -+class NICLossDetector(E2EDetector): -+ """SYS nic loss e2e detector which detects the network loss. -+ """ -+ -+ config_file = 'sys_nic_loss.json' -+ -+ def __init__(self, data_loader: MetricLoader, reporter: AnomalyReport): -+ """The system tcp transmission latency e2e detector initializer""" -+ super().__init__(reporter, SysAnomalyTemplate) -+ -+ self.detectors = [ -+ ThBaseDetector(data_loader) -+ ] -+ -+ def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]: -+ """Parses the cause metrics into the specific formats""" -+ cause_metrics = [] -+ for _cs in anomaly.root_causes: -+ tmp = { -+ 'metric': _cs.ts.metric, -+ 'labels': _cs.ts.labels, -+ 'score': _cs.score, -+ } -+ if 'tcp' in _cs.ts.metric: -+ tmp['description'] = _cs.description.format( -+ _cs.ts.labels.get('tgid', ''), -+ _cs.ts.labels.get('client_port', ''), -+ _cs.ts.labels.get('server_ip', ''), -+ _cs.ts.labels.get('server_port', '')) -+ else: -+ tmp['description'] = _cs.description.format( -+ _cs.ts.labels.get('dev_name', '')) -+ -+ cause_metrics.append(tmp) -+ -+ return cause_metrics -diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py -index 43e069f..a34c48d 100644 ---- a/anteater/module/sys/proc_io_latency.py -+++ b/anteater/module/sys/proc_io_latency.py -@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector): - def init_detectors(self, data_loader): - if self.job_config.model_config.enable: - detectors = [ -- NSigmaDetector(data_loader, method='abs'), -+ NSigmaDetector(data_loader, method='max'), - OnlineVAEDetector(data_loader, self.job_config.model_config) - ] - else: - detectors = [ -- NSigmaDetector(data_loader, method='abs') -+ NSigmaDetector(data_loader, method='max') - ] - - return detectors -diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py -index a509c96..4df4a35 100644 ---- a/anteater/template/app_anomaly_template.py -+++ b/anteater/template/app_anomaly_template.py -@@ -46,6 +46,8 @@ class AppAnomalyTemplate(Template): - 'SeverityNumber': 13, - 'Body': f'{self.timestamp.strftime("%c")} WARN, APP may be impacting sli performance issues.', - 'event_id': f'{timestamp}_{self.entity_id}', -+ "keywords": self.keywords, -+ 'cause_metrics': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} - } - - return result -diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py -index 4ac6abb..aec6ea0 100644 ---- a/anteater/template/sys_anomaly_template.py -+++ b/anteater/template/sys_anomaly_template.py -@@ -46,6 +46,7 @@ class SysAnomalyTemplate(Template): - 'SeverityNumber': 13, - 'Body': f'{self.timestamp.strftime("%c")} WARN, SYS may be impacting performance issues.', - 'event_id': f'{timestamp}_{self.entity_id}', -+ "keywords": self.keywords - } - - return result -diff --git a/config/module/sys_nic_loss.json b/config/module/sys_nic_loss.json -new file mode 100644 -index 0000000..793f82f ---- /dev/null -+++ b/config/module/sys_nic_loss.json -@@ -0,0 +1,53 @@ -+{ -+ "name": "sys_tcp_transmission_latency", -+ "job_type": "sys", -+ "keywords": [ -+ "net" -+ ], -+ "root_cause_number": 3, -+ "KPI": [ -+ { -+ "metric": "gala_gopher_nic_tc_sent_drop", -+ "kpi_type": "", -+ "entity_name": "nic", -+ "enable": true, -+ "description": "TC发送丢包数异常", -+ "params": { -+ "look_back": 2, -+ "th": 1 -+ } -+ } -+ ], -+ "Features": [ -+ { -+ "metric": "gala_gopher_nic_tx_dropped", -+ "priority": 0, -+ "description": "网卡发送丢弃的数据包数异常。(dev_name = {})" -+ }, -+ { -+ "metric": "gala_gopher_nic_rx_dropped", -+ "priority": 0, -+ "description": "网卡接收丢弃的数据包数异常。(dev_name = {})" -+ }, -+ { -+ "metric": "gala_gopher_tcp_link_sk_drops", -+ "priority": 3, -+ "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ }, -+ { -+ "metric": "gala_gopher_tcp_link_retran_packets", -+ "priority": 1, -+ "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ }, -+ { -+ "metric": "gala_gopher_tcp_link_lost_out", -+ "priority": 3, -+ "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ }, -+ { -+ "metric": "gala_gopher_tcp_link_notsent_bytes", -+ "priority": 4, -+ "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})" -+ } -+ ] -+} -\ No newline at end of file -diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json -index 61ae72d..2c158c0 100644 ---- a/config/module/sys_tcp_establish.json -+++ b/config/module/sys_tcp_establish.json -@@ -15,7 +15,8 @@ - "params": { - "look_back": 30, - "outlier_ratio_th": 0.5, -- "obs_size": 3 -+ "obs_size": 3, -+ "min_rtt": 500000 - } - } - ], --- -2.33.0 - diff --git a/add-chinese-descriptions.patch b/add-chinese-descriptions.patch deleted file mode 100644 index 43bfbd3..0000000 --- a/add-chinese-descriptions.patch +++ /dev/null @@ -1,533 +0,0 @@ -From e0e99ac8fc3de9e8781f5d7acd5e9fe1832461b0 Mon Sep 17 00:00:00 2001 -From: lizhenxing11 -Date: Tue, 3 Jan 2023 15:27:45 +0800 -Subject: [PATCH] add chinese descriptions - -update description - -fix typo - -update th ---- - anteater/core/kpi.py | 2 +- - anteater/template/app_anomaly_template.py | 5 ++- - anteater/template/sys_anomaly_template.py | 2 +- - anteater/utils/data_load.py | 14 ++++++-- - config/module/app_sli_rtt.json | 2 ++ - config/module/disk_throughput.json | 6 ++++ - config/module/proc_io_latency.json | 31 +++++++++++----- - config/module/sys_io_latency.json | 25 ++++++++----- - config/module/sys_nic_loss.json | 21 +++++++---- - config/module/sys_tcp_establish.json | 4 ++- - .../module/sys_tcp_transmission_latency.json | 36 ++++++++++++------- - 11 files changed, 104 insertions(+), 44 deletions(-) - -diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py -index 3480139..f83b666 100644 ---- a/anteater/core/kpi.py -+++ b/anteater/core/kpi.py -@@ -23,7 +23,7 @@ class KPI: - kpi_type: str - entity_name: str - enable: bool -- description: str = "" -+ description: str - params: dict = field(default=dict) - atrend: AnomalyTrend = AnomalyTrend.DEFAULT - -diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py -index 4df4a35..3770d2e 100644 ---- a/anteater/template/app_anomaly_template.py -+++ b/anteater/template/app_anomaly_template.py -@@ -33,7 +33,7 @@ class AppAnomalyTemplate(Template): - 'event_type': 'app', - 'event_source': 'gala-anteater', - 'keywords': self.keywords, -- 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} -+ 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': self.description} - }, - 'Resource': { - 'metric': self.metric, -@@ -46,8 +46,7 @@ class AppAnomalyTemplate(Template): - 'SeverityNumber': 13, - 'Body': f'{self.timestamp.strftime("%c")} WARN, APP may be impacting sli performance issues.', - 'event_id': f'{timestamp}_{self.entity_id}', -- "keywords": self.keywords, -- 'cause_metrics': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} -+ "keywords": self.keywords - } - - return result -diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py -index aec6ea0..d3c7e82 100644 ---- a/anteater/template/sys_anomaly_template.py -+++ b/anteater/template/sys_anomaly_template.py -@@ -33,7 +33,7 @@ class SysAnomalyTemplate(Template): - 'event_type': 'sys', - 'event_source': 'gala-anteater', - 'keywords': self.keywords, -- 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} -+ 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': self.description} - }, - 'Resource': { - 'metric': self.metric, -diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py -index b6991c6..730c9c6 100644 ---- a/anteater/utils/data_load.py -+++ b/anteater/utils/data_load.py -@@ -47,8 +47,9 @@ def load_job_config(file_name) -> JobConfig: - job_type = config['job_type'] - keywords = config['keywords'] - root_cause_number = config['root_cause_number'] -- kpis = [KPI(**_conf) for _conf in config['KPI']] -- features = [Feature(**_conf) for _conf in config['Features']] -+ -+ kpis = [KPI(**update_description(_conf)) for _conf in config['KPI']] -+ features = [Feature(**update_description(_conf)) for _conf in config['Features']] - - model_config = None - if 'OnlineModel' in config: -@@ -81,3 +82,12 @@ def load_job_config(file_name) -> JobConfig: - features=features, - model_config=model_config - ) -+ -+ -+def update_description(conf: dict): -+ """Changes description to zh""" -+ if 'description-zh' in conf: -+ conf['description'] = conf['description-zh'] -+ del conf['description-zh'] -+ -+ return conf -diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json -index db29392..0146883 100644 ---- a/config/module/app_sli_rtt.json -+++ b/config/module/app_sli_rtt.json -@@ -12,6 +12,7 @@ - "entity_name": "sli", - "enable": false, - "description": "sli rtt 异常", -+ "description-zh": "应用级请求往返时延(RTT)异常", - "params": { - "look_back": 10, - "obs_size": 25, -@@ -28,6 +29,7 @@ - "entity_name": "sli", - "enable": true, - "description": "sli tps 异常", -+ "description-zh": "应用级请求吞吐量(TPS)异常", - "params": { - "look_back": 10, - "obs_size": 25, -diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json -index 00276c0..f6244f6 100644 ---- a/config/module/disk_throughput.json -+++ b/config/module/disk_throughput.json -@@ -12,6 +12,7 @@ - "entity_name": "disk", - "enable": true, - "description": "Disk read await time is increasing!", -+ "description-zh": "磁盘读响应时间升高,性能发生劣化", - "params": { - "look_back": 20, - "obs_size": 25, -@@ -28,6 +29,7 @@ - "entity_name": "disk", - "enable": true, - "description": "Disk write await time is increasing!", -+ "description-zh": "磁盘写响应时间升高,性能发生劣化", - "params": { - "look_back": 20, - "obs_size": 25, -@@ -68,24 +70,28 @@ - "metric": "gala_gopher_disk_rspeed_kB", - "priority": 0, - "description": "The disk I/O await time performance deteriorates due to read throughput rise (read kbytes/second).(Disk = {})", -+ "description-zh": "磁盘读吞吐量异常升高,导致I/O等待时间性能劣化(Disk = {})", - "atrend": "rise" - }, - { - "metric": "gala_gopher_disk_wspeed_kB", - "priority": 0, - "description": "The disk I/O await time performance deteriorates due to write throughput rise (write kbytes/second).(Disk = {})", -+ "description-zh": "磁盘写吞吐量异常升高,导致I/O等待时间性能劣化(Disk = {})", - "atrend": "rise" - }, - { - "metric": "gala_gopher_disk_rareq", - "priority": 0, - "description": "The disk I/O await time performance deteriorates due to read saturation rise.(Disk = {})", -+ "description-zh": "磁盘读饱和度量异常升高,导致I/O等待时间性能劣化(Disk = {})", - "atrend": "rise" - }, - { - "metric": "gala_gopher_disk_wareq", - "priority": 0, - "description": "The disk I/O await time performance deteriorates due to write saturation rise.(Disk = {})", -+ "description-zh": "磁盘读写饱和度量异常升高,导致I/O等待时间性能劣化(Disk = {})", - "atrend": "rise" - } - ] -diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json -index c6c03c1..f086b87 100644 ---- a/config/module/proc_io_latency.json -+++ b/config/module/proc_io_latency.json -@@ -12,6 +12,7 @@ - "entity_name": "proc", - "enable": true, - "description": "I/O operation delay at the BIO layer (unit: us)", -+ "description-zh": "BIO层I/O操作延时高(单位:us)", - "params": { - "look_back": 20, - "obs_size": 37, -@@ -28,6 +29,7 @@ - "entity_name": "proc", - "enable": true, - "description": "Number of small I/O (less than 4 KB) read operations at the BIO layer.", -+ "description-zh": "BIO层小数据I/O读操作数量异常(小于4KB)", - "params": { - "look_back": 20, - "obs_size": 25, -@@ -44,6 +46,7 @@ - "entity_name": "proc", - "enable": true, - "description": "Number of small I/O (less than 4 KB) write operations at the BIO layer.", -+ "description-zh": "BIO层小数据I/O写操作数量异常(小于4KB)", - "params": { - "look_back": 20, - "obs_size": 25, -@@ -61,6 +64,7 @@ - "entity_name": "proc", - "enable": true, - "description": "Number of big I/O (greater than 4 KB) read operations at the BIO layer.", -+ "description-zh": "BIO层大数据I/O读操作数量异常(大于4KB)", - "params": { - "look_back": 20, - "obs_size": 25, -@@ -76,7 +80,8 @@ - "kpi_type": "", - "entity_name": "proc", - "enable": true, -- "description": "Number of big I/O (greater than 4 KB) read operations at the BIO layer.", -+ "description": "Number of big I/O (greater than 4 KB) write operations at the BIO layer.", -+ "description-zh": "BIO层大数据写操作数量异常(大于4KB)", - "params": { - "look_back": 20, - "obs_size": 25, -@@ -116,42 +121,50 @@ - { - "metric": "gala_gopher_block_latency_req_max", - "priority": 4, -- "description": "The system I/O performance deteriorates due to a drive failure.(Disk = {})" -+ "description": "Process I/O performance deteriorates due to system I/O bandwidth insufficient.(Disk = {})", -+ "description-zh": "系统I/O带宽不足引起进程I/O性能劣化(Disk={})" - }, - { - "metric": "gala_gopher_block_latency_device_max", - "priority": 3, -- "description": "Degraded system I/O performance due to device (disk) failure.(Disk = {})" -+ "description": "Process I/O performance deteriorates due to device I/O bandwidth insufficient.(Disk = {})", -+ "description-zh": "设备I/O带宽不足引起进程I/O性能劣化(Disk={})" - }, - { - "metric": "gala_gopher_block_read_bytes", - "priority": 2, -- "description": "System performance deteriorates due to frequent read I/O operations.(Disk = {})" -+ "description": "Process I/O performance deteriorates due to frequent read I/O operations.(Disk = {})", -+ "description-zh": "频繁I/O读操作引起进程I/O性能劣化(Disk={})" - }, - { - "metric": "gala_gopher_block_write_bytes", - "priority": 2, -- "description": "System performance deteriorates due to frequent write I/O operations.(Disk = {})" -+ "description": "Process I/O performance deteriorates due to frequent write I/O operations.(Disk = {})", -+ "description-zh": "频繁写操作引起进程I/O性能劣化(Disk={})" - }, - { - "metric": "gala_gopher_proc_less_4k_io_read", - "priority": 0, -- "description": "System performance degrades due to frequent small I/O read operations.(Disk = {}, PID = {}, comm = {})" -+ "description": "Process I/O performance degrades due to frequent small I/O read operations.(Disk = {}, PID = {}, comm = {})", -+ "description-zh": "频繁小数据量(小于4KB)读操作引起进程I/O性能劣化(Disk={},PID={},comm={})" - }, - { - "metric": "gala_gopher_proc_less_4k_io_write", - "priority": 0, -- "description": "System performance degrades due to frequent small I/O write operations.(Disk = {}, PID = {}, comm = {})" -+ "description": "Process I/O performance degrades due to frequent small I/O write operations.(Disk = {}, PID = {}, comm = {})", -+ "description-zh": "频繁小数据量(小于4KB)写操作引起进程I/O性能劣化(Disk={},PID={},comm={})" - }, - { - "metric": "gala_gopher_proc_greater_4k_io_read", - "priority": 1, -- "description": "System performance degrades due to frequent big I/O read operations.(Disk = {}, PID = {}, comm = {})" -+ "description": "Process I/O performance degrades due to frequent big I/O read operations.(Disk = {}, PID = {}, comm = {})", -+ "description-zh": "频繁大数据量(大于4KB)读操作引起进程I/O性能劣化(Disk={},PID={},comm={})" - }, - { - "metric": "gala_gopher_proc_greater_4k_io_write", - "priority": 1, -- "description": "System performance degrades due to frequent big I/O write operations.(Disk = {}, PID = {}, comm = {})" -+ "description": "Process I/O performance degrades due to frequent big I/O write operations.(Disk = {}, PID = {}, comm = {})", -+ "description-zh": "频繁大数据量(大于4KB)写操作引起进程I/O性能劣化(Disk={},PID={},comm={})" - } - ] - } -\ No newline at end of file -diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json -index e58990d..bdf17d3 100644 ---- a/config/module/sys_io_latency.json -+++ b/config/module/sys_io_latency.json -@@ -12,6 +12,7 @@ - "entity_name": "block", - "enable": true, - "description": "Block I/O latency performance is deteriorating!", -+ "description-zh": "Block层I/O操作时延性能劣化", - "params": { - "look_back": 20, - "obs_size": 25, -@@ -51,42 +52,50 @@ - { - "metric": "gala_gopher_block_latency_driver_max", - "priority": 4, -- "description": "The system I/O performance deteriorates due to a drive failure.(Disk = {})" -+ "description": "The system I/O performance deteriorates due to a drive failure.(Disk = {})", -+ "description-zh": "驱动异常引起系统I/O性能劣化(Disk={})" - }, - { - "metric": "gala_gopher_block_latency_device_max", - "priority": 3, -- "description": "Degraded system I/O performance due to device (disk) failure.(Disk = {})" -+ "description": "Degraded system I/O performance due to device (disk) failure.(Disk = {})", -+ "description-zh": "设备(磁盘)异常引起系统I/O性能劣化(Disk={})" - }, - { - "metric": "gala_gopher_block_read_bytes", - "priority": 2, -- "description": "System performance deteriorates due to frequent read I/O operations.(Disk = {})" -+ "description": "System performance deteriorates due to frequent read I/O operations.(Disk = {})", -+ "description-zh": "频繁读操作引起系统I/O性能劣化(Disk={})" - }, - { - "metric": "gala_gopher_block_write_bytes", - "priority": 2, -- "description": "System performance deteriorates due to frequent write I/O operations.(Disk = {})" -+ "description": "System performance deteriorates due to frequent write I/O operations.(Disk = {})", -+ "description-zh": "频繁写操作引起系统I/O性能劣化(Disk={})" - }, - { - "metric": "gala_gopher_proc_less_4k_io_read", - "priority": 0, -- "description": "System performance degrades due to frequent small I/O read operations.(Disk = {}, PID = {}, comm = {})" -+ "description": "System performance degrades due to frequent small I/O read operations.(Disk = {}, PID = {}, comm = {})", -+ "description-zh": "频繁小数据量(小于4KB)读操作引起系统I/O性能劣化(Disk={},PID={},comm={})" - }, - { - "metric": "gala_gopher_proc_less_4k_io_write", - "priority": 0, -- "description": "System performance degrades due to frequent small I/O write operations.(Disk = {}, PID = {}, comm = {})" -+ "description": "System performance degrades due to frequent small I/O write operations.(Disk = {}, PID = {}, comm = {})", -+ "description-zh": "频繁小数据量(小于4KB)写操作引起系统I/O性能劣化(Disk={},PID={},comm={})" - }, - { - "metric": "gala_gopher_proc_greater_4k_io_read", - "priority": 1, -- "description": "System performance degrades due to frequent big I/O read operations.(Disk = {}, PID = {}, comm = {})" -+ "description": "System performance degrades due to frequent big I/O read operations.(Disk = {}, PID = {}, comm = {})", -+ "description-zh": "频繁大数据量(大于4KB)读操作引起系统I/O性能劣化(Disk={},PID={},comm={})" - }, - { - "metric": "gala_gopher_proc_greater_4k_io_write", - "priority": 1, -- "description": "System performance degrades due to frequent big I/O write operations.(Disk = {}, PID = {}, comm = {})" -+ "description": "System performance degrades due to frequent big I/O write operations.(Disk = {}, PID = {}, comm = {})", -+ "description-zh": "频繁大数据量(大于4KB)写操作引起系统I/O性能劣化(Disk={},PID={},comm={})" - } - ] - } -\ No newline at end of file -diff --git a/config/module/sys_nic_loss.json b/config/module/sys_nic_loss.json -index 793f82f..8a1feb8 100644 ---- a/config/module/sys_nic_loss.json -+++ b/config/module/sys_nic_loss.json -@@ -11,7 +11,8 @@ - "kpi_type": "", - "entity_name": "nic", - "enable": true, -- "description": "TC发送丢包数异常", -+ "description": "TC sent dropped packets", -+ "description-zh": "TC发送丢包数异常", - "params": { - "look_back": 2, - "th": 1 -@@ -22,32 +23,38 @@ - { - "metric": "gala_gopher_nic_tx_dropped", - "priority": 0, -- "description": "网卡发送丢弃的数据包数异常。(dev_name = {})" -+ "description": "The number of lost packets sent by the nic card are increasing and the NIC performance deteriorates.(dev_name = {})", -+ "description-zh": "网卡发送丢弃的数据包数增加,导致网卡性能劣化(dev_name={})" - }, - { - "metric": "gala_gopher_nic_rx_dropped", - "priority": 0, -- "description": "网卡接收丢弃的数据包数异常。(dev_name = {})" -+ "description": "The number of lost packets received by the nic card are increasing and the NIC performance deteriorates.(dev_name = {})", -+ "description-zh": "网卡接收丢弃的数据包数增加,导致网卡性能劣化(dev_name={})" - }, - { - "metric": "gala_gopher_tcp_link_sk_drops", - "priority": 3, -- "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "Packets are lost in the host protocol stack due to unknown causes, and the NIC performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "由于未知原因,数据包在主机协议栈中丢失,导致网卡性能劣化(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_retran_packets", - "priority": 1, -- "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "TCP retransmission is triggered due to network faults, resulting in the NIC performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "网络故障触发TCP重传,导致网卡性能下降(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_lost_out", - "priority": 3, -- "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "The network may be congested, causing abnormal NIC packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "网络拥塞,导致网卡异常丢包,性能劣化(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_notsent_bytes", - "priority": 4, -- "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "由于网络延迟或对端应用程序性能,滑动窗口中累积了太多要发送的数据包,导致网卡性能劣化(PID={},client IP={},Server IP={},Port={})" - } - ] - } -\ No newline at end of file -diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json -index 2c158c0..7cd2369 100644 ---- a/config/module/sys_tcp_establish.json -+++ b/config/module/sys_tcp_establish.json -@@ -12,6 +12,7 @@ - "entity_name": "tcp_link", - "enable": true, - "description": "RTT of syn packet(us): the max syn packets rtt is {:.0f} us", -+ "description-zh": "SYN数据包时延异常:最大SYN数据包时延为:{:.0f}us。", - "params": { - "look_back": 30, - "outlier_ratio_th": 0.5, -@@ -24,7 +25,8 @@ - { - "metric": "gala_gopher_endpoint_retran_synacks", - "priority": 0, -- "description": "TCP established performance deteriorates due to loss of SYN/ACK packets.(PID = {}, TCP Listen Port = {})" -+ "description": "TCP established performance deteriorates due to loss of SYN/ACK packets.(PID = {}, TCP Listen Port = {})", -+ "description-zh": "由于SYN/ACK数据包丢失,TCP建链性能劣化(PID={},TCP Listen Port={})" - } - ] - } -\ No newline at end of file -diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json -index d9e7f80..0527487 100644 ---- a/config/module/sys_tcp_transmission_latency.json -+++ b/config/module/sys_tcp_transmission_latency.json -@@ -12,10 +12,11 @@ - "entity_name": "tcp_link", - "enable": true, - "description": "Smoothed Round Trip Time(us)", -+ "description-zh": "TCP链接往返时延异常,性能劣化", - "params": { - "look_back": 20, - "obs_size": 25, -- "n": 4, -+ "n": 3, - "outlier_ratio_th": 0.4, - "smooth_params": { - "method": "conv_smooth", -@@ -52,57 +53,68 @@ - { - "metric": "gala_gopher_tcp_link_notsent_bytes", - "priority": 4, -- "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "由于网络延迟或对端应用程序性能,滑动窗口中累积了太多要发送的数据包,导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_notack_bytes", - "priority": 4, -- "description": "Due to network delay or peer application performance, too many NO ACK packets are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "Due to network delay or peer application performance, too many NO ACK packets are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "由于网络延迟或对端应用程序性能,滑动窗口中累积了过多的NO ACK数据包,导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_snd_wnd", - "priority": 4, -- "description": "The TCP send window is abnormal due to peer application performance or network congestion. As a result, the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "The TCP send window is abnormal due to peer application performance or network congestion. As a result, the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "对端应用性能或网络拥塞导致TCP发送窗口异常,导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_rcv_wnd", - "priority": 4, -- "description": "The TCP receive window becomes abnormal due to the local application performance. As a result, the performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "The TCP receive window becomes abnormal due to the local application performance. As a result, the performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "本地应用性能导致TCP接收窗口异常,传输性能变差(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_avl_snd_wnd", - "priority": 4, -- "description": "The available TCP send window may be abnormal due to network congestion and the performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "The available TCP send window may be abnormal due to network congestion and the performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "可用的TCP发送窗口可能因网络拥塞而异常,传输性能劣化(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_lost_out", - "priority": 3, -- "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "网络可能拥塞,导致TCP异常丢包,传输性能劣化(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_sk_drops", - "priority": 3, -- "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "主机协议栈不明原因丢包,导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_retran_packets", - "priority": 1, -- "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "网络故障触发TCP重传,导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_backlog_drops", - "priority": 0, -- "description": "TCP backlog overflows due to local application performance. As a result, TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "TCP backlog overflows due to local application performance. As a result, TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "由于本地应用程序性能问题,TCP积压溢出,导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_sacked_out", - "priority": 2, -- "description": "TCP performance deteriorates due to network out-of-order. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "TCP performance deteriorates due to network out-of-order. (PID ={}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "网络乱序导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" - }, - { - "metric": "gala_gopher_tcp_link_sk_backlog_size", - "priority": 0, -- "description": "The TCP backlog queue length is abnormal due to the local application performance. As a result, the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" -+ "description": "The TCP backlog queue length is abnormal due to the local application performance. As a result, the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})", -+ "description-zh": "本地应用性能导致TCP backlog队列长度异常,TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" - } - ] - } -\ No newline at end of file --- -2.33.0 - diff --git a/add-systemd-service-for-anteater.patch b/add-systemd-service-for-anteater.patch deleted file mode 100644 index cfee1e6..0000000 --- a/add-systemd-service-for-anteater.patch +++ /dev/null @@ -1,81 +0,0 @@ -From 2ef581e4960dd0ba49bbe371496933841da001fe Mon Sep 17 00:00:00 2001 -From: lizhenxing11 -Date: Mon, 9 Jan 2023 15:08:01 +0800 -Subject: [PATCH] add systemd service for anteater - -add manifest.in ---- - MANIFEST.in | 11 +++++++++++ - service/gala-anteater.service | 12 ++++++++++++ - setup.py | 5 +++-- - 3 files changed, 26 insertions(+), 2 deletions(-) - create mode 100644 MANIFEST.in - create mode 100644 service/gala-anteater.service - -diff --git a/MANIFEST.in b/MANIFEST.in -new file mode 100644 -index 0000000..7120af9 ---- /dev/null -+++ b/MANIFEST.in -@@ -0,0 +1,11 @@ -+include LICENSE -+include README.en.md -+include README.md -+include requirements.txt -+ -+recursive-include service * -+recursive-include tests * -+recursive-include docs * -+ -+recursive-exclude * __pycache__ -+recursive-exclude * *.py[co] -\ No newline at end of file -diff --git a/service/gala-anteater.service b/service/gala-anteater.service -new file mode 100644 -index 0000000..24af354 ---- /dev/null -+++ b/service/gala-anteater.service -@@ -0,0 +1,12 @@ -+[Unit] -+Description=A-Ops gala-anteater service -+After=network.target -+ -+[Service] -+Type=exec -+ExecStart=/usr/bin/gala-anteater -+Restart=on-failure -+RestartSec=1 -+ -+[Install] -+WantedBy=multi-user.target -\ No newline at end of file -diff --git a/setup.py b/setup.py -index 4471a0f..e075391 100644 ---- a/setup.py -+++ b/setup.py -@@ -23,11 +23,12 @@ setup( - description="Times Series Anomaly Detection Platform on Operating System", - url="https://gitee.com/openeuler/A-Ops/tree/master/gala-anteater", - keywords=["Anomaly Detection", "Time Series Analysis", "Operating System"], -- packages=find_packages(where="."), -+ packages=find_packages(where=".", exclude=("tests",)), - data_files=[ - ('/etc/gala-anteater/config/', glob('config/gala-anteater.yaml')), - ('/etc/gala-anteater/config/', glob('config/log.settings.ini')), - ('/etc/gala-anteater/config/module/', glob('config/module/*')), -+ ('/usr/lib/systemd/system/', glob('service/*')), - ], - install_requires=[ - "APScheduler", -@@ -42,7 +43,7 @@ setup( - ], - entry_points={ - "console_scripts": [ -- "gala-anteater = anteater.main:main", -+ "gala-anteater=anteater.main:main", - ] - } - ) --- -2.33.0 - diff --git a/fix-str2enum-bug-data-query-refactor.patch b/fix-str2enum-bug-data-query-refactor.patch deleted file mode 100644 index 2f0bf7f..0000000 --- a/fix-str2enum-bug-data-query-refactor.patch +++ /dev/null @@ -1,737 +0,0 @@ -From 27bb7cdd80f76bfc7ebb0f3041544740aa2fa91b Mon Sep 17 00:00:00 2001 -From: lizhenxing11 -Date: Tue, 10 Jan 2023 15:31:44 +0800 -Subject: [PATCH] fix str2enum bug & data query refactor - ---- - anteater/core/anomaly.py | 10 ++++ - anteater/core/kpi.py | 14 ++++++ - anteater/model/algorithms/slope.py | 11 +++-- - anteater/model/detector/base.py | 20 ++++---- - anteater/model/detector/n_sigma_detector.py | 15 +++--- - .../model/detector/online_vae_detector.py | 3 +- - .../tcp_establish_n_sigma_detector.py | 3 +- - .../tcp_trans_latency_n_sigma_detector.py | 48 +++++++++++++++++-- - anteater/model/detector/th_base_detector.py | 3 +- - anteater/module/app/app_sli_detector.py | 4 +- - anteater/module/sys/disk_throughput.py | 4 +- - anteater/module/sys/proc_io_latency.py | 4 +- - anteater/module/sys/sys_io_latency.py | 4 +- - .../module/sys/tcp_transmission_latency.py | 4 +- - .../module/sys/tcp_transmission_throughput.py | 4 +- - anteater/source/metric_loader.py | 41 +++++++++++++++- - anteater/utils/data_load.py | 4 +- - config/module/app_sli_rtt.json | 6 ++- - config/module/disk_throughput.json | 6 ++- - config/module/proc_io_latency.json | 15 ++++-- - config/module/sys_io_latency.json | 2 +- - config/module/sys_tcp_establish.json | 2 +- - .../module/sys_tcp_transmission_latency.json | 4 +- - 23 files changed, 172 insertions(+), 59 deletions(-) - -diff --git a/anteater/core/anomaly.py b/anteater/core/anomaly.py -index 45c4fc3..fdee3d1 100644 ---- a/anteater/core/anomaly.py -+++ b/anteater/core/anomaly.py -@@ -52,3 +52,13 @@ class AnomalyTrend(Enum): - DEFAULT = 0 - RISE = 1 - FALL = 2 -+ -+ @staticmethod -+ def from_str(label: str): -+ """Trans str to Enum type""" -+ if label.upper() == 'RISE': -+ return AnomalyTrend.RISE -+ elif label.upper() == 'FALL': -+ return AnomalyTrend.FALL -+ else: -+ return AnomalyTrend.DEFAULT -diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py -index f83b666..70cc9ee 100644 ---- a/anteater/core/kpi.py -+++ b/anteater/core/kpi.py -@@ -27,6 +27,13 @@ class KPI: - params: dict = field(default=dict) - atrend: AnomalyTrend = AnomalyTrend.DEFAULT - -+ @classmethod -+ def from_dict(cls, **data): -+ if 'atrend' in data: -+ data['atrend'] = AnomalyTrend.from_str(data.get('atrend')) -+ -+ return cls(**data) -+ - - @dataclass - class Feature: -@@ -35,6 +42,13 @@ class Feature: - priority: int = 0 - atrend: AnomalyTrend = AnomalyTrend.DEFAULT - -+ @classmethod -+ def from_dict(cls, **data): -+ if 'atrend' in data: -+ data['atrend'] = AnomalyTrend.from_str(data.get('atrend')) -+ -+ return cls(**data) -+ - - @dataclass - class ModelConfig: -diff --git a/anteater/model/algorithms/slope.py b/anteater/model/algorithms/slope.py -index d324d58..e546183 100644 ---- a/anteater/model/algorithms/slope.py -+++ b/anteater/model/algorithms/slope.py -@@ -17,6 +17,7 @@ import numpy as np - - from anteater.core.anomaly import AnomalyTrend - from anteater.model.algorithms.smooth import conv_smooth -+from anteater.utils.common import divide - - - def slope(y, win_len): -@@ -36,13 +37,15 @@ def smooth_slope(time_series, windows_length): - - def trend(y, win_len=None): - """Gets the trend for the y""" -+ y = conv_smooth(y, box_pts=7) -+ - if not win_len: - win_len = len(y) // 2 - -- if np.mean(y[:win_len]) < np.mean(y[-win_len:]): -+ if divide(np.mean(y[:win_len]), np.mean(y[-win_len:])) < 0.9: - return 1 - -- elif np.mean(y[:win_len]) > np.mean(y[-win_len:]): -+ elif divide(np.mean(y[:win_len]), np.mean(y[-win_len:])) > 1.1: - return -1 - - else: -@@ -51,10 +54,10 @@ def trend(y, win_len=None): - - def check_trend(values: List[float], atrend: AnomalyTrend): - """Checks the values with an 'atrend' trend""" -- if atrend == AnomalyTrend.RISE and trend(values) < 0: -+ if atrend == AnomalyTrend.RISE and trend(values) != 1: - return False - -- if atrend == AnomalyTrend.FALL and trend(values) > 0: -+ if atrend == AnomalyTrend.FALL and trend(values) != -1: - return False - - return True -diff --git a/anteater/model/detector/base.py b/anteater/model/detector/base.py -index 2b2dafe..a23b6d9 100644 ---- a/anteater/model/detector/base.py -+++ b/anteater/model/detector/base.py -@@ -11,6 +11,7 @@ - # See the Mulan PSL v2 for more details. - # ******************************************************************************/ - -+import logging - import math - from abc import abstractmethod - from typing import List -@@ -39,12 +40,6 @@ class Detector: - """Executes anomaly detection on kpis""" - pass - -- def get_unique_machine_id(self, start, end, kpis: List[KPI]) -> List[str]: -- """Gets unique machine ids during past minutes""" -- metrics = [_kpi.metric for _kpi in kpis] -- machine_ids = self.data_loader.get_unique_machines(start, end, metrics) -- return machine_ids -- - def execute(self, job_config: JobConfig) -> List[Anomaly]: - """The main function of the detector""" - kpis = job_config.kpis -@@ -56,6 +51,12 @@ class Detector: - - return self._execute(kpis, features, top_n=n) - -+ def get_unique_machine_id(self, start, end, kpis: List[KPI]) -> List[str]: -+ """Gets unique machine ids during past minutes""" -+ metrics = [_kpi.metric for _kpi in kpis] -+ machine_ids = self.data_loader.get_unique_machines(start, end, metrics) -+ return machine_ids -+ - def find_root_causes(self, anomalies: List[Anomaly], features: List[Feature], top_n=3)\ - -> List[Anomaly]: - """Finds root causes for each anomaly events""" -@@ -82,6 +83,7 @@ class Detector: - tmp_ts_scores = self.cal_anomaly_score(f.metric, f.description, machine_id=machine_id) - for _ts_score in tmp_ts_scores: - if not check_trend(_ts_score.ts.values, f.atrend): -+ logging.info(f"Trends Filtered: {f.metric}") - _ts_score.score = 0 - if same_intersection_key_value(_ts_score.ts.labels, filters): - ts_scores.append(_ts_score) -@@ -101,6 +103,7 @@ class Detector: - for _ts_s in ts_scores: - if same_intersection_key_value(_ts_s.ts.labels, anomaly.labels): - if not check_trend(_ts_s.ts.values, kpi_atrends[anomaly.metric]): -+ logging.info(f"Trends Filtered: {anomaly.metric}") - anomaly.score = 0 - else: - anomaly.score = _ts_s.score -@@ -115,12 +118,11 @@ class Detector: - machine_id: str)\ - -> List[TimeSeriesScore]: - """Calculates metric anomaly scores based on sr model""" -- start, end = dt.last(minutes=6) -+ start, end = dt.last(minutes=10) - point_count = self.data_loader.expected_point_length(start, end) - model = SpectralResidual(12, 24, 50) - ts_scores = [] -- ts_list = self.data_loader.\ -- get_metric(start, end, metric, label_name='machine_id', label_value=machine_id) -+ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id) - for _ts in ts_list: - if sum(_ts.values) == 0 or \ - len(_ts.values) < point_count * 0.9 or\ -diff --git a/anteater/model/detector/n_sigma_detector.py b/anteater/model/detector/n_sigma_detector.py -index 3a2ab01..dbf83c6 100644 ---- a/anteater/model/detector/n_sigma_detector.py -+++ b/anteater/model/detector/n_sigma_detector.py -@@ -29,10 +29,9 @@ from anteater.utils.log import logger - class NSigmaDetector(Detector): - """The three sigma anomaly detector""" - -- def __init__(self, data_loader: MetricLoader, method: str): -+ def __init__(self, data_loader: MetricLoader): - """The detector base class initializer""" - super().__init__(data_loader) -- self.method = method - - def detect_kpis(self, kpis: List[KPI]): - """Executes anomaly detection on kpis""" -@@ -48,7 +47,7 @@ class NSigmaDetector(Detector): - def detect_signal_kpi(self, kpi, machine_id: str) -> List[Anomaly]: - """Detects kpi based on signal time series anomaly detection model""" - outlier_ratio_th = kpi.params['outlier_ratio_th'] -- ts_scores = self.calculate_metric_three_sigma_score( -+ ts_scores = self.calculate_n_sigma_score( - kpi.metric, kpi.description, machine_id, **kpi.params) - if not ts_scores: - logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!') -@@ -68,17 +67,17 @@ class NSigmaDetector(Detector): - - return anomalies - -- def calculate_metric_three_sigma_score(self, metric, description, machine_id: str, **kwargs)\ -+ def calculate_n_sigma_score(self, metric, description, machine_id: str, **kwargs)\ - -> List[TimeSeriesScore]: - """Calculate kpi anomaly scores based on three sigma scores""" -+ method = kwargs.get('method', 'abs') - look_back = kwargs.get('look_back') - smooth_params = kwargs.get('smooth_params') - obs_size = kwargs.get('obs_size') - n = kwargs.get('n', 3) - start, end = dt.last(minutes=look_back) - point_count = self.data_loader.expected_point_length(start, end) -- ts_list = self.data_loader.\ -- get_metric(start, end, metric, label_name='machine_id', label_value=machine_id) -+ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id) - ts_scores = [] - for _ts in ts_list: - dedup_values = [k for k, g in groupby(_ts.values)] -@@ -87,12 +86,12 @@ class NSigmaDetector(Detector): - len(_ts.values) > point_count * 1.5 or \ - all(x == _ts.values[0] for x in _ts.values): - ratio = 0 -- elif len(dedup_values) < point_count * 0.3: -+ elif len(dedup_values) < point_count * 0.6: - ratio = 0 - else: - smoothed_val = smoothing(_ts.values, **smooth_params) - outlier, mean, std = n_sigma( -- smoothed_val, obs_size=obs_size, n=n, method=self.method) -+ smoothed_val, obs_size=obs_size, n=n, method=method) - ratio = divide(len(outlier), obs_size) - - ts_scores.append(TimeSeriesScore(ts=_ts, score=ratio, description=description)) -diff --git a/anteater/model/detector/online_vae_detector.py b/anteater/model/detector/online_vae_detector.py -index 63a7b09..0f91576 100644 ---- a/anteater/model/detector/online_vae_detector.py -+++ b/anteater/model/detector/online_vae_detector.py -@@ -110,8 +110,7 @@ class OnlineVAEDetector(Detector): - metric_dfs = [] - for metric in metrics: - _ts_list = self.data_loader.\ -- get_metric(start, end, metric, label_name="machine_id", -- label_value=machine_id, operator_name='avg') -+ get_metric(start, end, metric, operator='avg', keys="machine_id", machine_id=machine_id) - - if len(_ts_list) > 1: - raise ValueError(f'Got multiple time_series based on machine id: {len(_ts_list)}') -diff --git a/anteater/model/detector/tcp_establish_n_sigma_detector.py b/anteater/model/detector/tcp_establish_n_sigma_detector.py -index 82d7837..3720069 100644 ---- a/anteater/model/detector/tcp_establish_n_sigma_detector.py -+++ b/anteater/model/detector/tcp_establish_n_sigma_detector.py -@@ -73,8 +73,7 @@ class TcpEstablishNSigmaDetector(Detector): - min_rtt = kpi.params.get('min_rtt') - - start, end = dt.last(minutes=look_back) -- ts_list = self.data_loader.\ -- get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id) -+ ts_list = self.data_loader.get_metric(start, end, kpi.metric, machine_id=machine_id) - - anomalies = [] - for _ts in ts_list: -diff --git a/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py b/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py -index 1eeb95f..6d41775 100644 ---- a/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py -+++ b/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py -@@ -11,20 +11,61 @@ - # See the Mulan PSL v2 for more details. - # ******************************************************************************/ - -+from itertools import groupby - from typing import List - -+import numpy as np -+ - from anteater.core.time_series import TimeSeriesScore -+from anteater.model.algorithms.smooth import smoothing -+from anteater.model.algorithms.three_sigma import n_sigma - from anteater.model.detector.n_sigma_detector import NSigmaDetector - from anteater.source.metric_loader import MetricLoader -+from anteater.utils.common import divide - from anteater.utils.datetime import DateTimeManager as dt - - - class TcpTransLatencyNSigmaDetector(NSigmaDetector): - """The three sigma anomaly detector""" - -- def __init__(self, data_loader: MetricLoader, method: str): -+ def __init__(self, data_loader: MetricLoader): - """The detector base class initializer""" -- super().__init__(data_loader, method) -+ super().__init__(data_loader) -+ -+ def calculate_n_sigma_score(self, metric, description, machine_id: str, **kwargs)\ -+ -> List[TimeSeriesScore]: -+ """Calculates anomaly scores based on n sigma scores""" -+ method = kwargs.get('method', 'abs') -+ look_back = kwargs.get('look_back') -+ smooth_params = kwargs.get('smooth_params') -+ obs_size = kwargs.get('obs_size') -+ min_srtt = kwargs.get("min_srtt") -+ n = kwargs.get('n', 3) -+ start, end = dt.last(minutes=look_back) -+ point_count = self.data_loader.expected_point_length(start, end) -+ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id) -+ ts_scores = [] -+ for _ts in ts_list: -+ dedup_values = [k for k, g in groupby(_ts.values)] -+ if sum(_ts.values) == 0 or \ -+ len(_ts.values) < point_count * 0.6 or \ -+ len(_ts.values) > point_count * 1.5 or \ -+ all(x == _ts.values[0] for x in _ts.values): -+ ratio = 0 -+ elif len(dedup_values) < point_count * 0.6: -+ ratio = 0 -+ else: -+ smoothed_val = smoothing(_ts.values, **smooth_params) -+ outlier, mean, std = n_sigma( -+ smoothed_val, obs_size=obs_size, n=n, method=method) -+ if outlier and np.average(outlier) <= min_srtt: -+ ratio = 0 -+ else: -+ ratio = divide(len(outlier), obs_size) -+ -+ ts_scores.append(TimeSeriesScore(ts=_ts, score=ratio, description=description)) -+ -+ return ts_scores - - def cal_anomaly_score(self, metric, description, machine_id: str) \ - -> List[TimeSeriesScore]: -@@ -32,8 +73,7 @@ class TcpTransLatencyNSigmaDetector(NSigmaDetector): - start, end = dt.last(minutes=2) - point_count = self.data_loader.expected_point_length(start, end) - ts_scores = [] -- ts_list = self.data_loader. \ -- get_metric(start, end, metric, label_name='machine_id', label_value=machine_id) -+ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id) - for _ts in ts_list: - if sum(_ts.values) == 0 or \ - len(_ts.values) < point_count * 0.5 or \ -diff --git a/anteater/model/detector/th_base_detector.py b/anteater/model/detector/th_base_detector.py -index bec9705..0af4f22 100644 ---- a/anteater/model/detector/th_base_detector.py -+++ b/anteater/model/detector/th_base_detector.py -@@ -44,8 +44,7 @@ class ThBaseDetector(Detector): - look_back = kpi.params.get('look_back') - th = kpi.params.get('th') - start, end = dt.last(minutes=look_back) -- ts_list = self.data_loader.\ -- get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id) -+ ts_list = self.data_loader.get_metric(start, end, kpi.metric, machine_id=machine_id) - - if not ts_list: - logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!') -diff --git a/anteater/module/app/app_sli_detector.py b/anteater/module/app/app_sli_detector.py -index 102ed11..e506332 100644 ---- a/anteater/module/app/app_sli_detector.py -+++ b/anteater/module/app/app_sli_detector.py -@@ -44,12 +44,12 @@ class APPSliDetector(E2EDetector): - def init_detectors(self, data_loader): - if self.job_config.model_config.enable: - detectors = [ -- NSigmaDetector(data_loader, method='min'), -+ NSigmaDetector(data_loader), - OnlineVAEDetector(data_loader, self.job_config.model_config) - ] - else: - detectors = [ -- NSigmaDetector(data_loader, method='min') -+ NSigmaDetector(data_loader) - ] - - return detectors -diff --git a/anteater/module/sys/disk_throughput.py b/anteater/module/sys/disk_throughput.py -index 9a192fb..7971505 100644 ---- a/anteater/module/sys/disk_throughput.py -+++ b/anteater/module/sys/disk_throughput.py -@@ -38,12 +38,12 @@ class DiskThroughputDetector(E2EDetector): - def init_detectors(self, data_loader): - if self.job_config.model_config.enable: - detectors = [ -- NSigmaDetector(data_loader, method='max'), -+ NSigmaDetector(data_loader), - OnlineVAEDetector(data_loader, self.job_config.model_config) - ] - else: - detectors = [ -- NSigmaDetector(data_loader, method='max') -+ NSigmaDetector(data_loader) - ] - - return detectors -diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py -index a34c48d..b76acea 100644 ---- a/anteater/module/sys/proc_io_latency.py -+++ b/anteater/module/sys/proc_io_latency.py -@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector): - def init_detectors(self, data_loader): - if self.job_config.model_config.enable: - detectors = [ -- NSigmaDetector(data_loader, method='max'), -+ NSigmaDetector(data_loader), - OnlineVAEDetector(data_loader, self.job_config.model_config) - ] - else: - detectors = [ -- NSigmaDetector(data_loader, method='max') -+ NSigmaDetector(data_loader) - ] - - return detectors -diff --git a/anteater/module/sys/sys_io_latency.py b/anteater/module/sys/sys_io_latency.py -index a6f01c2..17a34c9 100644 ---- a/anteater/module/sys/sys_io_latency.py -+++ b/anteater/module/sys/sys_io_latency.py -@@ -38,12 +38,12 @@ class SysIOLatencyDetector(E2EDetector): - def init_detectors(self, data_loader): - if self.job_config.model_config.enable: - detectors = [ -- NSigmaDetector(data_loader, method='abs'), -+ NSigmaDetector(data_loader), - OnlineVAEDetector(data_loader, self.job_config.model_config) - ] - else: - detectors = [ -- NSigmaDetector(data_loader, method='abs') -+ NSigmaDetector(data_loader) - ] - - return detectors -diff --git a/anteater/module/sys/tcp_transmission_latency.py b/anteater/module/sys/tcp_transmission_latency.py -index cf0f406..e085ec3 100644 ---- a/anteater/module/sys/tcp_transmission_latency.py -+++ b/anteater/module/sys/tcp_transmission_latency.py -@@ -39,12 +39,12 @@ class SysTcpTransmissionLatencyDetector(E2EDetector): - def init_detectors(self, data_loader): - if self.job_config.model_config.enable: - detectors = [ -- TcpTransLatencyNSigmaDetector(data_loader, method='max'), -+ TcpTransLatencyNSigmaDetector(data_loader), - OnlineVAEDetector(data_loader, self.job_config.model_config) - ] - else: - detectors = [ -- TcpTransLatencyNSigmaDetector(data_loader, method='max') -+ TcpTransLatencyNSigmaDetector(data_loader) - ] - - return detectors -diff --git a/anteater/module/sys/tcp_transmission_throughput.py b/anteater/module/sys/tcp_transmission_throughput.py -index 86ecc9e..2921602 100644 ---- a/anteater/module/sys/tcp_transmission_throughput.py -+++ b/anteater/module/sys/tcp_transmission_throughput.py -@@ -38,12 +38,12 @@ class SysTcpTransmissionThroughputDetector(E2EDetector): - def init_detectors(self, data_loader): - if self.job_config.model_config.enable: - detectors = [ -- NSigmaDetector(data_loader, method='abs'), -+ NSigmaDetector(data_loader), - OnlineVAEDetector(data_loader, self.job_config.model_config) - ] - else: - detectors = [ -- NSigmaDetector(data_loader, method='abs') -+ NSigmaDetector(data_loader) - ] - - return detectors -diff --git a/anteater/source/metric_loader.py b/anteater/source/metric_loader.py -index ef2d012..4745d87 100644 ---- a/anteater/source/metric_loader.py -+++ b/anteater/source/metric_loader.py -@@ -65,6 +65,43 @@ def get_query(metric: str, - return query - - -+def get_query2( -+ metric: str, operator: str = None, value: float = None, keys: Union[str, List] = None, **labels): -+ """Gets aggregated query patterns -+ -+ Format: [operator]([value,] metric{[**labels]}) by (keys) -+ -+ Such as: -+ - 1. gala_gopher_bind_sends{machine_id="1234"} -+ - 2. sum(gala_gopher_bind_sends) by (machine_id) -+ - 2. sum(gala_gopher_bind_sends) by (machine_id) -+ - 3. sum(gala_gopher_bind_sends{machine_id="1234"}) by (machine_id) -+ - 4. quantile(0.7, gala_gopher_bind_sends{machine_id="1234"}) by (machine_id) -+ """ -+ if operator and not keys: -+ raise ValueError("Please provide param 'keys' when specified 'operator'!") -+ -+ rule = "" -+ if labels: -+ pairs = ",".join([f"{n}='{v}'" for n, v in labels.items()]) -+ rule = f"{{{pairs}}}" -+ -+ group = "" -+ if isinstance(keys, list): -+ group = ",".join([k for k in keys]) -+ elif isinstance(keys, str): -+ group = keys -+ -+ if operator and value: -+ query = f"{operator}({value}, {metric}{rule}) by ({group})" -+ elif operator: -+ query = f"{operator}({metric}{rule}) by ({group})" -+ else: -+ query = f"{metric}{rule}" -+ -+ return query -+ -+ - class MetricLoader: - """ - The metric loader that consumes raw data from PrometheusAdapter, -@@ -87,7 +124,7 @@ class MetricLoader: - - :return List of TimeSeries - """ -- query = get_query(metric, **kwargs) -+ query = get_query2(metric, **kwargs) - time_series = self.provider.range_query(start, end, metric, query) - - return time_series -@@ -109,7 +146,7 @@ class MetricLoader: - """Gets unique labels of all metrics""" - unique_labels = set() - for metric in metrics: -- time_series = self.get_metric(start, end, metric, label_name=label_name) -+ time_series = self.get_metric(start, end, metric) - unique_labels.update([item.labels.get(label_name, "") for item in time_series]) - - return list([lbl for lbl in unique_labels if lbl]) -diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py -index 730c9c6..60c28e5 100644 ---- a/anteater/utils/data_load.py -+++ b/anteater/utils/data_load.py -@@ -48,8 +48,8 @@ def load_job_config(file_name) -> JobConfig: - keywords = config['keywords'] - root_cause_number = config['root_cause_number'] - -- kpis = [KPI(**update_description(_conf)) for _conf in config['KPI']] -- features = [Feature(**update_description(_conf)) for _conf in config['Features']] -+ kpis = [KPI.from_dict(**update_description(_conf)) for _conf in config['KPI']] -+ features = [Feature.from_dict(**update_description(_conf)) for _conf in config['Features']] - - model_config = None - if 'OnlineModel' in config: -diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json -index 0146883..5027b8d 100644 ---- a/config/module/app_sli_rtt.json -+++ b/config/module/app_sli_rtt.json -@@ -10,13 +10,14 @@ - "metric": "gala_gopher_sli_rtt_nsec", - "kpi_type": "rtt", - "entity_name": "sli", -- "enable": false, -+ "enable": true, - "description": "sli rtt 异常", - "description-zh": "应用级请求往返时延(RTT)异常", - "params": { -+ "method": "max", - "look_back": 10, - "obs_size": 25, -- "outlier_ratio_th": 0.3, -+ "outlier_ratio_th": 0.5, - "smooth_params": { - "method": "conv_smooth", - "box_pts": 3 -@@ -31,6 +32,7 @@ - "description": "sli tps 异常", - "description-zh": "应用级请求吞吐量(TPS)异常", - "params": { -+ "method": "min", - "look_back": 10, - "obs_size": 25, - "outlier_ratio_th": 0.3, -diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json -index f6244f6..e3bcf68 100644 ---- a/config/module/disk_throughput.json -+++ b/config/module/disk_throughput.json -@@ -14,9 +14,10 @@ - "description": "Disk read await time is increasing!", - "description-zh": "磁盘读响应时间升高,性能发生劣化", - "params": { -+ "method": "max", - "look_back": 20, - "obs_size": 25, -- "outlier_ratio_th": 0.3, -+ "outlier_ratio_th": 0.5, - "smooth_params": { - "method": "conv_smooth", - "box_pts": 3 -@@ -31,9 +32,10 @@ - "description": "Disk write await time is increasing!", - "description-zh": "磁盘写响应时间升高,性能发生劣化", - "params": { -+ "method": "max", - "look_back": 20, - "obs_size": 25, -- "outlier_ratio_th": 0.3, -+ "outlier_ratio_th": 0.5, - "smooth_params": { - "method": "conv_smooth", - "box_pts": 3 -diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json -index f086b87..171c5f4 100644 ---- a/config/module/proc_io_latency.json -+++ b/config/module/proc_io_latency.json -@@ -14,9 +14,10 @@ - "description": "I/O operation delay at the BIO layer (unit: us)", - "description-zh": "BIO层I/O操作延时高(单位:us)", - "params": { -+ "method":"max", - "look_back": 20, - "obs_size": 37, -- "outlier_ratio_th": 0.4, -+ "outlier_ratio_th": 0.5, - "smooth_params": { - "method": "conv_smooth", - "box_pts": 3 -@@ -31,9 +32,10 @@ - "description": "Number of small I/O (less than 4 KB) read operations at the BIO layer.", - "description-zh": "BIO层小数据I/O读操作数量异常(小于4KB)", - "params": { -+ "method":"max", - "look_back": 20, - "obs_size": 25, -- "outlier_ratio_th": 0.3, -+ "outlier_ratio_th": 0.4, - "smooth_params": { - "method": "conv_smooth", - "box_pts": 3 -@@ -48,9 +50,10 @@ - "description": "Number of small I/O (less than 4 KB) write operations at the BIO layer.", - "description-zh": "BIO层小数据I/O写操作数量异常(小于4KB)", - "params": { -+ "method":"max", - "look_back": 20, - "obs_size": 25, -- "outlier_ratio_th": 0.3, -+ "outlier_ratio_th": 0.4, - "smooth_params": { - "method": "savgol_smooth", - "window_length": 13, -@@ -66,9 +69,10 @@ - "description": "Number of big I/O (greater than 4 KB) read operations at the BIO layer.", - "description-zh": "BIO层大数据I/O读操作数量异常(大于4KB)", - "params": { -+ "method":"max", - "look_back": 20, - "obs_size": 25, -- "outlier_ratio_th": 0.3, -+ "outlier_ratio_th": 0.4, - "smooth_params": { - "method": "conv_smooth", - "box_pts": 3 -@@ -83,9 +87,10 @@ - "description": "Number of big I/O (greater than 4 KB) write operations at the BIO layer.", - "description-zh": "BIO层大数据写操作数量异常(大于4KB)", - "params": { -+ "method":"max", - "look_back": 20, - "obs_size": 25, -- "outlier_ratio_th": 0.3, -+ "outlier_ratio_th": 0.4, - "smooth_params": { - "method": "conv_smooth", - "box_pts": 3 -diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json -index bdf17d3..3fa1266 100644 ---- a/config/module/sys_io_latency.json -+++ b/config/module/sys_io_latency.json -@@ -16,7 +16,7 @@ - "params": { - "look_back": 20, - "obs_size": 25, -- "outlier_ratio_th": 0.3, -+ "outlier_ratio_th": 0.4, - "smooth_params": { - "method": "conv_smooth", - "box_pts": 3 -diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json -index 7cd2369..9bd2a46 100644 ---- a/config/module/sys_tcp_establish.json -+++ b/config/module/sys_tcp_establish.json -@@ -17,7 +17,7 @@ - "look_back": 30, - "outlier_ratio_th": 0.5, - "obs_size": 3, -- "min_rtt": 500000 -+ "min_rtt": 100000 - } - } - ], -diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json -index 0527487..3ba8113 100644 ---- a/config/module/sys_tcp_transmission_latency.json -+++ b/config/module/sys_tcp_transmission_latency.json -@@ -14,10 +14,12 @@ - "description": "Smoothed Round Trip Time(us)", - "description-zh": "TCP链接往返时延异常,性能劣化", - "params": { -+ "method": "max", - "look_back": 20, - "obs_size": 25, - "n": 3, -- "outlier_ratio_th": 0.4, -+ "min_srtt": 20000, -+ "outlier_ratio_th": 0.6, - "smooth_params": { - "method": "conv_smooth", - "box_pts": 3 --- -2.33.0 - diff --git a/gala-anteater-1.0.1.tar.gz b/gala-anteater-1.0.1.tar.gz deleted file mode 100644 index e19781b..0000000 Binary files a/gala-anteater-1.0.1.tar.gz and /dev/null differ diff --git a/gala-anteater-1.1.0.tar.gz b/gala-anteater-1.1.0.tar.gz new file mode 100644 index 0000000..bb9c79e Binary files /dev/null and b/gala-anteater-1.1.0.tar.gz differ diff --git a/gala-anteater.spec b/gala-anteater.spec index 1f8797f..c5c821d 100644 --- a/gala-anteater.spec +++ b/gala-anteater.spec @@ -1,8 +1,8 @@ %define debug_package %{nil} Name: gala-anteater -Version: 1.0.1 -Release: 4 +Version: 1.1.0 +Release: 1 Summary: A time-series anomaly detection platform for operating system. License: MulanPSL2 URL: https://gitee.com/openeuler/gala-anteater @@ -11,13 +11,6 @@ BuildRoot: %{_builddir}/%{name}-%{version} BuildRequires: procps-ng python3-setuptools Requires: python3-gala-anteater = %{version}-%{release} -Patch1: Add-disk-throughput-detector.patch -Patch2: Update-TCP-Establish-Model-Add-Nic-Loss-Detector.patch -Patch3: add-chinese-descriptions.patch -Patch4: remove-sys-level-config-param.patch -Patch5: add-systemd-service-for-anteater.patch -Patch6: fix-str2enum-bug-data-query-refactor.patch - %description Abnormal detection module for A-Ops project @@ -59,16 +52,18 @@ fi %doc README.md %license LICENSE %{_bindir}/gala-anteater +%config(noreplace) %{_sysconfdir}/%{name}/config/metricinfo.json %config(noreplace) %{_sysconfdir}/%{name}/config/gala-anteater.yaml %config(noreplace) %{_sysconfdir}/%{name}/config/log.settings.ini -%config(noreplace) %{_sysconfdir}/%{name}/config/module/app_sli_rtt.json -%config(noreplace) %{_sysconfdir}/%{name}/config/module/proc_io_latency.json -%config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_io_latency.json -%config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_tcp_establish.json -%config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_tcp_transmission_latency.json -%config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_tcp_transmission_throughput.json -%config(noreplace) %{_sysconfdir}/%{name}/config/module/disk_throughput.json -%config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_nic_loss.json +%config(noreplace) %{_sysconfdir}/%{name}/module/app_sli_rtt.job.json +%config(noreplace) %{_sysconfdir}/%{name}/module/disk_throughput.job.json +%config(noreplace) %{_sysconfdir}/%{name}/module/jvm_oom.job.json +%config(noreplace) %{_sysconfdir}/%{name}/module/proc_io_latency.job.json +%config(noreplace) %{_sysconfdir}/%{name}/module/sys_io_latency.job.json +%config(noreplace) %{_sysconfdir}/%{name}/module/sys_nic_loss.job.json +%config(noreplace) %{_sysconfdir}/%{name}/module/sys_tcp_establish.job.json +%config(noreplace) %{_sysconfdir}/%{name}/module/sys_tcp_transmission_latency.job.json +%config(noreplace) %{_sysconfdir}/%{name}/module/usad_model.job.json /usr/lib/systemd/system/gala-anteater.service %files -n python3-gala-anteater @@ -77,6 +72,9 @@ fi %changelog +* Thu Aug 31 2023 Li Zhenxing - 1.1.0-1 +- Upgrade anteater version to 1.1.0 + * Fri Jan 20 2023 Zhen Chen - 1.0.1-4 - eliminate 'Fail to try-restart' warning when downgrading to 1.0.1-1 diff --git a/remove-sys-level-config-param.patch b/remove-sys-level-config-param.patch deleted file mode 100644 index 4d0c7fd..0000000 --- a/remove-sys-level-config-param.patch +++ /dev/null @@ -1,98 +0,0 @@ -From 5c6b03a49a49ddc56574e906f959d5fe34c1debc Mon Sep 17 00:00:00 2001 -From: lizhenxing11 -Date: Fri, 6 Jan 2023 10:59:12 +0800 -Subject: [PATCH] remove 'sys-level' config param - ---- - anteater/config.py | 1 - - anteater/main.py | 29 +++++++++++------------------ - config/gala-anteater.yaml | 1 - - docs/conf_introduction.md | 1 - - 4 files changed, 11 insertions(+), 21 deletions(-) - -diff --git a/anteater/config.py b/anteater/config.py -index e9ab557..caeceec 100644 ---- a/anteater/config.py -+++ b/anteater/config.py -@@ -27,7 +27,6 @@ import yaml - class GlobalConf: - """The global config""" - data_source: str -- sys_level: bool - - - @dataclass -diff --git a/anteater/main.py b/anteater/main.py -index 4de72f9..87aae95 100644 ---- a/anteater/main.py -+++ b/anteater/main.py -@@ -26,7 +26,6 @@ from anteater.module.sys.nic_loss import NICLossDetector - from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector - from anteater.module.sys.sys_io_latency import SysIOLatencyDetector - from anteater.module.sys.tcp_establish import SysTcpEstablishDetector --from anteater.module.sys.tcp_transmission_throughput import SysTcpTransmissionThroughputDetector - from anteater.module.sys.tcp_transmission_latency import SysTcpTransmissionLatencyDetector - from anteater.provider.kafka import KafkaProvider - from anteater.source.anomaly_report import AnomalyReport -@@ -49,24 +48,18 @@ def main(): - kafka_provider = KafkaProvider(conf.kafka) - loader = MetricLoader(conf) - report = AnomalyReport(kafka_provider) -- if conf.global_conf.sys_level: -- detectors = [ -- # APP sli anomaly detection -- APPSliDetector(loader, report), -+ detectors = [ -+ # APP sli anomaly detection -+ APPSliDetector(loader, report), - -- # SYS tcp/io detection -- SysTcpEstablishDetector(loader, report), -- SysTcpTransmissionLatencyDetector(loader, report), -- SysIOLatencyDetector(loader, report), -- ProcIOLatencyDetector(loader, report), -- DiskThroughputDetector(loader, report), -- NICLossDetector(loader, report), -- ] -- else: -- detectors = [ -- # APP sli anomaly detection -- APPSliDetector(loader, report) -- ] -+ # SYS tcp/io detection -+ SysTcpEstablishDetector(loader, report), -+ SysTcpTransmissionLatencyDetector(loader, report), -+ SysIOLatencyDetector(loader, report), -+ ProcIOLatencyDetector(loader, report), -+ DiskThroughputDetector(loader, report), -+ NICLossDetector(loader, report), -+ ] - - anomaly_detect = AnomalyDetection(detectors, conf) - -diff --git a/config/gala-anteater.yaml b/config/gala-anteater.yaml -index c4c54a0..72ffc31 100644 ---- a/config/gala-anteater.yaml -+++ b/config/gala-anteater.yaml -@@ -1,6 +1,5 @@ - Global: - data_source: "prometheus" -- sys_level: false - - Kafka: - server: "localhost" -diff --git a/docs/conf_introduction.md b/docs/conf_introduction.md -index 09a7284..869d3e9 100644 ---- a/docs/conf_introduction.md -+++ b/docs/conf_introduction.md -@@ -16,7 +16,6 @@ gala-anteater # gala-anteater 主目录 - 在文件`gala-anteater.yaml`中,配置`gala-anteater`启动时所需的参数。该配置项中,主要包含: - - Global: 配置启动时的全局变量 - - data_source: 时序数据的来源,目前支持`"prometheus"`(Prometheus)和`"aom"`(AOM)两种数据来源; -- - sys_level: 是否支持`系统级`异常检测,可选:`true`、`false`。 - - - Kafka: 配置中间件Kafka所需的参数 - - server: Kafak对应的`server ip`,如:"10.xxx.xxx.xxx"; --- -2.33.0 -