From b2ed1a3df17c7c080d156fa2dde11895481d2e97 Mon Sep 17 00:00:00 2001
From: wangfenglai1 <wangfenglai1@huawei.com>
Date: Fri, 17 Jan 2025 17:00:00 +0800
Subject: [PATCH] add local file test case

---
 tests/e2e_tests/test_local_file.py | 209 +++++++++++++++++++++++++++++
 1 file changed, 209 insertions(+)
 create mode 100644 tests/e2e_tests/test_local_file.py

diff --git a/tests/e2e_tests/test_local_file.py b/tests/e2e_tests/test_local_file.py
new file mode 100644
index 0000000..d16b271
--- /dev/null
+++ b/tests/e2e_tests/test_local_file.py
@@ -0,0 +1,209 @@
+#!/usr/bin/python3
+# ******************************************************************************
+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# gala-anteater is licensed under Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+#          http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+# See the Mulan PSL v2 for more details.
+# ******************************************************************************/
+"""
+Time: 2025-01-17
+Author: wangfl
+Description: The main function of gala-anteater project.
+"""
+import os
+import argparse
+import logging
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+
+logging.basicConfig(level=logging.ERROR) 
+
+
+def read_table_data_pandas(file_path):
+    """
+    此函数使用 pandas 从文件读取表格数据。
+
+    参数:
+    file_path (str): CSV 文件的路径。
+
+    返回:
+    pandas.DataFrame: 包含表格数据的数据框。
+    """
+    try:
+        df = pd.read_csv(file_path)
+        return df
+    except FileNotFoundError:
+        logging.error(f"Error: The file {file_path} does not exist.")
+        return None
+    except Exception as e:
+        logging.error(f"Error: An error occurred while reading the file {file_path}: {e}")
+        return None
+
+
+def calculate_thresholds(window):
+    """
+    计算窗口内的上下阈值。
+
+    参数:
+    window (pandas.DataFrame): 滑动窗口的数据。
+
+    返回:
+    tuple: 包含上下阈值的元组。
+    """
+    mean_value = window['value'].mean()
+    std_value = window['value'].std()
+    threshold = 3 * std_value
+    upper_threshold = mean_value + threshold
+    lower_threshold = mean_value - threshold
+    return upper_threshold, lower_threshold
+
+
+def check_deviation(window, upper_threshold, lower_threshold):
+    """
+    检查窗口内的数据是否偏离阈值。
+
+    参数:
+    window (pandas.DataFrame): 滑动窗口的数据。
+    upper_threshold (float): 上阈值。
+    lower_threshold (float): 下阈值。
+
+    返回:
+    pandas.Series: 偏离状态的布尔序列。
+    """
+    return ((window['value'] > upper_threshold) | (window['value'] < lower_threshold))
+
+
+def check_consecutive_deviation(deviation_status_list, consecutive_count=5):
+    """
+    检查连续偏离点是否超过指定数量。
+
+    参数:
+    deviation_status_list (list): 偏离状态列表。
+    consecutive_count (int): 连续偏离点的阈值，默认为 5。
+
+    返回:
+    list: 标记为故障点的布尔列表。
+    """
+    corrected_status = []
+    consecutive_count_current = 0
+    for status in deviation_status_list:
+        if status:
+            consecutive_count_current += 1
+        else:
+            consecutive_count_current = 0
+        if consecutive_count_current >= consecutive_count:
+            corrected_status.append(True)
+        else:
+            corrected_status.append(False)
+    return corrected_status
+
+
+def analyze_table_data_pandas(df, window_size=600):
+    """
+    此函数使用 pandas 检查 value 是否偏离均值分布 3 倍以上，
+    并将结果保存在新列 deviation_status 中。
+    进一步检查滑窗内连续 5 个以上的偏离点，将其标记为故障点，其余为正常点。
+    滑窗间不重叠。
+
+    参数:
+    df (pandas.DataFrame): 包含表格数据的数据框。
+    window_size (int): 滑窗大小，默认为 600。
+
+    返回:
+    pandas.DataFrame: 包含 'timestamp', 'value', 'deviation_status' 列的数据框。
+    """
+    df['deviation_status'] = False
+    df['lower_threshold'] = np.nan
+    df['upper_threshold'] = np.nan
+    num_windows = len(df) // window_size
+    for i in range(num_windows):
+        start = i * window_size
+        end = (i + 1) * window_size
+        window = df.iloc[start:end]
+        upper_threshold, lower_threshold = calculate_thresholds(window)
+        deviation_status = check_deviation(window, upper_threshold, lower_threshold)
+        deviation_status_list = deviation_status.tolist()
+        corrected_status = check_consecutive_deviation(deviation_status_list)
+        df.loc[start:end - 1, 'deviation_status'] = corrected_status
+        df.loc[start:end - 1, 'lower_threshold'] = lower_threshold
+        df.loc[start:end - 1, 'upper_threshold'] = upper_threshold
+    return df
+
+
+def plot_table_data(df, output_path, window_size):
+    """
+    此函数使用 matplotlib 绘制表格数据，并标记出故障点，同时增加方格线，保存图像，并绘制上下阈值线。
+
+    参数:
+    df (pandas.DataFrame): 包含 'timestamp', 'value', 'deviation_status' 列的数据框。
+    output_path (str): 图像保存的路径。
+    window_size (int): 滑窗大小。
+    """
+    fig, ax = plt.subplots(figsize=(10, 6))
+    # 绘制正常点
+    normal_df = df[~df['deviation_status']]
+    ax.plot(normal_df['timestamp'], normal_df['value'], label='Normal', marker='o', linestyle='-', color='blue')
+    # 绘制故障点
+    fault_df = df[df['deviation_status']]
+    ax.plot(fault_df['timestamp'], fault_df['value'], label='Fault', marker='x', linestyle='None', color='red',
+            markersize=10)
+    # 绘制上下阈值线
+    ax.plot(df['timestamp'], df["upper_threshold"], label='Upper Threshold', linestyle='--', color='green')
+    ax.plot(df['timestamp'], df["lower_threshold"], label='Lower Threshold', linestyle='--', color='orange')
+    ax.set_xlabel('Timestamp')
+    ax.set_ylabel('Value')
+    ax.set_title('Table Data Analysis')
+    ax.legend()
+    # 添加方格线
+    ax.grid(True, which='both', linestyle='--', linewidth=0.5, color='gray')
+    plt.savefig(output_path)
+    plt.close(fig)
+
+
+def save_analysis_result(df, result_path):
+    """
+    此函数将分析结果保存到 CSV 文件中。
+
+    参数:
+    df (pandas.DataFrame): 包含分析结果的数据框。
+    result_path (str): 结果保存的路径。
+    """
+    try:
+        df.to_csv(result_path, index=False)
+    except Exception as e:
+        logging.error(f"Error: An error occurred while saving the file {result_path}: {e}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Table Data Analysis')
+    parser.add_argument('directory', type=str, nargs='?', default='./test',
+                        help='Path to the directory containing CSV files')
+    parser.add_argument('--window_size', type=int, default=600, help='Window size for analysis')
+    args = parser.parse_args()
+
+    # 遍历目录下的所有文件
+    for root, dirs, files in os.walk(args.directory):
+        for file in files:
+            if file.endswith('.csv'):
+                file_path = os.path.join(root, file)
+                # 获取输入文件的基本名称（不包含路径）
+                base_name = os.path.basename(file_path)
+                file_name = os.path.splitext(base_name)[0]
+                output_image_path = os.path.join(root, f'{file_name}.png')
+                output_result_path = os.path.join(root, f'{file_name}_result.csv')
+
+                df = read_table_data_pandas(file_path)
+                if df is not None:
+                    analysis_result = analyze_table_data_pandas(df, args.window_size)
+                    plot_table_data(analysis_result, output_image_path, args.window_size)
+                    save_analysis_result(analysis_result, output_result_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
-- 
Gitee