2195 lines
65 KiB
Diff
2195 lines
65 KiB
Diff
|
|
From abdeacfa6ae54b503714cb98f3262a39d883972e Mon Sep 17 00:00:00 2001
|
||
|
|
From: luckky <guodashun1@huawei.com>
|
||
|
|
Date: Fri, 11 Oct 2024 09:49:40 +0000
|
||
|
|
Subject: [PATCH] add hbm online repair
|
||
|
|
|
||
|
|
---
|
||
|
|
config/tasks/hbm_online_repair.mod | 9 +
|
||
|
|
src/c/hbm_online_repair/.gitignore | 6 +
|
||
|
|
src/c/hbm_online_repair/Makefile | 25 +
|
||
|
|
src/c/hbm_online_repair/hbm_online_repair.c | 144 ++++
|
||
|
|
src/c/hbm_online_repair/hbm_online_repair.env | 2 +
|
||
|
|
src/c/hbm_online_repair/logger.h | 31 +
|
||
|
|
.../non-standard-hbm-repair.c | 799 ++++++++++++++++++
|
||
|
|
.../non-standard-hbm-repair.h | 89 ++
|
||
|
|
src/c/hbm_online_repair/ras-events.c | 534 ++++++++++++
|
||
|
|
src/c/hbm_online_repair/ras-events.h | 28 +
|
||
|
|
.../ras-non-standard-handler.c | 81 ++
|
||
|
|
.../ras-non-standard-handler.h | 25 +
|
||
|
|
src/python/.gitignore | 1 +
|
||
|
|
src/python/syssentry/bmc_alarm.py | 159 ++++
|
||
|
|
src/python/syssentry/syssentry.py | 78 +-
|
||
|
|
15 files changed, 2001 insertions(+), 10 deletions(-)
|
||
|
|
create mode 100644 config/tasks/hbm_online_repair.mod
|
||
|
|
create mode 100644 src/c/hbm_online_repair/.gitignore
|
||
|
|
create mode 100644 src/c/hbm_online_repair/Makefile
|
||
|
|
create mode 100644 src/c/hbm_online_repair/hbm_online_repair.c
|
||
|
|
create mode 100644 src/c/hbm_online_repair/hbm_online_repair.env
|
||
|
|
create mode 100644 src/c/hbm_online_repair/logger.h
|
||
|
|
create mode 100644 src/c/hbm_online_repair/non-standard-hbm-repair.c
|
||
|
|
create mode 100644 src/c/hbm_online_repair/non-standard-hbm-repair.h
|
||
|
|
create mode 100644 src/c/hbm_online_repair/ras-events.c
|
||
|
|
create mode 100644 src/c/hbm_online_repair/ras-events.h
|
||
|
|
create mode 100644 src/c/hbm_online_repair/ras-non-standard-handler.c
|
||
|
|
create mode 100644 src/c/hbm_online_repair/ras-non-standard-handler.h
|
||
|
|
create mode 100644 src/python/.gitignore
|
||
|
|
create mode 100644 src/python/syssentry/bmc_alarm.py
|
||
|
|
|
||
|
|
diff --git a/config/tasks/hbm_online_repair.mod b/config/tasks/hbm_online_repair.mod
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..77dd73e
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/config/tasks/hbm_online_repair.mod
|
||
|
|
@@ -0,0 +1,9 @@
|
||
|
|
+[common]
|
||
|
|
+enabled=yes
|
||
|
|
+task_start=/usr/bin/hbm_online_repair
|
||
|
|
+task_stop=kill $pid
|
||
|
|
+type=period
|
||
|
|
+interval=180
|
||
|
|
+onstart=yes
|
||
|
|
+env_file=/etc/sysconfig/hbm_online_repair.env
|
||
|
|
+conflict=up
|
||
|
|
\ No newline at end of file
|
||
|
|
diff --git a/src/c/hbm_online_repair/.gitignore b/src/c/hbm_online_repair/.gitignore
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..a577882
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/c/hbm_online_repair/.gitignore
|
||
|
|
@@ -0,0 +1,6 @@
|
||
|
|
+*.o
|
||
|
|
+*.c~
|
||
|
|
+*.h~
|
||
|
|
+hbm_online_repair
|
||
|
|
+
|
||
|
|
+.vscode/
|
||
|
|
diff --git a/src/c/hbm_online_repair/Makefile b/src/c/hbm_online_repair/Makefile
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..16ebcd8
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/c/hbm_online_repair/Makefile
|
||
|
|
@@ -0,0 +1,25 @@
|
||
|
|
+CC = gcc
|
||
|
|
+
|
||
|
|
+CFLAGS = -Wall -o3
|
||
|
|
+
|
||
|
|
+LDFLAGS = -ltraceevent
|
||
|
|
+
|
||
|
|
+SRC = $(wildcard *.c)
|
||
|
|
+HDR = $(wildcard *.h)
|
||
|
|
+
|
||
|
|
+OBJ = $(SRC:.c=.o)
|
||
|
|
+
|
||
|
|
+TARGET = hbm_online_repair
|
||
|
|
+
|
||
|
|
+all: $(TARGET)
|
||
|
|
+
|
||
|
|
+$(TARGET): $(OBJ)
|
||
|
|
+ $(CC) $(OBJ) -o $@ $(LDFLAGS)
|
||
|
|
+
|
||
|
|
+%.o: %.c $(HDR)
|
||
|
|
+ $(CC) $(CFLAGS) -c $< -o $@
|
||
|
|
+
|
||
|
|
+clean:
|
||
|
|
+ rm -f $(OBJ) $(TARGET)
|
||
|
|
+
|
||
|
|
+.PHONY: all clean
|
||
|
|
diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..3ace206
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/c/hbm_online_repair/hbm_online_repair.c
|
||
|
|
@@ -0,0 +1,144 @@
|
||
|
|
+#include <argp.h>
|
||
|
|
+#include <stdio.h>
|
||
|
|
+#include <stdlib.h>
|
||
|
|
+#include <string.h>
|
||
|
|
+#include <unistd.h>
|
||
|
|
+
|
||
|
|
+#include "logger.h"
|
||
|
|
+#include "ras-events.h"
|
||
|
|
+#include "non-standard-hbm-repair.h"
|
||
|
|
+
|
||
|
|
+#define DEFAULT_LOG_LEVEL LOG_INFO
|
||
|
|
+#define DEFAULT_PAGE_ISOLATION_THRESHOLD 128
|
||
|
|
+
|
||
|
|
+int global_level_setting;
|
||
|
|
+int page_isolation_threshold;
|
||
|
|
+
|
||
|
|
+int string2int(const char* str, int* value)
|
||
|
|
+{
|
||
|
|
+ if (!str) {
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+ char *endptr;
|
||
|
|
+ errno = 0;
|
||
|
|
+ long val = strtol(str, &endptr, 10);
|
||
|
|
+ if (errno != 0 || *endptr != '\0') {
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+ *value = (int)val;
|
||
|
|
+ if (val != (long)*value) {
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+int execute_command(const char *command)
|
||
|
|
+{
|
||
|
|
+ FILE *fp;
|
||
|
|
+ char buffer[128] = {0};
|
||
|
|
+ int ret;
|
||
|
|
+ fp = popen(command, "r");
|
||
|
|
+ if (!fp) {
|
||
|
|
+ log(LOG_ERROR, "popen failed\n");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ fgets(buffer, sizeof(buffer), fp);
|
||
|
|
+ log(LOG_DEBUG, "output of command is: %s\n", buffer);
|
||
|
|
+
|
||
|
|
+ ret = pclose(fp);
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ log(LOG_ERROR, "pclose failed\n");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (!WIFEXITED(ret)) {
|
||
|
|
+ log(LOG_ERROR, "command did not terminate normally\n");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ ret = WEXITSTATUS(ret);
|
||
|
|
+ log(LOG_DEBUG, "command exited with status: %d\n", ret);
|
||
|
|
+ return ret;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+int load_required_driver(void)
|
||
|
|
+{
|
||
|
|
+ int ret;
|
||
|
|
+ ret = execute_command("modprobe hisi_mem_ras 2>&1");
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ log(LOG_ERROR, "load repair driver failed\n");
|
||
|
|
+ return ret;
|
||
|
|
+ }
|
||
|
|
+ ret = execute_command("modprobe page_eject 2>&1");
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ log(LOG_ERROR, "load page driver failed\n");
|
||
|
|
+ return ret;
|
||
|
|
+ }
|
||
|
|
+ log(LOG_INFO, "load required driver success\n");
|
||
|
|
+ return ret;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+void hbm_param_init(void)
|
||
|
|
+{
|
||
|
|
+ int ret;
|
||
|
|
+ char *env;
|
||
|
|
+
|
||
|
|
+ env = getenv("HBM_ONLINE_REPAIR_LOG_LEVEL");
|
||
|
|
+ ret = string2int(env, &global_level_setting);
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ global_level_setting = DEFAULT_LOG_LEVEL;
|
||
|
|
+ log(LOG_WARNING, "Get log level from config failed, set the default value %d\n", DEFAULT_LOG_LEVEL);
|
||
|
|
+ } else {
|
||
|
|
+ log(LOG_INFO, "log level: %d\n", global_level_setting);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ env = getenv("PAGE_ISOLATION_THRESHOLD");
|
||
|
|
+ ret = string2int(env, &page_isolation_threshold);
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD;
|
||
|
|
+ log(LOG_WARNING, "Get page_isolation_threshold from config failed, set the default value %d\n", DEFAULT_PAGE_ISOLATION_THRESHOLD);
|
||
|
|
+ } else {
|
||
|
|
+ log(LOG_INFO, "page_isolation_threshold: %d\n", page_isolation_threshold);
|
||
|
|
+ }
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+int main(int argc, char *argv[])
|
||
|
|
+{
|
||
|
|
+ int ret;
|
||
|
|
+
|
||
|
|
+ hbm_param_init();
|
||
|
|
+
|
||
|
|
+ ret = load_required_driver();
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ log(LOG_DEBUG, "load required driver failed\n");
|
||
|
|
+ return ret;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ struct ras_events *ras = init_trace_instance();
|
||
|
|
+ if (!ras)
|
||
|
|
+ return -1;
|
||
|
|
+
|
||
|
|
+ ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 1);
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ log(LOG_WARNING, "unable to enable ras non_standard_event.\n");
|
||
|
|
+ free(ras);
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ ret = init_all_flash();
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ log(LOG_ERROR, "flash writer init failed\n");
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ handle_ras_events(ras);
|
||
|
|
+
|
||
|
|
+ ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 0);
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ log(LOG_WARNING, "unable to disable ras non_standard_event.\n");
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ free(ras);
|
||
|
|
+ return ret;
|
||
|
|
+}
|
||
|
|
diff --git a/src/c/hbm_online_repair/hbm_online_repair.env b/src/c/hbm_online_repair/hbm_online_repair.env
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..de56079
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/c/hbm_online_repair/hbm_online_repair.env
|
||
|
|
@@ -0,0 +1,2 @@
|
||
|
|
+HBM_ONLINE_REPAIR_LOG_LEVEL=1
|
||
|
|
+PAGE_ISOLATION_THRESHOLD=128
|
||
|
|
diff --git a/src/c/hbm_online_repair/logger.h b/src/c/hbm_online_repair/logger.h
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..ddfa932
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/c/hbm_online_repair/logger.h
|
||
|
|
@@ -0,0 +1,31 @@
|
||
|
|
+#ifndef __LOGGER_H
|
||
|
|
+#define __LOGGER_H
|
||
|
|
+
|
||
|
|
+#define TOOL_NAME "hbm_online_repair"
|
||
|
|
+
|
||
|
|
+#define LOG_DEBUG 0
|
||
|
|
+#define LOG_INFO 1
|
||
|
|
+#define LOG_WARNING 2
|
||
|
|
+#define LOG_ERROR 3
|
||
|
|
+
|
||
|
|
+extern int global_level_setting;
|
||
|
|
+
|
||
|
|
+#define log_prefix(level) \
|
||
|
|
+ (level == LOG_DEBUG ? "DEBUG" : \
|
||
|
|
+ level == LOG_INFO ? "INFO" : \
|
||
|
|
+ level == LOG_WARNING ? "WARNING" : \
|
||
|
|
+ level == LOG_ERROR ? "ERROR" : \
|
||
|
|
+ "UNKNOWN_LEVEL")
|
||
|
|
+
|
||
|
|
+#define log_fd(level) \
|
||
|
|
+ (level == LOG_ERROR ? stderr : stdout)
|
||
|
|
+
|
||
|
|
+#define log(level, fmt, args...) do {\
|
||
|
|
+ if (level >= global_level_setting) {\
|
||
|
|
+ fprintf(log_fd(level), "[%s] %s: ", log_prefix(level), TOOL_NAME);\
|
||
|
|
+ fprintf(log_fd(level), fmt, ##args);\
|
||
|
|
+ fflush(log_fd(level));\
|
||
|
|
+ }\
|
||
|
|
+} while (0)
|
||
|
|
+
|
||
|
|
+#endif
|
||
|
|
diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..b175e14
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c
|
||
|
|
@@ -0,0 +1,799 @@
|
||
|
|
+#include <stdio.h>
|
||
|
|
+#include <stdlib.h>
|
||
|
|
+#include <string.h>
|
||
|
|
+#include <dirent.h>
|
||
|
|
+#include <errno.h>
|
||
|
|
+#include <fcntl.h>
|
||
|
|
+#include <unistd.h>
|
||
|
|
+#include <stdbool.h>
|
||
|
|
+#include <sys/socket.h>
|
||
|
|
+#include <sys/un.h>
|
||
|
|
+#include <linux/fs.h>
|
||
|
|
+#include <sys/stat.h>
|
||
|
|
+
|
||
|
|
+#include "logger.h"
|
||
|
|
+#include "non-standard-hbm-repair.h"
|
||
|
|
+
|
||
|
|
+extern int page_isolation_threshold;
|
||
|
|
+size_t total_size = 0;
|
||
|
|
+struct hisi_common_error_section {
|
||
|
|
+ uint32_t val_bits;
|
||
|
|
+ uint8_t version;
|
||
|
|
+ uint8_t soc_id;
|
||
|
|
+ uint8_t socket_id;
|
||
|
|
+ uint8_t totem_id;
|
||
|
|
+ uint8_t nimbus_id;
|
||
|
|
+ uint8_t subsystem_id;
|
||
|
|
+ uint8_t module_id;
|
||
|
|
+ uint8_t submodule_id;
|
||
|
|
+ uint8_t core_id;
|
||
|
|
+ uint8_t port_id;
|
||
|
|
+ uint16_t err_type;
|
||
|
|
+ struct {
|
||
|
|
+ uint8_t function;
|
||
|
|
+ uint8_t device;
|
||
|
|
+ uint16_t segment;
|
||
|
|
+ uint8_t bus;
|
||
|
|
+ uint8_t reserved[3];
|
||
|
|
+ } pcie_info;
|
||
|
|
+ uint8_t err_severity;
|
||
|
|
+ uint8_t reserved[3];
|
||
|
|
+ uint32_t reg_array_size;
|
||
|
|
+ uint32_t reg_array[];
|
||
|
|
+};
|
||
|
|
+
|
||
|
|
+struct fault_addr_info {
|
||
|
|
+ uint32_t processer_id;
|
||
|
|
+ uint32_t die_id;
|
||
|
|
+ uint32_t stack_id;
|
||
|
|
+ uint32_t sid;
|
||
|
|
+ uint32_t channel_id;
|
||
|
|
+ uint32_t bankgroup_id;
|
||
|
|
+ uint32_t bank_id;
|
||
|
|
+ uint32_t row_id;
|
||
|
|
+ uint32_t column_id;
|
||
|
|
+ uint32_t error_type;
|
||
|
|
+ uint32_t repair_type;
|
||
|
|
+ uint32_t reserved;
|
||
|
|
+ uint32_t crc8;
|
||
|
|
+};
|
||
|
|
+
|
||
|
|
+typedef struct {
|
||
|
|
+ const char *VariableName;
|
||
|
|
+ const char *VendorGuid;
|
||
|
|
+ uint32_t DataSize;
|
||
|
|
+ uint8_t *Data;
|
||
|
|
+ uint32_t Attributes;
|
||
|
|
+} efi_variable_t;
|
||
|
|
+
|
||
|
|
+char* flash_names[FLASH_ENTRY_NUM] = {
|
||
|
|
+ "repair0000",
|
||
|
|
+ "repair0001",
|
||
|
|
+ "repair0100",
|
||
|
|
+ "repair0101",
|
||
|
|
+ "repair0200",
|
||
|
|
+ "repair0201",
|
||
|
|
+ "repair0300",
|
||
|
|
+ "repair0301",
|
||
|
|
+};
|
||
|
|
+char *flash_guids[FLASH_ENTRY_NUM] = {
|
||
|
|
+ "CD2FF4D9-D937-4e1d-B810-A1A568C37C01",
|
||
|
|
+ "DD92CC91-43E6-4c69-A42A-B08F72FCB157",
|
||
|
|
+ "4A8E0D1E-4CFA-47b2-9359-DA3A0006878B",
|
||
|
|
+ "733F9979-4ED4-478d-BD6A-E4D0F0390FDB",
|
||
|
|
+ "9BFBBA1F-5A93-4d36-AD47-D3C2D714D914",
|
||
|
|
+ "A0920D6F-78B8-4c09-9F61-7CEC845F116C",
|
||
|
|
+ "0049CE5E-8C18-414c-BDC1-A87E60CEEFD7",
|
||
|
|
+ "6AED17B4-50C7-4a40-A5A7-48AF55DD8EAC"
|
||
|
|
+};
|
||
|
|
+
|
||
|
|
+static int get_guid_index(uint32_t socket_id, uint32_t error_type) {
|
||
|
|
+ if (2 * socket_id + error_type >= FLASH_ENTRY_NUM)
|
||
|
|
+ return -1;
|
||
|
|
+ return 2 * socket_id + error_type;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned long long fault_addr)
|
||
|
|
+{
|
||
|
|
+ info_struct->processer_id = fault_addr & FAULT_ADDR_PROCESSOR_ID_MASK;
|
||
|
|
+ fault_addr >>= FAULT_ADDR_PROCESSOR_ID_LEN;
|
||
|
|
+ info_struct->die_id = fault_addr & FAULT_ADDR_DIE_ID_MASK;
|
||
|
|
+ fault_addr >>= FAULT_ADDR_DIE_ID_LEN;
|
||
|
|
+ info_struct->stack_id = fault_addr & FAULT_ADDR_STACK_ID_MASK;
|
||
|
|
+ fault_addr >>= FAULT_ADDR_STACK_ID_LEN;
|
||
|
|
+ info_struct->sid = fault_addr & FAULT_ADDR_SID_MASK;
|
||
|
|
+ fault_addr >>= FAULT_ADDR_SID_LEN;
|
||
|
|
+ info_struct->channel_id = fault_addr & FAULT_ADDR_CHANNEL_ID_MASK;
|
||
|
|
+ fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN;
|
||
|
|
+ info_struct->bankgroup_id = fault_addr & FAULT_ADDR_BANKGROUP_ID_MASK;
|
||
|
|
+ fault_addr >>= FAULT_ADDR_BANKGROUP_ID_LEN;
|
||
|
|
+ info_struct->bank_id = fault_addr & FAULT_ADDR_BANK_ID_MASK;
|
||
|
|
+ fault_addr >>= FAULT_ADDR_BANK_ID_LEN;
|
||
|
|
+ info_struct->row_id = fault_addr & FAULT_ADDR_ROW_ID_MASK;
|
||
|
|
+ fault_addr >>= FAULT_ADDR_ROW_ID_LEN;
|
||
|
|
+ info_struct->column_id = fault_addr & FAULT_ADDR_COLUMN_ID_MASK;
|
||
|
|
+ fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN;
|
||
|
|
+ info_struct->error_type = fault_addr & FAULT_ADDR_ERROR_TYPE_MASK;
|
||
|
|
+ fault_addr >>= FAULT_ADDR_ERROR_TYPE_LEN;
|
||
|
|
+ info_struct->repair_type = fault_addr & FAULT_ADDR_REPAIR_TYPE_MASK;
|
||
|
|
+ fault_addr >>= FAULT_ADDR_REPAIR_TYPE_LEN;
|
||
|
|
+ info_struct->reserved = fault_addr & FAULT_ADDR_RESERVED_MASK;
|
||
|
|
+ fault_addr >>= FAULT_ADDR_RESERVED_LEN;
|
||
|
|
+ info_struct->crc8 = (uint32_t)fault_addr;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static bool variable_existed(char *name, char *guid)
|
||
|
|
+{
|
||
|
|
+ char filename[PATH_MAX];
|
||
|
|
+ int fd;
|
||
|
|
+
|
||
|
|
+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
|
||
|
|
+
|
||
|
|
+ // open var file
|
||
|
|
+ fd = open(filename, O_RDONLY);
|
||
|
|
+ if (fd < 0) {
|
||
|
|
+ log(LOG_WARNING, "open file %s failed\n", filename);
|
||
|
|
+ return false;
|
||
|
|
+ }
|
||
|
|
+ close(fd);
|
||
|
|
+ return true;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static uint32_t read_variable_attribute(char *name, char *guid) {
|
||
|
|
+ char filename[PATH_MAX];
|
||
|
|
+ int fd;
|
||
|
|
+ size_t readsize;
|
||
|
|
+ uint32_t attribute = (uint32_t)-1;
|
||
|
|
+
|
||
|
|
+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
|
||
|
|
+
|
||
|
|
+ // open var file
|
||
|
|
+ fd = open(filename, O_RDONLY);
|
||
|
|
+ if (fd < 0) {
|
||
|
|
+ log(LOG_ERROR, "open %s failed\n", filename);
|
||
|
|
+ return attribute;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ // read attributes from first 4 bytes
|
||
|
|
+ readsize = read(fd, &attribute, sizeof(uint32_t));
|
||
|
|
+ if (readsize != sizeof(uint32_t)) {
|
||
|
|
+ log(LOG_ERROR, "read attribute of %s failed\n", filename);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ close(fd);
|
||
|
|
+ return attribute;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int efivarfs_set_mutable(char *name, char *guid, bool mutable)
|
||
|
|
+{
|
||
|
|
+ unsigned long orig_attrs, new_attrs;
|
||
|
|
+ char filename[PATH_MAX];
|
||
|
|
+ int fd;
|
||
|
|
+
|
||
|
|
+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
|
||
|
|
+
|
||
|
|
+ fd = open(filename, O_RDONLY);
|
||
|
|
+ if (fd < 0) {
|
||
|
|
+ log(LOG_ERROR, "open %s failed\n", filename);
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (ioctl(fd, FS_IOC_GETFLAGS, &orig_attrs) == -1) {
|
||
|
|
+ log(LOG_ERROR, "ioctl FS_IOC_GETFLAGS failed\n");
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (mutable)
|
||
|
|
+ new_attrs = orig_attrs & ~(unsigned long)FS_IMMUTABLE_FL;
|
||
|
|
+ else
|
||
|
|
+ new_attrs = orig_attrs | FS_IMMUTABLE_FL;
|
||
|
|
+
|
||
|
|
+ if (new_attrs == orig_attrs) {
|
||
|
|
+ close(fd);
|
||
|
|
+ return 0;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (ioctl(fd, FS_IOC_SETFLAGS, &new_attrs) == -1) {
|
||
|
|
+ log(LOG_ERROR, "ioctl FS_IOC_SETFLAGS failed\n");
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+ close(fd);
|
||
|
|
+ return 0;
|
||
|
|
+err:
|
||
|
|
+ if (fd >= 0)
|
||
|
|
+ close(fd);
|
||
|
|
+ return -1;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute) {
|
||
|
|
+ int fd, mode;
|
||
|
|
+ size_t writesize;
|
||
|
|
+ void *buffer;
|
||
|
|
+ unsigned long total;
|
||
|
|
+ char filename[PATH_MAX];
|
||
|
|
+
|
||
|
|
+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
|
||
|
|
+
|
||
|
|
+ // prepare attributes(size 4 bytes) and data
|
||
|
|
+ total = size + sizeof(uint32_t);
|
||
|
|
+ buffer = malloc(total);
|
||
|
|
+ if (buffer == NULL) {
|
||
|
|
+ log(LOG_ERROR, "malloc data for %s failed\n", filename);
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+ memcpy(buffer, &attribute, sizeof(uint32_t));
|
||
|
|
+ memcpy(buffer + sizeof(uint32_t), value, size);
|
||
|
|
+
|
||
|
|
+ // change attr
|
||
|
|
+ if (efivarfs_set_mutable(name, guid, 1) != 0) {
|
||
|
|
+ log(LOG_ERROR, "set mutable for %s failed\n", filename);
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ mode = O_WRONLY;
|
||
|
|
+ if (attribute & EFI_VARIABLE_APPEND_WRITE)
|
||
|
|
+ mode |= O_APPEND;
|
||
|
|
+ else
|
||
|
|
+ mode |= O_CREAT;
|
||
|
|
+
|
||
|
|
+ // open var file
|
||
|
|
+ fd = open(filename, mode, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
|
||
|
|
+ if (fd < 0) {
|
||
|
|
+ log(LOG_ERROR, "open %s failed\n", filename);
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ // write to var file
|
||
|
|
+ writesize = write(fd, buffer, total);
|
||
|
|
+ if (writesize != total) {
|
||
|
|
+ log(LOG_ERROR, "write %s failed\n", filename);
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ close(fd);
|
||
|
|
+ free(buffer);
|
||
|
|
+ if (efivarfs_set_mutable(name, guid, 0) != 0) {
|
||
|
|
+ log(LOG_ERROR, "set immutable for %s failed\n", filename);
|
||
|
|
+ }
|
||
|
|
+ return 0;
|
||
|
|
+err:
|
||
|
|
+ if (fd >= 0)
|
||
|
|
+ close(fd);
|
||
|
|
+ if (buffer)
|
||
|
|
+ free(buffer);
|
||
|
|
+ if (efivarfs_set_mutable(name, guid, 0) != 0) {
|
||
|
|
+ log(LOG_ERROR, "set immutable for %s failed\n", filename);
|
||
|
|
+ }
|
||
|
|
+ return -1;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int append_variable(char *name, char *guid, void *data, unsigned long size) {
|
||
|
|
+ // prepare append attribute
|
||
|
|
+ uint32_t attribute = read_variable_attribute(name, guid);
|
||
|
|
+ if (attribute == (uint32_t)-1) {
|
||
|
|
+ log(LOG_ERROR, "read %s-%s attribute failed\n", name, guid);
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+ attribute |= EFI_VARIABLE_APPEND_WRITE;
|
||
|
|
+
|
||
|
|
+ return write_variable(name, guid, data, size, attribute);
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static size_t get_var_size(char *name, char *guid) {
|
||
|
|
+ char filename[PATH_MAX];
|
||
|
|
+ int fd;
|
||
|
|
+ struct stat stat;
|
||
|
|
+
|
||
|
|
+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid);
|
||
|
|
+
|
||
|
|
+ // open var file
|
||
|
|
+ fd = open(filename, O_RDONLY);
|
||
|
|
+ if (fd < 0) {
|
||
|
|
+ log(LOG_WARNING, "open %s failed\n", filename);
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+ // read stat
|
||
|
|
+ if (fstat(fd, &stat) != 0) {
|
||
|
|
+ log(LOG_WARNING, "fstat %s failed\n", filename);
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+ close(fd);
|
||
|
|
+ return stat.st_size;
|
||
|
|
+err:
|
||
|
|
+ if (fd >= 0)
|
||
|
|
+ close(fd);
|
||
|
|
+ return (size_t)-1;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+int init_all_flash() {
|
||
|
|
+ for (int i = 0; i < FLASH_ENTRY_NUM; i++) {
|
||
|
|
+ // check existed entry
|
||
|
|
+ if (variable_existed(flash_names[i], flash_guids[i])) {
|
||
|
|
+ total_size += get_var_size(flash_names[i], flash_guids[i]);
|
||
|
|
+ continue;
|
||
|
|
+ }
|
||
|
|
+ // create new entry
|
||
|
|
+ uint32_t attribute = EFI_VARIABLE_NON_VOLATILE |
|
||
|
|
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
|
||
|
|
+ EFI_VARIABLE_RUNTIME_ACCESS;
|
||
|
|
+ char *data = "";
|
||
|
|
+ unsigned long size = 1;
|
||
|
|
+ int ret = write_variable(flash_names[i], flash_guids[i], data, size, attribute);
|
||
|
|
+ if (ret) {
|
||
|
|
+ log(LOG_ERROR, "init %s-%s failed, fault info storage funtion not enabled\n", flash_names[i], flash_guids[i]);
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+ total_size += sizeof(uint32_t) + 1;
|
||
|
|
+ }
|
||
|
|
+ // check total entry size
|
||
|
|
+ log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n",
|
||
|
|
+ total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE);
|
||
|
|
+ if (total_size > MAX_VAR_SIZE) {
|
||
|
|
+ log(LOG_ERROR, "fault info storage reach threshold, cannot save new record\n");
|
||
|
|
+ }
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int write_fault_info_to_flash(const struct hisi_common_error_section *err) {
|
||
|
|
+ int ret, guid_index;
|
||
|
|
+ uint32_t reg_size;
|
||
|
|
+ uint64_t fault_addr;
|
||
|
|
+
|
||
|
|
+ // check flash usage threshold
|
||
|
|
+ if (total_size + sizeof(uint64_t) > MAX_VAR_SIZE) {
|
||
|
|
+ log(LOG_WARNING, "fault info storage reach threshold, cannot save new record into flash\n");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ // parse physical addr
|
||
|
|
+ reg_size = err->reg_array_size / sizeof(uint32_t);
|
||
|
|
+ fault_addr = err->reg_array[reg_size - 1];
|
||
|
|
+ fault_addr <<= TYPE_UINT32_WIDTH;
|
||
|
|
+ fault_addr += err->reg_array[reg_size - 2];
|
||
|
|
+
|
||
|
|
+ // get guid
|
||
|
|
+ struct fault_addr_info info_struct;
|
||
|
|
+ parse_fault_addr_info(&info_struct, fault_addr);
|
||
|
|
+ guid_index = get_guid_index(info_struct.processer_id, info_struct.error_type);
|
||
|
|
+ if (guid_index < 0) {
|
||
|
|
+ log(LOG_ERROR, "invalid fault info\n");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+ // record physical addr in flash
|
||
|
|
+ ret = append_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t));
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ log(LOG_ERROR, "append to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]);
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+ total_size += sizeof(uint64_t);
|
||
|
|
+ log(LOG_INFO, "write hbm fault info to flash success\n");
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int write_file(char *path, const char *name, unsigned long long value)
|
||
|
|
+{
|
||
|
|
+ char fname[MAX_PATH];
|
||
|
|
+ char buf[20];
|
||
|
|
+ int ret;
|
||
|
|
+ int fd;
|
||
|
|
+
|
||
|
|
+ snprintf(fname, MAX_PATH, "%s/%s", path, name);
|
||
|
|
+
|
||
|
|
+ fd = open(fname, O_WRONLY);
|
||
|
|
+ if (fd < 0) {
|
||
|
|
+ log(LOG_WARNING, "HBM ACLS: Cannot to open '%s': %s\n",
|
||
|
|
+ fname, strerror(errno));
|
||
|
|
+ return -errno;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ snprintf(buf, sizeof(buf), "0x%llx\n", value);
|
||
|
|
+ ret = write(fd, buf, strlen(buf));
|
||
|
|
+ if (ret <= 0)
|
||
|
|
+ log(LOG_WARNING, "HBM ACLS: Failed to set %s (0x%llx): %s\n",
|
||
|
|
+ fname, value, strerror(errno));
|
||
|
|
+
|
||
|
|
+ close(fd);
|
||
|
|
+ return ret > 0 ? 0 : -errno;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int get_hardware_corrupted_size()
|
||
|
|
+{
|
||
|
|
+ FILE *fp;
|
||
|
|
+ char line[256];
|
||
|
|
+ int hardware_corrupted_size = -1;
|
||
|
|
+ char *key = "HardwareCorrupted:";
|
||
|
|
+
|
||
|
|
+ fp = fopen("/proc/meminfo", "r");
|
||
|
|
+ if (fp == NULL) {
|
||
|
|
+ log(LOG_ERROR, "Failed to open /proc/meminfo\n");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ while (fgets(line, sizeof(line), fp) != NULL) {
|
||
|
|
+ char *pos;
|
||
|
|
+ if ((pos = strstr(line, key)) != NULL) {
|
||
|
|
+ sscanf(pos, "HardwareCorrupted: %5d kB\n", &hardware_corrupted_size);
|
||
|
|
+ break;
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ fclose(fp);
|
||
|
|
+ return hardware_corrupted_size;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static uint8_t get_repair_result_code(int ret)
|
||
|
|
+{
|
||
|
|
+ if (ret == -ENOSPC) {
|
||
|
|
+ return REPAIR_FAILED_NO_RESOURCE;
|
||
|
|
+ } else if (ret == -EIO) {
|
||
|
|
+ return REPAIR_FAILED_OTHER_REASON;
|
||
|
|
+ } else if (ret == -ENXIO || ret == -EINVAL) {
|
||
|
|
+ return REPAIR_FAILED_INVALID_PARAM;
|
||
|
|
+ }
|
||
|
|
+ return REPAIR_FAILED_OTHER_REASON;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int notice_BMC(const struct hisi_common_error_section *err, uint8_t repair_result_code)
|
||
|
|
+{
|
||
|
|
+ int sockfd;
|
||
|
|
+ struct sockaddr_un addr;
|
||
|
|
+ char bmc_msg[sizeof(BMC_REPORT_FORMAT)] = {0};
|
||
|
|
+ uint8_t repair_type_code, isolation_type_code;
|
||
|
|
+ uint32_t repair_type;
|
||
|
|
+ unsigned long long fault_addr;
|
||
|
|
+
|
||
|
|
+ sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
|
||
|
|
+ if (sockfd < 0) {
|
||
|
|
+ log(LOG_ERROR, "Failed to create BMC notice socket\n");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ memset(&addr, 0, sizeof(struct sockaddr_un));
|
||
|
|
+ addr.sun_family = AF_UNIX;
|
||
|
|
+ strncpy(addr.sun_path, BMC_SOCKET_PATH, sizeof(addr.sun_path) - 1);
|
||
|
|
+ if (connect(sockfd, (struct sockaddr *)&addr, sizeof(struct sockaddr_un)) < 0) {
|
||
|
|
+ log(LOG_ERROR, "Failed to connect BMC notice socket\n");
|
||
|
|
+ close(sockfd);
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /* assemble bmc specific msg */
|
||
|
|
+ repair_type_code = 0;
|
||
|
|
+ isolation_type_code = 0;
|
||
|
|
+ repair_type = err->reg_array[HBM_REPAIR_REQ_TYPE];
|
||
|
|
+ if (repair_type & HBM_CE_ACLS) {
|
||
|
|
+ repair_type_code = 0;
|
||
|
|
+ isolation_type_code = SINGLE_ADDR_FAULT;
|
||
|
|
+ } else if (repair_type & HBM_PSUE_ACLS) {
|
||
|
|
+ repair_type_code = 1;
|
||
|
|
+ isolation_type_code = SINGLE_ADDR_FAULT;
|
||
|
|
+ } else if (repair_type & HBM_CE_SPPR) {
|
||
|
|
+ repair_type_code = 2;
|
||
|
|
+ isolation_type_code = ROW_FAULT;
|
||
|
|
+ } else if (repair_type & HBM_PSUE_SPPR) {
|
||
|
|
+ repair_type_code = 3;
|
||
|
|
+ isolation_type_code = ROW_FAULT;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ const uint32_t reg_size = err->reg_array_size / sizeof(uint32_t);
|
||
|
|
+
|
||
|
|
+ fault_addr = err->reg_array[reg_size - 1];
|
||
|
|
+ fault_addr <<= TYPE_UINT32_WIDTH;
|
||
|
|
+ fault_addr += err->reg_array[reg_size - 2];
|
||
|
|
+
|
||
|
|
+ log(LOG_DEBUG, "Get the fault addr is %llu\n", fault_addr);
|
||
|
|
+
|
||
|
|
+ struct fault_addr_info info_struct;
|
||
|
|
+ parse_fault_addr_info(&info_struct, fault_addr);
|
||
|
|
+
|
||
|
|
+ log(LOG_DEBUG, "info_struct.processer_id is %u\n", info_struct.processer_id);
|
||
|
|
+ log(LOG_DEBUG, "info_struct.die_id is %u\n", info_struct.die_id);
|
||
|
|
+ log(LOG_DEBUG, "info_struct.stack_id is %u\n", info_struct.stack_id);
|
||
|
|
+ log(LOG_DEBUG, "info_struct.sid is %u\n", info_struct.sid);
|
||
|
|
+ log(LOG_DEBUG, "info_struct.channel_id is %u\n", info_struct.channel_id);
|
||
|
|
+ log(LOG_DEBUG, "info_struct.bankgroup_id is %u\n", info_struct.bankgroup_id);
|
||
|
|
+ log(LOG_DEBUG, "info_struct.bank_id is %u\n", info_struct.bank_id);
|
||
|
|
+ log(LOG_DEBUG, "info_struct.row_id is %u\n", info_struct.row_id);
|
||
|
|
+ log(LOG_DEBUG, "info_struct.column_id is %u\n", info_struct.column_id);
|
||
|
|
+ log(LOG_DEBUG, "info_struct.error_type is %u\n", info_struct.error_type);
|
||
|
|
+ log(LOG_DEBUG, "info_struct.repair_type is %u\n", info_struct.repair_type);
|
||
|
|
+ log(LOG_DEBUG, "info_struct.reserved is %u\n", info_struct.reserved);
|
||
|
|
+ log(LOG_DEBUG, "info_struct.crc8 is %u\n", info_struct.crc8);
|
||
|
|
+
|
||
|
|
+ snprintf(bmc_msg, sizeof(BMC_REPORT_FORMAT), BMC_REPORT_FORMAT,
|
||
|
|
+ repair_type_code,
|
||
|
|
+ repair_result_code,
|
||
|
|
+ isolation_type_code,
|
||
|
|
+ info_struct.processer_id,
|
||
|
|
+ info_struct.die_id,
|
||
|
|
+ info_struct.stack_id,
|
||
|
|
+ info_struct.sid,
|
||
|
|
+ info_struct.channel_id,
|
||
|
|
+ info_struct.bankgroup_id,
|
||
|
|
+ info_struct.bank_id,
|
||
|
|
+ info_struct.row_id,
|
||
|
|
+ info_struct.column_id
|
||
|
|
+ );
|
||
|
|
+
|
||
|
|
+ log(LOG_DEBUG, "Send msg to sysSentry, bmc msg is %s\n", bmc_msg);
|
||
|
|
+
|
||
|
|
+ if (write(sockfd, bmc_msg, strlen(bmc_msg)) <= 0) {
|
||
|
|
+ log(LOG_ERROR, "Failed to send data to BMC notice socket\n");
|
||
|
|
+ close(sockfd);
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ close(sockfd);
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err)
|
||
|
|
+{
|
||
|
|
+ unsigned long long paddr;
|
||
|
|
+ int ret;
|
||
|
|
+ bool is_acls = err->reg_array[HBM_REPAIR_REQ_TYPE] & (HBM_CE_ACLS | HBM_PSUE_ACLS);
|
||
|
|
+ int required_isolate_size = (is_acls ? HBM_ACLS_ADDR_NUM : HBM_SPPR_ADDR_NUM) * DEFAULT_PAGE_SIZE_KB;
|
||
|
|
+ int hardware_corrupted_size = get_hardware_corrupted_size();
|
||
|
|
+ if (hardware_corrupted_size < 0) {
|
||
|
|
+ log(LOG_ERROR, "Page isolate failed: Get hardware_corrupted_size failed");
|
||
|
|
+ notice_BMC(err, ISOLATE_FAILED_OTHER_REASON);
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+ if ((required_isolate_size + hardware_corrupted_size) > page_isolation_threshold) {
|
||
|
|
+ log(LOG_INFO, "Page isolate failed: the isolation resource is not enough\n");
|
||
|
|
+ notice_BMC(err, ISOLATE_FAILED_OVER_THRESHOLD);
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+ if (is_acls) {
|
||
|
|
+ /* ACLS */
|
||
|
|
+ paddr = err->reg_array[HBM_ADDH];
|
||
|
|
+ paddr <<= TYPE_UINT32_WIDTH;
|
||
|
|
+ paddr += err->reg_array[HBM_ADDL];
|
||
|
|
+
|
||
|
|
+ ret = write_file("/sys/kernel/page_eject", "offline_page", paddr);
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ notice_BMC(err, ISOLATE_FAILED_OTHER_REASON);
|
||
|
|
+ log(LOG_WARNING, "HBM: ACLS offline failed, address is 0x%llx \n", paddr);
|
||
|
|
+ return ret;
|
||
|
|
+ }
|
||
|
|
+ } else {
|
||
|
|
+ /* SPPR */
|
||
|
|
+ bool all_success = true;
|
||
|
|
+ uint32_t i;
|
||
|
|
+ for (i = 0; i < HBM_SPPR_ADDR_NUM; i++) {
|
||
|
|
+ paddr = err->reg_array[2 * i + HBM_ADDH];
|
||
|
|
+ paddr <<= TYPE_UINT32_WIDTH;
|
||
|
|
+ paddr += err->reg_array[2 * i + HBM_ADDL];
|
||
|
|
+ ret = write_file("/sys/kernel/page_eject", "offline_page", paddr);
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ all_success = false;
|
||
|
|
+ log(LOG_WARNING, "HBM: SPPR offline failed, address is 0x%llx \n", paddr);
|
||
|
|
+ continue;
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+ if (!all_success) {
|
||
|
|
+ notice_BMC(err, ISOLATE_FAILED_OTHER_REASON);
|
||
|
|
+ ret = -1;
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+ return ret < 0 ? ret : 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr)
|
||
|
|
+{
|
||
|
|
+ int ret;
|
||
|
|
+ if (repair_ret < 0) {
|
||
|
|
+ log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr);
|
||
|
|
+ /* not much we can do about errors here */
|
||
|
|
+ (void)write_file("/sys/kernel/page_eject", "remove_page", paddr);
|
||
|
|
+ return get_repair_result_code(repair_ret);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ ret = write_file("/sys/kernel/page_eject", "online_page", paddr);
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ log(LOG_WARNING, "HBM %s: Page (0x%llx) online failed\n",is_acls ? "ACLS" : "SPPR", paddr);
|
||
|
|
+ return ONLINE_PAGE_FAILED;
|
||
|
|
+ } else {
|
||
|
|
+ log(LOG_INFO, "HBM %s: Page (0x%llx) repair and online success\n",is_acls ? "ACLS" : "SPPR", paddr);
|
||
|
|
+ return ISOLATE_REPAIR_ONLINE_SUCCESS;
|
||
|
|
+ }
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path)
|
||
|
|
+{
|
||
|
|
+ unsigned long long paddr;
|
||
|
|
+ int ret;
|
||
|
|
+ uint8_t repair_result_code;
|
||
|
|
+ bool is_acls;
|
||
|
|
+
|
||
|
|
+ /* Both ACLS and SPPR only repair the first address */
|
||
|
|
+ paddr = err->reg_array[HBM_ADDH];
|
||
|
|
+ paddr <<= TYPE_UINT32_WIDTH;
|
||
|
|
+ paddr += err->reg_array[HBM_ADDL];
|
||
|
|
+
|
||
|
|
+ is_acls = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_CE_ACLS ||
|
||
|
|
+ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_PSUE_ACLS;
|
||
|
|
+
|
||
|
|
+ ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr);
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ notice_BMC(err, get_repair_result_code(ret));
|
||
|
|
+ log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR");
|
||
|
|
+ return ret;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ ret = write_file(path, is_acls ? "acls_repair" : "sppr_repair", paddr);
|
||
|
|
+
|
||
|
|
+ if (is_acls) {
|
||
|
|
+ /* ACLS */
|
||
|
|
+ repair_result_code = hbmc_hbm_after_repair(is_acls, ret, paddr);
|
||
|
|
+ notice_BMC(err, repair_result_code);
|
||
|
|
+ return ret;
|
||
|
|
+ } else {
|
||
|
|
+ /* SPPR */
|
||
|
|
+ bool all_online_success = true;
|
||
|
|
+ uint32_t i;
|
||
|
|
+ for (i = 0; i < HBM_SPPR_ADDR_NUM; i++) {
|
||
|
|
+ paddr = err->reg_array[2 * i + HBM_ADDH];
|
||
|
|
+ paddr <<= TYPE_UINT32_WIDTH;
|
||
|
|
+ paddr += err->reg_array[2 * i + HBM_ADDL];
|
||
|
|
+
|
||
|
|
+ repair_result_code = hbmc_hbm_after_repair(is_acls, ret, paddr);
|
||
|
|
+ if (repair_result_code != ISOLATE_REPAIR_ONLINE_SUCCESS) {
|
||
|
|
+ all_online_success = false;
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ notice_BMC(err, get_repair_result_code(ret));
|
||
|
|
+ return ret;
|
||
|
|
+ } else if (all_online_success) {
|
||
|
|
+ notice_BMC(err, ISOLATE_REPAIR_ONLINE_SUCCESS);
|
||
|
|
+ return 0;
|
||
|
|
+ } else {
|
||
|
|
+ notice_BMC(err, ONLINE_PAGE_FAILED);
|
||
|
|
+ return ret;
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+ /* The final return code is not necessary */
|
||
|
|
+ return ret < 0 ? ret : 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int hbmc_get_memory_type(char *path)
|
||
|
|
+{
|
||
|
|
+ int type = HBM_UNKNOWN;
|
||
|
|
+ char fname[MAX_PATH];
|
||
|
|
+ char buf[128];
|
||
|
|
+ FILE *file;
|
||
|
|
+
|
||
|
|
+ snprintf(fname, MAX_PATH, "%s/%s", path, "memory_type");
|
||
|
|
+ file = fopen(fname, "r");
|
||
|
|
+ if (!file) {
|
||
|
|
+ log(LOG_WARNING, "HBM: Cannot to open '%s': %s\n",
|
||
|
|
+ fname, strerror(errno));
|
||
|
|
+ return -errno;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (!fgets(buf, sizeof(buf), file)) {
|
||
|
|
+ log(LOG_WARNING, "HBM: Failed to read %s\n", fname);
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /* Remove the last '\n' */
|
||
|
|
+ buf[strlen(buf) - 1] = 0;
|
||
|
|
+
|
||
|
|
+ if (strcmp(buf, "HBM") == 0)
|
||
|
|
+ type = HBM_HBM_MEMORY;
|
||
|
|
+ else if (strcmp(buf, "DDR") == 0)
|
||
|
|
+ type = HBM_DDR_MEMORY;
|
||
|
|
+
|
||
|
|
+err:
|
||
|
|
+ fclose(file);
|
||
|
|
+ return type;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static void hbm_repair_handler(const struct hisi_common_error_section *err)
|
||
|
|
+{
|
||
|
|
+ log(LOG_DEBUG, "Received ACLS/SPPR flat mode repair request, try to repair\n");
|
||
|
|
+ char *sys_dev_path = "/sys/devices/platform";
|
||
|
|
+ char path[MAX_PATH];
|
||
|
|
+ struct dirent *dent;
|
||
|
|
+ DIR *dir;
|
||
|
|
+ int ret;
|
||
|
|
+ bool find_device = false, find_hbm_mem = false;
|
||
|
|
+
|
||
|
|
+ ret = hbmc_hbm_page_isolate(err);
|
||
|
|
+ if (ret < 0) {
|
||
|
|
+ return;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ dir = opendir(sys_dev_path);
|
||
|
|
+ if (!dir) {
|
||
|
|
+ log(LOG_WARNING, "Can't read '%s': %s\n",
|
||
|
|
+ sys_dev_path, strerror(errno));
|
||
|
|
+ notice_BMC(err, REPAIR_FAILED_OTHER_REASON);
|
||
|
|
+ return;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ while ((dent = readdir(dir))) {
|
||
|
|
+ if (!strstr(dent->d_name, HBM_MEM_RAS_NAME))
|
||
|
|
+ continue;
|
||
|
|
+ find_device = true;
|
||
|
|
+
|
||
|
|
+ snprintf(path, MAX_PATH, "%s/%s", sys_dev_path, dent->d_name);
|
||
|
|
+
|
||
|
|
+ if (hbmc_get_memory_type(path) == HBM_HBM_MEMORY) {
|
||
|
|
+ find_hbm_mem = true;
|
||
|
|
+ ret = hbmc_hbm_repair(err, path);
|
||
|
|
+ if (ret != -ENXIO)
|
||
|
|
+ break;
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+ if (!find_device) {
|
||
|
|
+ log(LOG_ERROR, "Repair driver is not loaded, skip error, error_type is %u\n",
|
||
|
|
+ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK);
|
||
|
|
+ notice_BMC(err, REPAIR_FAILED_OTHER_REASON);
|
||
|
|
+ } else if (!find_hbm_mem) {
|
||
|
|
+ log(LOG_ERROR, "No HBM device memory type found, skip error, error_type is %u\n",
|
||
|
|
+ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK);
|
||
|
|
+ notice_BMC(err, REPAIR_FAILED_OTHER_REASON);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ closedir(dir);
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static bool hbm_repair_validate(const struct hisi_common_error_section *err)
|
||
|
|
+{
|
||
|
|
+ if (!((err->val_bits & BIT(COMMON_VALID_MODULE_ID)) &&
|
||
|
|
+ (err->val_bits & BIT(COMMON_VALID_SUBMODULE_ID)) &&
|
||
|
|
+ (err->val_bits & BIT(COMMON_VALID_REG_ARRAY_SIZE))
|
||
|
|
+ )) {
|
||
|
|
+ log(LOG_DEBUG, "Err val_bits validate failed, val_bits is %u\n", err->val_bits);
|
||
|
|
+ return false;
|
||
|
|
+ }
|
||
|
|
+ log(LOG_DEBUG, "err->module_id: %u\n", err->module_id);
|
||
|
|
+ log(LOG_DEBUG, "err->submodule_id: %u\n", err->submodule_id);
|
||
|
|
+ log(LOG_DEBUG, "err->val_bits: 0x%x\n", err->val_bits);
|
||
|
|
+ log(LOG_DEBUG, "err->reg_array_size: %u\n", err->reg_array_size);
|
||
|
|
+
|
||
|
|
+ if (err->module_id != HBMC_MODULE_ID ||
|
||
|
|
+ err->submodule_id != HBMC_SUBMOD_HBM_REPAIR) {
|
||
|
|
+ log(LOG_DEBUG, "err module_id or sub_module id doesn't not match\n");
|
||
|
|
+ return false;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ uint32_t hbm_repair_reg_type = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK;
|
||
|
|
+ bool is_acls_valid = (hbm_repair_reg_type & (HBM_CE_ACLS | HBM_PSUE_ACLS)) &&
|
||
|
|
+ (err->reg_array_size == HBM_ACLS_ARRAY_SIZE);
|
||
|
|
+ bool is_sppr_valid = (hbm_repair_reg_type & (HBM_CE_SPPR | HBM_PSUE_SPPR)) &&
|
||
|
|
+ (err->reg_array_size == HBM_SPPR_ARRAY_SIZE);
|
||
|
|
+ bool is_cache_mode = (hbm_repair_reg_type & HBM_CACHE_MODE) &&
|
||
|
|
+ (err->reg_array_size == HBM_CACHE_ARRAY_SIZE);
|
||
|
|
+
|
||
|
|
+ if (!(is_acls_valid || is_sppr_valid || is_cache_mode)) {
|
||
|
|
+ log(LOG_DEBUG, "err type (%u) is unknown or address array length (%u) is invalid\n",
|
||
|
|
+ hbm_repair_reg_type, err->reg_array_size);
|
||
|
|
+ return false;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ log(LOG_INFO, "Received ACLS/SPPR repair request\n");
|
||
|
|
+ return true;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static bool hbm_flat_mode_validate(const struct hisi_common_error_section *err)
|
||
|
|
+{
|
||
|
|
+ uint32_t hbm_repair_reg_type = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK;
|
||
|
|
+ return !(hbm_repair_reg_type & HBM_CACHE_MODE);
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+int decode_hisi_common_section(struct ras_non_standard_event *event)
|
||
|
|
+{
|
||
|
|
+ const struct hisi_common_error_section *err = (struct hisi_common_error_section *)event->error;
|
||
|
|
+
|
||
|
|
+ if (hbm_repair_validate(err)) {
|
||
|
|
+ write_fault_info_to_flash(err);
|
||
|
|
+ if (hbm_flat_mode_validate(err)) {
|
||
|
|
+ hbm_repair_handler(err);
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.h b/src/c/hbm_online_repair/non-standard-hbm-repair.h
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..7e8e448
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.h
|
||
|
|
@@ -0,0 +1,89 @@
|
||
|
|
+#ifndef __NON_STANDARD_HBM_REPAIR
|
||
|
|
+#define __NON_STANDARD_HBM_REPAIR
|
||
|
|
+
|
||
|
|
+#include "ras-non-standard-handler.h"
|
||
|
|
+
|
||
|
|
+#define DEFAULT_PAGE_SIZE_KB 4
|
||
|
|
+#define HBM_MEM_RAS_NAME "HISI0521"
|
||
|
|
+#define HBM_UNKNOWN 0
|
||
|
|
+#define HBM_HBM_MEMORY 1
|
||
|
|
+#define HBM_DDR_MEMORY 2
|
||
|
|
+
|
||
|
|
+#define TYPE_UINT32_WIDTH 32
|
||
|
|
+#define HBM_REPAIR_REQ_TYPE 0
|
||
|
|
+#define HBM_CE_ACLS BIT(0)
|
||
|
|
+#define HBM_PSUE_ACLS BIT(1)
|
||
|
|
+#define HBM_CE_SPPR BIT(2)
|
||
|
|
+#define HBM_PSUE_SPPR BIT(3)
|
||
|
|
+#define HBM_CACHE_MODE (BIT(4) | BIT(5) | BIT(6) | BIT(7))
|
||
|
|
+#define HBM_ERROR_MASK 0b11111111
|
||
|
|
+#define HBM_ADDL 1
|
||
|
|
+#define HBM_ADDH 2
|
||
|
|
+#define HBM_ERROR_TYPE_SIZE 4
|
||
|
|
+#define HBM_ADDR_SIZE 8
|
||
|
|
+#define HBM_ACLS_ADDR_NUM 1
|
||
|
|
+#define HBM_SPPR_ADDR_NUM 16
|
||
|
|
+#define HBM_ACLS_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE * HBM_ACLS_ADDR_NUM + HBM_ADDR_SIZE)
|
||
|
|
+#define HBM_SPPR_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE * HBM_SPPR_ADDR_NUM + HBM_ADDR_SIZE)
|
||
|
|
+#define HBM_CACHE_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE)
|
||
|
|
+#define HBMC_MODULE_ID 0x28
|
||
|
|
+#define HBMC_SUBMOD_HBM_REPAIR 6
|
||
|
|
+#define COMMON_VALID_MODULE_ID 5
|
||
|
|
+#define COMMON_VALID_SUBMODULE_ID 6
|
||
|
|
+#define COMMON_VALID_REG_ARRAY_SIZE 12
|
||
|
|
+
|
||
|
|
+#define BMC_SOCKET_PATH "/var/run/sysSentry/bmc.sock"
|
||
|
|
+#define BMC_REPORT_FORMAT "REP00%02x%02x%02x0000000000000000%02x%02x%02x00%02x00%02x%02x%02x%08x%08x0000000000"
|
||
|
|
+
|
||
|
|
+#define ISOLATE_FAILED_OVER_THRESHOLD 0b10000001
|
||
|
|
+#define ISOLATE_FAILED_OTHER_REASON 0b10000010
|
||
|
|
+#define REPAIR_FAILED_NO_RESOURCE 0b10010100
|
||
|
|
+#define REPAIR_FAILED_INVALID_PARAM 0b10011000
|
||
|
|
+#define REPAIR_FAILED_OTHER_REASON 0b10011100
|
||
|
|
+#define ONLINE_PAGE_FAILED 0b10100000
|
||
|
|
+#define ISOLATE_REPAIR_ONLINE_SUCCESS 0b00000000
|
||
|
|
+
|
||
|
|
+#define ROW_FAULT 1
|
||
|
|
+#define SINGLE_ADDR_FAULT 6
|
||
|
|
+
|
||
|
|
+#define FAULT_ADDR_PROCESSOR_ID_LEN 2
|
||
|
|
+#define FAULT_ADDR_DIE_ID_LEN 1
|
||
|
|
+#define FAULT_ADDR_STACK_ID_LEN 3
|
||
|
|
+#define FAULT_ADDR_SID_LEN 3
|
||
|
|
+#define FAULT_ADDR_CHANNEL_ID_LEN 8
|
||
|
|
+#define FAULT_ADDR_BANKGROUP_ID_LEN 3
|
||
|
|
+#define FAULT_ADDR_BANK_ID_LEN 3
|
||
|
|
+#define FAULT_ADDR_ROW_ID_LEN 17
|
||
|
|
+#define FAULT_ADDR_COLUMN_ID_LEN 10
|
||
|
|
+#define FAULT_ADDR_ERROR_TYPE_LEN 2
|
||
|
|
+#define FAULT_ADDR_REPAIR_TYPE_LEN 2
|
||
|
|
+#define FAULT_ADDR_RESERVED_LEN 2
|
||
|
|
+#define FAULT_ADDR_CRC8_LEN 8
|
||
|
|
+
|
||
|
|
+#define FAULT_ADDR_PROCESSOR_ID_MASK ((1 << FAULT_ADDR_PROCESSOR_ID_LEN ) - 1)
|
||
|
|
+#define FAULT_ADDR_DIE_ID_MASK ((1 << FAULT_ADDR_DIE_ID_LEN ) - 1)
|
||
|
|
+#define FAULT_ADDR_STACK_ID_MASK ((1 << FAULT_ADDR_STACK_ID_LEN ) - 1)
|
||
|
|
+#define FAULT_ADDR_SID_MASK ((1 << FAULT_ADDR_SID_LEN ) - 1)
|
||
|
|
+#define FAULT_ADDR_CHANNEL_ID_MASK ((1 << FAULT_ADDR_CHANNEL_ID_LEN ) - 1)
|
||
|
|
+#define FAULT_ADDR_BANKGROUP_ID_MASK ((1 << FAULT_ADDR_BANKGROUP_ID_LEN ) - 1)
|
||
|
|
+#define FAULT_ADDR_BANK_ID_MASK ((1 << FAULT_ADDR_BANK_ID_LEN ) - 1)
|
||
|
|
+#define FAULT_ADDR_ROW_ID_MASK ((1 << FAULT_ADDR_ROW_ID_LEN ) - 1)
|
||
|
|
+#define FAULT_ADDR_COLUMN_ID_MASK ((1 << FAULT_ADDR_COLUMN_ID_LEN ) - 1)
|
||
|
|
+#define FAULT_ADDR_ERROR_TYPE_MASK ((1 << FAULT_ADDR_ERROR_TYPE_LEN ) - 1)
|
||
|
|
+#define FAULT_ADDR_REPAIR_TYPE_MASK ((1 << FAULT_ADDR_REPAIR_TYPE_LEN ) - 1)
|
||
|
|
+#define FAULT_ADDR_RESERVED_MASK ((1 << FAULT_ADDR_RESERVED_LEN ) - 1)
|
||
|
|
+#define FAULT_ADDR_CRC8_MASK ((1 << FAULT_ADDR_CRC8_LEN ) - 1)
|
||
|
|
+
|
||
|
|
+#define EFI_VARIABLE_NON_VOLATILE 0x1
|
||
|
|
+#define EFI_VARIABLE_BOOTSERVICE_ACCESS 0x2
|
||
|
|
+#define EFI_VARIABLE_RUNTIME_ACCESS 0x4
|
||
|
|
+#define EFI_VARIABLE_APPEND_WRITE 0x40
|
||
|
|
+
|
||
|
|
+#define EFIVARFS_PATH "/sys/firmware/efi/efivars"
|
||
|
|
+#define MAX_VAR_SIZE (128 * 1024)
|
||
|
|
+#define FLASH_ENTRY_NUM 8
|
||
|
|
+#define KB_SIZE 1024
|
||
|
|
+
|
||
|
|
+extern int init_all_flash();
|
||
|
|
+
|
||
|
|
+#endif
|
||
|
|
diff --git a/src/c/hbm_online_repair/ras-events.c b/src/c/hbm_online_repair/ras-events.c
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..0b12329
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/c/hbm_online_repair/ras-events.c
|
||
|
|
@@ -0,0 +1,534 @@
|
||
|
|
+#include <dirent.h>
|
||
|
|
+#include <errno.h>
|
||
|
|
+#include <fcntl.h>
|
||
|
|
+#include <stdio.h>
|
||
|
|
+#include <stdlib.h>
|
||
|
|
+#include <stdint.h>
|
||
|
|
+#include <stdbool.h>
|
||
|
|
+#include <string.h>
|
||
|
|
+#include <unistd.h>
|
||
|
|
+#include <sys/stat.h>
|
||
|
|
+#include <sys/types.h>
|
||
|
|
+#include <sys/poll.h>
|
||
|
|
+#include <signal.h>
|
||
|
|
+#include <sys/signalfd.h>
|
||
|
|
+
|
||
|
|
+#include <traceevent/kbuffer.h>
|
||
|
|
+#include <traceevent/event-parse.h>
|
||
|
|
+#include "ras-non-standard-handler.h"
|
||
|
|
+#include "logger.h"
|
||
|
|
+
|
||
|
|
+/*
|
||
|
|
+ * Polling time, if read() doesn't block. Currently, trace_pipe_raw never
|
||
|
|
+ * blocks on read(). So, we need to sleep for a while, to avoid spending
|
||
|
|
+ * too much CPU cycles. A fix for it is expected for 3.10.
|
||
|
|
+ */
|
||
|
|
+#define POLLING_TIME 3
|
||
|
|
+
|
||
|
|
+/* Test for a little-endian machine */
|
||
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||
|
|
+ #define ENDIAN KBUFFER_ENDIAN_LITTLE
|
||
|
|
+#else
|
||
|
|
+ #define ENDIAN KBUFFER_ENDIAN_BIG
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+static int get_debugfs_dir(char *debugfs_dir, size_t len)
|
||
|
|
+{
|
||
|
|
+ FILE *fp;
|
||
|
|
+ char line[MAX_PATH + 1 + 256];
|
||
|
|
+
|
||
|
|
+ fp = fopen("/proc/mounts","r");
|
||
|
|
+ if (!fp) {
|
||
|
|
+ log(LOG_INFO, "Can't open /proc/mounts");
|
||
|
|
+ return errno;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ do {
|
||
|
|
+ char *p, *type, *dir;
|
||
|
|
+ if (!fgets(line, sizeof(line), fp))
|
||
|
|
+ break;
|
||
|
|
+
|
||
|
|
+ p = strtok(line, " \t");
|
||
|
|
+ if (!p)
|
||
|
|
+ break;
|
||
|
|
+
|
||
|
|
+ dir = strtok(NULL, " \t");
|
||
|
|
+ if (!dir)
|
||
|
|
+ break;
|
||
|
|
+
|
||
|
|
+ type = strtok(NULL, " \t");
|
||
|
|
+ if (!type)
|
||
|
|
+ break;
|
||
|
|
+
|
||
|
|
+ if (!strcmp(type, "debugfs")) {
|
||
|
|
+ fclose(fp);
|
||
|
|
+ strncpy(debugfs_dir, dir, len - 1);
|
||
|
|
+ debugfs_dir[len - 1] = '\0';
|
||
|
|
+ return 0;
|
||
|
|
+ }
|
||
|
|
+ } while(1);
|
||
|
|
+
|
||
|
|
+ fclose(fp);
|
||
|
|
+ log(LOG_INFO, "Can't find debugfs\n");
|
||
|
|
+ return ENOENT;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+static int open_trace(char *trace_dir, char *name, int flags)
|
||
|
|
+{
|
||
|
|
+ int ret;
|
||
|
|
+ char fname[MAX_PATH + 1];
|
||
|
|
+
|
||
|
|
+ strcpy(fname, trace_dir);
|
||
|
|
+ strcat(fname, "/");
|
||
|
|
+ strcat(fname, name);
|
||
|
|
+
|
||
|
|
+ ret = open(fname, flags);
|
||
|
|
+ if (ret < 0)
|
||
|
|
+ log(LOG_WARNING, "open_trace() failed, fname=%s ret=%d errno=%d\n", fname, ret, errno);
|
||
|
|
+
|
||
|
|
+ return ret;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int create_trace_instance(char *trace_instance_dir)
|
||
|
|
+{
|
||
|
|
+ char fname[MAX_PATH + 1];
|
||
|
|
+ int rc;
|
||
|
|
+
|
||
|
|
+ get_debugfs_dir(fname, sizeof(fname));
|
||
|
|
+ strcat(fname, "/tracing/instances/"TOOL_NAME);
|
||
|
|
+ rc = mkdir(fname, S_IRWXU);
|
||
|
|
+ if (rc < 0 && errno != EEXIST) {
|
||
|
|
+ log(LOG_INFO, "Unable to create " TOOL_NAME " instance at %s\n", fname);
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+ strcpy(trace_instance_dir, fname);
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+struct ras_events *init_trace_instance(void)
|
||
|
|
+{
|
||
|
|
+ struct ras_events *ras = calloc(1, sizeof(*ras));
|
||
|
|
+ if (!ras) {
|
||
|
|
+ log(LOG_ERROR, "Can't allocate memory for ras struct\n");
|
||
|
|
+ return NULL;
|
||
|
|
+ }
|
||
|
|
+ int rc = create_trace_instance(ras->tracing);
|
||
|
|
+ if (rc < 0) {
|
||
|
|
+ free(ras);
|
||
|
|
+ return NULL;
|
||
|
|
+ }
|
||
|
|
+ return ras;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+/*
|
||
|
|
+ * Tracing enable/disable code
|
||
|
|
+ */
|
||
|
|
+int toggle_ras_event(char *trace_dir, char *group, char *event, int enable)
|
||
|
|
+{
|
||
|
|
+ int fd, rc;
|
||
|
|
+ char fname[MAX_PATH + 1];
|
||
|
|
+
|
||
|
|
+ snprintf(fname, sizeof(fname), "%s%s:%s\n",
|
||
|
|
+ enable ? "" : "!",
|
||
|
|
+ group, event);
|
||
|
|
+
|
||
|
|
+ /* Enable RAS events */
|
||
|
|
+ fd = open_trace(trace_dir, "set_event", O_RDWR | O_APPEND);
|
||
|
|
+ if (fd < 0) {
|
||
|
|
+ log(LOG_WARNING, "Can't open set_event\n");
|
||
|
|
+ rc = -errno;
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ rc = write(fd, fname, strlen(fname));
|
||
|
|
+ close(fd);
|
||
|
|
+ if (rc <= 0) {
|
||
|
|
+ log(LOG_WARNING, "Can't write to set_event\n");
|
||
|
|
+ rc = -EIO;
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ log(LOG_INFO, "%s:%s event %s\n",
|
||
|
|
+ group, event,
|
||
|
|
+ enable ? "enabled" : "disabled");
|
||
|
|
+ return 0;
|
||
|
|
+err:
|
||
|
|
+ log(LOG_ERROR, "Can't %s %s:%s tracing\n",
|
||
|
|
+ enable ? "enable" : "disable", group, event);
|
||
|
|
+ return rc;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int parse_header_page(struct ras_events *ras, struct tep_handle *pevent)
|
||
|
|
+{
|
||
|
|
+ int fd, len, page_size = DEFAULT_PAGE_SIZE;
|
||
|
|
+ char buf[page_size];
|
||
|
|
+
|
||
|
|
+ fd = open_trace(ras->tracing, "events/header_page", O_RDONLY);
|
||
|
|
+ if (fd < 0) {
|
||
|
|
+ log(LOG_WARNING, "Open event header page failed\n");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ len = read(fd, buf, page_size);
|
||
|
|
+ close(fd);
|
||
|
|
+ if (len <= 0) {
|
||
|
|
+ log(LOG_WARNING, "Read event header page failed\n");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (tep_parse_header_page(pevent, buf, len, sizeof(long))) {
|
||
|
|
+ log(LOG_WARNING, "Parse event header page failed\n");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static void parse_ras_data(struct pcpu_data *pdata, struct kbuffer *kbuf,
|
||
|
|
+ void *data, unsigned long long time_stamp)
|
||
|
|
+{
|
||
|
|
+ struct tep_record record;
|
||
|
|
+ struct trace_seq s;
|
||
|
|
+
|
||
|
|
+ record.ts = time_stamp;
|
||
|
|
+ record.size = kbuffer_event_size(kbuf);
|
||
|
|
+ record.data = data;
|
||
|
|
+ record.offset = kbuffer_curr_offset(kbuf);
|
||
|
|
+ record.cpu = pdata->cpu;
|
||
|
|
+
|
||
|
|
+ /* note offset is just offset in subbuffer */
|
||
|
|
+ record.missed_events = kbuffer_missed_events(kbuf);
|
||
|
|
+ record.record_size = kbuffer_curr_size(kbuf);
|
||
|
|
+
|
||
|
|
+ trace_seq_init(&s);
|
||
|
|
+ tep_print_event(pdata->ras->pevent, &s, &record, "%s-%s-%d-%s",
|
||
|
|
+ TEP_PRINT_NAME, TEP_PRINT_COMM, TEP_PRINT_TIME, TEP_PRINT_INFO);
|
||
|
|
+ trace_seq_do_printf(&s);
|
||
|
|
+ fflush(stdout);
|
||
|
|
+ trace_seq_destroy(&s);
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int get_num_cpus()
|
||
|
|
+{
|
||
|
|
+ return sysconf(_SC_NPROCESSORS_ONLN);
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int set_buffer_percent(struct ras_events *ras, int percent)
|
||
|
|
+{
|
||
|
|
+ int res = 0;
|
||
|
|
+ int fd;
|
||
|
|
+
|
||
|
|
+ fd = open_trace(ras->tracing, "buffer_percent", O_WRONLY);
|
||
|
|
+ if (fd >= 0) {
|
||
|
|
+ char buf[16];
|
||
|
|
+ ssize_t size;
|
||
|
|
+ snprintf(buf, sizeof(buf), "%d", percent);
|
||
|
|
+ size = write(fd, buf, strlen(buf));
|
||
|
|
+ if (size <= 0) {
|
||
|
|
+ log(LOG_WARNING, "can't write to buffer_percent\n");
|
||
|
|
+ res = -1;
|
||
|
|
+ }
|
||
|
|
+ close(fd);
|
||
|
|
+ } else {
|
||
|
|
+ log(LOG_WARNING, "Can't open buffer_percent\n");
|
||
|
|
+ res = -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ return res;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int read_ras_event_all_cpus(struct pcpu_data *pdata,
|
||
|
|
+ unsigned n_cpus)
|
||
|
|
+{
|
||
|
|
+ ssize_t size;
|
||
|
|
+ unsigned long long time_stamp;
|
||
|
|
+ void *data;
|
||
|
|
+ int ready, i, count_nready;
|
||
|
|
+ struct kbuffer *kbuf;
|
||
|
|
+ void *page;
|
||
|
|
+ struct pollfd fds[n_cpus + 1];
|
||
|
|
+ struct signalfd_siginfo fdsiginfo;
|
||
|
|
+ sigset_t mask;
|
||
|
|
+ int warnonce[n_cpus];
|
||
|
|
+ char pipe_raw[PATH_MAX];
|
||
|
|
+
|
||
|
|
+ memset(&warnonce, 0, sizeof(warnonce));
|
||
|
|
+
|
||
|
|
+ page = malloc(pdata[0].ras->page_size);
|
||
|
|
+ if (!page) {
|
||
|
|
+ log(LOG_ERROR, "Can't allocate page\n");
|
||
|
|
+ return -ENOMEM;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ kbuf = kbuffer_alloc(KBUFFER_LSIZE_8, ENDIAN);
|
||
|
|
+ if (!kbuf) {
|
||
|
|
+ log(LOG_ERROR, "Can't allocate kbuf\n");
|
||
|
|
+ free(page);
|
||
|
|
+ return -ENOMEM;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /* Fix for poll() on the per_cpu trace_pipe and trace_pipe_raw blocks
|
||
|
|
+ * indefinitely with the default buffer_percent in the kernel trace system,
|
||
|
|
+ * which is introduced by the following change in the kernel.
|
||
|
|
+ * https://lore.kernel.org/all/20221020231427.41be3f26@gandalf.local.home/T/#u.
|
||
|
|
+ * Set buffer_percent to 0 so that poll() will return immediately
|
||
|
|
+ * when the trace data is available in the ras per_cpu trace pipe_raw
|
||
|
|
+ */
|
||
|
|
+ if (set_buffer_percent(pdata[0].ras, 0))
|
||
|
|
+ log(LOG_WARNING, "Set buffer_percent failed\n");
|
||
|
|
+
|
||
|
|
+ for (i = 0; i < (n_cpus + 1); i++)
|
||
|
|
+ fds[i].fd = -1;
|
||
|
|
+
|
||
|
|
+ for (i = 0; i < n_cpus; i++) {
|
||
|
|
+ fds[i].events = POLLIN;
|
||
|
|
+
|
||
|
|
+ snprintf(pipe_raw, sizeof(pipe_raw),
|
||
|
|
+ "per_cpu/cpu%d/trace_pipe_raw", i);
|
||
|
|
+
|
||
|
|
+ fds[i].fd = open_trace(pdata[0].ras->tracing, pipe_raw, O_RDONLY);
|
||
|
|
+ if (fds[i].fd < 0) {
|
||
|
|
+ log(LOG_ERROR, "Can't open trace_pipe_raw\n");
|
||
|
|
+ goto error;
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ sigemptyset(&mask);
|
||
|
|
+ sigaddset(&mask, SIGINT);
|
||
|
|
+ sigaddset(&mask, SIGTERM);
|
||
|
|
+ sigaddset(&mask, SIGHUP);
|
||
|
|
+ sigaddset(&mask, SIGQUIT);
|
||
|
|
+ if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1)
|
||
|
|
+ log(LOG_WARNING, "sigprocmask\n");
|
||
|
|
+ fds[n_cpus].events = POLLIN;
|
||
|
|
+ fds[n_cpus].fd = signalfd(-1, &mask, 0);
|
||
|
|
+ if (fds[n_cpus].fd < 0) {
|
||
|
|
+ log(LOG_WARNING, "signalfd\n");
|
||
|
|
+ goto error;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ log(LOG_INFO, "Listening to events for cpus 0 to %u\n", n_cpus - 1);
|
||
|
|
+
|
||
|
|
+ do {
|
||
|
|
+ ready = poll(fds, (n_cpus + 1), -1);
|
||
|
|
+ if (ready < 0) {
|
||
|
|
+ log(LOG_WARNING, "poll\n");
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /* check for the signal */
|
||
|
|
+ if (fds[n_cpus].revents & POLLIN) {
|
||
|
|
+ size = read(fds[n_cpus].fd, &fdsiginfo,
|
||
|
|
+ sizeof(struct signalfd_siginfo));
|
||
|
|
+ if (size != sizeof(struct signalfd_siginfo)) {
|
||
|
|
+ log(LOG_WARNING, "signalfd read\n");
|
||
|
|
+ continue;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (fdsiginfo.ssi_signo == SIGINT ||
|
||
|
|
+ fdsiginfo.ssi_signo == SIGTERM ||
|
||
|
|
+ fdsiginfo.ssi_signo == SIGHUP ||
|
||
|
|
+ fdsiginfo.ssi_signo == SIGQUIT) {
|
||
|
|
+ log(LOG_INFO, "Recevied signal=%d\n",
|
||
|
|
+ fdsiginfo.ssi_signo);
|
||
|
|
+ goto error;
|
||
|
|
+ } else {
|
||
|
|
+ log(LOG_INFO,
|
||
|
|
+ "Received unexpected signal=%d\n",
|
||
|
|
+ fdsiginfo.ssi_signo);
|
||
|
|
+ continue;
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ count_nready = 0;
|
||
|
|
+ for (i = 0; i < n_cpus; i++) {
|
||
|
|
+ if (fds[i].revents & POLLERR) {
|
||
|
|
+ if (!warnonce[i]) {
|
||
|
|
+ log(LOG_INFO,
|
||
|
|
+ "Error on CPU %i\n", i);
|
||
|
|
+ warnonce[i]++;
|
||
|
|
+ }
|
||
|
|
+ continue;
|
||
|
|
+ }
|
||
|
|
+ if (!(fds[i].revents & POLLIN)) {
|
||
|
|
+ count_nready++;
|
||
|
|
+ continue;
|
||
|
|
+ }
|
||
|
|
+ size = read(fds[i].fd, page, pdata[i].ras->page_size);
|
||
|
|
+ if (size < 0) {
|
||
|
|
+ log(LOG_WARNING, "read\n");
|
||
|
|
+ goto error;
|
||
|
|
+ } else if (size > 0) {
|
||
|
|
+ log(LOG_DEBUG, "cpu %d receive %ld bytes data\n", i, size);
|
||
|
|
+ kbuffer_load_subbuffer(kbuf, page);
|
||
|
|
+
|
||
|
|
+ while ((data = kbuffer_read_event(kbuf, &time_stamp))) {
|
||
|
|
+ if (kbuffer_curr_size(kbuf) < 0) {
|
||
|
|
+ log(LOG_ERROR, "invalid kbuf data, discard\n");
|
||
|
|
+ break;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ log(LOG_DEBUG, "parse_ras_data\n");
|
||
|
|
+ parse_ras_data(&pdata[i],
|
||
|
|
+ kbuf, data, time_stamp);
|
||
|
|
+
|
||
|
|
+ /* increment to read next event */
|
||
|
|
+ log(LOG_DEBUG, "kbuffer_next_event\n");
|
||
|
|
+ kbuffer_next_event(kbuf, NULL);
|
||
|
|
+ }
|
||
|
|
+ } else {
|
||
|
|
+ count_nready++;
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * If count_nready == n_cpus, there is no cpu fd in POLLIN state,
|
||
|
|
+ * so we need to break the cycle
|
||
|
|
+ */
|
||
|
|
+ if (count_nready == n_cpus) {
|
||
|
|
+ log(LOG_ERROR, "no cpu fd in POLLIN state, stop running\n");
|
||
|
|
+ break;
|
||
|
|
+ }
|
||
|
|
+ } while (1);
|
||
|
|
+
|
||
|
|
+error:
|
||
|
|
+ kbuffer_free(kbuf);
|
||
|
|
+ free(page);
|
||
|
|
+ sigprocmask(SIG_UNBLOCK, &mask, NULL);
|
||
|
|
+
|
||
|
|
+ for (i = 0; i < (n_cpus + 1); i++) {
|
||
|
|
+ if (fds[i].fd > 0)
|
||
|
|
+ close(fds[i].fd);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ return -1;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int init_header_page(struct ras_events *ras, struct tep_handle *pevent)
|
||
|
|
+{
|
||
|
|
+ int rc;
|
||
|
|
+
|
||
|
|
+ rc = parse_header_page(ras, pevent);
|
||
|
|
+ if (rc) {
|
||
|
|
+ log(LOG_ERROR, "cannot read trace header_page: %d\n", rc);
|
||
|
|
+ return rc;
|
||
|
|
+ }
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int init_event_format(struct ras_events *ras, struct tep_handle *pevent,
|
||
|
|
+ char *group, char *event)
|
||
|
|
+{
|
||
|
|
+ char *page, fname[MAX_PATH + 1];
|
||
|
|
+ int fd, size, rc, page_size = DEFAULT_PAGE_SIZE;
|
||
|
|
+
|
||
|
|
+ // read one page from format
|
||
|
|
+ snprintf(fname, sizeof(fname), "events/%s/%s/format", group, event);
|
||
|
|
+ fd = open_trace(ras->tracing, fname, O_RDONLY);
|
||
|
|
+ if (fd < 0) {
|
||
|
|
+ log(LOG_ERROR,
|
||
|
|
+ "Can't get %s:%s traces. Perhaps this feature is not supported on your system.\n",
|
||
|
|
+ group, event);
|
||
|
|
+ return errno;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ log(LOG_INFO, "page_size: %d\n", page_size);
|
||
|
|
+ ras->page_size = page_size;
|
||
|
|
+ page = malloc(page_size);
|
||
|
|
+ if (!page) {
|
||
|
|
+ log(LOG_ERROR, "Can't allocate page to read %s:%s format\n",
|
||
|
|
+ group, event);
|
||
|
|
+ rc = errno;
|
||
|
|
+ close(fd);
|
||
|
|
+ return rc;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ size = read(fd, page, page_size);
|
||
|
|
+ close(fd);
|
||
|
|
+ if (size < 0) {
|
||
|
|
+ log(LOG_ERROR, "Can't read format\n");
|
||
|
|
+ free(page);
|
||
|
|
+ return size;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ // parse event format
|
||
|
|
+ rc = tep_parse_event(pevent, page, size, group);
|
||
|
|
+ if (rc) {
|
||
|
|
+ log(LOG_ERROR, "Can't parse event %s:%s\n", group, event);
|
||
|
|
+ free(page);
|
||
|
|
+ return EINVAL;
|
||
|
|
+ }
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent,
|
||
|
|
+ char *group, char *event,
|
||
|
|
+ tep_event_handler_func func)
|
||
|
|
+{
|
||
|
|
+ int rc;
|
||
|
|
+
|
||
|
|
+ rc = init_event_format(ras, pevent, group, event);
|
||
|
|
+ if (rc) {
|
||
|
|
+ log(LOG_ERROR, "init_event_format for %s:%s failed\n", group, event);
|
||
|
|
+ return rc;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /* Registers the special event handlers */
|
||
|
|
+ rc = tep_register_event_handler(pevent, -1, group, event, func, ras);
|
||
|
|
+ if (rc < 0) {
|
||
|
|
+ log(LOG_ERROR, "Can't register event handler for %s:%s\n",
|
||
|
|
+ group, event);
|
||
|
|
+ return EINVAL;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+int handle_ras_events(struct ras_events *ras)
|
||
|
|
+{
|
||
|
|
+ int rc, i;
|
||
|
|
+ unsigned cpus;
|
||
|
|
+ struct tep_handle *pevent = NULL;
|
||
|
|
+ struct pcpu_data *data = NULL;
|
||
|
|
+
|
||
|
|
+ pevent = tep_alloc();
|
||
|
|
+ if (!pevent) {
|
||
|
|
+ log(LOG_ERROR, "Can't allocate pevent\n");
|
||
|
|
+ rc = errno;
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+ ras->pevent = pevent;
|
||
|
|
+
|
||
|
|
+ rc = init_header_page(ras, pevent);
|
||
|
|
+ if (rc) {
|
||
|
|
+ log(LOG_ERROR, "init_header_page failed\n");
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ rc = add_event_handler(ras, pevent, "ras", "non_standard_event",
|
||
|
|
+ ras_non_standard_event_handler);
|
||
|
|
+ if (rc) {
|
||
|
|
+ log(LOG_ERROR, "Can't get traces from %s:%s\n",
|
||
|
|
+ "ras", "non_standard_event");
|
||
|
|
+ goto err;
|
||
|
|
+ }
|
||
|
|
+ log(LOG_INFO, "add_event_handler done\n");
|
||
|
|
+
|
||
|
|
+ cpus = get_num_cpus();
|
||
|
|
+ data = calloc(sizeof(*data), cpus);
|
||
|
|
+ if (!data)
|
||
|
|
+ goto err;
|
||
|
|
+
|
||
|
|
+ for (i = 0; i < cpus; i++) {
|
||
|
|
+ data[i].ras = ras;
|
||
|
|
+ data[i].cpu = i;
|
||
|
|
+ }
|
||
|
|
+ rc = read_ras_event_all_cpus(data, cpus);
|
||
|
|
+
|
||
|
|
+err:
|
||
|
|
+ if (data)
|
||
|
|
+ free(data);
|
||
|
|
+ if (pevent)
|
||
|
|
+ tep_free(pevent);
|
||
|
|
+ return rc;
|
||
|
|
+}
|
||
|
|
diff --git a/src/c/hbm_online_repair/ras-events.h b/src/c/hbm_online_repair/ras-events.h
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..4218d93
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/c/hbm_online_repair/ras-events.h
|
||
|
|
@@ -0,0 +1,28 @@
|
||
|
|
+#ifndef __RAS_EVENTS_H
|
||
|
|
+#define __RAS_EVENTS_H
|
||
|
|
+
|
||
|
|
+#include <stdint.h>
|
||
|
|
+#include <time.h>
|
||
|
|
+
|
||
|
|
+#define MAX_PATH 1024
|
||
|
|
+
|
||
|
|
+#define DEFAULT_PAGE_SIZE 4096
|
||
|
|
+
|
||
|
|
+struct ras_events {
|
||
|
|
+ char tracing[MAX_PATH + 1];
|
||
|
|
+ struct tep_handle *pevent;
|
||
|
|
+ int page_size;
|
||
|
|
+};
|
||
|
|
+
|
||
|
|
+struct pcpu_data {
|
||
|
|
+ struct tep_handle *pevent;
|
||
|
|
+ struct ras_events *ras;
|
||
|
|
+ int cpu;
|
||
|
|
+};
|
||
|
|
+
|
||
|
|
+/* Function prototypes */
|
||
|
|
+int toggle_ras_event(char *trace_dir, char *group, char *event, int enable);
|
||
|
|
+int handle_ras_events(struct ras_events *ras);
|
||
|
|
+struct ras_events *init_trace_instance(void);
|
||
|
|
+
|
||
|
|
+#endif
|
||
|
|
diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.c b/src/c/hbm_online_repair/ras-non-standard-handler.c
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..1d1fd04
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/c/hbm_online_repair/ras-non-standard-handler.c
|
||
|
|
@@ -0,0 +1,81 @@
|
||
|
|
+#include <stdio.h>
|
||
|
|
+#include <stdlib.h>
|
||
|
|
+#include <stdbool.h>
|
||
|
|
+#include <string.h>
|
||
|
|
+#include <unistd.h>
|
||
|
|
+#include <traceevent/kbuffer.h>
|
||
|
|
+#include "ras-non-standard-handler.h"
|
||
|
|
+#include "logger.h"
|
||
|
|
+
|
||
|
|
+static char *uuid_le(const char *uu)
|
||
|
|
+{
|
||
|
|
+ static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
|
||
|
|
+ if (!uu) {
|
||
|
|
+ log(LOG_ERROR, "uuid_le failed: uu is empty");
|
||
|
|
+ return uuid;
|
||
|
|
+ }
|
||
|
|
+ size_t uu_len = strlen(uu);
|
||
|
|
+ if (uu_len < SECTION_TYPE_UUID_LEN) {
|
||
|
|
+ log(LOG_ERROR, "uuid_le failed: uu is too short");
|
||
|
|
+ return uuid;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ char *p = uuid;
|
||
|
|
+ int i;
|
||
|
|
+ static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
|
||
|
|
+
|
||
|
|
+ for (i = 0; i < 16; i++) {
|
||
|
|
+ p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]);
|
||
|
|
+ switch (i) {
|
||
|
|
+ case 3:
|
||
|
|
+ case 5:
|
||
|
|
+ case 7:
|
||
|
|
+ case 9:
|
||
|
|
+ *p++ = '-';
|
||
|
|
+ break;
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ *p = 0;
|
||
|
|
+
|
||
|
|
+ return uuid;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+int ras_non_standard_event_handler(struct trace_seq *s,
|
||
|
|
+ struct tep_record *record,
|
||
|
|
+ struct tep_event *event, void *context)
|
||
|
|
+{
|
||
|
|
+ int len;
|
||
|
|
+ unsigned long long val;
|
||
|
|
+ struct ras_non_standard_event ev;
|
||
|
|
+
|
||
|
|
+ ev.sec_type = tep_get_field_raw(s, event, "sec_type",
|
||
|
|
+ record, &len, 1);
|
||
|
|
+ if(!ev.sec_type) {
|
||
|
|
+ log(LOG_WARNING, "get event section type failed");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ trace_seq_printf(s, "\n");
|
||
|
|
+ trace_seq_printf(s, "sec_type: %s\n", uuid_le(ev.sec_type));
|
||
|
|
+
|
||
|
|
+ if (tep_get_field_val(s, event, "len", record, &val, 1) < 0) {
|
||
|
|
+ log(LOG_WARNING, "tep get field val failed");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ ev.length = val;
|
||
|
|
+ trace_seq_printf(s, "length: %d\n", ev.length);
|
||
|
|
+
|
||
|
|
+ ev.error = tep_get_field_raw(s, event, "buf", record, &len, 1);
|
||
|
|
+ if(!ev.error || ev.length != len) {
|
||
|
|
+ log(LOG_WARNING, "get event error failed");
|
||
|
|
+ return -1;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ if (strcmp(uuid_le(ev.sec_type), HISI_COMMON_SECTION_TYPE_UUID) == 0) {
|
||
|
|
+ decode_hisi_common_section(&ev);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.h b/src/c/hbm_online_repair/ras-non-standard-handler.h
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..0272dc1
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/c/hbm_online_repair/ras-non-standard-handler.h
|
||
|
|
@@ -0,0 +1,25 @@
|
||
|
|
+#ifndef __RAS_NON_STANDARD_HANDLER_H
|
||
|
|
+#define __RAS_NON_STANDARD_HANDLER_H
|
||
|
|
+
|
||
|
|
+#include <traceevent/event-parse.h>
|
||
|
|
+#include "ras-events.h"
|
||
|
|
+
|
||
|
|
+#define BIT(nr) (1UL << (nr))
|
||
|
|
+
|
||
|
|
+#define SECTION_TYPE_UUID_LEN 16
|
||
|
|
+#define HISI_COMMON_SECTION_TYPE_UUID "c8b328a8-9917-4af6-9a13-2e08ab2e7586"
|
||
|
|
+
|
||
|
|
+struct ras_non_standard_event {
|
||
|
|
+ char timestamp[64];
|
||
|
|
+ const char *sec_type;
|
||
|
|
+ const uint8_t *error;
|
||
|
|
+ uint32_t length;
|
||
|
|
+};
|
||
|
|
+
|
||
|
|
+int ras_non_standard_event_handler(struct trace_seq *s,
|
||
|
|
+ struct tep_record *record,
|
||
|
|
+ struct tep_event *event, void *context);
|
||
|
|
+
|
||
|
|
+int decode_hisi_common_section(struct ras_non_standard_event *event);
|
||
|
|
+
|
||
|
|
+#endif
|
||
|
|
diff --git a/src/python/.gitignore b/src/python/.gitignore
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..58200d4
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/python/.gitignore
|
||
|
|
@@ -0,0 +1 @@
|
||
|
|
+__pycache__/
|
||
|
|
diff --git a/src/python/syssentry/bmc_alarm.py b/src/python/syssentry/bmc_alarm.py
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000..5956538
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/src/python/syssentry/bmc_alarm.py
|
||
|
|
@@ -0,0 +1,159 @@
|
||
|
|
+import logging
|
||
|
|
+import socket
|
||
|
|
+from enum import Enum
|
||
|
|
+
|
||
|
|
+from .utils import execute_command
|
||
|
|
+
|
||
|
|
+HEX_CHAR_LEN = 2
|
||
|
|
+SOCKET_RECEIVE_LEN = 128
|
||
|
|
+BMC_DATA_HEAD = "REP"
|
||
|
|
+BMC_REPORT_TYPE_BIT = 0
|
||
|
|
+HBMC_REPAIR_TYPE_BIT = 1
|
||
|
|
+HBMC_REPAIR_RESULT_BIT = 2
|
||
|
|
+HBMC_ISOLATION_TYPE_BIT = 3
|
||
|
|
+HBMC_SEND_HEAD_LEN = 4 # "ipmtool", "raw", "0x30", "0x92"
|
||
|
|
+HBMC_SEND_ROW_BIT = 26 + HBMC_SEND_HEAD_LEN
|
||
|
|
+HBMC_SEND_COL_BIT = 30 + HBMC_SEND_HEAD_LEN
|
||
|
|
+HBMC_REPAIR_TYPE_OFFSET = 7
|
||
|
|
+
|
||
|
|
+HBMC_SEND_SUCCESS_CODE = "db 07 00"
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+class ReportType(Enum):
|
||
|
|
+ HBMC_REPAIR_BMC = 0x00
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+class HBMCRepairType(Enum):
|
||
|
|
+ CE_ACLS = 7
|
||
|
|
+ PS_UCE_ACLS = 8
|
||
|
|
+ CE_SPPR = 9
|
||
|
|
+ PS_UCE_SPPR = 10
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+class HBMCRepairResultType(Enum):
|
||
|
|
+ ISOLATE_FAILED_OVER_THRESHOLD = 0b10000001
|
||
|
|
+ ISOLATE_FAILED_OTHER_REASON = 0b10000010
|
||
|
|
+ REPAIR_FAILED_NO_RESOURCE = 0b10010100
|
||
|
|
+ REPAIR_FAILED_INVALID_PARAM = 0b10011000
|
||
|
|
+ REPAIR_FAILED_OTHER_REASON = 0b10011100
|
||
|
|
+ ONLINE_PAGE_FAILED = 0b10100000
|
||
|
|
+ ISOLATE_REPAIR_ONLINE_SUCCESS = 0b00000000
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+class HBMCIsolationType(Enum):
|
||
|
|
+ ROW_FAULT = 1
|
||
|
|
+ SINGLE_ADDR_FAULT = 6
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+def find_value_is_in_enum(value: int, enum: Enum):
|
||
|
|
+ for item in enum:
|
||
|
|
+ if value == item.value:
|
||
|
|
+ return True
|
||
|
|
+ return False
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+def convert_hex_char_to_int(data, bit):
|
||
|
|
+ if len(data) < (bit+1)*HEX_CHAR_LEN:
|
||
|
|
+ logging.error(f"Data {data} len is too short, current convert bit is {bit}")
|
||
|
|
+ char = data[bit*HEX_CHAR_LEN:(bit+1)*HEX_CHAR_LEN]
|
||
|
|
+ try:
|
||
|
|
+ value = int(char, 16)
|
||
|
|
+ except ValueError:
|
||
|
|
+ logging.error(f"Cannot convert char [{char}] to int")
|
||
|
|
+ raise ValueError
|
||
|
|
+ return value
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+def reverse_byte(data):
|
||
|
|
+ return data[3], data[2], data[1], data[0]
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+def parse_hbmc_report(data: str):
|
||
|
|
+ logging.debug(f"bmc receive raw data is {data}")
|
||
|
|
+ repair_type = convert_hex_char_to_int(data, HBMC_REPAIR_TYPE_BIT)
|
||
|
|
+ repair_type += HBMC_REPAIR_TYPE_OFFSET
|
||
|
|
+ if not find_value_is_in_enum(repair_type, HBMCRepairType):
|
||
|
|
+ logging.warning(f"HBMC msg repair type ({repair_type}) is unknown")
|
||
|
|
+ raise ValueError
|
||
|
|
+
|
||
|
|
+ repair_result = convert_hex_char_to_int(data, HBMC_REPAIR_RESULT_BIT)
|
||
|
|
+ if not find_value_is_in_enum(repair_result, HBMCRepairResultType):
|
||
|
|
+ logging.warning(f"HBMC msg repair result ({repair_result}) is unknown")
|
||
|
|
+ raise ValueError
|
||
|
|
+
|
||
|
|
+ isolation_type = convert_hex_char_to_int(data, HBMC_ISOLATION_TYPE_BIT)
|
||
|
|
+ if not find_value_is_in_enum(isolation_type, HBMCIsolationType):
|
||
|
|
+ logging.warning(f"HBMC msg isolation type ({isolation_type}) is unknown")
|
||
|
|
+ raise ValueError
|
||
|
|
+
|
||
|
|
+ cmd_list = [
|
||
|
|
+ "ipmitool",
|
||
|
|
+ "raw",
|
||
|
|
+ "0x30", # Netfn
|
||
|
|
+ "0x92", # cmd
|
||
|
|
+ "0xdb",
|
||
|
|
+ "0x07",
|
||
|
|
+ "0x00",
|
||
|
|
+ "0x65", # sub command
|
||
|
|
+ "0x01", # SystemId
|
||
|
|
+ "0x00", # LocalSystemId
|
||
|
|
+ "{:#04X}".format(repair_type),
|
||
|
|
+ "{:#04X}".format(repair_result),
|
||
|
|
+ "{:#04X}".format(isolation_type),
|
||
|
|
+ ]
|
||
|
|
+ # send the remain data directly
|
||
|
|
+ data = data[(HBMC_ISOLATION_TYPE_BIT + 1) * HEX_CHAR_LEN:]
|
||
|
|
+ other_info_str = []
|
||
|
|
+ for i in range(len(data) // 2):
|
||
|
|
+ other_info_str.append("{:#04X}".format(convert_hex_char_to_int(data, i)))
|
||
|
|
+ cmd_list.extend(other_info_str)
|
||
|
|
+
|
||
|
|
+ cmd_list[HBMC_SEND_ROW_BIT:HBMC_SEND_ROW_BIT + 4] = reverse_byte(cmd_list[HBMC_SEND_ROW_BIT:HBMC_SEND_ROW_BIT + 4])
|
||
|
|
+ cmd_list[HBMC_SEND_COL_BIT:HBMC_SEND_COL_BIT + 4] = reverse_byte(cmd_list[HBMC_SEND_COL_BIT:HBMC_SEND_COL_BIT + 4])
|
||
|
|
+
|
||
|
|
+ logging.info(f"Send bmc alarm command is {cmd_list}")
|
||
|
|
+
|
||
|
|
+ ret = execute_command(cmd_list)
|
||
|
|
+ if HBMC_SEND_SUCCESS_CODE not in ret:
|
||
|
|
+ logging.warning(f"Send bmc alarm failed, error code is {ret}")
|
||
|
|
+ raise ValueError
|
||
|
|
+ logging.debug("Send bmc alarm success")
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+PARSE_REPORT_MSG_FUNC_DICT = {
|
||
|
|
+ ReportType.HBMC_REPAIR_BMC.value: parse_hbmc_report,
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+def bmc_recv(server_socket: socket.socket):
|
||
|
|
+ logging.debug("Get hbm socket connection request")
|
||
|
|
+ try:
|
||
|
|
+ client_socket, _ = server_socket.accept()
|
||
|
|
+ logging.debug("cpu alarm fd listen ok")
|
||
|
|
+
|
||
|
|
+ data = client_socket.recv(SOCKET_RECEIVE_LEN)
|
||
|
|
+ data = data.decode()
|
||
|
|
+
|
||
|
|
+ data_head = data[0:len(BMC_DATA_HEAD)]
|
||
|
|
+ if data_head != BMC_DATA_HEAD:
|
||
|
|
+ logging.warning(f"The head of the msg is incorrect, head is {data_head}")
|
||
|
|
+ raise ValueError
|
||
|
|
+
|
||
|
|
+ # remove the data head
|
||
|
|
+ data = data[len(BMC_DATA_HEAD):]
|
||
|
|
+ logging.info(f"Remove head data is {data}")
|
||
|
|
+
|
||
|
|
+ report_type = convert_hex_char_to_int(data, BMC_REPORT_TYPE_BIT)
|
||
|
|
+ if report_type not in PARSE_REPORT_MSG_FUNC_DICT.keys():
|
||
|
|
+ logging.warning(f"The type of the msg ({report_type}) is unknown")
|
||
|
|
+ raise ValueError
|
||
|
|
+
|
||
|
|
+ PARSE_REPORT_MSG_FUNC_DICT[report_type](data)
|
||
|
|
+
|
||
|
|
+ except socket.error:
|
||
|
|
+ logging.error("socket error")
|
||
|
|
+ return
|
||
|
|
+ except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError):
|
||
|
|
+ logging.error("server recv bmc msg failed!")
|
||
|
|
+ client_socket.close()
|
||
|
|
+ return
|
||
|
|
diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py
|
||
|
|
index ea09095..3829849 100644
|
||
|
|
--- a/src/python/syssentry/syssentry.py
|
||
|
|
+++ b/src/python/syssentry/syssentry.py
|
||
|
|
@@ -48,6 +48,12 @@ try:
|
||
|
|
except ImportError:
|
||
|
|
CPU_EXIST = False
|
||
|
|
|
||
|
|
+BMC_EXIST = True
|
||
|
|
+try:
|
||
|
|
+ from .bmc_alarm import bmc_recv
|
||
|
|
+except ImportError:
|
||
|
|
+ BMC_EXIST = False
|
||
|
|
+
|
||
|
|
|
||
|
|
INSPECTOR = None
|
||
|
|
|
||
|
|
@@ -89,6 +95,9 @@ RESULT_SOCKET_PATH = "/var/run/sysSentry/result.sock"
|
||
|
|
|
||
|
|
CPU_ALARM_SOCKET_PATH = "/var/run/sysSentry/report.sock"
|
||
|
|
|
||
|
|
+BMC_SOCKET_PATH = "/var/run/sysSentry/bmc.sock"
|
||
|
|
+
|
||
|
|
+fd_list = []
|
||
|
|
|
||
|
|
def msg_data_process(msg_data):
|
||
|
|
"""message data process"""
|
||
|
|
@@ -334,6 +343,41 @@ def cpu_alarm_fd_create():
|
||
|
|
|
||
|
|
return cpu_alarm_fd
|
||
|
|
|
||
|
|
+def bmc_fd_create():
|
||
|
|
+ """create bmc fd"""
|
||
|
|
+ if not os.path.exists(SENTRY_RUN_DIR):
|
||
|
|
+ logging.debug("%s not exist", SENTRY_RUN_DIR)
|
||
|
|
+ return None
|
||
|
|
+
|
||
|
|
+ try:
|
||
|
|
+ bmc_fd = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||
|
|
+ except socket.error:
|
||
|
|
+ logging.error("bmc fd create failed")
|
||
|
|
+ return None
|
||
|
|
+
|
||
|
|
+ bmc_fd.setblocking(False)
|
||
|
|
+ if os.path.exists(BMC_SOCKET_PATH):
|
||
|
|
+ os.remove(BMC_SOCKET_PATH)
|
||
|
|
+
|
||
|
|
+ try:
|
||
|
|
+ bmc_fd.bind(BMC_SOCKET_PATH)
|
||
|
|
+ except OSError:
|
||
|
|
+ logging.error("bmc fd bind failed")
|
||
|
|
+ bmc_fd.close()
|
||
|
|
+ return None
|
||
|
|
+
|
||
|
|
+ os.chmod(BMC_SOCKET_PATH, 0o600)
|
||
|
|
+ try:
|
||
|
|
+ bmc_fd.listen(5)
|
||
|
|
+ except OSError:
|
||
|
|
+ logging.error("bmc fd listen failed")
|
||
|
|
+ bmc_fd.close()
|
||
|
|
+ return None
|
||
|
|
+
|
||
|
|
+ logging.debug("%s bind and listen", BMC_SOCKET_PATH)
|
||
|
|
+
|
||
|
|
+ return bmc_fd
|
||
|
|
+
|
||
|
|
|
||
|
|
def server_result_recv(server_socket: socket.socket):
|
||
|
|
"""server result receive"""
|
||
|
|
@@ -407,35 +451,47 @@ def server_result_fd_create():
|
||
|
|
return server_result_fd
|
||
|
|
|
||
|
|
|
||
|
|
+def close_all_fd():
|
||
|
|
+ for fd in fd_list:
|
||
|
|
+ fd.close()
|
||
|
|
+
|
||
|
|
+
|
||
|
|
def main_loop():
|
||
|
|
"""main loop"""
|
||
|
|
+
|
||
|
|
server_fd = server_fd_create()
|
||
|
|
if not server_fd:
|
||
|
|
+ close_all_fd()
|
||
|
|
return
|
||
|
|
+ fd_list.append(server_fd)
|
||
|
|
|
||
|
|
server_result_fd = server_result_fd_create()
|
||
|
|
if not server_result_fd:
|
||
|
|
- server_fd.close()
|
||
|
|
+ close_all_fd()
|
||
|
|
return
|
||
|
|
+ fd_list.append(server_result_fd)
|
||
|
|
|
||
|
|
heartbeat_fd = heartbeat_fd_create()
|
||
|
|
if not heartbeat_fd:
|
||
|
|
- server_fd.close()
|
||
|
|
- server_result_fd.close()
|
||
|
|
+ close_all_fd()
|
||
|
|
return
|
||
|
|
+ fd_list.append(heartbeat_fd)
|
||
|
|
|
||
|
|
cpu_alarm_fd = cpu_alarm_fd_create()
|
||
|
|
if not cpu_alarm_fd:
|
||
|
|
- server_fd.close()
|
||
|
|
- heartbeat_fd.close()
|
||
|
|
- server_result_fd.close()
|
||
|
|
+ close_all_fd()
|
||
|
|
+ return
|
||
|
|
+ fd_list.append(cpu_alarm_fd)
|
||
|
|
+
|
||
|
|
+ bmc_fd = bmc_fd_create()
|
||
|
|
+ if not bmc_fd:
|
||
|
|
+ close_all_fd()
|
||
|
|
return
|
||
|
|
+ fd_list.append(bmc_fd)
|
||
|
|
|
||
|
|
epoll_fd = select.epoll()
|
||
|
|
- epoll_fd.register(server_fd.fileno(), select.EPOLLIN)
|
||
|
|
- epoll_fd.register(server_result_fd.fileno(), select.EPOLLIN)
|
||
|
|
- epoll_fd.register(heartbeat_fd.fileno(), select.EPOLLIN)
|
||
|
|
- epoll_fd.register(cpu_alarm_fd.fileno(), select.EPOLLIN)
|
||
|
|
+ for fd in fd_list:
|
||
|
|
+ epoll_fd.register(fd.fileno(), select.EPOLLIN)
|
||
|
|
|
||
|
|
logging.debug("start main loop")
|
||
|
|
# onstart_tasks_handle()
|
||
|
|
@@ -458,6 +514,8 @@ def main_loop():
|
||
|
|
heartbeat_recv(heartbeat_fd)
|
||
|
|
elif CPU_EXIST and event_fd == cpu_alarm_fd.fileno():
|
||
|
|
cpu_alarm_recv(cpu_alarm_fd)
|
||
|
|
+ elif BMC_EXIST and event_fd == bmc_fd.fileno():
|
||
|
|
+ bmc_recv(bmc_fd)
|
||
|
|
else:
|
||
|
|
continue
|
||
|
|
|
||
|
|
--
|
||
|
|
2.27.0
|
||
|
|
|