From abdeacfa6ae54b503714cb98f3262a39d883972e Mon Sep 17 00:00:00 2001 From: luckky Date: Fri, 11 Oct 2024 09:49:40 +0000 Subject: [PATCH] add hbm online repair --- config/tasks/hbm_online_repair.mod | 9 + src/c/hbm_online_repair/.gitignore | 6 + src/c/hbm_online_repair/Makefile | 25 + src/c/hbm_online_repair/hbm_online_repair.c | 144 ++++ src/c/hbm_online_repair/hbm_online_repair.env | 2 + src/c/hbm_online_repair/logger.h | 31 + .../non-standard-hbm-repair.c | 799 ++++++++++++++++++ .../non-standard-hbm-repair.h | 89 ++ src/c/hbm_online_repair/ras-events.c | 534 ++++++++++++ src/c/hbm_online_repair/ras-events.h | 28 + .../ras-non-standard-handler.c | 81 ++ .../ras-non-standard-handler.h | 25 + src/python/.gitignore | 1 + src/python/syssentry/bmc_alarm.py | 159 ++++ src/python/syssentry/syssentry.py | 78 +- 15 files changed, 2001 insertions(+), 10 deletions(-) create mode 100644 config/tasks/hbm_online_repair.mod create mode 100644 src/c/hbm_online_repair/.gitignore create mode 100644 src/c/hbm_online_repair/Makefile create mode 100644 src/c/hbm_online_repair/hbm_online_repair.c create mode 100644 src/c/hbm_online_repair/hbm_online_repair.env create mode 100644 src/c/hbm_online_repair/logger.h create mode 100644 src/c/hbm_online_repair/non-standard-hbm-repair.c create mode 100644 src/c/hbm_online_repair/non-standard-hbm-repair.h create mode 100644 src/c/hbm_online_repair/ras-events.c create mode 100644 src/c/hbm_online_repair/ras-events.h create mode 100644 src/c/hbm_online_repair/ras-non-standard-handler.c create mode 100644 src/c/hbm_online_repair/ras-non-standard-handler.h create mode 100644 src/python/.gitignore create mode 100644 src/python/syssentry/bmc_alarm.py diff --git a/config/tasks/hbm_online_repair.mod b/config/tasks/hbm_online_repair.mod new file mode 100644 index 0000000..77dd73e --- /dev/null +++ b/config/tasks/hbm_online_repair.mod @@ -0,0 +1,9 @@ +[common] +enabled=yes +task_start=/usr/bin/hbm_online_repair +task_stop=kill $pid +type=period +interval=180 +onstart=yes +env_file=/etc/sysconfig/hbm_online_repair.env +conflict=up \ No newline at end of file diff --git a/src/c/hbm_online_repair/.gitignore b/src/c/hbm_online_repair/.gitignore new file mode 100644 index 0000000..a577882 --- /dev/null +++ b/src/c/hbm_online_repair/.gitignore @@ -0,0 +1,6 @@ +*.o +*.c~ +*.h~ +hbm_online_repair + +.vscode/ diff --git a/src/c/hbm_online_repair/Makefile b/src/c/hbm_online_repair/Makefile new file mode 100644 index 0000000..16ebcd8 --- /dev/null +++ b/src/c/hbm_online_repair/Makefile @@ -0,0 +1,25 @@ +CC = gcc + +CFLAGS = -Wall -o3 + +LDFLAGS = -ltraceevent + +SRC = $(wildcard *.c) +HDR = $(wildcard *.h) + +OBJ = $(SRC:.c=.o) + +TARGET = hbm_online_repair + +all: $(TARGET) + +$(TARGET): $(OBJ) + $(CC) $(OBJ) -o $@ $(LDFLAGS) + +%.o: %.c $(HDR) + $(CC) $(CFLAGS) -c $< -o $@ + +clean: + rm -f $(OBJ) $(TARGET) + +.PHONY: all clean diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c new file mode 100644 index 0000000..3ace206 --- /dev/null +++ b/src/c/hbm_online_repair/hbm_online_repair.c @@ -0,0 +1,144 @@ +#include +#include +#include +#include +#include + +#include "logger.h" +#include "ras-events.h" +#include "non-standard-hbm-repair.h" + +#define DEFAULT_LOG_LEVEL LOG_INFO +#define DEFAULT_PAGE_ISOLATION_THRESHOLD 128 + +int global_level_setting; +int page_isolation_threshold; + +int string2int(const char* str, int* value) +{ + if (!str) { + return -1; + } + char *endptr; + errno = 0; + long val = strtol(str, &endptr, 10); + if (errno != 0 || *endptr != '\0') { + return -1; + } + *value = (int)val; + if (val != (long)*value) { + return -1; + } + return 0; +} + +int execute_command(const char *command) +{ + FILE *fp; + char buffer[128] = {0}; + int ret; + fp = popen(command, "r"); + if (!fp) { + log(LOG_ERROR, "popen failed\n"); + return -1; + } + + fgets(buffer, sizeof(buffer), fp); + log(LOG_DEBUG, "output of command is: %s\n", buffer); + + ret = pclose(fp); + if (ret < 0) { + log(LOG_ERROR, "pclose failed\n"); + return -1; + } + + if (!WIFEXITED(ret)) { + log(LOG_ERROR, "command did not terminate normally\n"); + return -1; + } + + ret = WEXITSTATUS(ret); + log(LOG_DEBUG, "command exited with status: %d\n", ret); + return ret; +} + +int load_required_driver(void) +{ + int ret; + ret = execute_command("modprobe hisi_mem_ras 2>&1"); + if (ret < 0) { + log(LOG_ERROR, "load repair driver failed\n"); + return ret; + } + ret = execute_command("modprobe page_eject 2>&1"); + if (ret < 0) { + log(LOG_ERROR, "load page driver failed\n"); + return ret; + } + log(LOG_INFO, "load required driver success\n"); + return ret; +} + +void hbm_param_init(void) +{ + int ret; + char *env; + + env = getenv("HBM_ONLINE_REPAIR_LOG_LEVEL"); + ret = string2int(env, &global_level_setting); + if (ret < 0) { + global_level_setting = DEFAULT_LOG_LEVEL; + log(LOG_WARNING, "Get log level from config failed, set the default value %d\n", DEFAULT_LOG_LEVEL); + } else { + log(LOG_INFO, "log level: %d\n", global_level_setting); + } + + env = getenv("PAGE_ISOLATION_THRESHOLD"); + ret = string2int(env, &page_isolation_threshold); + if (ret < 0) { + page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD; + log(LOG_WARNING, "Get page_isolation_threshold from config failed, set the default value %d\n", DEFAULT_PAGE_ISOLATION_THRESHOLD); + } else { + log(LOG_INFO, "page_isolation_threshold: %d\n", page_isolation_threshold); + } +} + + +int main(int argc, char *argv[]) +{ + int ret; + + hbm_param_init(); + + ret = load_required_driver(); + if (ret < 0) { + log(LOG_DEBUG, "load required driver failed\n"); + return ret; + } + + struct ras_events *ras = init_trace_instance(); + if (!ras) + return -1; + + ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 1); + if (ret < 0) { + log(LOG_WARNING, "unable to enable ras non_standard_event.\n"); + free(ras); + return -1; + } + + ret = init_all_flash(); + if (ret < 0) { + log(LOG_ERROR, "flash writer init failed\n"); + } + + handle_ras_events(ras); + + ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 0); + if (ret < 0) { + log(LOG_WARNING, "unable to disable ras non_standard_event.\n"); + } + + free(ras); + return ret; +} diff --git a/src/c/hbm_online_repair/hbm_online_repair.env b/src/c/hbm_online_repair/hbm_online_repair.env new file mode 100644 index 0000000..de56079 --- /dev/null +++ b/src/c/hbm_online_repair/hbm_online_repair.env @@ -0,0 +1,2 @@ +HBM_ONLINE_REPAIR_LOG_LEVEL=1 +PAGE_ISOLATION_THRESHOLD=128 diff --git a/src/c/hbm_online_repair/logger.h b/src/c/hbm_online_repair/logger.h new file mode 100644 index 0000000..ddfa932 --- /dev/null +++ b/src/c/hbm_online_repair/logger.h @@ -0,0 +1,31 @@ +#ifndef __LOGGER_H +#define __LOGGER_H + +#define TOOL_NAME "hbm_online_repair" + +#define LOG_DEBUG 0 +#define LOG_INFO 1 +#define LOG_WARNING 2 +#define LOG_ERROR 3 + +extern int global_level_setting; + +#define log_prefix(level) \ + (level == LOG_DEBUG ? "DEBUG" : \ + level == LOG_INFO ? "INFO" : \ + level == LOG_WARNING ? "WARNING" : \ + level == LOG_ERROR ? "ERROR" : \ + "UNKNOWN_LEVEL") + +#define log_fd(level) \ + (level == LOG_ERROR ? stderr : stdout) + +#define log(level, fmt, args...) do {\ + if (level >= global_level_setting) {\ + fprintf(log_fd(level), "[%s] %s: ", log_prefix(level), TOOL_NAME);\ + fprintf(log_fd(level), fmt, ##args);\ + fflush(log_fd(level));\ + }\ +} while (0) + +#endif diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c new file mode 100644 index 0000000..b175e14 --- /dev/null +++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c @@ -0,0 +1,799 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "logger.h" +#include "non-standard-hbm-repair.h" + +extern int page_isolation_threshold; +size_t total_size = 0; +struct hisi_common_error_section { + uint32_t val_bits; + uint8_t version; + uint8_t soc_id; + uint8_t socket_id; + uint8_t totem_id; + uint8_t nimbus_id; + uint8_t subsystem_id; + uint8_t module_id; + uint8_t submodule_id; + uint8_t core_id; + uint8_t port_id; + uint16_t err_type; + struct { + uint8_t function; + uint8_t device; + uint16_t segment; + uint8_t bus; + uint8_t reserved[3]; + } pcie_info; + uint8_t err_severity; + uint8_t reserved[3]; + uint32_t reg_array_size; + uint32_t reg_array[]; +}; + +struct fault_addr_info { + uint32_t processer_id; + uint32_t die_id; + uint32_t stack_id; + uint32_t sid; + uint32_t channel_id; + uint32_t bankgroup_id; + uint32_t bank_id; + uint32_t row_id; + uint32_t column_id; + uint32_t error_type; + uint32_t repair_type; + uint32_t reserved; + uint32_t crc8; +}; + +typedef struct { + const char *VariableName; + const char *VendorGuid; + uint32_t DataSize; + uint8_t *Data; + uint32_t Attributes; +} efi_variable_t; + +char* flash_names[FLASH_ENTRY_NUM] = { + "repair0000", + "repair0001", + "repair0100", + "repair0101", + "repair0200", + "repair0201", + "repair0300", + "repair0301", +}; +char *flash_guids[FLASH_ENTRY_NUM] = { + "CD2FF4D9-D937-4e1d-B810-A1A568C37C01", + "DD92CC91-43E6-4c69-A42A-B08F72FCB157", + "4A8E0D1E-4CFA-47b2-9359-DA3A0006878B", + "733F9979-4ED4-478d-BD6A-E4D0F0390FDB", + "9BFBBA1F-5A93-4d36-AD47-D3C2D714D914", + "A0920D6F-78B8-4c09-9F61-7CEC845F116C", + "0049CE5E-8C18-414c-BDC1-A87E60CEEFD7", + "6AED17B4-50C7-4a40-A5A7-48AF55DD8EAC" +}; + +static int get_guid_index(uint32_t socket_id, uint32_t error_type) { + if (2 * socket_id + error_type >= FLASH_ENTRY_NUM) + return -1; + return 2 * socket_id + error_type; +} + +static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned long long fault_addr) +{ + info_struct->processer_id = fault_addr & FAULT_ADDR_PROCESSOR_ID_MASK; + fault_addr >>= FAULT_ADDR_PROCESSOR_ID_LEN; + info_struct->die_id = fault_addr & FAULT_ADDR_DIE_ID_MASK; + fault_addr >>= FAULT_ADDR_DIE_ID_LEN; + info_struct->stack_id = fault_addr & FAULT_ADDR_STACK_ID_MASK; + fault_addr >>= FAULT_ADDR_STACK_ID_LEN; + info_struct->sid = fault_addr & FAULT_ADDR_SID_MASK; + fault_addr >>= FAULT_ADDR_SID_LEN; + info_struct->channel_id = fault_addr & FAULT_ADDR_CHANNEL_ID_MASK; + fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN; + info_struct->bankgroup_id = fault_addr & FAULT_ADDR_BANKGROUP_ID_MASK; + fault_addr >>= FAULT_ADDR_BANKGROUP_ID_LEN; + info_struct->bank_id = fault_addr & FAULT_ADDR_BANK_ID_MASK; + fault_addr >>= FAULT_ADDR_BANK_ID_LEN; + info_struct->row_id = fault_addr & FAULT_ADDR_ROW_ID_MASK; + fault_addr >>= FAULT_ADDR_ROW_ID_LEN; + info_struct->column_id = fault_addr & FAULT_ADDR_COLUMN_ID_MASK; + fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN; + info_struct->error_type = fault_addr & FAULT_ADDR_ERROR_TYPE_MASK; + fault_addr >>= FAULT_ADDR_ERROR_TYPE_LEN; + info_struct->repair_type = fault_addr & FAULT_ADDR_REPAIR_TYPE_MASK; + fault_addr >>= FAULT_ADDR_REPAIR_TYPE_LEN; + info_struct->reserved = fault_addr & FAULT_ADDR_RESERVED_MASK; + fault_addr >>= FAULT_ADDR_RESERVED_LEN; + info_struct->crc8 = (uint32_t)fault_addr; +} + +static bool variable_existed(char *name, char *guid) +{ + char filename[PATH_MAX]; + int fd; + + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + // open var file + fd = open(filename, O_RDONLY); + if (fd < 0) { + log(LOG_WARNING, "open file %s failed\n", filename); + return false; + } + close(fd); + return true; +} + +static uint32_t read_variable_attribute(char *name, char *guid) { + char filename[PATH_MAX]; + int fd; + size_t readsize; + uint32_t attribute = (uint32_t)-1; + + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + // open var file + fd = open(filename, O_RDONLY); + if (fd < 0) { + log(LOG_ERROR, "open %s failed\n", filename); + return attribute; + } + + // read attributes from first 4 bytes + readsize = read(fd, &attribute, sizeof(uint32_t)); + if (readsize != sizeof(uint32_t)) { + log(LOG_ERROR, "read attribute of %s failed\n", filename); + } + + close(fd); + return attribute; +} + +static int efivarfs_set_mutable(char *name, char *guid, bool mutable) +{ + unsigned long orig_attrs, new_attrs; + char filename[PATH_MAX]; + int fd; + + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + fd = open(filename, O_RDONLY); + if (fd < 0) { + log(LOG_ERROR, "open %s failed\n", filename); + goto err; + } + + if (ioctl(fd, FS_IOC_GETFLAGS, &orig_attrs) == -1) { + log(LOG_ERROR, "ioctl FS_IOC_GETFLAGS failed\n"); + goto err; + } + + if (mutable) + new_attrs = orig_attrs & ~(unsigned long)FS_IMMUTABLE_FL; + else + new_attrs = orig_attrs | FS_IMMUTABLE_FL; + + if (new_attrs == orig_attrs) { + close(fd); + return 0; + } + + if (ioctl(fd, FS_IOC_SETFLAGS, &new_attrs) == -1) { + log(LOG_ERROR, "ioctl FS_IOC_SETFLAGS failed\n"); + goto err; + } + close(fd); + return 0; +err: + if (fd >= 0) + close(fd); + return -1; +} + +static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute) { + int fd, mode; + size_t writesize; + void *buffer; + unsigned long total; + char filename[PATH_MAX]; + + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + // prepare attributes(size 4 bytes) and data + total = size + sizeof(uint32_t); + buffer = malloc(total); + if (buffer == NULL) { + log(LOG_ERROR, "malloc data for %s failed\n", filename); + goto err; + } + memcpy(buffer, &attribute, sizeof(uint32_t)); + memcpy(buffer + sizeof(uint32_t), value, size); + + // change attr + if (efivarfs_set_mutable(name, guid, 1) != 0) { + log(LOG_ERROR, "set mutable for %s failed\n", filename); + goto err; + } + + mode = O_WRONLY; + if (attribute & EFI_VARIABLE_APPEND_WRITE) + mode |= O_APPEND; + else + mode |= O_CREAT; + + // open var file + fd = open(filename, mode, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd < 0) { + log(LOG_ERROR, "open %s failed\n", filename); + goto err; + } + + // write to var file + writesize = write(fd, buffer, total); + if (writesize != total) { + log(LOG_ERROR, "write %s failed\n", filename); + goto err; + } + + close(fd); + free(buffer); + if (efivarfs_set_mutable(name, guid, 0) != 0) { + log(LOG_ERROR, "set immutable for %s failed\n", filename); + } + return 0; +err: + if (fd >= 0) + close(fd); + if (buffer) + free(buffer); + if (efivarfs_set_mutable(name, guid, 0) != 0) { + log(LOG_ERROR, "set immutable for %s failed\n", filename); + } + return -1; +} + +static int append_variable(char *name, char *guid, void *data, unsigned long size) { + // prepare append attribute + uint32_t attribute = read_variable_attribute(name, guid); + if (attribute == (uint32_t)-1) { + log(LOG_ERROR, "read %s-%s attribute failed\n", name, guid); + return -1; + } + attribute |= EFI_VARIABLE_APPEND_WRITE; + + return write_variable(name, guid, data, size, attribute); +} + +static size_t get_var_size(char *name, char *guid) { + char filename[PATH_MAX]; + int fd; + struct stat stat; + + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + // open var file + fd = open(filename, O_RDONLY); + if (fd < 0) { + log(LOG_WARNING, "open %s failed\n", filename); + goto err; + } + // read stat + if (fstat(fd, &stat) != 0) { + log(LOG_WARNING, "fstat %s failed\n", filename); + goto err; + } + close(fd); + return stat.st_size; +err: + if (fd >= 0) + close(fd); + return (size_t)-1; +} + +int init_all_flash() { + for (int i = 0; i < FLASH_ENTRY_NUM; i++) { + // check existed entry + if (variable_existed(flash_names[i], flash_guids[i])) { + total_size += get_var_size(flash_names[i], flash_guids[i]); + continue; + } + // create new entry + uint32_t attribute = EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS; + char *data = ""; + unsigned long size = 1; + int ret = write_variable(flash_names[i], flash_guids[i], data, size, attribute); + if (ret) { + log(LOG_ERROR, "init %s-%s failed, fault info storage funtion not enabled\n", flash_names[i], flash_guids[i]); + return -1; + } + total_size += sizeof(uint32_t) + 1; + } + // check total entry size + log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n", + total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE); + if (total_size > MAX_VAR_SIZE) { + log(LOG_ERROR, "fault info storage reach threshold, cannot save new record\n"); + } + return 0; +} + +static int write_fault_info_to_flash(const struct hisi_common_error_section *err) { + int ret, guid_index; + uint32_t reg_size; + uint64_t fault_addr; + + // check flash usage threshold + if (total_size + sizeof(uint64_t) > MAX_VAR_SIZE) { + log(LOG_WARNING, "fault info storage reach threshold, cannot save new record into flash\n"); + return -1; + } + + // parse physical addr + reg_size = err->reg_array_size / sizeof(uint32_t); + fault_addr = err->reg_array[reg_size - 1]; + fault_addr <<= TYPE_UINT32_WIDTH; + fault_addr += err->reg_array[reg_size - 2]; + + // get guid + struct fault_addr_info info_struct; + parse_fault_addr_info(&info_struct, fault_addr); + guid_index = get_guid_index(info_struct.processer_id, info_struct.error_type); + if (guid_index < 0) { + log(LOG_ERROR, "invalid fault info\n"); + return -1; + } + // record physical addr in flash + ret = append_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t)); + if (ret < 0) { + log(LOG_ERROR, "append to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]); + return -1; + } + total_size += sizeof(uint64_t); + log(LOG_INFO, "write hbm fault info to flash success\n"); + return 0; +} + +static int write_file(char *path, const char *name, unsigned long long value) +{ + char fname[MAX_PATH]; + char buf[20]; + int ret; + int fd; + + snprintf(fname, MAX_PATH, "%s/%s", path, name); + + fd = open(fname, O_WRONLY); + if (fd < 0) { + log(LOG_WARNING, "HBM ACLS: Cannot to open '%s': %s\n", + fname, strerror(errno)); + return -errno; + } + + snprintf(buf, sizeof(buf), "0x%llx\n", value); + ret = write(fd, buf, strlen(buf)); + if (ret <= 0) + log(LOG_WARNING, "HBM ACLS: Failed to set %s (0x%llx): %s\n", + fname, value, strerror(errno)); + + close(fd); + return ret > 0 ? 0 : -errno; +} + +static int get_hardware_corrupted_size() +{ + FILE *fp; + char line[256]; + int hardware_corrupted_size = -1; + char *key = "HardwareCorrupted:"; + + fp = fopen("/proc/meminfo", "r"); + if (fp == NULL) { + log(LOG_ERROR, "Failed to open /proc/meminfo\n"); + return -1; + } + + while (fgets(line, sizeof(line), fp) != NULL) { + char *pos; + if ((pos = strstr(line, key)) != NULL) { + sscanf(pos, "HardwareCorrupted: %5d kB\n", &hardware_corrupted_size); + break; + } + } + + fclose(fp); + return hardware_corrupted_size; +} + +static uint8_t get_repair_result_code(int ret) +{ + if (ret == -ENOSPC) { + return REPAIR_FAILED_NO_RESOURCE; + } else if (ret == -EIO) { + return REPAIR_FAILED_OTHER_REASON; + } else if (ret == -ENXIO || ret == -EINVAL) { + return REPAIR_FAILED_INVALID_PARAM; + } + return REPAIR_FAILED_OTHER_REASON; +} + +static int notice_BMC(const struct hisi_common_error_section *err, uint8_t repair_result_code) +{ + int sockfd; + struct sockaddr_un addr; + char bmc_msg[sizeof(BMC_REPORT_FORMAT)] = {0}; + uint8_t repair_type_code, isolation_type_code; + uint32_t repair_type; + unsigned long long fault_addr; + + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + if (sockfd < 0) { + log(LOG_ERROR, "Failed to create BMC notice socket\n"); + return -1; + } + + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, BMC_SOCKET_PATH, sizeof(addr.sun_path) - 1); + if (connect(sockfd, (struct sockaddr *)&addr, sizeof(struct sockaddr_un)) < 0) { + log(LOG_ERROR, "Failed to connect BMC notice socket\n"); + close(sockfd); + return -1; + } + + /* assemble bmc specific msg */ + repair_type_code = 0; + isolation_type_code = 0; + repair_type = err->reg_array[HBM_REPAIR_REQ_TYPE]; + if (repair_type & HBM_CE_ACLS) { + repair_type_code = 0; + isolation_type_code = SINGLE_ADDR_FAULT; + } else if (repair_type & HBM_PSUE_ACLS) { + repair_type_code = 1; + isolation_type_code = SINGLE_ADDR_FAULT; + } else if (repair_type & HBM_CE_SPPR) { + repair_type_code = 2; + isolation_type_code = ROW_FAULT; + } else if (repair_type & HBM_PSUE_SPPR) { + repair_type_code = 3; + isolation_type_code = ROW_FAULT; + } + + const uint32_t reg_size = err->reg_array_size / sizeof(uint32_t); + + fault_addr = err->reg_array[reg_size - 1]; + fault_addr <<= TYPE_UINT32_WIDTH; + fault_addr += err->reg_array[reg_size - 2]; + + log(LOG_DEBUG, "Get the fault addr is %llu\n", fault_addr); + + struct fault_addr_info info_struct; + parse_fault_addr_info(&info_struct, fault_addr); + + log(LOG_DEBUG, "info_struct.processer_id is %u\n", info_struct.processer_id); + log(LOG_DEBUG, "info_struct.die_id is %u\n", info_struct.die_id); + log(LOG_DEBUG, "info_struct.stack_id is %u\n", info_struct.stack_id); + log(LOG_DEBUG, "info_struct.sid is %u\n", info_struct.sid); + log(LOG_DEBUG, "info_struct.channel_id is %u\n", info_struct.channel_id); + log(LOG_DEBUG, "info_struct.bankgroup_id is %u\n", info_struct.bankgroup_id); + log(LOG_DEBUG, "info_struct.bank_id is %u\n", info_struct.bank_id); + log(LOG_DEBUG, "info_struct.row_id is %u\n", info_struct.row_id); + log(LOG_DEBUG, "info_struct.column_id is %u\n", info_struct.column_id); + log(LOG_DEBUG, "info_struct.error_type is %u\n", info_struct.error_type); + log(LOG_DEBUG, "info_struct.repair_type is %u\n", info_struct.repair_type); + log(LOG_DEBUG, "info_struct.reserved is %u\n", info_struct.reserved); + log(LOG_DEBUG, "info_struct.crc8 is %u\n", info_struct.crc8); + + snprintf(bmc_msg, sizeof(BMC_REPORT_FORMAT), BMC_REPORT_FORMAT, + repair_type_code, + repair_result_code, + isolation_type_code, + info_struct.processer_id, + info_struct.die_id, + info_struct.stack_id, + info_struct.sid, + info_struct.channel_id, + info_struct.bankgroup_id, + info_struct.bank_id, + info_struct.row_id, + info_struct.column_id + ); + + log(LOG_DEBUG, "Send msg to sysSentry, bmc msg is %s\n", bmc_msg); + + if (write(sockfd, bmc_msg, strlen(bmc_msg)) <= 0) { + log(LOG_ERROR, "Failed to send data to BMC notice socket\n"); + close(sockfd); + return -1; + } + + close(sockfd); + return 0; +} + +static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err) +{ + unsigned long long paddr; + int ret; + bool is_acls = err->reg_array[HBM_REPAIR_REQ_TYPE] & (HBM_CE_ACLS | HBM_PSUE_ACLS); + int required_isolate_size = (is_acls ? HBM_ACLS_ADDR_NUM : HBM_SPPR_ADDR_NUM) * DEFAULT_PAGE_SIZE_KB; + int hardware_corrupted_size = get_hardware_corrupted_size(); + if (hardware_corrupted_size < 0) { + log(LOG_ERROR, "Page isolate failed: Get hardware_corrupted_size failed"); + notice_BMC(err, ISOLATE_FAILED_OTHER_REASON); + return -1; + } + if ((required_isolate_size + hardware_corrupted_size) > page_isolation_threshold) { + log(LOG_INFO, "Page isolate failed: the isolation resource is not enough\n"); + notice_BMC(err, ISOLATE_FAILED_OVER_THRESHOLD); + return -1; + } + if (is_acls) { + /* ACLS */ + paddr = err->reg_array[HBM_ADDH]; + paddr <<= TYPE_UINT32_WIDTH; + paddr += err->reg_array[HBM_ADDL]; + + ret = write_file("/sys/kernel/page_eject", "offline_page", paddr); + if (ret < 0) { + notice_BMC(err, ISOLATE_FAILED_OTHER_REASON); + log(LOG_WARNING, "HBM: ACLS offline failed, address is 0x%llx \n", paddr); + return ret; + } + } else { + /* SPPR */ + bool all_success = true; + uint32_t i; + for (i = 0; i < HBM_SPPR_ADDR_NUM; i++) { + paddr = err->reg_array[2 * i + HBM_ADDH]; + paddr <<= TYPE_UINT32_WIDTH; + paddr += err->reg_array[2 * i + HBM_ADDL]; + ret = write_file("/sys/kernel/page_eject", "offline_page", paddr); + if (ret < 0) { + all_success = false; + log(LOG_WARNING, "HBM: SPPR offline failed, address is 0x%llx \n", paddr); + continue; + } + } + if (!all_success) { + notice_BMC(err, ISOLATE_FAILED_OTHER_REASON); + ret = -1; + } + } + return ret < 0 ? ret : 0; +} + +static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) +{ + int ret; + if (repair_ret < 0) { + log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr); + /* not much we can do about errors here */ + (void)write_file("/sys/kernel/page_eject", "remove_page", paddr); + return get_repair_result_code(repair_ret); + } + + ret = write_file("/sys/kernel/page_eject", "online_page", paddr); + if (ret < 0) { + log(LOG_WARNING, "HBM %s: Page (0x%llx) online failed\n",is_acls ? "ACLS" : "SPPR", paddr); + return ONLINE_PAGE_FAILED; + } else { + log(LOG_INFO, "HBM %s: Page (0x%llx) repair and online success\n",is_acls ? "ACLS" : "SPPR", paddr); + return ISOLATE_REPAIR_ONLINE_SUCCESS; + } +} + +static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path) +{ + unsigned long long paddr; + int ret; + uint8_t repair_result_code; + bool is_acls; + + /* Both ACLS and SPPR only repair the first address */ + paddr = err->reg_array[HBM_ADDH]; + paddr <<= TYPE_UINT32_WIDTH; + paddr += err->reg_array[HBM_ADDL]; + + is_acls = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_CE_ACLS || + err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_PSUE_ACLS; + + ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr); + if (ret < 0) { + notice_BMC(err, get_repair_result_code(ret)); + log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); + return ret; + } + + ret = write_file(path, is_acls ? "acls_repair" : "sppr_repair", paddr); + + if (is_acls) { + /* ACLS */ + repair_result_code = hbmc_hbm_after_repair(is_acls, ret, paddr); + notice_BMC(err, repair_result_code); + return ret; + } else { + /* SPPR */ + bool all_online_success = true; + uint32_t i; + for (i = 0; i < HBM_SPPR_ADDR_NUM; i++) { + paddr = err->reg_array[2 * i + HBM_ADDH]; + paddr <<= TYPE_UINT32_WIDTH; + paddr += err->reg_array[2 * i + HBM_ADDL]; + + repair_result_code = hbmc_hbm_after_repair(is_acls, ret, paddr); + if (repair_result_code != ISOLATE_REPAIR_ONLINE_SUCCESS) { + all_online_success = false; + } + } + if (ret < 0) { + notice_BMC(err, get_repair_result_code(ret)); + return ret; + } else if (all_online_success) { + notice_BMC(err, ISOLATE_REPAIR_ONLINE_SUCCESS); + return 0; + } else { + notice_BMC(err, ONLINE_PAGE_FAILED); + return ret; + } + } + /* The final return code is not necessary */ + return ret < 0 ? ret : 0; +} + +static int hbmc_get_memory_type(char *path) +{ + int type = HBM_UNKNOWN; + char fname[MAX_PATH]; + char buf[128]; + FILE *file; + + snprintf(fname, MAX_PATH, "%s/%s", path, "memory_type"); + file = fopen(fname, "r"); + if (!file) { + log(LOG_WARNING, "HBM: Cannot to open '%s': %s\n", + fname, strerror(errno)); + return -errno; + } + + if (!fgets(buf, sizeof(buf), file)) { + log(LOG_WARNING, "HBM: Failed to read %s\n", fname); + goto err; + } + + /* Remove the last '\n' */ + buf[strlen(buf) - 1] = 0; + + if (strcmp(buf, "HBM") == 0) + type = HBM_HBM_MEMORY; + else if (strcmp(buf, "DDR") == 0) + type = HBM_DDR_MEMORY; + +err: + fclose(file); + return type; +} + +static void hbm_repair_handler(const struct hisi_common_error_section *err) +{ + log(LOG_DEBUG, "Received ACLS/SPPR flat mode repair request, try to repair\n"); + char *sys_dev_path = "/sys/devices/platform"; + char path[MAX_PATH]; + struct dirent *dent; + DIR *dir; + int ret; + bool find_device = false, find_hbm_mem = false; + + ret = hbmc_hbm_page_isolate(err); + if (ret < 0) { + return; + } + + dir = opendir(sys_dev_path); + if (!dir) { + log(LOG_WARNING, "Can't read '%s': %s\n", + sys_dev_path, strerror(errno)); + notice_BMC(err, REPAIR_FAILED_OTHER_REASON); + return; + } + + while ((dent = readdir(dir))) { + if (!strstr(dent->d_name, HBM_MEM_RAS_NAME)) + continue; + find_device = true; + + snprintf(path, MAX_PATH, "%s/%s", sys_dev_path, dent->d_name); + + if (hbmc_get_memory_type(path) == HBM_HBM_MEMORY) { + find_hbm_mem = true; + ret = hbmc_hbm_repair(err, path); + if (ret != -ENXIO) + break; + } + } + if (!find_device) { + log(LOG_ERROR, "Repair driver is not loaded, skip error, error_type is %u\n", + err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); + notice_BMC(err, REPAIR_FAILED_OTHER_REASON); + } else if (!find_hbm_mem) { + log(LOG_ERROR, "No HBM device memory type found, skip error, error_type is %u\n", + err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); + notice_BMC(err, REPAIR_FAILED_OTHER_REASON); + } + + closedir(dir); +} + +static bool hbm_repair_validate(const struct hisi_common_error_section *err) +{ + if (!((err->val_bits & BIT(COMMON_VALID_MODULE_ID)) && + (err->val_bits & BIT(COMMON_VALID_SUBMODULE_ID)) && + (err->val_bits & BIT(COMMON_VALID_REG_ARRAY_SIZE)) + )) { + log(LOG_DEBUG, "Err val_bits validate failed, val_bits is %u\n", err->val_bits); + return false; + } + log(LOG_DEBUG, "err->module_id: %u\n", err->module_id); + log(LOG_DEBUG, "err->submodule_id: %u\n", err->submodule_id); + log(LOG_DEBUG, "err->val_bits: 0x%x\n", err->val_bits); + log(LOG_DEBUG, "err->reg_array_size: %u\n", err->reg_array_size); + + if (err->module_id != HBMC_MODULE_ID || + err->submodule_id != HBMC_SUBMOD_HBM_REPAIR) { + log(LOG_DEBUG, "err module_id or sub_module id doesn't not match\n"); + return false; + } + + uint32_t hbm_repair_reg_type = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK; + bool is_acls_valid = (hbm_repair_reg_type & (HBM_CE_ACLS | HBM_PSUE_ACLS)) && + (err->reg_array_size == HBM_ACLS_ARRAY_SIZE); + bool is_sppr_valid = (hbm_repair_reg_type & (HBM_CE_SPPR | HBM_PSUE_SPPR)) && + (err->reg_array_size == HBM_SPPR_ARRAY_SIZE); + bool is_cache_mode = (hbm_repair_reg_type & HBM_CACHE_MODE) && + (err->reg_array_size == HBM_CACHE_ARRAY_SIZE); + + if (!(is_acls_valid || is_sppr_valid || is_cache_mode)) { + log(LOG_DEBUG, "err type (%u) is unknown or address array length (%u) is invalid\n", + hbm_repair_reg_type, err->reg_array_size); + return false; + } + + log(LOG_INFO, "Received ACLS/SPPR repair request\n"); + return true; +} + +static bool hbm_flat_mode_validate(const struct hisi_common_error_section *err) +{ + uint32_t hbm_repair_reg_type = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK; + return !(hbm_repair_reg_type & HBM_CACHE_MODE); +} + +int decode_hisi_common_section(struct ras_non_standard_event *event) +{ + const struct hisi_common_error_section *err = (struct hisi_common_error_section *)event->error; + + if (hbm_repair_validate(err)) { + write_fault_info_to_flash(err); + if (hbm_flat_mode_validate(err)) { + hbm_repair_handler(err); + } + } + + return 0; +} diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.h b/src/c/hbm_online_repair/non-standard-hbm-repair.h new file mode 100644 index 0000000..7e8e448 --- /dev/null +++ b/src/c/hbm_online_repair/non-standard-hbm-repair.h @@ -0,0 +1,89 @@ +#ifndef __NON_STANDARD_HBM_REPAIR +#define __NON_STANDARD_HBM_REPAIR + +#include "ras-non-standard-handler.h" + +#define DEFAULT_PAGE_SIZE_KB 4 +#define HBM_MEM_RAS_NAME "HISI0521" +#define HBM_UNKNOWN 0 +#define HBM_HBM_MEMORY 1 +#define HBM_DDR_MEMORY 2 + +#define TYPE_UINT32_WIDTH 32 +#define HBM_REPAIR_REQ_TYPE 0 +#define HBM_CE_ACLS BIT(0) +#define HBM_PSUE_ACLS BIT(1) +#define HBM_CE_SPPR BIT(2) +#define HBM_PSUE_SPPR BIT(3) +#define HBM_CACHE_MODE (BIT(4) | BIT(5) | BIT(6) | BIT(7)) +#define HBM_ERROR_MASK 0b11111111 +#define HBM_ADDL 1 +#define HBM_ADDH 2 +#define HBM_ERROR_TYPE_SIZE 4 +#define HBM_ADDR_SIZE 8 +#define HBM_ACLS_ADDR_NUM 1 +#define HBM_SPPR_ADDR_NUM 16 +#define HBM_ACLS_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE * HBM_ACLS_ADDR_NUM + HBM_ADDR_SIZE) +#define HBM_SPPR_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE * HBM_SPPR_ADDR_NUM + HBM_ADDR_SIZE) +#define HBM_CACHE_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE) +#define HBMC_MODULE_ID 0x28 +#define HBMC_SUBMOD_HBM_REPAIR 6 +#define COMMON_VALID_MODULE_ID 5 +#define COMMON_VALID_SUBMODULE_ID 6 +#define COMMON_VALID_REG_ARRAY_SIZE 12 + +#define BMC_SOCKET_PATH "/var/run/sysSentry/bmc.sock" +#define BMC_REPORT_FORMAT "REP00%02x%02x%02x0000000000000000%02x%02x%02x00%02x00%02x%02x%02x%08x%08x0000000000" + +#define ISOLATE_FAILED_OVER_THRESHOLD 0b10000001 +#define ISOLATE_FAILED_OTHER_REASON 0b10000010 +#define REPAIR_FAILED_NO_RESOURCE 0b10010100 +#define REPAIR_FAILED_INVALID_PARAM 0b10011000 +#define REPAIR_FAILED_OTHER_REASON 0b10011100 +#define ONLINE_PAGE_FAILED 0b10100000 +#define ISOLATE_REPAIR_ONLINE_SUCCESS 0b00000000 + +#define ROW_FAULT 1 +#define SINGLE_ADDR_FAULT 6 + +#define FAULT_ADDR_PROCESSOR_ID_LEN 2 +#define FAULT_ADDR_DIE_ID_LEN 1 +#define FAULT_ADDR_STACK_ID_LEN 3 +#define FAULT_ADDR_SID_LEN 3 +#define FAULT_ADDR_CHANNEL_ID_LEN 8 +#define FAULT_ADDR_BANKGROUP_ID_LEN 3 +#define FAULT_ADDR_BANK_ID_LEN 3 +#define FAULT_ADDR_ROW_ID_LEN 17 +#define FAULT_ADDR_COLUMN_ID_LEN 10 +#define FAULT_ADDR_ERROR_TYPE_LEN 2 +#define FAULT_ADDR_REPAIR_TYPE_LEN 2 +#define FAULT_ADDR_RESERVED_LEN 2 +#define FAULT_ADDR_CRC8_LEN 8 + +#define FAULT_ADDR_PROCESSOR_ID_MASK ((1 << FAULT_ADDR_PROCESSOR_ID_LEN ) - 1) +#define FAULT_ADDR_DIE_ID_MASK ((1 << FAULT_ADDR_DIE_ID_LEN ) - 1) +#define FAULT_ADDR_STACK_ID_MASK ((1 << FAULT_ADDR_STACK_ID_LEN ) - 1) +#define FAULT_ADDR_SID_MASK ((1 << FAULT_ADDR_SID_LEN ) - 1) +#define FAULT_ADDR_CHANNEL_ID_MASK ((1 << FAULT_ADDR_CHANNEL_ID_LEN ) - 1) +#define FAULT_ADDR_BANKGROUP_ID_MASK ((1 << FAULT_ADDR_BANKGROUP_ID_LEN ) - 1) +#define FAULT_ADDR_BANK_ID_MASK ((1 << FAULT_ADDR_BANK_ID_LEN ) - 1) +#define FAULT_ADDR_ROW_ID_MASK ((1 << FAULT_ADDR_ROW_ID_LEN ) - 1) +#define FAULT_ADDR_COLUMN_ID_MASK ((1 << FAULT_ADDR_COLUMN_ID_LEN ) - 1) +#define FAULT_ADDR_ERROR_TYPE_MASK ((1 << FAULT_ADDR_ERROR_TYPE_LEN ) - 1) +#define FAULT_ADDR_REPAIR_TYPE_MASK ((1 << FAULT_ADDR_REPAIR_TYPE_LEN ) - 1) +#define FAULT_ADDR_RESERVED_MASK ((1 << FAULT_ADDR_RESERVED_LEN ) - 1) +#define FAULT_ADDR_CRC8_MASK ((1 << FAULT_ADDR_CRC8_LEN ) - 1) + +#define EFI_VARIABLE_NON_VOLATILE 0x1 +#define EFI_VARIABLE_BOOTSERVICE_ACCESS 0x2 +#define EFI_VARIABLE_RUNTIME_ACCESS 0x4 +#define EFI_VARIABLE_APPEND_WRITE 0x40 + +#define EFIVARFS_PATH "/sys/firmware/efi/efivars" +#define MAX_VAR_SIZE (128 * 1024) +#define FLASH_ENTRY_NUM 8 +#define KB_SIZE 1024 + +extern int init_all_flash(); + +#endif diff --git a/src/c/hbm_online_repair/ras-events.c b/src/c/hbm_online_repair/ras-events.c new file mode 100644 index 0000000..0b12329 --- /dev/null +++ b/src/c/hbm_online_repair/ras-events.c @@ -0,0 +1,534 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "ras-non-standard-handler.h" +#include "logger.h" + +/* + * Polling time, if read() doesn't block. Currently, trace_pipe_raw never + * blocks on read(). So, we need to sleep for a while, to avoid spending + * too much CPU cycles. A fix for it is expected for 3.10. + */ +#define POLLING_TIME 3 + +/* Test for a little-endian machine */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #define ENDIAN KBUFFER_ENDIAN_LITTLE +#else + #define ENDIAN KBUFFER_ENDIAN_BIG +#endif + +static int get_debugfs_dir(char *debugfs_dir, size_t len) +{ + FILE *fp; + char line[MAX_PATH + 1 + 256]; + + fp = fopen("/proc/mounts","r"); + if (!fp) { + log(LOG_INFO, "Can't open /proc/mounts"); + return errno; + } + + do { + char *p, *type, *dir; + if (!fgets(line, sizeof(line), fp)) + break; + + p = strtok(line, " \t"); + if (!p) + break; + + dir = strtok(NULL, " \t"); + if (!dir) + break; + + type = strtok(NULL, " \t"); + if (!type) + break; + + if (!strcmp(type, "debugfs")) { + fclose(fp); + strncpy(debugfs_dir, dir, len - 1); + debugfs_dir[len - 1] = '\0'; + return 0; + } + } while(1); + + fclose(fp); + log(LOG_INFO, "Can't find debugfs\n"); + return ENOENT; +} + + +static int open_trace(char *trace_dir, char *name, int flags) +{ + int ret; + char fname[MAX_PATH + 1]; + + strcpy(fname, trace_dir); + strcat(fname, "/"); + strcat(fname, name); + + ret = open(fname, flags); + if (ret < 0) + log(LOG_WARNING, "open_trace() failed, fname=%s ret=%d errno=%d\n", fname, ret, errno); + + return ret; +} + +static int create_trace_instance(char *trace_instance_dir) +{ + char fname[MAX_PATH + 1]; + int rc; + + get_debugfs_dir(fname, sizeof(fname)); + strcat(fname, "/tracing/instances/"TOOL_NAME); + rc = mkdir(fname, S_IRWXU); + if (rc < 0 && errno != EEXIST) { + log(LOG_INFO, "Unable to create " TOOL_NAME " instance at %s\n", fname); + return -1; + } + strcpy(trace_instance_dir, fname); + return 0; +} + +struct ras_events *init_trace_instance(void) +{ + struct ras_events *ras = calloc(1, sizeof(*ras)); + if (!ras) { + log(LOG_ERROR, "Can't allocate memory for ras struct\n"); + return NULL; + } + int rc = create_trace_instance(ras->tracing); + if (rc < 0) { + free(ras); + return NULL; + } + return ras; +} + +/* + * Tracing enable/disable code + */ +int toggle_ras_event(char *trace_dir, char *group, char *event, int enable) +{ + int fd, rc; + char fname[MAX_PATH + 1]; + + snprintf(fname, sizeof(fname), "%s%s:%s\n", + enable ? "" : "!", + group, event); + + /* Enable RAS events */ + fd = open_trace(trace_dir, "set_event", O_RDWR | O_APPEND); + if (fd < 0) { + log(LOG_WARNING, "Can't open set_event\n"); + rc = -errno; + goto err; + } + + rc = write(fd, fname, strlen(fname)); + close(fd); + if (rc <= 0) { + log(LOG_WARNING, "Can't write to set_event\n"); + rc = -EIO; + goto err; + } + + log(LOG_INFO, "%s:%s event %s\n", + group, event, + enable ? "enabled" : "disabled"); + return 0; +err: + log(LOG_ERROR, "Can't %s %s:%s tracing\n", + enable ? "enable" : "disable", group, event); + return rc; +} + +static int parse_header_page(struct ras_events *ras, struct tep_handle *pevent) +{ + int fd, len, page_size = DEFAULT_PAGE_SIZE; + char buf[page_size]; + + fd = open_trace(ras->tracing, "events/header_page", O_RDONLY); + if (fd < 0) { + log(LOG_WARNING, "Open event header page failed\n"); + return -1; + } + + len = read(fd, buf, page_size); + close(fd); + if (len <= 0) { + log(LOG_WARNING, "Read event header page failed\n"); + return -1; + } + + if (tep_parse_header_page(pevent, buf, len, sizeof(long))) { + log(LOG_WARNING, "Parse event header page failed\n"); + return -1; + } + + return 0; +} + +static void parse_ras_data(struct pcpu_data *pdata, struct kbuffer *kbuf, + void *data, unsigned long long time_stamp) +{ + struct tep_record record; + struct trace_seq s; + + record.ts = time_stamp; + record.size = kbuffer_event_size(kbuf); + record.data = data; + record.offset = kbuffer_curr_offset(kbuf); + record.cpu = pdata->cpu; + + /* note offset is just offset in subbuffer */ + record.missed_events = kbuffer_missed_events(kbuf); + record.record_size = kbuffer_curr_size(kbuf); + + trace_seq_init(&s); + tep_print_event(pdata->ras->pevent, &s, &record, "%s-%s-%d-%s", + TEP_PRINT_NAME, TEP_PRINT_COMM, TEP_PRINT_TIME, TEP_PRINT_INFO); + trace_seq_do_printf(&s); + fflush(stdout); + trace_seq_destroy(&s); +} + +static int get_num_cpus() +{ + return sysconf(_SC_NPROCESSORS_ONLN); +} + +static int set_buffer_percent(struct ras_events *ras, int percent) +{ + int res = 0; + int fd; + + fd = open_trace(ras->tracing, "buffer_percent", O_WRONLY); + if (fd >= 0) { + char buf[16]; + ssize_t size; + snprintf(buf, sizeof(buf), "%d", percent); + size = write(fd, buf, strlen(buf)); + if (size <= 0) { + log(LOG_WARNING, "can't write to buffer_percent\n"); + res = -1; + } + close(fd); + } else { + log(LOG_WARNING, "Can't open buffer_percent\n"); + res = -1; + } + + return res; +} + +static int read_ras_event_all_cpus(struct pcpu_data *pdata, + unsigned n_cpus) +{ + ssize_t size; + unsigned long long time_stamp; + void *data; + int ready, i, count_nready; + struct kbuffer *kbuf; + void *page; + struct pollfd fds[n_cpus + 1]; + struct signalfd_siginfo fdsiginfo; + sigset_t mask; + int warnonce[n_cpus]; + char pipe_raw[PATH_MAX]; + + memset(&warnonce, 0, sizeof(warnonce)); + + page = malloc(pdata[0].ras->page_size); + if (!page) { + log(LOG_ERROR, "Can't allocate page\n"); + return -ENOMEM; + } + + kbuf = kbuffer_alloc(KBUFFER_LSIZE_8, ENDIAN); + if (!kbuf) { + log(LOG_ERROR, "Can't allocate kbuf\n"); + free(page); + return -ENOMEM; + } + + /* Fix for poll() on the per_cpu trace_pipe and trace_pipe_raw blocks + * indefinitely with the default buffer_percent in the kernel trace system, + * which is introduced by the following change in the kernel. + * https://lore.kernel.org/all/20221020231427.41be3f26@gandalf.local.home/T/#u. + * Set buffer_percent to 0 so that poll() will return immediately + * when the trace data is available in the ras per_cpu trace pipe_raw + */ + if (set_buffer_percent(pdata[0].ras, 0)) + log(LOG_WARNING, "Set buffer_percent failed\n"); + + for (i = 0; i < (n_cpus + 1); i++) + fds[i].fd = -1; + + for (i = 0; i < n_cpus; i++) { + fds[i].events = POLLIN; + + snprintf(pipe_raw, sizeof(pipe_raw), + "per_cpu/cpu%d/trace_pipe_raw", i); + + fds[i].fd = open_trace(pdata[0].ras->tracing, pipe_raw, O_RDONLY); + if (fds[i].fd < 0) { + log(LOG_ERROR, "Can't open trace_pipe_raw\n"); + goto error; + } + } + + sigemptyset(&mask); + sigaddset(&mask, SIGINT); + sigaddset(&mask, SIGTERM); + sigaddset(&mask, SIGHUP); + sigaddset(&mask, SIGQUIT); + if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1) + log(LOG_WARNING, "sigprocmask\n"); + fds[n_cpus].events = POLLIN; + fds[n_cpus].fd = signalfd(-1, &mask, 0); + if (fds[n_cpus].fd < 0) { + log(LOG_WARNING, "signalfd\n"); + goto error; + } + + log(LOG_INFO, "Listening to events for cpus 0 to %u\n", n_cpus - 1); + + do { + ready = poll(fds, (n_cpus + 1), -1); + if (ready < 0) { + log(LOG_WARNING, "poll\n"); + } + + /* check for the signal */ + if (fds[n_cpus].revents & POLLIN) { + size = read(fds[n_cpus].fd, &fdsiginfo, + sizeof(struct signalfd_siginfo)); + if (size != sizeof(struct signalfd_siginfo)) { + log(LOG_WARNING, "signalfd read\n"); + continue; + } + + if (fdsiginfo.ssi_signo == SIGINT || + fdsiginfo.ssi_signo == SIGTERM || + fdsiginfo.ssi_signo == SIGHUP || + fdsiginfo.ssi_signo == SIGQUIT) { + log(LOG_INFO, "Recevied signal=%d\n", + fdsiginfo.ssi_signo); + goto error; + } else { + log(LOG_INFO, + "Received unexpected signal=%d\n", + fdsiginfo.ssi_signo); + continue; + } + } + + count_nready = 0; + for (i = 0; i < n_cpus; i++) { + if (fds[i].revents & POLLERR) { + if (!warnonce[i]) { + log(LOG_INFO, + "Error on CPU %i\n", i); + warnonce[i]++; + } + continue; + } + if (!(fds[i].revents & POLLIN)) { + count_nready++; + continue; + } + size = read(fds[i].fd, page, pdata[i].ras->page_size); + if (size < 0) { + log(LOG_WARNING, "read\n"); + goto error; + } else if (size > 0) { + log(LOG_DEBUG, "cpu %d receive %ld bytes data\n", i, size); + kbuffer_load_subbuffer(kbuf, page); + + while ((data = kbuffer_read_event(kbuf, &time_stamp))) { + if (kbuffer_curr_size(kbuf) < 0) { + log(LOG_ERROR, "invalid kbuf data, discard\n"); + break; + } + + log(LOG_DEBUG, "parse_ras_data\n"); + parse_ras_data(&pdata[i], + kbuf, data, time_stamp); + + /* increment to read next event */ + log(LOG_DEBUG, "kbuffer_next_event\n"); + kbuffer_next_event(kbuf, NULL); + } + } else { + count_nready++; + } + } + + /* + * If count_nready == n_cpus, there is no cpu fd in POLLIN state, + * so we need to break the cycle + */ + if (count_nready == n_cpus) { + log(LOG_ERROR, "no cpu fd in POLLIN state, stop running\n"); + break; + } + } while (1); + +error: + kbuffer_free(kbuf); + free(page); + sigprocmask(SIG_UNBLOCK, &mask, NULL); + + for (i = 0; i < (n_cpus + 1); i++) { + if (fds[i].fd > 0) + close(fds[i].fd); + } + + return -1; +} + +static int init_header_page(struct ras_events *ras, struct tep_handle *pevent) +{ + int rc; + + rc = parse_header_page(ras, pevent); + if (rc) { + log(LOG_ERROR, "cannot read trace header_page: %d\n", rc); + return rc; + } + return 0; +} + +static int init_event_format(struct ras_events *ras, struct tep_handle *pevent, + char *group, char *event) +{ + char *page, fname[MAX_PATH + 1]; + int fd, size, rc, page_size = DEFAULT_PAGE_SIZE; + + // read one page from format + snprintf(fname, sizeof(fname), "events/%s/%s/format", group, event); + fd = open_trace(ras->tracing, fname, O_RDONLY); + if (fd < 0) { + log(LOG_ERROR, + "Can't get %s:%s traces. Perhaps this feature is not supported on your system.\n", + group, event); + return errno; + } + + log(LOG_INFO, "page_size: %d\n", page_size); + ras->page_size = page_size; + page = malloc(page_size); + if (!page) { + log(LOG_ERROR, "Can't allocate page to read %s:%s format\n", + group, event); + rc = errno; + close(fd); + return rc; + } + + size = read(fd, page, page_size); + close(fd); + if (size < 0) { + log(LOG_ERROR, "Can't read format\n"); + free(page); + return size; + } + + // parse event format + rc = tep_parse_event(pevent, page, size, group); + if (rc) { + log(LOG_ERROR, "Can't parse event %s:%s\n", group, event); + free(page); + return EINVAL; + } + return 0; +} + +static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent, + char *group, char *event, + tep_event_handler_func func) +{ + int rc; + + rc = init_event_format(ras, pevent, group, event); + if (rc) { + log(LOG_ERROR, "init_event_format for %s:%s failed\n", group, event); + return rc; + } + + /* Registers the special event handlers */ + rc = tep_register_event_handler(pevent, -1, group, event, func, ras); + if (rc < 0) { + log(LOG_ERROR, "Can't register event handler for %s:%s\n", + group, event); + return EINVAL; + } + + return 0; +} + +int handle_ras_events(struct ras_events *ras) +{ + int rc, i; + unsigned cpus; + struct tep_handle *pevent = NULL; + struct pcpu_data *data = NULL; + + pevent = tep_alloc(); + if (!pevent) { + log(LOG_ERROR, "Can't allocate pevent\n"); + rc = errno; + goto err; + } + ras->pevent = pevent; + + rc = init_header_page(ras, pevent); + if (rc) { + log(LOG_ERROR, "init_header_page failed\n"); + goto err; + } + + rc = add_event_handler(ras, pevent, "ras", "non_standard_event", + ras_non_standard_event_handler); + if (rc) { + log(LOG_ERROR, "Can't get traces from %s:%s\n", + "ras", "non_standard_event"); + goto err; + } + log(LOG_INFO, "add_event_handler done\n"); + + cpus = get_num_cpus(); + data = calloc(sizeof(*data), cpus); + if (!data) + goto err; + + for (i = 0; i < cpus; i++) { + data[i].ras = ras; + data[i].cpu = i; + } + rc = read_ras_event_all_cpus(data, cpus); + +err: + if (data) + free(data); + if (pevent) + tep_free(pevent); + return rc; +} diff --git a/src/c/hbm_online_repair/ras-events.h b/src/c/hbm_online_repair/ras-events.h new file mode 100644 index 0000000..4218d93 --- /dev/null +++ b/src/c/hbm_online_repair/ras-events.h @@ -0,0 +1,28 @@ +#ifndef __RAS_EVENTS_H +#define __RAS_EVENTS_H + +#include +#include + +#define MAX_PATH 1024 + +#define DEFAULT_PAGE_SIZE 4096 + +struct ras_events { + char tracing[MAX_PATH + 1]; + struct tep_handle *pevent; + int page_size; +}; + +struct pcpu_data { + struct tep_handle *pevent; + struct ras_events *ras; + int cpu; +}; + +/* Function prototypes */ +int toggle_ras_event(char *trace_dir, char *group, char *event, int enable); +int handle_ras_events(struct ras_events *ras); +struct ras_events *init_trace_instance(void); + +#endif diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.c b/src/c/hbm_online_repair/ras-non-standard-handler.c new file mode 100644 index 0000000..1d1fd04 --- /dev/null +++ b/src/c/hbm_online_repair/ras-non-standard-handler.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include +#include +#include "ras-non-standard-handler.h" +#include "logger.h" + +static char *uuid_le(const char *uu) +{ + static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; + if (!uu) { + log(LOG_ERROR, "uuid_le failed: uu is empty"); + return uuid; + } + size_t uu_len = strlen(uu); + if (uu_len < SECTION_TYPE_UUID_LEN) { + log(LOG_ERROR, "uuid_le failed: uu is too short"); + return uuid; + } + + char *p = uuid; + int i; + static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; + + for (i = 0; i < 16; i++) { + p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); + switch (i) { + case 3: + case 5: + case 7: + case 9: + *p++ = '-'; + break; + } + } + + *p = 0; + + return uuid; +} + +int ras_non_standard_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context) +{ + int len; + unsigned long long val; + struct ras_non_standard_event ev; + + ev.sec_type = tep_get_field_raw(s, event, "sec_type", + record, &len, 1); + if(!ev.sec_type) { + log(LOG_WARNING, "get event section type failed"); + return -1; + } + + trace_seq_printf(s, "\n"); + trace_seq_printf(s, "sec_type: %s\n", uuid_le(ev.sec_type)); + + if (tep_get_field_val(s, event, "len", record, &val, 1) < 0) { + log(LOG_WARNING, "tep get field val failed"); + return -1; + } + + ev.length = val; + trace_seq_printf(s, "length: %d\n", ev.length); + + ev.error = tep_get_field_raw(s, event, "buf", record, &len, 1); + if(!ev.error || ev.length != len) { + log(LOG_WARNING, "get event error failed"); + return -1; + } + + if (strcmp(uuid_le(ev.sec_type), HISI_COMMON_SECTION_TYPE_UUID) == 0) { + decode_hisi_common_section(&ev); + } + + return 0; +} diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.h b/src/c/hbm_online_repair/ras-non-standard-handler.h new file mode 100644 index 0000000..0272dc1 --- /dev/null +++ b/src/c/hbm_online_repair/ras-non-standard-handler.h @@ -0,0 +1,25 @@ +#ifndef __RAS_NON_STANDARD_HANDLER_H +#define __RAS_NON_STANDARD_HANDLER_H + +#include +#include "ras-events.h" + +#define BIT(nr) (1UL << (nr)) + +#define SECTION_TYPE_UUID_LEN 16 +#define HISI_COMMON_SECTION_TYPE_UUID "c8b328a8-9917-4af6-9a13-2e08ab2e7586" + +struct ras_non_standard_event { + char timestamp[64]; + const char *sec_type; + const uint8_t *error; + uint32_t length; +}; + +int ras_non_standard_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); + +int decode_hisi_common_section(struct ras_non_standard_event *event); + +#endif diff --git a/src/python/.gitignore b/src/python/.gitignore new file mode 100644 index 0000000..58200d4 --- /dev/null +++ b/src/python/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/src/python/syssentry/bmc_alarm.py b/src/python/syssentry/bmc_alarm.py new file mode 100644 index 0000000..5956538 --- /dev/null +++ b/src/python/syssentry/bmc_alarm.py @@ -0,0 +1,159 @@ +import logging +import socket +from enum import Enum + +from .utils import execute_command + +HEX_CHAR_LEN = 2 +SOCKET_RECEIVE_LEN = 128 +BMC_DATA_HEAD = "REP" +BMC_REPORT_TYPE_BIT = 0 +HBMC_REPAIR_TYPE_BIT = 1 +HBMC_REPAIR_RESULT_BIT = 2 +HBMC_ISOLATION_TYPE_BIT = 3 +HBMC_SEND_HEAD_LEN = 4 # "ipmtool", "raw", "0x30", "0x92" +HBMC_SEND_ROW_BIT = 26 + HBMC_SEND_HEAD_LEN +HBMC_SEND_COL_BIT = 30 + HBMC_SEND_HEAD_LEN +HBMC_REPAIR_TYPE_OFFSET = 7 + +HBMC_SEND_SUCCESS_CODE = "db 07 00" + + +class ReportType(Enum): + HBMC_REPAIR_BMC = 0x00 + + +class HBMCRepairType(Enum): + CE_ACLS = 7 + PS_UCE_ACLS = 8 + CE_SPPR = 9 + PS_UCE_SPPR = 10 + + +class HBMCRepairResultType(Enum): + ISOLATE_FAILED_OVER_THRESHOLD = 0b10000001 + ISOLATE_FAILED_OTHER_REASON = 0b10000010 + REPAIR_FAILED_NO_RESOURCE = 0b10010100 + REPAIR_FAILED_INVALID_PARAM = 0b10011000 + REPAIR_FAILED_OTHER_REASON = 0b10011100 + ONLINE_PAGE_FAILED = 0b10100000 + ISOLATE_REPAIR_ONLINE_SUCCESS = 0b00000000 + + +class HBMCIsolationType(Enum): + ROW_FAULT = 1 + SINGLE_ADDR_FAULT = 6 + + +def find_value_is_in_enum(value: int, enum: Enum): + for item in enum: + if value == item.value: + return True + return False + + +def convert_hex_char_to_int(data, bit): + if len(data) < (bit+1)*HEX_CHAR_LEN: + logging.error(f"Data {data} len is too short, current convert bit is {bit}") + char = data[bit*HEX_CHAR_LEN:(bit+1)*HEX_CHAR_LEN] + try: + value = int(char, 16) + except ValueError: + logging.error(f"Cannot convert char [{char}] to int") + raise ValueError + return value + + +def reverse_byte(data): + return data[3], data[2], data[1], data[0] + + +def parse_hbmc_report(data: str): + logging.debug(f"bmc receive raw data is {data}") + repair_type = convert_hex_char_to_int(data, HBMC_REPAIR_TYPE_BIT) + repair_type += HBMC_REPAIR_TYPE_OFFSET + if not find_value_is_in_enum(repair_type, HBMCRepairType): + logging.warning(f"HBMC msg repair type ({repair_type}) is unknown") + raise ValueError + + repair_result = convert_hex_char_to_int(data, HBMC_REPAIR_RESULT_BIT) + if not find_value_is_in_enum(repair_result, HBMCRepairResultType): + logging.warning(f"HBMC msg repair result ({repair_result}) is unknown") + raise ValueError + + isolation_type = convert_hex_char_to_int(data, HBMC_ISOLATION_TYPE_BIT) + if not find_value_is_in_enum(isolation_type, HBMCIsolationType): + logging.warning(f"HBMC msg isolation type ({isolation_type}) is unknown") + raise ValueError + + cmd_list = [ + "ipmitool", + "raw", + "0x30", # Netfn + "0x92", # cmd + "0xdb", + "0x07", + "0x00", + "0x65", # sub command + "0x01", # SystemId + "0x00", # LocalSystemId + "{:#04X}".format(repair_type), + "{:#04X}".format(repair_result), + "{:#04X}".format(isolation_type), + ] + # send the remain data directly + data = data[(HBMC_ISOLATION_TYPE_BIT + 1) * HEX_CHAR_LEN:] + other_info_str = [] + for i in range(len(data) // 2): + other_info_str.append("{:#04X}".format(convert_hex_char_to_int(data, i))) + cmd_list.extend(other_info_str) + + cmd_list[HBMC_SEND_ROW_BIT:HBMC_SEND_ROW_BIT + 4] = reverse_byte(cmd_list[HBMC_SEND_ROW_BIT:HBMC_SEND_ROW_BIT + 4]) + cmd_list[HBMC_SEND_COL_BIT:HBMC_SEND_COL_BIT + 4] = reverse_byte(cmd_list[HBMC_SEND_COL_BIT:HBMC_SEND_COL_BIT + 4]) + + logging.info(f"Send bmc alarm command is {cmd_list}") + + ret = execute_command(cmd_list) + if HBMC_SEND_SUCCESS_CODE not in ret: + logging.warning(f"Send bmc alarm failed, error code is {ret}") + raise ValueError + logging.debug("Send bmc alarm success") + + +PARSE_REPORT_MSG_FUNC_DICT = { + ReportType.HBMC_REPAIR_BMC.value: parse_hbmc_report, +} + + +def bmc_recv(server_socket: socket.socket): + logging.debug("Get hbm socket connection request") + try: + client_socket, _ = server_socket.accept() + logging.debug("cpu alarm fd listen ok") + + data = client_socket.recv(SOCKET_RECEIVE_LEN) + data = data.decode() + + data_head = data[0:len(BMC_DATA_HEAD)] + if data_head != BMC_DATA_HEAD: + logging.warning(f"The head of the msg is incorrect, head is {data_head}") + raise ValueError + + # remove the data head + data = data[len(BMC_DATA_HEAD):] + logging.info(f"Remove head data is {data}") + + report_type = convert_hex_char_to_int(data, BMC_REPORT_TYPE_BIT) + if report_type not in PARSE_REPORT_MSG_FUNC_DICT.keys(): + logging.warning(f"The type of the msg ({report_type}) is unknown") + raise ValueError + + PARSE_REPORT_MSG_FUNC_DICT[report_type](data) + + except socket.error: + logging.error("socket error") + return + except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError): + logging.error("server recv bmc msg failed!") + client_socket.close() + return diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py index ea09095..3829849 100644 --- a/src/python/syssentry/syssentry.py +++ b/src/python/syssentry/syssentry.py @@ -48,6 +48,12 @@ try: except ImportError: CPU_EXIST = False +BMC_EXIST = True +try: + from .bmc_alarm import bmc_recv +except ImportError: + BMC_EXIST = False + INSPECTOR = None @@ -89,6 +95,9 @@ RESULT_SOCKET_PATH = "/var/run/sysSentry/result.sock" CPU_ALARM_SOCKET_PATH = "/var/run/sysSentry/report.sock" +BMC_SOCKET_PATH = "/var/run/sysSentry/bmc.sock" + +fd_list = [] def msg_data_process(msg_data): """message data process""" @@ -334,6 +343,41 @@ def cpu_alarm_fd_create(): return cpu_alarm_fd +def bmc_fd_create(): + """create bmc fd""" + if not os.path.exists(SENTRY_RUN_DIR): + logging.debug("%s not exist", SENTRY_RUN_DIR) + return None + + try: + bmc_fd = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + except socket.error: + logging.error("bmc fd create failed") + return None + + bmc_fd.setblocking(False) + if os.path.exists(BMC_SOCKET_PATH): + os.remove(BMC_SOCKET_PATH) + + try: + bmc_fd.bind(BMC_SOCKET_PATH) + except OSError: + logging.error("bmc fd bind failed") + bmc_fd.close() + return None + + os.chmod(BMC_SOCKET_PATH, 0o600) + try: + bmc_fd.listen(5) + except OSError: + logging.error("bmc fd listen failed") + bmc_fd.close() + return None + + logging.debug("%s bind and listen", BMC_SOCKET_PATH) + + return bmc_fd + def server_result_recv(server_socket: socket.socket): """server result receive""" @@ -407,35 +451,47 @@ def server_result_fd_create(): return server_result_fd +def close_all_fd(): + for fd in fd_list: + fd.close() + + def main_loop(): """main loop""" + server_fd = server_fd_create() if not server_fd: + close_all_fd() return + fd_list.append(server_fd) server_result_fd = server_result_fd_create() if not server_result_fd: - server_fd.close() + close_all_fd() return + fd_list.append(server_result_fd) heartbeat_fd = heartbeat_fd_create() if not heartbeat_fd: - server_fd.close() - server_result_fd.close() + close_all_fd() return + fd_list.append(heartbeat_fd) cpu_alarm_fd = cpu_alarm_fd_create() if not cpu_alarm_fd: - server_fd.close() - heartbeat_fd.close() - server_result_fd.close() + close_all_fd() + return + fd_list.append(cpu_alarm_fd) + + bmc_fd = bmc_fd_create() + if not bmc_fd: + close_all_fd() return + fd_list.append(bmc_fd) epoll_fd = select.epoll() - epoll_fd.register(server_fd.fileno(), select.EPOLLIN) - epoll_fd.register(server_result_fd.fileno(), select.EPOLLIN) - epoll_fd.register(heartbeat_fd.fileno(), select.EPOLLIN) - epoll_fd.register(cpu_alarm_fd.fileno(), select.EPOLLIN) + for fd in fd_list: + epoll_fd.register(fd.fileno(), select.EPOLLIN) logging.debug("start main loop") # onstart_tasks_handle() @@ -458,6 +514,8 @@ def main_loop(): heartbeat_recv(heartbeat_fd) elif CPU_EXIST and event_fd == cpu_alarm_fd.fileno(): cpu_alarm_recv(cpu_alarm_fd) + elif BMC_EXIST and event_fd == bmc_fd.fileno(): + bmc_recv(bmc_fd) else: continue -- 2.27.0