From 85d6dae9d7c6148f2699ef7da7d2d784043a2ee1 Mon Sep 17 00:00:00 2001 From: luckky Date: Wed, 30 Oct 2024 10:41:11 +0800 Subject: [PATCH] fix hbm online repair notice and efi create --- src/c/hbm_online_repair/hbm_online_repair.c | 5 +- .../non-standard-hbm-repair.c | 194 +++++++++--------- .../non-standard-hbm-repair.h | 2 +- src/c/hbm_online_repair/ras-events.c | 1 - .../ras-non-standard-handler.c | 33 +-- .../ras-non-standard-handler.h | 1 + 6 files changed, 116 insertions(+), 120 deletions(-) diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c index 3ace206..b3b2742 100644 --- a/src/c/hbm_online_repair/hbm_online_repair.c +++ b/src/c/hbm_online_repair/hbm_online_repair.c @@ -127,10 +127,7 @@ int main(int argc, char *argv[]) return -1; } - ret = init_all_flash(); - if (ret < 0) { - log(LOG_ERROR, "flash writer init failed\n"); - } + get_flash_total_size(); handle_ras_events(ras); diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c index b175e14..f26d8ae 100644 --- a/src/c/hbm_online_repair/non-standard-hbm-repair.c +++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c @@ -15,7 +15,7 @@ #include "non-standard-hbm-repair.h" extern int page_isolation_threshold; -size_t total_size = 0; +size_t flash_total_size = 0; struct hisi_common_error_section { uint32_t val_bits; uint8_t version; @@ -122,28 +122,58 @@ static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned info_struct->crc8 = (uint32_t)fault_addr; } -static bool variable_existed(char *name, char *guid) +static bool is_variable_existing(char *name, char *guid) { + char filename[PATH_MAX]; + snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); + + return access(filename, F_OK | R_OK) == 0; +} + +static size_t get_var_size(char *name, char *guid) { char filename[PATH_MAX]; int fd; + struct stat stat; snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); // open var file fd = open(filename, O_RDONLY); if (fd < 0) { - log(LOG_WARNING, "open file %s failed\n", filename); - return false; + log(LOG_WARNING, "open %s failed\n", filename); + goto err; + } + // read stat + if (fstat(fd, &stat) != 0) { + log(LOG_WARNING, "fstat %s failed\n", filename); + goto err; } close(fd); - return true; + return stat.st_size; +err: + if (fd >= 0) + close(fd); + return (size_t)-1; } -static uint32_t read_variable_attribute(char *name, char *guid) { +void get_flash_total_size() { + for (int i = 0; i < FLASH_ENTRY_NUM; i++) { + if (is_variable_existing(flash_names[i], flash_guids[i])) { + flash_total_size += get_var_size(flash_names[i], flash_guids[i]); + } + } + // check total entry size + log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n", + flash_total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE); + if (flash_total_size > MAX_VAR_SIZE) { + log(LOG_WARNING, "fault info storage %zu reach threshold, cannot save new record\n", flash_total_size); + } +} + +static int read_variable_attribute(char *name, char *guid, uint32_t *attribute) { char filename[PATH_MAX]; int fd; size_t readsize; - uint32_t attribute = (uint32_t)-1; snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); @@ -151,17 +181,18 @@ static uint32_t read_variable_attribute(char *name, char *guid) { fd = open(filename, O_RDONLY); if (fd < 0) { log(LOG_ERROR, "open %s failed\n", filename); - return attribute; + return -1; } // read attributes from first 4 bytes - readsize = read(fd, &attribute, sizeof(uint32_t)); + readsize = read(fd, attribute, sizeof(uint32_t)); if (readsize != sizeof(uint32_t)) { log(LOG_ERROR, "read attribute of %s failed\n", filename); + return -1; } close(fd); - return attribute; + return 0; } static int efivarfs_set_mutable(char *name, char *guid, bool mutable) @@ -205,8 +236,8 @@ err: return -1; } -static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute) { - int fd, mode; +static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute, bool is_existing) { + int fd = -1, mode; size_t writesize; void *buffer; unsigned long total; @@ -225,16 +256,13 @@ static int write_variable(char *name, char *guid, void *value, unsigned long siz memcpy(buffer + sizeof(uint32_t), value, size); // change attr - if (efivarfs_set_mutable(name, guid, 1) != 0) { + if (is_existing && efivarfs_set_mutable(name, guid, 1) != 0) { log(LOG_ERROR, "set mutable for %s failed\n", filename); goto err; } mode = O_WRONLY; - if (attribute & EFI_VARIABLE_APPEND_WRITE) - mode |= O_APPEND; - else - mode |= O_CREAT; + mode |= is_existing ? O_APPEND : O_CREAT; // open var file fd = open(filename, mode, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); @@ -252,7 +280,7 @@ static int write_variable(char *name, char *guid, void *value, unsigned long siz close(fd); free(buffer); - if (efivarfs_set_mutable(name, guid, 0) != 0) { + if (is_existing && efivarfs_set_mutable(name, guid, 0) != 0) { log(LOG_ERROR, "set immutable for %s failed\n", filename); } return 0; @@ -261,86 +289,21 @@ err: close(fd); if (buffer) free(buffer); - if (efivarfs_set_mutable(name, guid, 0) != 0) { + if (is_existing && efivarfs_set_mutable(name, guid, 0) != 0) { log(LOG_ERROR, "set immutable for %s failed\n", filename); } return -1; } -static int append_variable(char *name, char *guid, void *data, unsigned long size) { - // prepare append attribute - uint32_t attribute = read_variable_attribute(name, guid); - if (attribute == (uint32_t)-1) { - log(LOG_ERROR, "read %s-%s attribute failed\n", name, guid); - return -1; - } - attribute |= EFI_VARIABLE_APPEND_WRITE; - - return write_variable(name, guid, data, size, attribute); -} - -static size_t get_var_size(char *name, char *guid) { - char filename[PATH_MAX]; - int fd; - struct stat stat; - - snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); - - // open var file - fd = open(filename, O_RDONLY); - if (fd < 0) { - log(LOG_WARNING, "open %s failed\n", filename); - goto err; - } - // read stat - if (fstat(fd, &stat) != 0) { - log(LOG_WARNING, "fstat %s failed\n", filename); - goto err; - } - close(fd); - return stat.st_size; -err: - if (fd >= 0) - close(fd); - return (size_t)-1; -} - -int init_all_flash() { - for (int i = 0; i < FLASH_ENTRY_NUM; i++) { - // check existed entry - if (variable_existed(flash_names[i], flash_guids[i])) { - total_size += get_var_size(flash_names[i], flash_guids[i]); - continue; - } - // create new entry - uint32_t attribute = EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS; - char *data = ""; - unsigned long size = 1; - int ret = write_variable(flash_names[i], flash_guids[i], data, size, attribute); - if (ret) { - log(LOG_ERROR, "init %s-%s failed, fault info storage funtion not enabled\n", flash_names[i], flash_guids[i]); - return -1; - } - total_size += sizeof(uint32_t) + 1; - } - // check total entry size - log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n", - total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE); - if (total_size > MAX_VAR_SIZE) { - log(LOG_ERROR, "fault info storage reach threshold, cannot save new record\n"); - } - return 0; -} - static int write_fault_info_to_flash(const struct hisi_common_error_section *err) { int ret, guid_index; uint32_t reg_size; uint64_t fault_addr; + bool is_existing; + uint32_t attribute = -1; // check flash usage threshold - if (total_size + sizeof(uint64_t) > MAX_VAR_SIZE) { + if (flash_total_size + sizeof(uint64_t) > MAX_VAR_SIZE) { log(LOG_WARNING, "fault info storage reach threshold, cannot save new record into flash\n"); return -1; } @@ -359,14 +322,29 @@ static int write_fault_info_to_flash(const struct hisi_common_error_section *err log(LOG_ERROR, "invalid fault info\n"); return -1; } + + // judge if the efivar is existing to set the attribute + is_existing = is_variable_existing(flash_names[guid_index], flash_guids[guid_index]); + attribute = EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS; + if (is_existing) { + ret = read_variable_attribute(flash_names[guid_index], flash_guids[guid_index], &attribute); + if (ret < 0) { + log(LOG_ERROR, "read variable %s-%s attribute failed, stop writing\n", flash_names[guid_index], flash_guids[guid_index]); + return -1; + } + attribute |= EFI_VARIABLE_APPEND_WRITE; + } + // record physical addr in flash - ret = append_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t)); + ret = write_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t), attribute, is_existing); if (ret < 0) { - log(LOG_ERROR, "append to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]); + log(LOG_ERROR, "write to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]); return -1; } - total_size += sizeof(uint64_t); - log(LOG_INFO, "write hbm fault info to flash success\n"); + flash_total_size += sizeof(uint64_t); + log(LOG_INFO, "write hbm fault info to flash %s-%s success\n", flash_names[guid_index], flash_guids[guid_index]); return 0; } @@ -421,7 +399,7 @@ static int get_hardware_corrupted_size() return hardware_corrupted_size; } -static uint8_t get_repair_result_code(int ret) +static uint8_t get_repair_failed_result_code(int ret) { if (ret == -ENOSPC) { return REPAIR_FAILED_NO_RESOURCE; @@ -582,11 +560,11 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err) static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) { int ret; - if (repair_ret < 0) { + if (repair_ret <= 0) { log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr); /* not much we can do about errors here */ (void)write_file("/sys/kernel/page_eject", "remove_page", paddr); - return get_repair_result_code(repair_ret); + return get_repair_failed_result_code(repair_ret); } ret = write_file("/sys/kernel/page_eject", "online_page", paddr); @@ -615,9 +593,13 @@ static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_PSUE_ACLS; ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr); - if (ret < 0) { - notice_BMC(err, get_repair_result_code(ret)); - log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); + + /* Only positive num means the error is supported to repair */ + if (ret <= 0) { + if (ret != -ENXIO) { + notice_BMC(err, get_repair_failed_result_code(ret)); + log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); + } return ret; } @@ -642,8 +624,9 @@ static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char all_online_success = false; } } - if (ret < 0) { - notice_BMC(err, get_repair_result_code(ret)); + /* The ret is from the acls/sppr repair, and only positive num means the error is repaired successfully */ + if (ret <= 0) { + notice_BMC(err, get_repair_failed_result_code(ret)); return ret; } else if (all_online_success) { notice_BMC(err, ISOLATE_REPAIR_ONLINE_SUCCESS); @@ -698,7 +681,7 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err) struct dirent *dent; DIR *dir; int ret; - bool find_device = false, find_hbm_mem = false; + bool find_device = false, find_hbm_mem = false, addr_in_hbm_device = false; ret = hbmc_hbm_page_isolate(err); if (ret < 0) { @@ -723,10 +706,13 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err) if (hbmc_get_memory_type(path) == HBM_HBM_MEMORY) { find_hbm_mem = true; ret = hbmc_hbm_repair(err, path); - if (ret != -ENXIO) + if (ret != -ENXIO) { + addr_in_hbm_device = true; break; + } } } + if (!find_device) { log(LOG_ERROR, "Repair driver is not loaded, skip error, error_type is %u\n", err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); @@ -735,6 +721,10 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err) log(LOG_ERROR, "No HBM device memory type found, skip error, error_type is %u\n", err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); notice_BMC(err, REPAIR_FAILED_OTHER_REASON); + } else if (!addr_in_hbm_device) { + log(LOG_ERROR, "Err addr is not in device, skip error, error_type is %u\n", + err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); + notice_BMC(err, REPAIR_FAILED_INVALID_PARAM); } closedir(dir); @@ -769,7 +759,7 @@ static bool hbm_repair_validate(const struct hisi_common_error_section *err) (err->reg_array_size == HBM_CACHE_ARRAY_SIZE); if (!(is_acls_valid || is_sppr_valid || is_cache_mode)) { - log(LOG_DEBUG, "err type (%u) is unknown or address array length (%u) is invalid\n", + log(LOG_WARNING, "err type (%u) is unknown or address array length (%u) is invalid\n", hbm_repair_reg_type, err->reg_array_size); return false; } diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.h b/src/c/hbm_online_repair/non-standard-hbm-repair.h index 7e8e448..ecb04fe 100644 --- a/src/c/hbm_online_repair/non-standard-hbm-repair.h +++ b/src/c/hbm_online_repair/non-standard-hbm-repair.h @@ -84,6 +84,6 @@ #define FLASH_ENTRY_NUM 8 #define KB_SIZE 1024 -extern int init_all_flash(); +extern void get_flash_total_size(); #endif diff --git a/src/c/hbm_online_repair/ras-events.c b/src/c/hbm_online_repair/ras-events.c index 0b12329..4d281ad 100644 --- a/src/c/hbm_online_repair/ras-events.c +++ b/src/c/hbm_online_repair/ras-events.c @@ -348,7 +348,6 @@ static int read_ras_event_all_cpus(struct pcpu_data *pdata, "Error on CPU %i\n", i); warnonce[i]++; } - continue; } if (!(fds[i].revents & POLLIN)) { count_nready++; diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.c b/src/c/hbm_online_repair/ras-non-standard-handler.c index 1d1fd04..48ffa70 100644 --- a/src/c/hbm_online_repair/ras-non-standard-handler.c +++ b/src/c/hbm_online_repair/ras-non-standard-handler.c @@ -7,17 +7,21 @@ #include "ras-non-standard-handler.h" #include "logger.h" -static char *uuid_le(const char *uu) +static int uuid_le(const char *uu, char* uuid) { - static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; if (!uu) { log(LOG_ERROR, "uuid_le failed: uu is empty"); - return uuid; + return -1; } size_t uu_len = strlen(uu); - if (uu_len < SECTION_TYPE_UUID_LEN) { - log(LOG_ERROR, "uuid_le failed: uu is too short"); - return uuid; + if (uu_len != SECTION_TYPE_UUID_LEN) { + log(LOG_ERROR, "uuid_le failed: uu len is incorrect"); + return -1; + } + size_t uuid_len = strlen(uuid); + if (uuid_len != strlen(UUID_STR_TYPE)) { + log(LOG_ERROR, "uuid_le failed: uuid len is incorrect"); + return -1; } char *p = uuid; @@ -38,7 +42,7 @@ static char *uuid_le(const char *uu) *p = 0; - return uuid; + return 0; } int ras_non_standard_event_handler(struct trace_seq *s, @@ -52,15 +56,20 @@ int ras_non_standard_event_handler(struct trace_seq *s, ev.sec_type = tep_get_field_raw(s, event, "sec_type", record, &len, 1); if(!ev.sec_type) { - log(LOG_WARNING, "get event section type failed"); + log(LOG_WARNING, "get event section type failed\n"); return -1; } trace_seq_printf(s, "\n"); - trace_seq_printf(s, "sec_type: %s\n", uuid_le(ev.sec_type)); + char uuid[sizeof(UUID_STR_TYPE)] = UUID_STR_TYPE; + if (uuid_le(ev.sec_type, uuid) < 0) { + log(LOG_WARNING, "get uuid failed\n"); + return -1; + } + trace_seq_printf(s, "sec_type: %s\n", uuid); if (tep_get_field_val(s, event, "len", record, &val, 1) < 0) { - log(LOG_WARNING, "tep get field val failed"); + log(LOG_WARNING, "tep get field val failed\n"); return -1; } @@ -69,11 +78,11 @@ int ras_non_standard_event_handler(struct trace_seq *s, ev.error = tep_get_field_raw(s, event, "buf", record, &len, 1); if(!ev.error || ev.length != len) { - log(LOG_WARNING, "get event error failed"); + log(LOG_WARNING, "get event error failed\n"); return -1; } - if (strcmp(uuid_le(ev.sec_type), HISI_COMMON_SECTION_TYPE_UUID) == 0) { + if (strcmp(uuid, HISI_COMMON_SECTION_TYPE_UUID) == 0) { decode_hisi_common_section(&ev); } diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.h b/src/c/hbm_online_repair/ras-non-standard-handler.h index 0272dc1..15a37ee 100644 --- a/src/c/hbm_online_repair/ras-non-standard-handler.h +++ b/src/c/hbm_online_repair/ras-non-standard-handler.h @@ -7,6 +7,7 @@ #define BIT(nr) (1UL << (nr)) #define SECTION_TYPE_UUID_LEN 16 +#define UUID_STR_TYPE "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" #define HISI_COMMON_SECTION_TYPE_UUID "c8b328a8-9917-4af6-9a13-2e08ab2e7586" struct ras_non_standard_event { -- 2.43.0