Skip to content

Commit

Permalink
ras-arm-handler: Parse and log ARM Processor Error Info table
Browse files Browse the repository at this point in the history
Parse and log ARM Processor Error Info table data, UEFI 2.9A/2.10
specs section N2.4.4.1.

[mchehab: fix a typo]
Suggested-by: Mauro Carvalho Chehab <[email protected]>
Signed-off-by: Shiju Jose <[email protected]>
Signed-off-by: Mauro Carvalho Chehab <[email protected]>
  • Loading branch information
shijujose4 authored and mchehab committed Jul 17, 2024
1 parent 126561e commit d0773a8
Show file tree
Hide file tree
Showing 3 changed files with 349 additions and 0 deletions.
334 changes: 334 additions & 0 deletions ras-arm-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,61 @@
#define ARM_ERR_VALID_FLAGS BIT(1)
#define BIT2 2

#define ARM_INFO_VALID_MULTI_ERR BIT(0)
#define ARM_INFO_VALID_FLAGS BIT(1)
#define ARM_INFO_VALID_ERR_INFO BIT(2)
#define ARM_INFO_VALID_VIRT_ADDR BIT(3)
#define ARM_INFO_VALID_PHYSICAL_ADDR BIT(4)

#define ARM_INFO_FLAGS_FIRST BIT(0)
#define ARM_INFO_FLAGS_LAST BIT(1)
#define ARM_INFO_FLAGS_PROPAGATED BIT(2)
#define ARM_INFO_FLAGS_OVERFLOW BIT(3)

#define ARM_ERR_TYPE_MASK 0x1E /* GENMASK(4,1) */
#define ARM_CACHE_ERROR BIT(1)
#define ARM_TLB_ERROR BIT(2)
#define ARM_BUS_ERROR BIT(3)
#define ARM_VENDOR_ERROR BIT(4)

#define ARM_ERR_VALID_TRANSACTION_TYPE BIT(0)
#define ARM_ERR_VALID_OPERATION_TYPE BIT(1)
#define ARM_ERR_VALID_LEVEL BIT(2)
#define ARM_ERR_VALID_PROC_CONTEXT_CORRUPT BIT(3)
#define ARM_ERR_VALID_CORRECTED BIT(4)
#define ARM_ERR_VALID_PRECISE_PC BIT(5)
#define ARM_ERR_VALID_RESTARTABLE_PC BIT(6)
#define ARM_ERR_VALID_PARTICIPATION_TYPE BIT(7)
#define ARM_ERR_VALID_TIME_OUT BIT(8)
#define ARM_ERR_VALID_ADDRESS_SPACE BIT(9)
#define ARM_ERR_VALID_MEM_ATTRIBUTES BIT(10)
#define ARM_ERR_VALID_ACCESS_MODE BIT(11)

#define ARM_ERR_TRANSACTION_SHIFT 16
#define ARM_ERR_TRANSACTION_MASK 0x3 /* GENMASK(1,0) */
#define ARM_ERR_OPERATION_SHIFT 18
#define ARM_ERR_OPERATION_MASK 0xF /* GENMASK(3,0) */
#define ARM_ERR_LEVEL_SHIFT 22
#define ARM_ERR_LEVEL_MASK 0x7 /* GENMASK(2,0) */
#define ARM_ERR_PC_CORRUPT_SHIFT 25
#define ARM_ERR_PC_CORRUPT_MASK 0x1 /* (GENMASK(0,0) */
#define ARM_ERR_CORRECTED_SHIFT 26
#define ARM_ERR_CORRECTED_MASK 0x1 /* GENMASK(0,0) */
#define ARM_ERR_PRECISE_PC_SHIFT 27
#define ARM_ERR_PRECISE_PC_MASK 0x1 /* GENMASK(0,0) */
#define ARM_ERR_RESTARTABLE_PC_SHIFT 28
#define ARM_ERR_RESTARTABLE_PC_MASK 0x1 /* GENMASK(0,0) */
#define ARM_ERR_PARTICIPATION_TYPE_SHIFT 29
#define ARM_ERR_PARTICIPATION_TYPE_MASK 0x3 /* GENMASK(1,0) */
#define ARM_ERR_TIME_OUT_SHIFT 31
#define ARM_ERR_TIME_OUT_MASK 0x1 /* GENMASK(0,0) */
#define ARM_ERR_ADDRESS_SPACE_SHIFT 32
#define ARM_ERR_ADDRESS_SPACE_MASK 0x3 /* GENMASK(1,0) */
#define ARM_ERR_MEM_ATTRIBUTES_SHIFT 34
#define ARM_ERR_MEM_ATTRIBUTES_MASK 0x1FF /* GENMASK(8,0) */
#define ARM_ERR_ACCESS_MODE_SHIFT 43
#define ARM_ERR_ACCESS_MODE_MASK 0x1 /* GENMASK(0,0) */

void display_raw_data(struct trace_seq *s,
const uint8_t *buf,
uint32_t datalen)
Expand All @@ -48,6 +103,283 @@ void display_raw_data(struct trace_seq *s,
}
}

static const char * const arm_proc_error_type_strs[] = {
"",
"cache error",
"TLB error",
"bus error",
"micro-architectural error",
};

static const char * const arm_proc_error_flags_strs[] = {
"first error ",
"last error",
"propagated error",
"overflow",
};

static const char * const arm_err_trans_type_strs[] = {
"Instruction",
"Data Access",
"Generic",
};

static const char * const arm_bus_err_op_strs[] = {
"Generic error (type cannot be determined)",
"Generic read (type of instruction or data request cannot be determined)",
"Generic write (type of instruction of data request cannot be determined)",
"Data read",
"Data write",
"Instruction fetch",
"Prefetch",
};

static const char * const arm_cache_err_op_strs[] = {
"Generic error (type cannot be determined)",
"Generic read (type of instruction or data request cannot be determined)",
"Generic write (type of instruction of data request cannot be determined)",
"Data read",
"Data write",
"Instruction fetch",
"Prefetch",
"Eviction",
"Snooping (processor initiated a cache snoop that resulted in an error)",
"Snooped (processor raised a cache error caused by another processor or device snooping its cache)",
"Management",
};

static const char * const arm_tlb_err_op_strs[] = {
"Generic error (type cannot be determined)",
"Generic read (type of instruction or data request cannot be determined)",
"Generic write (type of instruction of data request cannot be determined)",
"Data read",
"Data write",
"Instruction fetch",
"Prefetch",
"Local management operation (processor initiated a TLB management operation that resulted in an error)",
"External management operation (processor raised a TLB error caused by another processor or device broadcasting TLB operations)",
};

static const char * const arm_bus_err_part_type_strs[] = {
"Local processor originated request",
"Local processor responded to request",
"Local processor observed",
"Generic",
};

static const char * const arm_bus_err_addr_space_strs[] = {
"External Memory Access",
"Internal Memory Access",
"Unknown",
"Device Memory Access",
};

static int decode_err_data_bits(char *buf, unsigned long data,
const char **data_str, size_t str_size)
{
int bit;

if (!buf || !data_str || !str_size)
return -1;

for (bit = 0; bit < str_size; bit++)
if (data & BIT(bit))
mce_snprintf(buf, " %s", ((char *)data_str[bit]));
return 0;
}

static void parse_arm_err_info(struct trace_seq *s, uint32_t type, uint64_t error_info)
{
uint8_t trans_type, op_type, level, participation_type, address_space;
uint16_t mem_attributes;
bool proc_context_corrupt, corrected, precise_pc, restartable_pc;
bool time_out, access_mode;

/*
* Vendor type errors have error information values that are vendor
* specific.
*/
if (type & ARM_VENDOR_ERROR)
return;

if (error_info & ARM_ERR_VALID_TRANSACTION_TYPE) {
trans_type = ((error_info >> ARM_ERR_TRANSACTION_SHIFT)
& ARM_ERR_TRANSACTION_MASK);
if (trans_type < ARRAY_SIZE(arm_err_trans_type_strs))
trace_seq_printf(s, " transaction type:%s",
arm_err_trans_type_strs[trans_type]);
}

if (error_info & ARM_ERR_VALID_OPERATION_TYPE) {
op_type = ((error_info >> ARM_ERR_OPERATION_SHIFT)
& ARM_ERR_OPERATION_MASK);
if (type & ARM_CACHE_ERROR) {
if (op_type < ARRAY_SIZE(arm_cache_err_op_strs))
trace_seq_printf(s, " cache error, operation type:%s",
arm_cache_err_op_strs[op_type]);
}
if (type & ARM_TLB_ERROR) {
if (op_type < ARRAY_SIZE(arm_tlb_err_op_strs)) {
trace_seq_printf(s, " TLB error, operation type: %s",
arm_tlb_err_op_strs[op_type]);
}
}
if (type & ARM_BUS_ERROR) {
if (op_type < ARRAY_SIZE(arm_bus_err_op_strs)) {
trace_seq_printf(s, " bus error, operation type: %s",
arm_bus_err_op_strs[op_type]);
}
}
}

if (error_info & ARM_ERR_VALID_LEVEL) {
level = ((error_info >> ARM_ERR_LEVEL_SHIFT)
& ARM_ERR_LEVEL_MASK);
if (type & ARM_CACHE_ERROR)
trace_seq_printf(s, " cache level: %d", level);

if (type & ARM_TLB_ERROR)
trace_seq_printf(s, " TLB level: %d", level);

if (type & ARM_BUS_ERROR)
trace_seq_printf(s, " affinity level at which the bus error occurred: %d",
level);
}

if (error_info & ARM_ERR_VALID_PROC_CONTEXT_CORRUPT) {
proc_context_corrupt = ((error_info >> ARM_ERR_PC_CORRUPT_SHIFT)
& ARM_ERR_PC_CORRUPT_MASK);
if (proc_context_corrupt)
trace_seq_printf(s, " processor context corrupted");
else
trace_seq_printf(s, " processor context not corrupted");
}

if (error_info & ARM_ERR_VALID_CORRECTED) {
corrected = ((error_info >> ARM_ERR_CORRECTED_SHIFT)
& ARM_ERR_CORRECTED_MASK);
if (corrected)
trace_seq_printf(s, " the error has been corrected");
else
trace_seq_printf(s, " the error has not been corrected");
}

if (error_info & ARM_ERR_VALID_PRECISE_PC) {
precise_pc = ((error_info >> ARM_ERR_PRECISE_PC_SHIFT)
& ARM_ERR_PRECISE_PC_MASK);
if (precise_pc)
trace_seq_printf(s, " PC is precise");
else
trace_seq_printf(s, " PC is imprecise");
}

if (error_info & ARM_ERR_VALID_RESTARTABLE_PC) {
restartable_pc = ((error_info >> ARM_ERR_RESTARTABLE_PC_SHIFT)
& ARM_ERR_RESTARTABLE_PC_MASK);
if (restartable_pc)
trace_seq_printf(s, " Program execution can be restarted reliably at the PC associated with the error");
}

/* The rest of the fields are specific to bus errors */
if (type != ARM_BUS_ERROR)
return;

if (error_info & ARM_ERR_VALID_PARTICIPATION_TYPE) {
participation_type = ((error_info >> ARM_ERR_PARTICIPATION_TYPE_SHIFT)
& ARM_ERR_PARTICIPATION_TYPE_MASK);
if (participation_type < ARRAY_SIZE(arm_bus_err_part_type_strs)) {
trace_seq_printf(s, " participation type: %s",
arm_bus_err_part_type_strs[participation_type]);
}
}

if (error_info & ARM_ERR_VALID_TIME_OUT) {
time_out = ((error_info >> ARM_ERR_TIME_OUT_SHIFT)
& ARM_ERR_TIME_OUT_MASK);
if (time_out)
trace_seq_printf(s, " request timed out");
}

if (error_info & ARM_ERR_VALID_ADDRESS_SPACE) {
address_space = ((error_info >> ARM_ERR_ADDRESS_SPACE_SHIFT)
& ARM_ERR_ADDRESS_SPACE_MASK);
if (address_space < ARRAY_SIZE(arm_bus_err_addr_space_strs)) {
trace_seq_printf(s, " address space: %s",
arm_bus_err_addr_space_strs[address_space]);
}
}

if (error_info & ARM_ERR_VALID_MEM_ATTRIBUTES) {
mem_attributes = ((error_info >> ARM_ERR_MEM_ATTRIBUTES_SHIFT)
& ARM_ERR_MEM_ATTRIBUTES_MASK);
trace_seq_printf(s, " memory access attributes:0x%x",
mem_attributes);
}

if (error_info & ARM_ERR_VALID_ACCESS_MODE) {
access_mode = ((error_info >> ARM_ERR_ACCESS_MODE_SHIFT)
& ARM_ERR_ACCESS_MODE_MASK);
if (access_mode)
trace_seq_printf(s, " access mode: normal");
else
trace_seq_printf(s, " access mode: secure");
}
}

static int parse_arm_processor_err_info(struct trace_seq *s, struct ras_arm_event *ev)
{
int err_info_size = sizeof(struct ras_arm_err_info);
struct ras_arm_err_info *err_info;
int i, num_pei;

if (ev->pei_len % err_info_size != 0) {
log(TERM, LOG_ERR,
"The event data does not match to the ARM Processor Error Information Structure\n");
return -1;
}
num_pei = ev->pei_len / err_info_size;
err_info = (struct ras_arm_err_info *)(ev->pei_error);

trace_seq_printf(s, "\nARM processor error info:\n");
for (i = 0; i < num_pei; ++i) {
decode_err_data_bits(ev->error_types, err_info->type,
(const char **)arm_proc_error_type_strs,
ARRAY_SIZE(arm_proc_error_type_strs));
trace_seq_printf(s, " error_types:%s", ev->error_types);

if (err_info->validation_bits & ARM_ERR_VALID_ERROR_COUNT) {
ev->error_count = err_info->multiple_error + 1;
trace_seq_printf(s, " error_count:%d", ev->error_count);
}
if (err_info->validation_bits & ARM_INFO_VALID_FLAGS) {
decode_err_data_bits(ev->error_flags, err_info->flags,
(const char **)arm_proc_error_flags_strs,
ARRAY_SIZE(arm_proc_error_flags_strs));
trace_seq_printf(s, " error_flags:%s", ev->error_flags);
}
if (err_info->validation_bits & ARM_INFO_VALID_ERR_INFO) {
ev->error_info = err_info->error_info;
trace_seq_printf(s, " error_info: 0x%016llx",
(unsigned long long)ev->error_info);
parse_arm_err_info(s, err_info->type, ev->error_info);
}
if (err_info->validation_bits & ARM_INFO_VALID_VIRT_ADDR) {
ev->virt_fault_addr = err_info->virt_fault_addr;
trace_seq_printf(s, " virtual fault address: 0x%016llx",
(unsigned long long)err_info->virt_fault_addr);
}
if (err_info->validation_bits & ARM_INFO_VALID_PHYSICAL_ADDR) {
ev->phy_fault_addr = err_info->physical_fault_addr;
trace_seq_printf(s, " physical fault address: 0x%016llx",
(unsigned long long)err_info->physical_fault_addr);
}
trace_seq_printf(s, "\n");
err_info += 1;
}

return 0;
}

#ifdef HAVE_CPU_FAULT_ISOLATION
static int is_core_failure(struct ras_arm_err_info *err_info)
{
Expand Down Expand Up @@ -226,6 +558,8 @@ int ras_arm_event_handler(struct trace_seq *s,
}
display_raw_data(s, ev.pei_error, ev.pei_len);

parse_arm_processor_err_info(s, &ev);

if (tep_get_field_val(s, event, "ctx_len", record, &val, 1) < 0)
return -1;
ev.ctx_len = val;
Expand Down
10 changes: 10 additions & 0 deletions ras-record.c
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ static const struct db_fields arm_event_fields[] = {
{ .name = "err_info", .type = "BLOB" },
{ .name = "context_info", .type = "BLOB" },
{ .name = "vendor_info", .type = "BLOB" },
{ .name = "error_type", .type = "TEXT" },
{ .name = "error_flags", .type = "TEXT" },
{ .name = "error_info", .type = "INTEGER" },
{ .name = "virt_fault_addr", .type = "INTEGER" },
{ .name = "phy_fault_addr", .type = "INTEGER" },
};

static const struct db_table_descriptor arm_event_tab = {
Expand Down Expand Up @@ -244,6 +249,11 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev)
ev->ctx_error, ev->ctx_len, NULL);
sqlite3_bind_blob(priv->stmt_arm_record, 9,
ev->vsei_error, ev->oem_len, NULL);
sqlite3_bind_text(priv->stmt_arm_record, 10, ev->error_types, -1, NULL);
sqlite3_bind_text(priv->stmt_arm_record, 11, ev->error_flags, -1, NULL);
sqlite3_bind_int64(priv->stmt_arm_record, 12, ev->error_info);
sqlite3_bind_int64(priv->stmt_arm_record, 13, ev->virt_fault_addr);
sqlite3_bind_int64(priv->stmt_arm_record, 14, ev->phy_fault_addr);

rc = sqlite3_step(priv->stmt_arm_record);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
Expand Down
5 changes: 5 additions & 0 deletions ras-record.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ struct ras_arm_event {
uint32_t ctx_len;
const uint8_t *vsei_error;
uint32_t oem_len;
char error_types[512];
char error_flags[512];
uint64_t error_info;
uint64_t virt_fault_addr;
uint64_t phy_fault_addr;
};

struct devlink_event {
Expand Down

0 comments on commit d0773a8

Please sign in to comment.