From 926c2b39c6386d0a1bf4232977f9fd7e37850361 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Tue, 21 Nov 2023 14:04:19 -0600 Subject: [PATCH] rasdaemon: Add support for vendor-specific machine check error information Some CPU vendors may provide additional vendor-specific machine check error information. AMD, for example, provides FRU Text through SYND 1/2 registers if BIT 9 of SMCA_CONFIG register is set. Add support to display the additional vendor-specific error information, if any. Signed-off-by: Avadhut Naik --- mce-amd-smca.c | 12 ++++++++++++ ras-mce-handler.c | 21 +++++++++++++++++++++ ras-mce-handler.h | 3 +++ 3 files changed, 36 insertions(+) diff --git a/mce-amd-smca.c b/mce-amd-smca.c index 55620e2..6b2b92b 100644 --- a/mce-amd-smca.c +++ b/mce-amd-smca.c @@ -965,6 +965,18 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m) channel, csrow); } + + if (e->vdata_len) { + uint64_t smca_config = e->vdata[2]; + + /* + * BIT 9 of the CONFIG register of a few SMCA Bank types indicates + * presence of FRU Text in SYND 1 / 2 registers + */ + if (smca_config & BIT(9)) + memcpy(e->frutext, e->vdata, 16); + } + } int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e) diff --git a/ras-mce-handler.c b/ras-mce-handler.c index 5ad9888..410541e 100644 --- a/ras-mce-handler.c +++ b/ras-mce-handler.c @@ -372,6 +372,24 @@ static void report_mce_event(struct ras_events *ras, trace_seq_printf(s, ", apicid= %x", e->apicid); + if (!e->vdata_len) + return; + + if (strlen(e->frutext)) { + trace_seq_printf(s, ", FRU Text= %s", e->frutext); + trace_seq_printf(s, ", Vendor Data= "); + for (int i = 2; i < e->vdata_len/8; i++) { + trace_seq_printf(s, "0x%lx", e->vdata[i]); + trace_seq_printf(s, " "); + } + } else { + trace_seq_printf(s, ", Vendor Data= "); + for (int i = 0; i < e->vdata_len/8; i ++) { + trace_seq_printf(s, "0x%lx", e->vdata[i]); + trace_seq_printf(s, " "); + } + } + /* * FIXME: The original mcelog userspace tool uses DMI to map from * address to DIMM. From the comments there, the code there doesn't @@ -548,6 +566,9 @@ int ras_mce_event_handler(struct trace_seq *s, return -1; e.ipid = val; + /* Get Vendor-specfic Data, if any */ + e.vdata = tep_get_field_raw(s, event, "v_data", record, &e.vdata_len, 1); + switch (mce->cputype) { case CPU_GENERIC: break; diff --git a/ras-mce-handler.h b/ras-mce-handler.h index 83407e4..976fb4f 100644 --- a/ras-mce-handler.h +++ b/ras-mce-handler.h @@ -75,8 +75,11 @@ struct mce_event { uint8_t cpuvendor; uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */ uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ + int32_t vdata_len; + const uint64_t *vdata; /* Parsed data */ + char frutext[17]; char timestamp[64]; char bank_name[64]; char error_msg[4096];