Skip to content

Commit

Permalink
Add support for fatal error monitoring
Browse files Browse the repository at this point in the history
- The application monitors APML_ALERT_L and upon assertion of the
  GPIO PIN , BMC collects the MCA MSR dump and creates CPER record.

- Depending on the user configuration , the application initiates
  system recovery.

- ras-config.json contains the configuration parameters with
  default value. User can get and set the configuration parameters
  using d-bus calls to the methods getAttribute and setAttribute.

- The application also contains harvesting of last transaction address,
  debug log ID's.

- The application is intended to be supported for 1P and 2P platforms.

Signed-off-by: Abinaya Dhandapani <[email protected]>
  • Loading branch information
Abinaya Dhandapani committed Nov 21, 2024
1 parent bf04998 commit d274822
Show file tree
Hide file tree
Showing 15 changed files with 3,078 additions and 0 deletions.
70 changes: 70 additions & 0 deletions config/ras_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{
"Configuration": [
{
"ApmlRetries": {
"Description": "Number of APML retry count",
"Value": 10,
"MaxBoundLimit": "50"
}
},
{
"SystemRecovery": {
"Description": "System recovery mode",
"Value": "NO_RESET"
}
},
{
"HarvestMicrocode": {
"Description": "Harvest microcode version",
"Value": true
}
},
{
"HarvestPPIN": {
"Description": "Harvest PPIN",
"Value": true
}
},
{
"ResetSignal": {
"Description": "Reset Signal Type",
"Value": "SYS_RST"
}
},
{
"SigIdOffset": {
"Description": "List of Signature ID offsets",
"Value": [
"0x30",
"0x34",
"0x28",
"0x2c",
"0x08",
"0x0c",
"null",
"null"
]
}
},
{
"AifsArmed": {
"Description": "If this field is true, AIFS flow is triggered",
"Value": false
}
},
{
"AifsSignatureId": {
"Description": "List of signature Id to check if Aifs is triggered",
"Value": {
"EX-WDT": "0xaea0000000000108000500b020009a00000000004d000000"
}
}
},
{
"DisableAifsResetOnSyncfloodCounter": {
"Description": "Disable AIFS Reset on syncfloow counter ",
"Value": true
}
}
]
}
48 changes: 48 additions & 0 deletions inc/apml_manager.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#include "cper_generator.hpp"
#include "interface_manager_base.hpp"

class ApmlInterfaceManager : public RasManagerBase
{
public:
virtual void init();

virtual void configure();

ApmlInterfaceManager(
sdbusplus::asio::object_server& objectServer,
std::shared_ptr<sdbusplus::asio::connection>& systemBus,
boost::asio::io_service& io) :
RasManagerBase(objectServer, systemBus, io)
{}

protected:
std::vector<uint8_t> blockId;
uint32_t familyId;
std::mutex harvest_in_progress_mtx;
bool p0AlertProcessed = false;
bool p1AlertProcessed = false;
uint64_t recordId = 1;
uint16_t debugLogIdOffset;
uint32_t SignatureID[8];
virtual void interfaceActiveMonitor();
virtual void getCpuId();
virtual void findProgramId();
virtual void harvestFatalError(uint8_t);
void triggerWarmReset() override;
void clearSbrmiAlertMask(uint8_t);

void performPlatformInitialization();

oob_status_t readRegister(uint8_t, uint32_t, uint8_t*);

void writeRegister(uint8_t, uint32_t, uint32_t);
bool compare_with_bitwise_AND(const uint32_t*, const std::string&);
bool checkSignatureIdMatch();
std::vector<uint32_t> hexstring_to_vector(const std::string&);
bool harvestMcaValidityCheck(uint8_t, uint16_t*, uint16_t*);
template <typename T>
void harvestMcaDataBanks(uint8_t, uint16_t, uint16_t, CperGenerator<T>&);
void getLastTransAddr(EFI_AMD_FATAL_ERROR_DATA*, uint8_t);
void dumpContextInfo(EFI_AMD_FATAL_ERROR_DATA*, uint8_t);
void harvestDebugLogDump(EFI_AMD_FATAL_ERROR_DATA*, uint8_t, uint8_t);
};
73 changes: 73 additions & 0 deletions inc/config_manager.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#pragma once

#include <com/amd/RAS/Configuration/common.hpp>
#include <com/amd/RAS/Configuration/server.hpp>
#include <sdbusplus/asio/object_server.hpp>
#include <sdbusplus/server.hpp>
static constexpr auto objectPath = "/com/amd/RAS";

using AttributeType =
sdbusplus::common::com::amd::ras::Configuration::AttributeType;
using Base = sdbusplus::com::amd::RAS::server::Configuration;

using AttributeName = std::string;
using AttributeValue =
std::variant<bool, std::string, int64_t, std::vector<std::string>,
std::map<std::string, std::string>>;
using ConfigTable =
std::map<std::string,
std::tuple<AttributeType, std::string,
std::variant<bool, std::string, int64_t,
std::vector<std::string>,
std::map<std::string, std::string>>,
int64_t>>;

struct EventDeleter
{
void operator()(sd_event* event) const
{
event = sd_event_unref(event);
}
};
using EventPtr = std::unique_ptr<sd_event, EventDeleter>;

/**
* @brief Definition of the RasConfiguration class.
*
* @tparam AttributeName The type for attribute names (usually std::string).
* @tparam AttributeValue The variant type for attribute values.
* @tparam ConfigTable The map type for storing attribute information.
*/

class RasConfiguration : public Base
{
public:
/**
* Constructor for Configuration.
*
* @param objectServer Reference to the object server.
* @param systemBus Reference to the system D-Bus connection.
*/
RasConfiguration(sdbusplus::asio::object_server& objectServer,
std::shared_ptr<sdbusplus::asio::connection>& systemBus);

/**
* Set the value of an attribute.
*
* @param attribute The name of the attribute.
* @param value The value to set.
*/
void setAttribute(AttributeName attribute, AttributeValue value) override;

/**
* Get the value of an attribute.
*
* @param attribute The name of the attribute.
* @return The value of the attribute.
*/
AttributeValue getAttribute(AttributeName attribute) override;

private:
sdbusplus::asio::object_server& objServer;
std::shared_ptr<sdbusplus::asio::connection>& systemBus;
};
65 changes: 65 additions & 0 deletions inc/cper_generator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#pragma once

#include "ras.hpp"

extern "C"
{
#include "apml.h"
#include "apml_common.h"
#include "esmi_cpuid_msr.h"
#include "esmi_mailbox.h"
#include "esmi_rmi.h"
}

/** @class CperGenerator
* @brief Implementation of CPER record creation
*/
template <typename T>
class CperGenerator
{
protected:
uint32_t boardId;
uint8_t numOfCpu;
uint64_t recordId;
uint8_t progId;
uint32_t familyId;
int errCount;

public:
CperGenerator(uint8_t numOfCpu, uint8_t progId, uint32_t familyId,
int errCount) :
numOfCpu(numOfCpu), recordId(1), progId(progId), familyId(familyId),
errCount(errCount)
{}

void dumpCperHeaderSection(const std::shared_ptr<T>& data,
uint16_t sectionCount, uint32_t errorSeverity,
std::string errorType);

void calculateTimeStamp(const std::shared_ptr<T>& data);

void dumpErrorDescriptorSection(const std::shared_ptr<T>&, uint16_t,
std::string);

void dumpProcessorErrorSection(const std::shared_ptr<FatalCperRecord>&,
uint8_t, CpuId*);

void cperFileWrite(const std::shared_ptr<T>&, std::string, uint16_t);

void dumpContextInfo(const std::shared_ptr<FatalCperRecord>&, uint16_t,
uint16_t, uint8_t, std::vector<uint8_t>, uint64_t*,
uint32_t*, int64_t*);

void getLastTransAddr(const std::shared_ptr<FatalCperRecord>&, uint8_t);

void harvestDebugLogDump(const std::shared_ptr<FatalCperRecord>&, uint8_t,
uint8_t, int64_t*, uint16_t&);

std::string getCperFilename(int);

void dumpProcErrorSection(const std::shared_ptr<T>&, uint8_t,
struct ras_rt_valid_err_inst, uint8_t, uint16_t,
uint32_t*, uint64_t*, int64_t,
std::map<std::string, std::string>*,
std::map<std::string, std::string>*);
};
67 changes: 67 additions & 0 deletions inc/interface_manager_base.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#include "config_manager.hpp"
#include "ras.hpp"

#include <boost/asio/io_service.hpp>
#include <boost/asio/posix/stream_descriptor.hpp>
#include <gpiod.hpp>
#include <nlohmann/json.hpp>

class RasManagerBase : public RasConfiguration
{
public:
RasManagerBase(sdbusplus::asio::object_server& objectServer,
std::shared_ptr<sdbusplus::asio::connection>& systemBus,
boost::asio::io_service& io) :
RasConfiguration(objectServer, systemBus), io(io),
p0_apmlAlertEvent(io), p1_apmlAlertEvent(io)
{}

virtual void init() = 0;

virtual void configure() = 0;

virtual ~RasManagerBase();

void p0AlertEventHandler();
void p1AlertEventHandler();

protected:
boost::asio::io_service& io;
uint8_t numOfCpu;
CpuId* cpuId;
uint32_t* uCode;
uint64_t* ppin;
std::string* inventoryPath;
unsigned int boardId;
uint8_t progId;
int errCount = 0;
std::shared_ptr<FatalCperRecord> rcd = NULL;
boost::asio::posix::stream_descriptor p0_apmlAlertEvent;
boost::asio::posix::stream_descriptor p1_apmlAlertEvent;
gpiod::line p0_apmlAlertLine;
gpiod::line p1_apmlAlertLine;

void getNumberOfCpu();
void getBoardId();
void createIndexFile();
void createConfigFile();
void getMicrocodeRev();
void getPpinFuse();
template <typename T>
T getProperty(sdbusplus::bus::bus&, const char*, const char*, const char*,
const char*);
void requestGPIOEvents(const std::string&, const std::function<void()>&,
gpiod::line&,
boost::asio::posix::stream_descriptor&);
void rasRecoveryAction(uint8_t);
void triggerColdReset();
void triggerRsmrstReset();
void triggerSysReset();
virtual void triggerWarmReset() = 0;
void requestHostTransition(std::string);

virtual void interfaceActiveMonitor() = 0;
virtual void getCpuId() = 0;
virtual void findProgramId() = 0;
virtual void harvestFatalError(uint8_t) = 0;
};
Loading

0 comments on commit d274822

Please sign in to comment.