From 3e2c8edffb1e3b731098554a8b369f4090b1dd77 Mon Sep 17 00:00:00 2001 From: Abinaya Dhandapani Date: Wed, 13 Nov 2024 10:10:30 -0600 Subject: [PATCH 1/3] Add README, OWNERS, and LICENSE files - Added README.md with project overview. - Added OWNERS file to define project ownership and contributors. - Added LICENSE file to specify project licensing terms. Signed-off-by: Abinaya Dhandapani --- LICENSE | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ OWNERS | 43 ++++++++++++ README.md | 21 +++++- 3 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 LICENSE create mode 100644 OWNERS diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/OWNERS b/OWNERS new file mode 100644 index 0000000..b66c08d --- /dev/null +++ b/OWNERS @@ -0,0 +1,43 @@ +# OWNERS +# ------ +# +# The OWNERS file maintains the list of individuals responsible for various +# parts of this repository, including code review and approval. We use the +# Gerrit 'owners' plugin, which consumes this file, along with some extra +# keywords for our own purposes and tooling. +# +# For details on the configuration used by 'owners' see: +# https://gerrit.googlesource.com/plugins/owners/+/refs/heads/master/owners/src/main/resources/Documentation/config.md +# +# An OWNERS file must be in the root of a repository but may also be present +# in any subdirectory. The contents of the subdirectory OWNERS file are +# combined with parent directories unless 'inherit: false' is set. +# +# The owners file is YAML and has [up to] 4 top-level keywords. +# * owners: A list of individuals who have approval authority on the +# repository. +# +# * reviewers: A list of individuals who have requested review notification +# on the repository. +# +# * matchers: A list of specific file/path matchers for granular 'owners' and +# 'reviewers'. See 'owners' plugin documentation. +# +# * openbmc: A list of openbmc-specific meta-data about owners and reviewers. +# - name: preferred name of the individual. +# - email: preferred email address of the individual. +# - discord: Discord nickname of the individual. +# +# It is expected that these 4 sections will be listed in the order above and +# data within them will be kept sorted. + +owners: +- abinaya.dhandapani@amd.com +- jayanth.othayoth@amd.com + +reviewers: +- supreeth.venkatesh@amd.com + +matchers: + +openbmc: diff --git a/README.md b/README.md index 4978617..2265ac7 100644 --- a/README.md +++ b/README.md @@ -1 +1,20 @@ -# AMD BMC RAS +# AMD BMC RAS + +The amd - bmc - ras service is intended to discover, configure and exercise OOB +RAS capabilities supported by the processors .The application creates error +records from RAS telemetry extracted from the processor over APML. + +## Features + +The application waits on the APML_L gpio pin to check if any events are +detected. When a fatal error is detected in the system , SMU responds to +ErrEvent by signaling ALERT_L on APML. BMC then checks for the SB-RMI RasStatus +register via APML to confirm an MCA error has caused the ALERT_L assertion. The +application collects the MCA / MSR dump via APML and creates CPER record. System +recovery is handled as per the user's preference from the config file. + +## Configuration + +amd-ras is configured per the +[meson build files](https://mesonbuild.com/Build-options.html). Available +options are documented in `meson_options.txt` From bf04998ba62b809d54230996bdbba07f27ce4df7 Mon Sep 17 00:00:00 2001 From: Abinaya Dhandapani Date: Tue, 19 Nov 2024 06:03:28 -0600 Subject: [PATCH 2/3] Add clang-format and prettier Added clang-18 formatter. Signed-off-by: Abinaya Dhandapani --- .clang-format | 136 +++++++++++++++++++++++++++++++++++++++++++++++ .prettierrc.yaml | 7 +++ 2 files changed, 143 insertions(+) create mode 100644 .clang-format create mode 100644 .prettierrc.yaml diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..28e3328 --- /dev/null +++ b/.clang-format @@ -0,0 +1,136 @@ +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 1 +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: Empty +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: true +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: true +BinPackParameters: true +BitFieldColonSpacing: None +BraceWrapping: + AfterCaseLabel: true + AfterClass: true + AfterControlStatement: true + AfterEnum: true + AfterExternBlock: true + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: true + AfterStruct: true + AfterUnion: true + BeforeCatch: true + BeforeElse: true + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +BreakAfterAttributes: Never +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: AfterColon +BreakInheritanceList: AfterColon +BreakStringLiterals: false +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^[<"](gtest|gmock)' + Priority: 7 + - Regex: '^"config.h"' + Priority: -1 + - Regex: '^".*\.h"' + Priority: 1 + - Regex: '^".*\.hpp"' + Priority: 2 + - Regex: '^<.*\.h>' + Priority: 3 + - Regex: '^<.*\.hpp>' + Priority: 4 + - Regex: '^<.*' + Priority: 5 + - Regex: '.*' + Priority: 6 +IndentCaseLabels: true +IndentExternBlock: NoIndent +IndentRequiresClause: true +IndentWidth: 4 +IndentWrappedFunctionNames: true +InsertNewlineAtEOF: true +KeepEmptyLinesAtTheStartOfBlocks: false +LambdaBodyIndentation: Signature +LineEnding: LF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PackConstructorInitializers: BinPack +PenaltyBreakAssignment: 25 +PenaltyBreakBeforeFirstCallParameter: 50 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PenaltyIndentedWhitespace: 1 +PointerAlignment: Left +QualifierAlignment: Left +ReferenceAlignment: Left +ReflowComments: true +RequiresClausePosition: OwnLine +RequiresExpressionIndentation: Keyword +SortIncludes: CaseSensitive +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: Never +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Latest +TabWidth: 4 +UseTab: Never +... + diff --git a/.prettierrc.yaml b/.prettierrc.yaml new file mode 100644 index 0000000..a69b879 --- /dev/null +++ b/.prettierrc.yaml @@ -0,0 +1,7 @@ +tabWidth: 4 +printWidth: 80 +proseWrap: "always" +overrides: + - files: "*.md" + options: + tabWidth: 2 From a13485affa55d484fbd013ec89d3e1763d372509 Mon Sep 17 00:00:00 2001 From: Abinaya Dhandapani Date: Wed, 13 Nov 2024 10:12:33 -0600 Subject: [PATCH 3/3] Add support for fatal error monitoring - The application monitors APML_ALERT_L and upon assertion of the GPIO PIN , BMC collects the MCA MSR dump and creates CPER record. - Depending on the user configuration , the application initiates system recovery. - ras-config.json contains the configuration parameters with default value. User can get and set the configuration parameters using d-bus calls to the methods getAttribute and setAttribute. - The application also contains harvesting of last transaction address, debug log ID's. - The application is intended to be supported for 1P and 2P platforms. Tested fields: Tested in sp5 platform. root@sp5:/var/lib/amd-ras# ls current_index ras-error0.cper ras-error2.cper ras-error4.cper ras-error6.cper ras-error8.cper ras-config.json ras-error1.cper ras-error3.cper ras-error5.cper ras-error7.cper ras-error9.cper Signed-off-by: Abinaya Dhandapani --- config/ras_config.json | 70 ++ inc/apml_manager.hpp | 181 +++++ inc/config_manager.hpp | 62 ++ inc/cper_generator.hpp | 117 ++++ inc/interface_manager_base.hpp | 240 +++++++ inc/ras.hpp | 89 +++ meson.build | 85 +++ meson_options.txt | 2 + service_files/com.amd.RAS.service | 10 + src/apml_manager.cpp | 1089 +++++++++++++++++++++++++++++ src/config_manager.cpp | 119 ++++ src/cper_generator.cpp | 492 +++++++++++++ src/interface_manager_base.cpp | 513 ++++++++++++++ src/main.cpp | 56 ++ 14 files changed, 3125 insertions(+) create mode 100644 config/ras_config.json create mode 100644 inc/apml_manager.hpp create mode 100644 inc/config_manager.hpp create mode 100644 inc/cper_generator.hpp create mode 100644 inc/interface_manager_base.hpp create mode 100644 inc/ras.hpp create mode 100644 meson.build create mode 100644 meson_options.txt create mode 100644 service_files/com.amd.RAS.service create mode 100644 src/apml_manager.cpp create mode 100644 src/config_manager.cpp create mode 100644 src/cper_generator.cpp create mode 100644 src/interface_manager_base.cpp create mode 100644 src/main.cpp diff --git a/config/ras_config.json b/config/ras_config.json new file mode 100644 index 0000000..8a0d1d5 --- /dev/null +++ b/config/ras_config.json @@ -0,0 +1,70 @@ +{ + "Configuration": [ + { + "ApmlRetries": { + "Description": "Number of APML retry count", + "Value": 10, + "MaxBoundLimit": "50" + } + }, + { + "SystemRecovery": { + "Description": "System recovery mode", + "Value": "NO_RESET" + } + }, + { + "HarvestMicrocode": { + "Description": "Harvest microcode version", + "Value": true + } + }, + { + "HarvestPPIN": { + "Description": "Harvest PPIN", + "Value": true + } + }, + { + "ResetSignal": { + "Description": "Reset Signal Type", + "Value": "SYS_RST" + } + }, + { + "SigIdOffset": { + "Description": "List of Signature ID offsets", + "Value": [ + "0x30", + "0x34", + "0x28", + "0x2c", + "0x08", + "0x0c", + "null", + "null" + ] + } + }, + { + "AifsArmed": { + "Description": "If this field is true, AIFS flow is triggered", + "Value": false + } + }, + { + "AifsSignatureId": { + "Description": "List of signature Id to check if Aifs is triggered", + "Value": { + "EX-WDT": "0xaea0000000000108000500b020009a00000000004d000000" + } + } + }, + { + "DisableAifsResetOnSyncfloodCounter": { + "Description": "Disable AIFS Reset on syncfloow counter ", + "Value": true + } + } + ] +} diff --git a/inc/apml_manager.hpp b/inc/apml_manager.hpp new file mode 100644 index 0000000..2c343a9 --- /dev/null +++ b/inc/apml_manager.hpp @@ -0,0 +1,181 @@ +#include "cper_generator.hpp" +#include "interface_manager_base.hpp" + +class ApmlInterfaceManager : public RasManagerBase +{ + public: + /** + * @brief Initializes the APML interface manager. + * + * This function performs any necessary initialization for the APML + * interface manager. + */ + virtual void init(); + + /** + * @brief Configures the APML interface manager. + * + * This function configures the settings for the ADDC enablement. + */ + virtual void configure(); + + /** + * @brief Constructor for ApmlInterfaceManager. + * + * Initializes the ApmlInterfaceManager with the given object server, + * system bus connection, and I/O service. + * + * @param[in] objectServer Reference to an object server for managing + * D-Bus objects. + * @param[in] systemBus Shared pointer to a D-Bus connection. + * @param[in] io Reference to an I/O service for asynchronous operations. + */ + ApmlInterfaceManager( + sdbusplus::asio::object_server& objectServer, + std::shared_ptr& systemBus, + boost::asio::io_service& io) : + RasManagerBase(objectServer, systemBus, io) + {} + + protected: + std::vector blockId; // Vector to hold block IDs + uint32_t familyId; // Family ID + std::mutex harvest_in_progress_mtx; // Mutex for synchronization + bool p0AlertProcessed = false; // Flag for P0 alert processing + bool p1AlertProcessed = false; // Flag for P1 alert processing + uint64_t recordId = 1; // Record ID + uint16_t debugLogIdOffset; // Offset for debug log ID + uint32_t SignatureID[8]; // Array to hold signature IDs + + /** + * @brief Monitors if APML interface is up + * + * This function monitors the status of the APML interface. + */ + virtual void interfaceActiveMonitor(); + + /** + * @brief Retrieves the CPU ID. + * + * This function retrieves the CPU ID from the system. + */ + virtual void getCpuId(); + + /** + * @brief Finds the program ID. + * + * This function locates the program ID associated with the system. + */ + virtual void findProgramId(); + + /** + * @brief Harvests fatal error information. + * + * This function processes a fatal error based on its type. + * + * @param[in] errorType The type of fatal error to harvest. + */ + virtual void harvestFatalError(uint8_t); + + /** + * @brief Triggers a warm reset of the system. + * + * This function initiates a warm reset operation. + */ + void triggerWarmReset() override; + + /** + * @brief Clears the SBRMI alert mask. + * + * Requests de-assertion of APML_ALERT_L signal by clearing + * SBRMI::Status[SwAlertSts] + * + * @param[in] soc_num - The socket number. + */ + void clearSbrmiAlertMask(uint8_t soc_num); + + /** + * @brief Performs platform initialization tasks. + * + * This function executes necessary initialization tasks specific to + * the platform. + */ + void performPlatformInitialization(); + + /** + * @brief Reads a register from the specified address. + * + * This function reads a value from a register at a given address. + */ + oob_status_t readRegister(uint8_t, uint32_t, uint8_t*); + + /** + * @brief Writes a value to a register at a specified address. + * + * This function writes a value to a register at a given address. + * + */ + void writeRegister(uint8_t, uint32_t, uint32_t); + + /** + * @brief Compares values using bitwise AND operation with expected values. + * + * This function checks if there is a match between values and expected + * results using bitwise AND operation. + * + * @param[in] values Pointer to an array of values. + * @param[in] expected The expected string representation of values. + * + * @return True if there is a match, false otherwise. + */ + bool compare_with_bitwise_AND(const uint32_t* values, + const std::string& expected); + + /** + * @brief Checks if the signature ID matches expected values. + * + * This function verifies if the stored signature ID matches expected + * values. + * + * @return True if there is a match, false otherwise. + */ + bool checkSignatureIdMatch(); + + /** + * @brief Converts a hexadecimal string to a vector of uint32_t. + * + * This function takes a hexadecimal string and converts it into a + * vector of unsigned 32-bit integers. + * + * @param[in] hexString The hexadecimal string to convert. + * + * @return A vector containing converted values. + */ + std::vector hexstring_to_vector(const std::string& hexString); + + /** + * @brief MCA data harvesting. + * + */ + bool harvestMcaValidityCheck(uint8_t type, uint16_t* param1, + uint16_t* param2); + + /** + * @brief Harvests MCA data banks. + * + */ + template + void harvestMcaDataBanks(uint8_t bank, uint16_t, uint16_t, + CperGenerator&); + + /** + * @brief Harvests last transaction address using command 5Ch + * + */ + void getLastTransAddr(EFI_AMD_FATAL_ERROR_DATA* errorData, uint8_t type); + + void dumpContextInfo(EFI_AMD_FATAL_ERROR_DATA* errorData, uint8_t type); + + void harvestDebugLogDump(EFI_AMD_FATAL_ERROR_DATA* errorData, uint8_t, + uint8_t); +}; diff --git a/inc/config_manager.hpp b/inc/config_manager.hpp new file mode 100644 index 0000000..b7f0ca7 --- /dev/null +++ b/inc/config_manager.hpp @@ -0,0 +1,62 @@ +#pragma once + +#include +#include +#include +#include +static constexpr auto objectPath = "/com/amd/RAS"; + +// Type alias for attribute type from the D-Bus configuration. +using AttributeType = + sdbusplus::common::com::amd::ras::Configuration::AttributeType; + +// Type alias for attribute type from the D-Bus configuration. +using Base = sdbusplus::com::amd::RAS::server::Configuration; + +// Type alias for attribute name and value. +using AttributeName = std::string; +using AttributeValue = + std::variant, + std::map>; +using ConfigTable = + std::map, + std::map>, + int64_t>>; + +// Type alias for the configuration table structure. +struct EventDeleter +{ + void operator()(sd_event* event) const + { + event = sd_event_unref(event); + } +}; + +// Unique pointer type for sd_event with custom deleter. +using EventPtr = std::unique_ptr; + +/** + * @brief Definition of the RasConfiguration class. + * + * @tparam AttributeName The type for attribute names (usually std::string). + * @tparam AttributeValue The variant type for attribute values. + * @tparam ConfigTable The map type for storing attribute information. + */ + +class RasConfiguration : public Base +{ + public: + RasConfiguration(sdbusplus::asio::object_server& objectServer, + std::shared_ptr& systemBus); + + void setAttribute(AttributeName attribute, AttributeValue value) override; + + AttributeValue getAttribute(AttributeName attribute) override; + + private: + sdbusplus::asio::object_server& objServer; + std::shared_ptr& systemBus; +}; diff --git a/inc/cper_generator.hpp b/inc/cper_generator.hpp new file mode 100644 index 0000000..2344678 --- /dev/null +++ b/inc/cper_generator.hpp @@ -0,0 +1,117 @@ +#pragma once + +#include "ras.hpp" + +extern "C" +{ +#include "apml.h" +#include "apml_common.h" +#include "esmi_cpuid_msr.h" +#include "esmi_mailbox.h" +#include "esmi_rmi.h" +} + +/** @class CperGenerator + * @brief Implementation of CPER record creation + */ +template +class CperGenerator +{ + protected: + uint32_t boardId; + uint8_t numOfCpu; + uint64_t recordId; + uint8_t progId; + uint32_t familyId; + int errCount; + + public: + /** + * @brief Constructs a CperGenerator object. + * + * @param[in] numOfCpu The total number of sockets in the system. + * @param[in] progId The program ID. + * @param[in] familyId The family ID. + * @param[in] errCount The preserved error count number for numbering cper + * files. + */ + CperGenerator(uint8_t numOfCpu, uint8_t progId, uint32_t familyId, + int errCount) : + numOfCpu(numOfCpu), recordId(1), progId(progId), familyId(familyId), + errCount(errCount) + {} + + /** @brief Push contents of the error record header of the CPER file + * @param[in] data - Shared pointer to the CPER record object. + * @param[in] sectionCount - Number of error sections in the CPER record. + * @param[in] errorSeverity - Error Severity - fatal , correctable or + * uncorrectable. + * @param[in] errorType - Fatal or runtime - MCA, DRAM or PCIE AER errors. + */ + void dumpCperHeaderSection(const std::shared_ptr& data, + uint16_t sectionCount, uint32_t errorSeverity, + std::string errorType); + + /** @brief Function to calculate and set the timestamp for a given data + * object + * @param[in] data - Shared pointer to the CPER record object. + */ + void calculateTimeStamp(const std::shared_ptr& data); + + /** @brief Function to dump the error descriptor section of a given data + * object + * @param[in] data - Shared pointer to CPER record object + * @param[in] ErrorType - The type of error. + */ + void dumpErrorDescriptorSection(const std::shared_ptr&, uint16_t, + std::string); + + /** @brief Function to dump processor error section to CPER record. + * @param[in] socNum - socket number. + * @param[in] - pointer of the structure variable CpuId + */ + void dumpProcessorErrorSection(const std::shared_ptr&, + uint8_t, CpuId*); + + /** @brief Function to write error information to a CPER file. + * @param[in] data - Shared pointer to the CPER record object + * @param[in] ErrorType - The type of error. + * @param[in] SectionCount - The number of error descriptor sections. + */ + void cperFileWrite(const std::shared_ptr&, std::string, uint16_t); + + /** @brief Function to dump context information to CPER record. + * @param[in] data - Shared pointer to the CPER record object + * @param[in] - Number of valid MCA banks. + * @param[in] - Number of valid bytes per MCA bank. + * @param[in] - Socket Number + * @param[in] - The number of fatal error sections. + * @param[in] - A vector of uint8_t representing block IDs + * @param[in] - PPIn The processor pin read from apml commands + * @param[in] - Microcode version read from apml commands + * @param[in] - The total number of apml retriesi during failure. + */ + void dumpContextInfo(const std::shared_ptr&, uint16_t, + uint16_t, uint8_t, std::vector, uint64_t*, + uint32_t*, int64_t*); + + /** @brief Function to dump the last transaction address during a fatal + * error + * @param[in] data - Shared pointer to the CPER record object + * @param[in] socNum - socket number. + */ + void getLastTransAddr(const std::shared_ptr&, uint8_t); + + /** @brief Function to harvest number of valid debug log instances during a + * Syncflood. + * @param[in] data - Shared pointer to the CPER record object + * @param[in] socNum - socket number. + * @param[in] - A vector of uint8_t representing block IDs + * @param[in] - PPIn The processor pin read from apml commands + * @param[in] - Offset for the debug log ID in the CPER record + */ + void harvestDebugLogDump(const std::shared_ptr&, uint8_t, + uint8_t, int64_t*, uint16_t&); + + std::string getCperFilename(int); +}; diff --git a/inc/interface_manager_base.hpp b/inc/interface_manager_base.hpp new file mode 100644 index 0000000..06a0602 --- /dev/null +++ b/inc/interface_manager_base.hpp @@ -0,0 +1,240 @@ +#include "config_manager.hpp" +#include "ras.hpp" + +#include +#include +#include +#include + +/* Base class for managing Ras (Reliability, Availability, and + Serviceability) configurations.*/ +class RasManagerBase : public RasConfiguration +{ + public: + /** + * @brief Constructs a RasManagerBase object. + * + * This constructor initializes the base class RasConfiguration and sets up + * the object with the provided parameters, including the IO service and + * alert event objects. + * + * @param[in] objectServer Reference to the object server for DBus + * integration. + * @param[in] systemBus Shared pointer to the system bus for DBus + * communication. + * @param[in] io Reference to the boost::asio::io_service used for + * asynchronous operations. + */ + RasManagerBase(sdbusplus::asio::object_server& objectServer, + std::shared_ptr& systemBus, + boost::asio::io_service& io) : + RasConfiguration(objectServer, systemBus), io(io), + p0_apmlAlertEvent(io), p1_apmlAlertEvent(io) + {} + + /** + * @brief Initializes the RasManagerBase object. + * + * This is a pure virtual function, intended to be implemented by derived + * classes to perform any necessary initialization specific to the subclass. + */ + virtual void init() = 0; + + /** + * @brief Configures the RasManagerBase object. + * + * This is a pure virtual function, intended to be implemented by derived + * classes to configure the RasManagerBase object for specific use cases. + */ + virtual void configure() = 0; + + /** + * @brief Destructor for RasManagerBase. + * + * Virtual destructor ensures proper cleanup of derived class objects. + */ + virtual ~RasManagerBase(); + + /** + * @brief Handler for P0 alert events. + * + * This function is invoked when an alert event occurs on P0. The function + * handles the event by processing the necessary response. + */ + void p0AlertEventHandler(); + + /** + * @brief Handler for P1 alert events. + * + * This function is invoked when an alert event occurs on P0. The function + * handles the event by processing the necessary response. + */ + void p1AlertEventHandler(); + + protected: + boost::asio::io_service& io; + uint8_t numOfCpu; + CpuId* cpuId; + uint32_t* uCode; + uint64_t* ppin; + std::string* inventoryPath; + unsigned int boardId; + uint8_t progId; + int errCount = 0; + std::shared_ptr rcd = NULL; + + /** + * @brief Stream descriptor for handling P0 APML alert events. + * + * This stream descriptor listens for alert events related to the P0 sensor + * and triggers actions upon detection. + */ + boost::asio::posix::stream_descriptor p0_apmlAlertEvent; + + /** + * @brief Stream descriptor for handling P1 APML alert events. + * + * This stream descriptor listens for alert events related to the P1 sensor + * and triggers actions upon detection. + */ + boost::asio::posix::stream_descriptor p1_apmlAlertEvent; + + /** + * @brief GPIO line for handling P0 alert events. + * + * This GPIO line is used to detect hardware alerts for P0 and trigger + * events for processing. + */ + gpiod::line p0_apmlAlertLine; + + /** + * @brief GPIO line for handling P1 alert events. + * + * This GPIO line is used to detect hardware alerts for P1 and trigger + * events for processing. + */ + gpiod::line p1_apmlAlertLine; + + /** + * @brief Retrieves the number of CPUs in the system. + * + * This function queries the system to obtain the number of CPUs and stores + * the result in the numOfCpu member variable. + */ + void getNumberOfCpu(); + + /** + * @brief Retrieves the board ID. + * + * This function queries the system to obtain the board ID and stores the + * result in the boardId member variable. + */ + void getBoardId(); + + /** + * @brief Creates an index file. + * + * This function generates an index file for CPER record tracking + */ + void createIndexFile(); + + /** + * @brief Creates a configuration file. + * + * This function generates a RAS configuration file. + */ + void createConfigFile(); + + /** + * @brief Retrieves the CPU microcode revision. + * + * This function queries the CPU for its microcode revision and stores it in + * the uCode member variable. + */ + void getMicrocodeRev(); + + /** + * @brief Retrieves the PPIN fuse value. + * + * This function queries the system for the PPIN fuse value and stores it in + * the ppin member variable. + */ + void getPpinFuse(); + + /** + * @brief Fetches a property from DBus. + * + * This template function retrieves a property from DBus based on the given + * object path, interface, and property name. + * + * @param[in] bus The DBus connection to query. + * @param[in] path The object path of the DBus object. + * @param[in] interface The interface of the DBus object. + * @param[in] property The name of the property to retrieve. + * @param[in] dbusMethod The DBus method to call to fetch the property. + * + * @return The property value of type T, if found. + */ + template + T getProperty(sdbusplus::bus::bus&, const char*, const char*, const char*, + const char*); + + /** + * @brief Requests GPIO events for hardware alert handling. + * + * This function configures a GPIO line and stream descriptor to listen for + * events. It triggers the provided callback function upon event detection. + * + * @param[in] gpioPin The GPIO pin to monitor. + * @param[in] callback The function to call when an event is detected. + * @param[in] line The GPIO line to use for event detection. + * @param[in] stream The stream descriptor used to listen for events. + */ + void requestGPIOEvents(const std::string&, const std::function&, + gpiod::line&, + boost::asio::posix::stream_descriptor&); + + void rasRecoveryAction(uint8_t); + + /** + * @brief Triggers a cold reset of the system. + * + * This function triggers a cold reset. + */ + void triggerColdReset(); + + /** + * @brief Triggers a reset through the RSMRST signal. + * + * This function triggers a reset using the RSMRST signal. + */ + void triggerRsmrstReset(); + + /** + * @brief Triggers a reset through the SYS RST signal. + * + * This function triggers a reset using the SYS_RST signal (system reset). + */ + void triggerSysReset(); + + /** + * @brief Requests a system transition. + * + * This function requests a transition for the host system, + * such as a change in state or mode (e.g., shutdown, reboot). + * + * @param[in] transitionType A string that specifies the type + * of transition to request. + */ + void requestHostTransition(std::string); + + virtual void triggerWarmReset() = 0; + + virtual void interfaceActiveMonitor() = 0; + + virtual void getCpuId() = 0; + + virtual void findProgramId() = 0; + + virtual void harvestFatalError(uint8_t) = 0; +}; diff --git a/inc/ras.hpp b/inc/ras.hpp new file mode 100644 index 0000000..424ffd5 --- /dev/null +++ b/inc/ras.hpp @@ -0,0 +1,89 @@ +#pragma once + +#include "libcper/Cper.h" + +#include +#include +#include +#include + +#include + +constexpr int BASE_16 = 16; +constexpr int INDEX_0 = 0; +constexpr int INDEX_1 = 1; +constexpr int INDEX_2 = 2; +constexpr int INDEX_3 = 3; +constexpr int INDEX_4 = 4; +constexpr int INDEX_5 = 5; +constexpr int INDEX_6 = 6; +constexpr int INDEX_8 = 8; +constexpr int INDEX_12 = 12; +constexpr int INDEX_16 = 16; +constexpr int INDEX_19 = 19; +constexpr int INDEX_20 = 20; +constexpr int TURIN_FAMILY_ID = 0x1A; +constexpr int GENOA_FAMILY_ID = 0x19; +constexpr int RAS_STATUS_REGISTER = 0x4C; +constexpr int CPER_SEV_FATAL = 1; +constexpr int INT_255 = 0xFF; +constexpr int SOCKET_0 = 0; +constexpr int SOCKET_1 = 1; +constexpr int MAX_MCA_BANKS = 32; +constexpr int SHIFT_24 = 24; +constexpr int SHIFT_4 = 4; +constexpr int BYTE_4 = 4; +constexpr int BYTE_2 = 2; +constexpr int BAD_DATA = 0xBAADDA7A; +static const std::string FATAL_ERR = "FATAL"; + +struct CpuId +{ + uint32_t eax; + uint32_t ebx; + uint32_t ecx; + uint32_t edx; +}; + +#define CCM_COUNT 8 +#define DEBUG_LOG_DUMP_REGION 12124 +#define MCA_BANK_MAX_OFFSET 128 +#define MCA_BANKS 32 +#define LAST_TRANS_ADDR_OFFSET 4 + +typedef struct +{ + UINT32 McaData[MCA_BANK_MAX_OFFSET]; +} CRASHDUMP_T; + +typedef struct +{ + UINT32 WdtData[LAST_TRANS_ADDR_OFFSET]; +} LAST_TRANS_ADDR; + +typedef struct +{ + LAST_TRANS_ADDR LastTransAddr[CCM_COUNT]; +} DF_DUMP; + +typedef struct +{ + EFI_IA32_X64_PROCESSOR_ERROR_RECORD ProcError; + UINT32 SignatureID[8]; + UINT32 Reserved[8]; + UINT16 RegisterContextType; + UINT16 RegisterArraySize; + UINT32 MicrocodeVersion; + UINT64 Ppin; + CRASHDUMP_T CrashDumpData[MCA_BANKS]; + DF_DUMP DfDumpData; + UINT32 Reserved1[96]; + UINT32 DebugLogIdData[DEBUG_LOG_DUMP_REGION]; +} __attribute__((packed)) EFI_AMD_FATAL_ERROR_DATA; + +typedef struct +{ + EFI_COMMON_ERROR_RECORD_HEADER header; + EFI_ERROR_SECTION_DESCRIPTOR* sectionDescriptor; + EFI_AMD_FATAL_ERROR_DATA* errorRecord; +} __attribute__((packed)) FatalCperRecord; diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..33f9601 --- /dev/null +++ b/meson.build @@ -0,0 +1,85 @@ +project( + 'amd-bmc-ras', + 'cpp', + default_options: [ + 'warning_level=3', + 'werror=true', + 'cpp_std=c++23' + ], + license: 'Apache-2.0', + version: '1.0', +) + + +conf_data = configuration_data() + +# Define the directory paths +ras_dir = '/var/lib/amd-ras/' +index_file = '/var/lib/amd-ras/current_index' +config_file = '/var/lib/amd-ras/ras_config.json' + +cpp_args = [ + '-DRAS_DIR="' + ras_dir + '"', + '-DINDEX_FILE="' + index_file + '"', + '-DCONFIG_FILE="' + config_file + '"' +] + +cpp = meson.get_compiler('cpp') + +# Meson requires an absolute path for find_library(). +libdir = meson.current_source_dir() + './lib/' + +apml_dep = cpp.find_library('apml64', dirs : libdir) # ./lib/libapml64.lib +cper_dep = cpp.find_library('cper-generate',dirs : libdir) +boost_dep = dependency('boost') + +deps = [ + dependency('libgpiodcxx', default_options: ['bindings=cxx']), + dependency('nlohmann_json'), + dependency('phosphor-logging'), + dependency('sdbusplus'), + dependency('sdeventplus'), + dependency('systemd'), + dependency('threads'), + apml_dep, + cper_dep, + boost_dep, +] + +apml = get_option('apml-interface') +if apml + add_project_arguments('-DAPML', language: 'cpp') +endif + +pldm = get_option('pldm-interface') +if pldm + add_project_arguments('-DPLDM', language: 'cpp') +endif + +executable( + 'amd-bmc-ras', + 'src/apml_manager.cpp', + 'src/config_manager.cpp', + 'src/cper_generator.cpp', + 'src/interface_manager_base.cpp', + 'src/main.cpp', + include_directories: include_directories('inc'), + cpp_args: cpp_args, + dependencies: deps, + install: true, + install_dir: get_option('bindir')) + +ras_config_dir = join_paths(get_option('datadir'), 'ras-config') +install_data( + join_paths(meson.current_source_dir(), 'config', 'ras_config.json'), + install_dir: ras_config_dir, + rename: 'ras_config.json' +) + +systemd = dependency('systemd') + +install_data( + ['service_files/com.amd.RAS.service'], + install_dir: systemd.get_pkgconfig_variable('systemdsystemunitdir') +) + diff --git a/meson_options.txt b/meson_options.txt new file mode 100644 index 0000000..f5e5ae2 --- /dev/null +++ b/meson_options.txt @@ -0,0 +1,2 @@ +option('apml-interface', type: 'boolean', value: true, description: 'Enable APML interface') +option('pldm-interface', type: 'boolean', value: false, description: 'Enable PLDM interface') diff --git a/service_files/com.amd.RAS.service b/service_files/com.amd.RAS.service new file mode 100644 index 0000000..e6327db --- /dev/null +++ b/service_files/com.amd.RAS.service @@ -0,0 +1,10 @@ +[Unit] +Description=Crash dump manager +After=xyz.openbmc_project.Chassis.Control.Power.service + +[Service] +Restart=always +ExecStart=/usr/bin/amd-bmc-ras + +[Install] +WantedBy=multi-user.target diff --git a/src/apml_manager.cpp b/src/apml_manager.cpp new file mode 100644 index 0000000..c674c2f --- /dev/null +++ b/src/apml_manager.cpp @@ -0,0 +1,1089 @@ +#include "apml_manager.hpp" + +#include + +#include + +constexpr int EPYC_PROG_SEG_ID = 0x01; +constexpr int FAILURE_SIGNATURE_ID = 0x04; +constexpr int FATAL_ERROR = 1; +constexpr int FATAL_SECTION_COUNT = 2; +constexpr int SYS_MGMT_CTRL_ERR = 0x04; +constexpr int RESET_HANG_ERR = 0x02; +constexpr int SBRMI_CONTROL_REGISTER = 0x1; +constexpr int SHIFT_23 = 23; +constexpr int SHIFT_25 = 25; +constexpr int TWO_SOCKET = 2; +constexpr char EVENT_SUBSCRIPTION_FILE[] = + "/var/lib/bmcweb/eventservice_config.json"; + +enum BlockID : unsigned short +{ + BLOCK_ID_1 = 1, + BLOCK_ID_2, + BLOCK_ID_3, + BLOCK_ID_23 = 23, + BLOCK_ID_24, + BLOCK_ID_33 = 33, + BLOCK_ID_36, + BLOCK_ID_37, + BLOCK_ID_38, + BLOCK_ID_39, + BLOCK_ID_40 +}; + +void ApmlInterfaceManager::init() +{ + getNumberOfCpu(); + + interfaceActiveMonitor(); + + getCpuId(); + + getBoardId(); + + findProgramId(); +} + +void ApmlInterfaceManager::configure() +{ + createIndexFile(); + + createConfigFile(); + + // Retrieve microcode version attribute and check if it's a boolean + AttributeValue uCodeVersion = getAttribute("HarvestMicrocode"); + bool* uCodeVersionFlag = std::get_if(&uCodeVersion); + + // Retrieve PPIN attribute and check if it's a boolean + AttributeValue harvestPpin = getAttribute("HarvestPPIN"); + bool* harvestPpinFlag = std::get_if(&harvestPpin); + + // If microcode harvesting is enabled, retrieve microcode revision + if (*uCodeVersionFlag == true) + { + getMicrocodeRev(); + } + + // If PPIN harvesting is enabled, retrieve PPIN fuse + if (*harvestPpinFlag == true) + { + getPpinFuse(); + } + + // Request GPIO events for P0 alert handling + requestGPIOEvents("P0_I3C_APML_ALERT_L", + std::bind(&RasManagerBase::p0AlertEventHandler, this), + p0_apmlAlertLine, p0_apmlAlertEvent); + + // Request GPIO events for P1 alert handling + if (numOfCpu == TWO_SOCKET) + { + requestGPIOEvents("P1_I3C_APML_ALERT_L", + std::bind(&RasManagerBase::p1AlertEventHandler, this), + p1_apmlAlertLine, p1_apmlAlertEvent); + } +} + +void ApmlInterfaceManager::interfaceActiveMonitor() +{ + oob_status_t ret = OOB_MAILBOX_CMD_UNKNOWN; + + uint32_t d_out = 0; + + while (ret != OOB_SUCCESS) + { + ret = get_bmc_ras_oob_config(INDEX_0, &d_out); + + if (ret == OOB_MAILBOX_CMD_UNKNOWN) + { + ret = esmi_get_processor_info(INDEX_0, plat_info); + } + sleep(INDEX_1); + } + performPlatformInitialization(); +} + +void ApmlInterfaceManager::writeRegister(uint8_t info, uint32_t reg, + uint32_t value) +{ + oob_status_t ret; + + ret = esmi_oob_write_byte(info, reg, SBRMI, value); + if (ret != OOB_SUCCESS) + { + lg2::error("Failed to write register: {REG}", "REG", lg2::hex, reg); + return; + } + lg2::debug("Write to register {REGISTER} is successful", "REGISTER", reg); +} + +void ApmlInterfaceManager::triggerWarmReset() +{ + oob_status_t ret; + uint32_t ack_resp = 0; + /* In a 2P config, it is recommended to only send this command to P0 + Hence, sending the Signal only to socket 0*/ + ret = reset_on_sync_flood(INDEX_0, &ack_resp); + if (ret) + { + lg2::error("Failed to request reset after sync flood"); + } + else + { + lg2::info("Warm reset triggered"); + } +} + +oob_status_t ApmlInterfaceManager::readRegister(uint8_t info, uint32_t reg, + uint8_t* value) +{ + oob_status_t ret; + uint16_t retryCount = 10; + + while (retryCount > 0) + { + ret = esmi_oob_read_byte(info, reg, SBRMI, value); + if (ret == OOB_SUCCESS) + { + break; + } + + lg2::error("Failed to read register: {REGISTER} Retrying\n", "REGISTER", + lg2::hex, reg); + + usleep(1000 * 1000); + retryCount--; + } + if (ret != OOB_SUCCESS) + { + lg2::error("Failed to read register: {REGISTER}\n", "REGISTER", + lg2::hex, reg); + } + + return ret; +} + +void ApmlInterfaceManager::clearSbrmiAlertMask(uint8_t socNum) +{ + oob_status_t ret; + + lg2::info("Clear Alert Mask bit of SBRMI Control register"); + + uint8_t buffer; + + ret = readRegister(socNum, SBRMI_CONTROL_REGISTER, &buffer); + + if (ret == OOB_SUCCESS) + { + buffer = buffer & 0xFE; + writeRegister(socNum, SBRMI_CONTROL_REGISTER, + static_cast(buffer)); + } +} + +void ApmlInterfaceManager::performPlatformInitialization() +{ + oob_status_t ret = OOB_MAILBOX_CMD_UNKNOWN; + struct processor_info platInfo[INDEX_1]; + + while (ret != OOB_SUCCESS) + { + uint8_t soc_num = 0; + ret = esmi_get_processor_info(soc_num, platInfo); + + if (ret == OOB_SUCCESS) + { + familyId = platInfo->family; + break; + } + sleep(INDEX_1); + } + + if (ret == OOB_SUCCESS) + { + if (platInfo->family == GENOA_FAMILY_ID) + { + blockId = {BLOCK_ID_33}; + } + else if (platInfo->family == TURIN_FAMILY_ID) + { + for (uint8_t i = 0; i < numOfCpu; i++) + { + clearSbrmiAlertMask(i); + } + + blockId = {BLOCK_ID_1, BLOCK_ID_2, BLOCK_ID_3, BLOCK_ID_23, + BLOCK_ID_24, BLOCK_ID_33, BLOCK_ID_36, BLOCK_ID_37, + BLOCK_ID_38, BLOCK_ID_40}; + } + } + else + { + sd_journal_print(LOG_ERR, + "Failed to perform platform initialization\n"); + } +} + +void ApmlInterfaceManager::getCpuId() +{ + for (int i = 0; i < numOfCpu; i++) + { + uint32_t core_id = 0; + oob_status_t ret; + cpuId[i].eax = 1; + cpuId[i].ebx = 0; + cpuId[i].ecx = 0; + cpuId[i].edx = 0; + + ret = esmi_oob_cpuid(i, core_id, &cpuId[i].eax, &cpuId[i].ebx, + &cpuId[i].ecx, &cpuId[i].edx); + + if (ret) + { + lg2::error("Failed to get the CPUID for socket {CPU}", "CPU", i); + } + } +} + +void ApmlInterfaceManager::findProgramId() +{ + oob_status_t ret; + uint8_t socNum = 0; + + struct processor_info platInfo[INDEX_1]; + + ret = esmi_get_processor_info(socNum, platInfo); + + if (ret == OOB_SUCCESS) + { + progId = EPYC_PROG_SEG_ID; + } +} + +bool ApmlInterfaceManager::harvestMcaValidityCheck( + uint8_t info, uint16_t* numbanks, uint16_t* bytespermca) +{ + oob_status_t ret = OOB_MAILBOX_CMD_UNKNOWN; + uint16_t retries = 0; + bool mcaValidityCheck = true; + + AttributeValue apmlRetry = getAttribute("ApmlRetries"); + int64_t* apmlRetryCount = std::get_if(&apmlRetry); + + while (ret != OOB_SUCCESS) + { + retries++; + + ret = read_bmc_ras_mca_validity_check(info, bytespermca, numbanks); + + if (retries > *apmlRetryCount) + { + lg2::error( + "Socket {SOCK}: Failed to get MCA banks with valid status", + "SOCK", info); + break; + } + + if ((*numbanks == 0) || (*numbanks > MAX_MCA_BANKS)) + { + lg2::error("Socket {SOCKET}: Invalid MCA bank validity status. " + "Retry Count: {RETRY_COUNT}", + "SOCKET", info, "RETRY_COUNT", retries); + ret = OOB_MAILBOX_CMD_UNKNOWN; + usleep(1000 * 1000); + continue; + } + } + + if ((*numbanks <= 0) || (*numbanks > MAX_MCA_BANKS)) + { + mcaValidityCheck = false; + } + + return mcaValidityCheck; +} + +inline std::string getCperFilename(int num) +{ + return "ras-error" + std::to_string(num) + ".cper"; +} + +void ApmlInterfaceManager::getLastTransAddr( + EFI_AMD_FATAL_ERROR_DATA* fatal_error_data, uint8_t info) +{ + oob_status_t ret; + uint8_t blk_id = 0; + uint16_t n = 0; + uint16_t maxOffset32; + uint32_t data; + struct ras_df_err_chk err_chk; + union ras_df_err_dump df_err = {0}; + + ret = read_ras_df_err_validity_check(info, blk_id, &err_chk); + + if (ret) + { + sd_journal_print(LOG_ERR, "Failed to read RAS DF validity check\n"); + } + else + { + if (err_chk.df_block_instances != 0) + { + maxOffset32 = ((err_chk.err_log_len % BYTE_4) ? INDEX_1 : INDEX_0) + + (err_chk.err_log_len >> BYTE_2); + while (n < err_chk.df_block_instances) + { + for (int offset = 0; offset < maxOffset32; offset++) + { + memset(&data, 0, sizeof(data)); + /* Offset */ + df_err.input[INDEX_0] = offset * BYTE_4; + /* DF block ID */ + df_err.input[INDEX_1] = blk_id; + /* DF block ID instance */ + df_err.input[INDEX_2] = n; + + ret = read_ras_df_err_dump(info, df_err, &data); + + if (ret != OOB_SUCCESS) + { + // retry + AttributeValue apmlRetry = getAttribute("ApmlRetries"); + int64_t* retryCount = std::get_if(&apmlRetry); + int64_t retries = 0; + while (ret != OOB_SUCCESS) + { + retries++; + memset(&data, 0, sizeof(data)); + memset(&df_err, 0, sizeof(df_err)); + + /* Offset */ + df_err.input[INDEX_0] = offset * BYTE_4; + /* DF block ID */ + df_err.input[INDEX_1] = blk_id; + /* DF block ID instance */ + df_err.input[INDEX_2] = n; + + ret = read_ras_df_err_dump(info, df_err, &data); + + if (retries > *retryCount) + { + break; + } + sleep(INDEX_1); + } + + if (ret != OOB_SUCCESS) + { + data = 0; + } + } + fatal_error_data->DfDumpData.LastTransAddr[n] + .WdtData[offset] = data; + } + n++; + } + } + } +} + +void ApmlInterfaceManager::harvestDebugLogDump( + EFI_AMD_FATAL_ERROR_DATA* fatal_error_data, uint8_t info, uint8_t blk_id) +{ + oob_status_t ret = OOB_MAILBOX_CMD_UNKNOWN; + uint16_t retries = 0; + uint32_t data; + struct ras_df_err_chk err_chk; + union ras_df_err_dump df_err = {0}; + + AttributeValue apmlRetry = getAttribute("ApmlRetries"); + int64_t* apmlRetryCount = std::get_if(&apmlRetry); + + while (ret != OOB_SUCCESS) + { + retries++; + + ret = read_ras_df_err_validity_check(info, blk_id, &err_chk); + + if (ret == OOB_SUCCESS) + { + sd_journal_print(LOG_INFO, + "Socket : %d , Debug Log ID : %d , Block Instance " + "= %d, Err Log Length = %d\n", + info, blk_id, err_chk.df_block_instances, + err_chk.err_log_len); + break; + } + + if (retries > *apmlRetryCount) + { + sd_journal_print(LOG_ERR, + "Socket %d: Failed to get valid debug log for Dbg " + "Log ID %d . Error: %d\n", + info, blk_id, ret); + + /*If 5Bh command fails ,0xBAADDA7A is written thrice in the PCIE + * dump region*/ + fatal_error_data->DebugLogIdData[debugLogIdOffset++] = blk_id; + fatal_error_data->DebugLogIdData[debugLogIdOffset++] = BAD_DATA; + fatal_error_data->DebugLogIdData[debugLogIdOffset++] = BAD_DATA; + fatal_error_data->DebugLogIdData[debugLogIdOffset++] = BAD_DATA; + break; + } + } + + if (ret == OOB_SUCCESS) + { + if (err_chk.df_block_instances != 0) + { + uint16_t n = 0; + uint16_t maxOffset32; + + uint32_t DbgLogIdHeader = + (static_cast(err_chk.err_log_len) << INDEX_16) | + (static_cast(err_chk.df_block_instances) << INDEX_8) | + static_cast(blk_id); + + if (info == SOCKET_0) + { + fatal_error_data->DebugLogIdData[debugLogIdOffset++] = + DbgLogIdHeader; + } + else if (info == SOCKET_1) + { + fatal_error_data->DebugLogIdData[debugLogIdOffset++] = + DbgLogIdHeader; + } + + maxOffset32 = ((err_chk.err_log_len % BYTE_4) ? INDEX_1 : INDEX_0) + + (err_chk.err_log_len >> BYTE_2); + + while (n < err_chk.df_block_instances) + { + bool apmlHang = false; + + for (int offset = 0; offset < maxOffset32; offset++) + { + if (apmlHang == false) + { + memset(&data, 0, sizeof(data)); + memset(&df_err, 0, sizeof(df_err)); + + /* Offset */ + df_err.input[INDEX_0] = offset * BYTE_4; + /* DF block ID */ + df_err.input[INDEX_1] = blk_id; + /* DF block ID instance */ + df_err.input[INDEX_2] = n; + + ret = read_ras_df_err_dump(info, df_err, &data); + + if (ret != OOB_SUCCESS) + { + // retry + AttributeValue apmlRetry = + getAttribute("ApmlRetries"); + int64_t* retryCount = + std::get_if(&apmlRetry); + int64_t retries = 0; + while (ret != OOB_SUCCESS) + { + retries++; + memset(&data, 0, sizeof(data)); + memset(&df_err, 0, sizeof(df_err)); + + /* Offset */ + df_err.input[INDEX_0] = offset * BYTE_4; + /* DF block ID */ + df_err.input[INDEX_1] = blk_id; + /* DF block ID instance */ + df_err.input[INDEX_2] = n; + + ret = read_ras_df_err_dump(info, df_err, &data); + + if (retries > *retryCount) + { + break; + } + sleep(INDEX_1); + } + + if (ret != OOB_SUCCESS) + { + sd_journal_print(LOG_ERR, + "Failed to read debug log " + "dump for debug log ID : %d\n", + blk_id); + data = BAD_DATA; + /*the Dump APML command fails in the middle + of the iterative loop, then write BAADDA7A + for the remaining iterations in the for + loop*/ + apmlHang = true; + } + } + } + + if (info == SOCKET_0) + { + fatal_error_data->DebugLogIdData[debugLogIdOffset++] = + data; + } + else if (info == SOCKET_1) + { + fatal_error_data->DebugLogIdData[debugLogIdOffset++] = + data; + } + } + n++; + } + } + } +} + +void ApmlInterfaceManager::dumpContextInfo( + EFI_AMD_FATAL_ERROR_DATA* fatal_error_data, uint8_t info) +{ + if ((info == SOCKET_1) && (numOfCpu != TWO_SOCKET)) + { + return; + } + + getLastTransAddr(fatal_error_data, info); + + uint8_t blk_id; + + debugLogIdOffset = 0; + + for (blk_id = 0; blk_id < blockId.size(); blk_id++) + { + harvestDebugLogDump(fatal_error_data, info, blockId[blk_id]); + } +} + +template +void ApmlInterfaceManager::harvestMcaDataBanks(uint8_t info, uint16_t numbanks, + uint16_t bytespermca, + CperGenerator& cperGenerator) +{ + uint16_t n = 0; + uint16_t maxOffset32; + uint32_t buffer; + struct mca_bank mca_dump; + oob_status_t ret = OOB_MAILBOX_CMD_UNKNOWN; + bool ValidSignatureID = false; + + int syndOffsetLo = 0; + int syndOffsetHi = 0; + int ipidOffsetLo = 0; + int ipidOffsetHi = 0; + int statusOffsetLo = 0; + int statusOffsetHi = 0; + + uint32_t mcaStatusLo = 0; + uint32_t mcaStatusHi = 0; + uint32_t mcaIpidLo = 0; + uint32_t mcaIpidHi = 0; + uint32_t mcaSyndLo = 0; + uint32_t mcaSyndHi = 0; + + AttributeValue sigIdOffsetVal = getAttribute("SigIdOffset"); + std::vector* sigIDOffset = + std::get_if>(&sigIdOffsetVal); + + AttributeValue apmlRetry = getAttribute("ApmlRetries"); + int64_t* apmlRetryCount = std::get_if(&apmlRetry); + + cperGenerator.dumpCperHeaderSection(rcd, FATAL_SECTION_COUNT, + CPER_SEV_FATAL, FATAL_ERR); + + rcd->sectionDescriptor = + new EFI_ERROR_SECTION_DESCRIPTOR[FATAL_SECTION_COUNT]; + std::memset(rcd->sectionDescriptor, 0, + FATAL_SECTION_COUNT * sizeof(EFI_ERROR_SECTION_DESCRIPTOR)); + + rcd->errorRecord = new EFI_AMD_FATAL_ERROR_DATA[FATAL_SECTION_COUNT]; + std::memset(rcd->errorRecord, 0, + FATAL_SECTION_COUNT * sizeof(EFI_AMD_FATAL_ERROR_DATA)); + + cperGenerator.dumpErrorDescriptorSection(rcd, FATAL_SECTION_COUNT, + FATAL_ERR); + + cperGenerator.dumpProcessorErrorSection(rcd, info, cpuId); + + cperGenerator.dumpContextInfo(rcd, numbanks, bytespermca, info, blockId, + ppin, uCode, apmlRetryCount); + + syndOffsetLo = std::stoul((*sigIDOffset)[INDEX_0], nullptr, BASE_16); + syndOffsetHi = std::stoul((*sigIDOffset)[INDEX_1], nullptr, BASE_16); + ipidOffsetLo = std::stoul((*sigIDOffset)[INDEX_2], nullptr, BASE_16); + ipidOffsetHi = std::stoul((*sigIDOffset)[INDEX_3], nullptr, BASE_16); + statusOffsetLo = std::stoul((*sigIDOffset)[INDEX_4], nullptr, BASE_16); + statusOffsetHi = std::stoul((*sigIDOffset)[INDEX_5], nullptr, BASE_16); + + maxOffset32 = ((bytespermca % BYTE_4) ? INDEX_1 : INDEX_0) + + (bytespermca >> BYTE_2); + lg2::info("Number of Valid MCA bank: {NUMBANKS}", "NUMBANKS", numbanks); + lg2::info("Number of 32 Bit Words:{MAX_OFFSET}", "MAX_OFFSET", maxOffset32); + + while (n < numbanks) + { + for (int offset = 0; offset < maxOffset32; offset++) + { + memset(&buffer, 0, sizeof(buffer)); + memset(&mca_dump, 0, sizeof(mca_dump)); + mca_dump.index = n; + mca_dump.offset = offset * BYTE_4; + + ret = read_bmc_ras_mca_msr_dump(info, mca_dump, &buffer); + + if (ret != OOB_SUCCESS) + { + while (*apmlRetryCount > 0) + { + memset(&buffer, 0, sizeof(buffer)); + memset(&mca_dump, 0, sizeof(mca_dump)); + mca_dump.index = n; + mca_dump.offset = offset * BYTE_4; + + ret = read_bmc_ras_mca_msr_dump(info, mca_dump, &buffer); + + if (ret == OOB_SUCCESS) + { + break; + } + (*apmlRetryCount)--; + usleep(1000 * 1000); + } + if (ret != OOB_SUCCESS) + { + lg2::error("Socket {SOCKET} : Failed to get MCA bank data " + "from Bank:{N}, Offset:{OFFSET}", + "SOCKET", info, "N", n, "OFFSET", lg2::hex, + offset); + rcd->errorRecord[info].CrashDumpData[n].McaData[offset] = + BAD_DATA; // Write BAADDA7A pattern on error + continue; + } + + } // if (ret != OOB_SUCCESS) + + rcd->errorRecord[info].CrashDumpData[n].McaData[offset] = buffer; + + if (mca_dump.offset == statusOffsetLo) + { + mcaStatusLo = buffer; + } + if (mca_dump.offset == statusOffsetHi) + { + mcaStatusHi = buffer; + + /*Bit 23 and bit 25 of MCA_STATUS_HI + should be set for a valid signature ID*/ + if ((mcaStatusHi & (INDEX_1 << SHIFT_25)) && + (mcaStatusHi & (INDEX_1 << SHIFT_23))) + { + ValidSignatureID = true; + } + } + if (mca_dump.offset == ipidOffsetLo) + { + mcaIpidLo = buffer; + } + if (mca_dump.offset == ipidOffsetHi) + { + mcaIpidHi = buffer; + } + if (mca_dump.offset == syndOffsetLo) + { + mcaSyndLo = buffer; + } + if (mca_dump.offset == syndOffsetHi) + { + mcaSyndHi = buffer; + } + + } // for loop + + if (ValidSignatureID == true) + { + rcd->errorRecord[info].SignatureID[INDEX_0] = mcaSyndLo; + rcd->errorRecord[info].SignatureID[INDEX_1] = mcaSyndHi; + rcd->errorRecord[info].SignatureID[INDEX_2] = mcaIpidLo; + rcd->errorRecord[info].SignatureID[INDEX_3] = mcaIpidHi; + rcd->errorRecord[info].SignatureID[INDEX_4] = mcaStatusLo; + rcd->errorRecord[info].SignatureID[INDEX_5] = mcaStatusHi; + + rcd->errorRecord[info].ProcError.ValidFields = + rcd->errorRecord[info].ProcError.ValidFields | + FAILURE_SIGNATURE_ID; + + ValidSignatureID = false; + } + else + { + mcaSyndLo = 0; + mcaSyndHi = 0; + mcaIpidLo = 0; + mcaIpidHi = 0; + mcaStatusLo = 0; + mcaStatusHi = 0; + } + n++; + } +} + +void ApmlInterfaceManager::harvestFatalError(uint8_t info) +{ + std::unique_lock lock(harvest_in_progress_mtx); + + uint16_t bytespermca = 0; + uint16_t numbanks = 0; + bool controlFabricError = false; + bool fchHangError = false; + uint8_t buf; + bool resetReady = false; + + CperGenerator cperGenerator(numOfCpu, progId, familyId, + errCount); + + // Check if APML ALERT is because of RAS + if (read_sbrmi_ras_status(info, &buf) == OOB_SUCCESS) + { + lg2::debug("Read RAS status register. Value: {BUF}", "BUF", buf); + + // check RAS Status Register + if (buf & INT_255) + { + lg2::error("The alert signaled is due to a RAS fatal error"); + + if (buf & SYS_MGMT_CTRL_ERR) + { + /*if RasStatus[reset_ctrl_err] is set in any of the processors, + proceed to cold reset, regardless of the status of the other P + */ + + std::string ras_err_msg = + "Fatal error detected in the control fabric. " + "BMC may trigger a reset based on policy set. "; + + sd_journal_send( + "MESSAGE=%s", ras_err_msg.c_str(), "PRIORITY=%i", LOG_ERR, + "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError", + "REDFISH_MESSAGE_ARGS=%s", ras_err_msg.c_str(), NULL); + + p0AlertProcessed = true; + p1AlertProcessed = true; + controlFabricError = true; + } + else if (buf & RESET_HANG_ERR) + { + std::string ras_err_msg = + "System hang while resetting in syncflood." + "Suggested next step is to do an additional manual " + "immediate reset"; + + sd_journal_send( + "MESSAGE=%s", ras_err_msg.c_str(), "PRIORITY=%i", LOG_ERR, + "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError", + "REDFISH_MESSAGE_ARGS=%s", ras_err_msg.c_str(), NULL); + + fchHangError = true; + } + else if (buf & FATAL_ERROR) + { + std::string ras_err_msg = "RAS FATAL Error detected. " + "System may reset after harvesting " + "MCA data based on policy set. "; + + sd_journal_send( + "MESSAGE=%s", ras_err_msg.c_str(), "PRIORITY=%i", LOG_ERR, + "REDFISH_MESSAGE_ID=%s", "OpenBMC.0.1.CPUError", + "REDFISH_MESSAGE_ARGS=%s", ras_err_msg.c_str(), NULL); + + if (true == + harvestMcaValidityCheck(info, &numbanks, &bytespermca)) + { + harvestMcaDataBanks(info, numbanks, bytespermca, + cperGenerator); + } + } + + if (info == SOCKET_0) + { + p0AlertProcessed = true; + } + + if (info == SOCKET_1) + { + p1AlertProcessed = true; + } + + // Clear RAS status register + // 0x4c is a SB-RMI register acting as write to clear + // check PPR to determine whether potential bug in PPR or in + // implementation of SMU? + + writeRegister(info, RAS_STATUS_REGISTER, buf); + + if (fchHangError == true) + { + // return true; + } + + if (numOfCpu == TWO_SOCKET) + { + if ((p0AlertProcessed == true) && (p1AlertProcessed == true)) + { + resetReady = true; + } + } + else + { + resetReady = true; + } + + if (resetReady == true) + { + if (controlFabricError == false) + { + // Create CPER file and write CPER data to the file + cperGenerator.cperFileWrite(rcd, FATAL_ERR, + FATAL_SECTION_COUNT); + } + + bool recoveryAction = true; + + AttributeValue aifsArmed = getAttribute("AifsArmed"); + bool* aifsArmedFlag = std::get_if(&aifsArmed); + + if ((*aifsArmedFlag == true) && + (checkSignatureIdMatch() == true)) + { + sd_journal_print(LOG_INFO, "AIFS armed for the system\n"); + + std::ifstream inputFile(EVENT_SUBSCRIPTION_FILE); + + /*Check if there is any active subscriptions for + the local AIFS flow*/ + if (inputFile.is_open()) + { + nlohmann::json jsonData; + inputFile >> jsonData; + + if (jsonData.find("Subscriptions") != jsonData.end()) + { + const auto& subscriptionsArray = + jsonData["Subscriptions"]; + if (subscriptionsArray.is_array()) + { + for (const auto& subscription : + subscriptionsArray) + { + const auto& messageIds = + subscription["MessageIds"]; + if (messageIds.is_array()) + { + bool messageIdFound = std::any_of( + messageIds.begin(), + messageIds.end(), + [](const std::string& messageId) { + return messageId == + "AifsFailureMatch"; + }); + if (messageIdFound) + { + recoveryAction = false; + + struct ras_override_delay d_in = { + 0, 0, 0}; + bool ack_resp; + d_in.stop_delay_counter = 1; + oob_status_t ret; + + AttributeValue disableResetCounter = + getAttribute( + "DisableAifsResetOnSyncfloodCounter"); + bool* disableResetCntr = + std::get_if( + &disableResetCounter); + + if (*disableResetCntr == true) + { + sd_journal_print( + LOG_INFO, + "Disable Aifs Delay " + "Reset on Syncflood " + "counter is true. " + "Sending Delay Reset " + "on Syncflood override " + "APML command\n"); + ret = + override_delay_reset_on_sync_flood( + info, d_in, &ack_resp); + + if (ret) + { + sd_journal_print( + LOG_ERR, + "Failed to " + "override " + "delay value reset " + "on " + "syncflood " + "Err[%d]: %s \n", + ret, + esmi_get_err_msg(ret)); + } + else + { + sd_journal_print( + LOG_INFO, + "Successfully sent " + "Reset delay on " + "Syncflood " + "command\n"); + } + } + sd_journal_send( + "PRIORITY=%i", LOG_INFO, + "REDFISH_MESSAGE_ID=%s", + "OpenBMC.0.1." + "AifsFailureMatch", + NULL); + break; + } + } + } + } + } + inputFile.close(); + } + } + if (recoveryAction == true) + { + rasRecoveryAction(buf); + } + + if (rcd->sectionDescriptor != nullptr) + { + delete[] rcd->sectionDescriptor; + rcd->sectionDescriptor = nullptr; + } + if (rcd->errorRecord != nullptr) + { + delete[] rcd->errorRecord; + rcd->errorRecord = nullptr; + } + + rcd = nullptr; + + p0AlertProcessed = false; + p1AlertProcessed = false; + } + } + } + else + { + lg2::debug("Nothing to Harvest. Not RAS Error"); + } +} + +std::vector + ApmlInterfaceManager::hexstring_to_vector(const std::string& hexString) +{ + std::vector result; + + // Skip the "0x" prefix if present + size_t start = + (hexString.substr(INDEX_0, INDEX_2) == "0x") ? INDEX_2 : INDEX_0; + + // Process the string in chunks of 8 characters (32 bits) + for (size_t i = start; i < hexString.length(); i += INDEX_8) + { + std::string chunk = hexString.substr(i, INDEX_8); + std::istringstream iss(chunk); + uint32_t value = 0; + iss >> std::hex >> value; + if (iss) + { + result.push_back(value); + } + else + { + break; + } + } + + // Pad the result vector with leading zeros if necessary + while (result.size() < 8) + { + result.insert(result.begin(), 0); + } + + return result; +} + +bool ApmlInterfaceManager::compare_with_bitwise_AND( + const uint32_t* Var, const std::string& hexString) +{ + std::vector hexVector = hexstring_to_vector(hexString); + std::vector result(8); + + // Pad the Var array with leading zeros if necessary + std::vector varVector(8); + + std::copy(Var, Var + 8, varVector.begin()); + + // Reverse the order of elements in varVector + std::reverse(varVector.begin(), varVector.end()); + + // Perform the bitwise AND operation + for (size_t i = 0; i < 8; i++) + { + result[i] = varVector[i] & hexVector[i]; + } + + // Compare the result with the original hexVector + return std::equal(result.begin(), result.end(), hexVector.begin(), + hexVector.end()); +} + +bool ApmlInterfaceManager::checkSignatureIdMatch() +{ + bool ret = false; + + AttributeValue configSigId = getAttribute("AifsSignatureId"); + std::map* configSigIdList = + std::get_if>(&configSigId); + + uint32_t P0_tempVar[8]; + std::memcpy(P0_tempVar, SignatureID, sizeof(P0_tempVar)); + + uint32_t P1_tempVar[8]; + std::memcpy(P1_tempVar, SignatureID, sizeof(P1_tempVar)); + + for (const auto& pair : *configSigIdList) + { + bool equal = compare_with_bitwise_AND(P0_tempVar, pair.second); + + if (equal == true) + { + sd_journal_print(LOG_INFO, "Signature ID matched with the config " + "file signature ID list\n"); + ret = true; + break; + } + + equal = compare_with_bitwise_AND(P1_tempVar, pair.second); + if (equal == true) + { + sd_journal_print(LOG_INFO, "Signature ID matched with the config " + "file signature ID list\n"); + ret = true; + break; + } + } + return ret; +} diff --git a/src/config_manager.cpp b/src/config_manager.cpp new file mode 100644 index 0000000..c66de71 --- /dev/null +++ b/src/config_manager.cpp @@ -0,0 +1,119 @@ +#include "config_manager.hpp" + +#include "ras.hpp" + +#include + +#include + +/** + * @brief Sets the attribute in the configuration + * + * This function updates the specified attribute in configuration JSON file. + * If the attribute + * is found, its value is updated and saved back to the file. + * + * @param[in] attribute The name of the attribute to set. + * @param[in] value The value to set for the specified attribute. + */ +void RasConfiguration::setAttribute(AttributeName attribute, + AttributeValue value) +{ + nlohmann::json j; + + auto configMap = rasConfigTable(); + + try + { + std::ifstream jsonFile(CONFIG_FILE); + if (!jsonFile.is_open()) + { + throw std::runtime_error("Could not open JSON file"); + } + + jsonFile >> j; + jsonFile.close(); + + bool attributeFound = false; + for (auto& configItem : j["Configuration"]) + { + auto it = configItem.find(attribute); + if (it != configItem.end()) + { + std::visit([&](auto&& arg) { it.value()["Value"] = arg; }, + value); + attributeFound = true; + break; + } + } + if (attributeFound) + { + for (auto& [key, tuple] : configMap) + { + if (key == attribute) + { + std::get<2>(tuple) = value; + break; + } + } + lg2::info("Attribute updated successfully"); + } + else + { + lg2::error("Attribute not found"); + } + } + catch (const std::exception& e) + { + lg2::error("Error : {ERROR}", "ERROR", e.what()); + } + + rasConfigTable(configMap); + + std::ofstream jsonFileOut(CONFIG_FILE); + jsonFileOut << j.dump(4); + jsonFileOut.close(); +} + +/** + * @brief Retrieves the value of a specified attribute from the configuration. + * + * This function searches for the specified attribute in the configuration + * map and returns its value. + * + * @param[in] attribute The name of the attribute to retrieve. + * @return The value of the specified attribute, or a default-constructed + * AttributeValue if not found. + */ +AttributeValue RasConfiguration::getAttribute(AttributeName attribute) +{ + auto configMap = rasConfigTable(); + AttributeValue value; + + for (auto& [key, tuple] : configMap) + { + if (key == attribute) + { + value = std::get<2>(tuple); + break; + } + } + return value; +} + +/** + * @brief Constructor for RasConfiguration + * + * This constructor initializes a RasConfiguration object with a given object + * server and system bus connection. + * + * @param[in] objectServer Reference to an object server for managing D-Bus + * objects. + * @param[in] systemBus Shared pointer to a D-Bus connection. + */ +RasConfiguration::RasConfiguration( + sdbusplus::asio::object_server& objectServer, + std::shared_ptr& systemBus) : + sdbusplus::com::amd::RAS::server::Configuration(*systemBus, objectPath), + objServer(objectServer), systemBus(systemBus) +{} diff --git a/src/cper_generator.cpp b/src/cper_generator.cpp new file mode 100644 index 0000000..6bfb268 --- /dev/null +++ b/src/cper_generator.cpp @@ -0,0 +1,492 @@ +#include "cper_generator.hpp" + +#include "ras.hpp" + +#include + +constexpr int ADDC_GEN_NUMBER_1 = 0x01; +constexpr int ADDC_GEN_NUMBER_2 = 0x02; +constexpr int CPER_RECORD_REV = 0x0100; +constexpr int CPU_ID_VALID = 0x02; +constexpr int CTX_OOB_CRASH = 0x01; +constexpr int LOCAL_APIC_ID_VALID = 0x01; +constexpr int CPER_PRIMARY = 1; +constexpr int CPER_SIG_END = 0xffffffff; +constexpr int CPER_VALID_PLATFORM_ID = 0x0001; +constexpr int CPER_VALID_TIMESTAMP = 0x0002; +constexpr int CPER_VALID_PARTITION_ID = 0x0004; +constexpr int CPER_MINOR_REV = 0x1; +constexpr int FRU_ID_VALID = 0x01; +constexpr int FRU_TEXT_VALID = 0x02; +constexpr int MAX_ERROR_FILE = 10; +constexpr int PROC_CONTEXT_STRUCT_VALID = 0x100; + +EFI_GUID gEfiEventCreatorIdGuid = { + 0x61FA3FAC, + 0xCB80, + 0x4292, + {0x8B, 0xFB, 0xD6, 0x43, 0xB1, 0xDE, 0x17, 0xF4}}; + +EFI_GUID gEfiAmdCrashdumpGuid = { + 0x32AC0C78, + 0x2623, + 0x48F6, + {0xB0, 0xD0, 0x73, 0x65, 0x72, 0x5F, 0xD6, 0xAE}}; + +EFI_GUID gEfiEventNotificationTypeMceGuid = { + 0xE8F56FFE, + 0x919C, + 0x4cc5, + {0xBA, 0x88, 0x65, 0xAB, 0xE1, 0x49, 0x13, 0xBB}}; + +template +void CperGenerator::calculateTimeStamp(const std::shared_ptr& data) +{ + using namespace std; + using namespace std::chrono; + typedef duration>::type> days; + + system_clock::time_point now = system_clock::now(); + system_clock::duration tp = now.time_since_epoch(); + + days d = duration_cast(tp); + tp -= d; + hours h = duration_cast(tp); + tp -= h; + minutes m = duration_cast(tp); + tp -= m; + seconds s = duration_cast(tp); + tp -= s; + + time_t tt = system_clock::to_time_t(now); + tm utc_tm = *gmtime(&tt); + + data->header.TimeStamp.Seconds = utc_tm.tm_sec; + data->header.TimeStamp.Minutes = utc_tm.tm_min; + data->header.TimeStamp.Hours = utc_tm.tm_hour; + data->header.TimeStamp.Flag = 1; + data->header.TimeStamp.Day = utc_tm.tm_mday; + data->header.TimeStamp.Month = utc_tm.tm_mon + 1; + data->header.TimeStamp.Year = utc_tm.tm_year; + data->header.TimeStamp.Century = 20 + utc_tm.tm_year / 100; + data->header.TimeStamp.Year = data->header.TimeStamp.Year % 100; +} + +template +void CperGenerator::dumpCperHeaderSection( + const std::shared_ptr& data, uint16_t sectionCount, + uint32_t errorSeverity, std::string errorType) +{ + data->header.SignatureStart = 0x52455043; // CPER + + data->header.Revision = CPER_RECORD_REV; /*(0x100)*/ + + data->header.SignatureEnd = CPER_SIG_END; /*(0xFFFFFFFF)*/ + + /*Number of valid sections associated with the record*/ + data->header.SectionCount = sectionCount; + + /*0 - Non-fatal uncorrected ; 1 - Fatal ; 2 - Corrected*/ + data->header.ErrorSeverity = errorSeverity; + + /*Bit 0 = 1 -> PlatformID field contains valid info + Bit 1 = 1 -> TimeStamp field contains valid info + Bit 2 = 1 -> PartitionID field contains valid info*/ + data->header.ValidationBits = + (CPER_VALID_PLATFORM_ID | CPER_VALID_TIMESTAMP); + + /*Size of whole CPER record*/ + if (errorType == FATAL_ERR) + { + data->header.RecordLength = + sizeof(EFI_COMMON_ERROR_RECORD_HEADER) + + (sizeof(EFI_ERROR_SECTION_DESCRIPTOR) * sectionCount) + + (sizeof(EFI_AMD_FATAL_ERROR_DATA) * sectionCount); + } + + /*TimeStamp when OOB controller received the event*/ + calculateTimeStamp(data); + + data->header.PlatformID.Data1 = boardId; + + memcpy(&data->header.CreatorID, &gEfiEventCreatorIdGuid, sizeof(EFI_GUID)); + + memcpy(&data->header.NotificationType, &gEfiEventNotificationTypeMceGuid, + sizeof(EFI_GUID)); + + /*Starts at 1 and increments at each time when cper file is generated*/ + data->header.RecordID = recordId++; +} + +template +void CperGenerator::dumpErrorDescriptorSection( + const std::shared_ptr& data, uint16_t sectionCount, + std::string errorType) +{ + for (int i = 0; i < sectionCount; i++) + { + if (errorType == FATAL_ERR) + { + data->sectionDescriptor[i].SectionOffset = + sizeof(EFI_COMMON_ERROR_RECORD_HEADER) + + (sectionCount * sizeof(EFI_ERROR_SECTION_DESCRIPTOR)) + + (i * sizeof(EFI_AMD_FATAL_ERROR_DATA)); + + data->sectionDescriptor[i].SectionLength = + sizeof(EFI_AMD_FATAL_ERROR_DATA); + + memcpy(&data->sectionDescriptor[i].SectionType, + &gEfiAmdCrashdumpGuid, sizeof(EFI_GUID)); + + data->sectionDescriptor[i].Severity = CPER_SEV_FATAL; + + data->sectionDescriptor[i].FruString[INDEX_0] = 'P'; + data->sectionDescriptor[i].FruString[INDEX_1] = '0' + i; + } + + if (familyId == TURIN_FAMILY_ID) + { + data->sectionDescriptor[i].Revision = + ((((uint16_t)(ADDC_GEN_NUMBER_2 & INT_255) << SHIFT_4) | progId) + << INDEX_8) | + CPER_MINOR_REV; + } + else if (familyId == GENOA_FAMILY_ID) + { + data->sectionDescriptor[i].Revision = + ((((uint16_t)(ADDC_GEN_NUMBER_1 & INT_255) << SHIFT_4) | progId) + << INDEX_8) | + CPER_MINOR_REV; + } + + data->sectionDescriptor[i].SecValidMask = FRU_ID_VALID | FRU_TEXT_VALID; + data->sectionDescriptor[i].SectionFlags = CPER_PRIMARY; + } +} + +template +void CperGenerator::dumpProcessorErrorSection( + const std::shared_ptr& fatalPtr, uint8_t info, + CpuId* cpuId) +{ + for (int i = 0; i < numOfCpu; i++) + { + fatalPtr->errorRecord[i].ProcError.ValidFields = + CPU_ID_VALID | LOCAL_APIC_ID_VALID; + fatalPtr->errorRecord[i].ProcError.CpuIdInfo[INDEX_0] = cpuId[i].eax; + fatalPtr->errorRecord[i].ProcError.CpuIdInfo[INDEX_2] = cpuId[i].ebx; + fatalPtr->errorRecord[i].ProcError.CpuIdInfo[INDEX_4] = cpuId[i].ecx; + fatalPtr->errorRecord[i].ProcError.CpuIdInfo[INDEX_6] = cpuId[i].edx; + fatalPtr->errorRecord[i].ProcError.ApicId = + ((cpuId[i].ebx >> SHIFT_24) & INT_255); + + if (i == info) + { + fatalPtr->errorRecord[i].ProcError.ValidFields |= + PROC_CONTEXT_STRUCT_VALID; + } + } +} + +template +void CperGenerator::getLastTransAddr( + const std::shared_ptr& fatalPtr, uint8_t info) +{ + oob_status_t ret; + uint8_t blk_id = 0; + uint16_t n = 0; + uint16_t maxOffset32; + uint32_t data; + struct ras_df_err_chk err_chk; + union ras_df_err_dump df_err = {0}; + + ret = read_ras_df_err_validity_check(info, blk_id, &err_chk); + + if (ret) + { + lg2::error("Failed to read RAS DF validity check"); + } + else + { + if (err_chk.df_block_instances != 0) + { + lg2::info("Harvesting last transaction address"); + + maxOffset32 = ((err_chk.err_log_len % BYTE_4) ? INDEX_1 : INDEX_0) + + (err_chk.err_log_len >> BYTE_2); + while (n < err_chk.df_block_instances) + { + for (int offset = 0; offset < maxOffset32; offset++) + { + memset(&data, 0, sizeof(data)); + /* Offset */ + df_err.input[INDEX_0] = offset * BYTE_4; + /* DF block ID */ + df_err.input[INDEX_1] = blk_id; + /* DF block ID instance */ + df_err.input[INDEX_2] = n; + + ret = read_ras_df_err_dump(info, df_err, &data); + + fatalPtr->errorRecord[info] + .DfDumpData.LastTransAddr[n] + .WdtData[offset] = data; + } + n++; + } + } + } +} + +template +void CperGenerator::harvestDebugLogDump( + const std::shared_ptr& fatalPtr, uint8_t info, + uint8_t blk_id, int64_t* apmlRetryCount, uint16_t& debugLogIdOffset) +{ + oob_status_t ret = OOB_MAILBOX_CMD_UNKNOWN; + uint16_t retries = 0; + uint16_t n = 0; + uint16_t maxOffset32; + uint32_t data; + struct ras_df_err_chk err_chk; + union ras_df_err_dump df_err = {0}; + + while (ret != OOB_SUCCESS) + { + retries++; + + ret = read_ras_df_err_validity_check(info, blk_id, &err_chk); + + if (ret == OOB_SUCCESS) + { + lg2::info( + "Socket: {SOCKET},Debug Log ID : {DBG_ID} read successful", + "SOCKET", info, "DBG_ID", blk_id); + break; + } + + if (retries > *apmlRetryCount) + { + lg2::error("Socket: {SOCKET},Debug Log ID : {DBG_ID} read failed", + "SOCKET", info, "DBG_ID", blk_id); + + /*If 5Bh command fails ,0xBAADDA7A is written thrice in the PCIE + * dump region*/ + fatalPtr->errorRecord[info].DebugLogIdData[debugLogIdOffset++] = + blk_id; + fatalPtr->errorRecord[info].DebugLogIdData[debugLogIdOffset++] = + BAD_DATA; + fatalPtr->errorRecord[info].DebugLogIdData[debugLogIdOffset++] = + BAD_DATA; + fatalPtr->errorRecord[info].DebugLogIdData[debugLogIdOffset++] = + BAD_DATA; + + break; + } + } + + if (ret == OOB_SUCCESS) + { + if (err_chk.df_block_instances != 0) + { + uint32_t DbgLogIdHeader = + (static_cast(err_chk.err_log_len) << INDEX_16) | + (static_cast(err_chk.df_block_instances) << INDEX_8) | + static_cast(blk_id); + + fatalPtr->errorRecord[info].DebugLogIdData[debugLogIdOffset++] = + DbgLogIdHeader; + + maxOffset32 = ((err_chk.err_log_len % BYTE_4) ? INDEX_1 : INDEX_0) + + (err_chk.err_log_len >> BYTE_2); + + while (n < err_chk.df_block_instances) + { + bool apmlHang = false; + + for (int offset = 0; offset < maxOffset32; offset++) + { + lg2::info("Harvtesing debug log ID dumps"); + + if (apmlHang == false) + { + memset(&data, 0, sizeof(data)); + memset(&df_err, 0, sizeof(df_err)); + + /* Offset */ + df_err.input[INDEX_0] = offset * BYTE_4; + /* DF block ID */ + df_err.input[INDEX_1] = blk_id; + /* DF block ID instance */ + df_err.input[INDEX_2] = n; + + ret = read_ras_df_err_dump(info, df_err, &data); + + if (ret != OOB_SUCCESS) + { + // retry + uint16_t retryCount = *apmlRetryCount; + + while (retryCount > 0) + { + memset(&data, 0, sizeof(data)); + memset(&df_err, 0, sizeof(df_err)); + + /* Offset */ + df_err.input[INDEX_0] = offset * BYTE_4; + /* DF block ID */ + df_err.input[INDEX_1] = blk_id; + /* DF block ID instance */ + df_err.input[INDEX_2] = n; + + ret = read_ras_df_err_dump(info, df_err, &data); + + if (ret == OOB_SUCCESS) + { + break; + } + retryCount--; + usleep(1000 * 1000); + } + + if (ret != OOB_SUCCESS) + { + lg2::error("Failed to read debug log dump for " + "debug log ID : {BLK_ID}", + "BLK_ID", blk_id); + data = BAD_DATA; + /*the Dump APML command fails in the middle of + the iterative loop, then write BAADDA7A for + the remaining iterations in the for loop*/ + apmlHang = true; + } + } + } + + fatalPtr->errorRecord[info] + .DebugLogIdData[debugLogIdOffset++] = data; + } + n++; + } + } + } +} + +template +void CperGenerator::dumpContextInfo( + const std::shared_ptr& fatalPtr, uint16_t numbanks, + uint16_t bytespermca, uint8_t info, std::vector blockId, + uint64_t* ppin, uint32_t* uCode, int64_t* apmlRetryCount) +{ + for (int i = 0; i < numOfCpu; i++) + { + uint8_t blk_id; + + getLastTransAddr(fatalPtr, i); + + uint16_t debugLogIdOffset = 0; + + for (blk_id = 0; blk_id < blockId.size(); blk_id++) + { + harvestDebugLogDump(fatalPtr, i, blockId[blk_id], apmlRetryCount, + debugLogIdOffset); + } + + fatalPtr->errorRecord[i].Ppin = ppin[i]; + fatalPtr->errorRecord[i].MicrocodeVersion = uCode[i]; + + if (i == info) + { + fatalPtr->errorRecord[i].RegisterContextType = CTX_OOB_CRASH; + fatalPtr->errorRecord[i].RegisterArraySize = numbanks * bytespermca; + } + } +} + +template +std::string CperGenerator::getCperFilename(int num) +{ + return "ras-error" + std::to_string(num) + ".cper"; +} + +template +void CperGenerator::cperFileWrite(const std::shared_ptr& data, + std::string errorType, + uint16_t sectionCount) +{ + static std::mutex index_file_mtx; + std::unique_lock lock(index_file_mtx); + + std::string cperFileName; + FILE* file; + + std::shared_ptr fatalPtr; + + if constexpr (std::is_same_v) + { + fatalPtr = std::static_pointer_cast(data); + } + + // Generate the CPER file name based on the error count + cperFileName = getCperFilename(errCount); + + /* Iterate through the RAS directory and remove any + existing file with the same name*/ + for (const auto& entry : std::filesystem::directory_iterator(RAS_DIR)) + { + std::string filename = entry.path().filename().string(); + if (filename.size() >= cperFileName.size() && + filename.substr(filename.size() - cperFileName.size()) == + cperFileName) + { + std::filesystem::remove(entry.path()); + } + } + + std::string cperFilePath = RAS_DIR + cperFileName; + + file = fopen(cperFilePath.c_str(), "w"); + + if (errorType == FATAL_ERR) + { + if ((fatalPtr) && (file != NULL)) + { + lg2::info("Generating CPER file for the fatal error"); + + // Write the header of the FatalCperRecord to the file + fwrite(&fatalPtr->header, sizeof(EFI_COMMON_ERROR_RECORD_HEADER), 1, + file); + + // Write the section descriptors to the file + fwrite(fatalPtr->sectionDescriptor, + sizeof(EFI_ERROR_SECTION_DESCRIPTOR) * sectionCount, 1, + file); + + // Write the error record data to the file + fwrite(fatalPtr->errorRecord, + sizeof(EFI_AMD_FATAL_ERROR_DATA) * sectionCount, 1, file); + } + } + + fclose(file); + + errCount++; + + if (errCount >= MAX_ERROR_FILE) + { + /*The maximum number of error files supported is 10. + The counter will be rotated once it reaches max count*/ + errCount = (errCount % MAX_ERROR_FILE); + } + + file = fopen(INDEX_FILE, "w"); + if (file != NULL) + { + fprintf(file, "%d", errCount); + fclose(file); + } + + lock.unlock(); +} + +template class CperGenerator; diff --git a/src/interface_manager_base.cpp b/src/interface_manager_base.cpp new file mode 100644 index 0000000..e9a8b45 --- /dev/null +++ b/src/interface_manager_base.cpp @@ -0,0 +1,513 @@ +#include "interface_manager_base.hpp" + +#include + +constexpr int COMMAND_LEN = 3; +constexpr int SYS_MGMT_CTRL_ERR = 0x04; +constexpr char SRC_CONFIG_FILE[] = "/usr/share/ras-config/ras_config.json"; +constexpr char INVENTORY_SERVICE[] = "xyz.openbmc_project.Inventory.Manager"; +constexpr char CPU_INVENTORY_INTERFACE[] = + "xyz.openbmc_project.Inventory.Item.Cpu"; +constexpr char COMMAND_NUM_OF_CPU[] = "/sbin/fw_printenv -n num_of_cpu"; +static const std::string COMMAND_BOARD_ID = "/sbin/fw_printenv -n board_id"; + +void RasManagerBase::getNumberOfCpu() +{ + FILE* pf; + char data[COMMAND_LEN]; + std::stringstream ss; + + pf = popen(COMMAND_NUM_OF_CPU, "r"); + if (pf) + { + if (fgets(data, COMMAND_LEN, pf)) + { + numOfCpu = std::stoi(data); + + lg2::info("Number of Cpu: {CPU}", "CPU", numOfCpu); + cpuId = new CpuId[numOfCpu]; + + uCode = new uint32_t[numOfCpu]; + std::memset(uCode, 0, numOfCpu * sizeof(uint32_t)); + + ppin = new uint64_t[numOfCpu]; + std::memset(ppin, 0, numOfCpu * sizeof(uint64_t)); + + inventoryPath = new std::string[numOfCpu]; + + for (int i = 0; i < numOfCpu; i++) + { + inventoryPath[i] = + "/xyz/openbmc_project/inventory/system/processor/P" + + std::to_string(i); + } + } + else + { + throw std::runtime_error("Error reading data from the process."); + } + pclose(pf); + } + else + { + throw std::runtime_error("Error opening the process."); + } +} + +void RasManagerBase::getBoardId() +{ + FILE* pf; + char data[COMMAND_LEN]; + std::stringstream ss; + + // Setup pipe for reading and execute to get u-boot environment + // variable board_id. + pf = popen(COMMAND_BOARD_ID.data(), "r"); + // Error handling + if (pf) + { + // Get the data from the process execution + if (fgets(data, COMMAND_LEN, pf)) + { + ss << std::hex << (std::string)data; + ss >> boardId; + + lg2::debug("Board ID: {BOARD_ID}", "BOARD_ID", boardId); + } + // the data is now in 'data' + pclose(pf); + } +} + +void RasManagerBase::createIndexFile() +{ + try + { + struct stat buffer; + + // Create the RAS directory if it doesn't exist + if (stat(RAS_DIR, &buffer) != 0) + { + if (mkdir(RAS_DIR, 0777) != 0) + { + throw std::runtime_error( + "Failed to create ras-error-logging directory"); + } + } + + memset(&buffer, 0, sizeof(buffer)); + + // Create or read the index file + if (stat(INDEX_FILE, &buffer) != 0) + { + std::ofstream file(INDEX_FILE); + if (file.is_open()) + { + file << "0"; + file.close(); + } + else + { + throw std::runtime_error("Failed to create index file"); + } + } + else + { + std::ifstream file(INDEX_FILE); + if (file.is_open()) + { + if (!(file >> errCount) || errCount < INDEX_0) + { + throw std::runtime_error( + "Failed to read CPER index number"); + } + file.close(); + } + else + { + throw std::runtime_error("Failed to read from index file"); + } + } + } + catch (const std::exception& e) + { + lg2::error("Exception: {ERROR}", "ERROR", e.what()); + } +} + +void RasManagerBase::createConfigFile() +{ + struct stat buffer; + + /*Create Cdump Config file to store the system recovery*/ + if (stat(CONFIG_FILE, &buffer) != 0) + { + std::string copyCommand = + std::string("cp ") + SRC_CONFIG_FILE + " " + CONFIG_FILE; + + int result = system(copyCommand.c_str()); + if (result != 0) + { + lg2::error("Error copying RAS config file."); + } + } + + std::ifstream jsonRead(CONFIG_FILE); + nlohmann::json data = nlohmann::json::parse(jsonRead); + + ConfigTable configMap; + + for (const auto& item : data["Configuration"]) + { + AttributeType attributeType; + std::string key; + std::string description; + std::variant, + std::map> + value; + int64_t maxBoundValue = 0; + + if (item.is_object() && item.size() == 1) + { + key = item.begin().key(); + + const auto& obj = item[key]; + description = obj["Description"]; + if (value.index() == 0) + { + attributeType = sdbusplus::common::com::amd::ras:: + Configuration::AttributeType::Boolean; + } + else if (value.index() == 1) + { + attributeType = sdbusplus::common::com::amd::ras:: + Configuration::AttributeType::String; + } + else if (value.index() == 2) + { + attributeType = sdbusplus::common::com::amd::ras:: + Configuration::AttributeType::Integer; + } + else if (value.index() == 3) + { + attributeType = sdbusplus::common::com::amd::ras:: + Configuration::AttributeType::ArrayOfStrings; + } + else if (value.index() == 4) + { + attributeType = sdbusplus::common::com::amd::ras:: + Configuration::AttributeType::KeyValueMap; + } + + // Determine the type of the value and construct the std::variant + // accordingly + if (obj["Value"].is_boolean()) + { + value = obj["Value"].get(); + } + else if (obj["Value"].is_string()) + { + value = obj["Value"].get(); + } + else if (obj["Value"].is_number_integer()) + { + value = obj["Value"].get(); + } + else if (obj["Value"].is_array()) + { + value = obj["Value"].get>(); + } + else if (obj["Value"].is_object()) + { + value = obj["Value"].get>(); + } + } + + configMap[key] = + std::make_tuple(attributeType, description, value, maxBoundValue); + } + + rasConfigTable(configMap); + + jsonRead.close(); +} + +void RasManagerBase::getMicrocodeRev() +{ + sdbusplus::bus::bus bus = sdbusplus::bus::new_default(); + + for (int i = 0; i < numOfCpu; i++) + { + std::string microCode = getProperty( + bus, INVENTORY_SERVICE, inventoryPath[i].c_str(), + CPU_INVENTORY_INTERFACE, "Microcode"); + + if (microCode.empty()) + { + lg2::error("Failed to read ucode revision"); + } + else + { + uCode[i] = std::stoul(microCode, nullptr, BASE_16); + } + } +} + +void RasManagerBase::getPpinFuse() +{ + sdbusplus::bus::bus bus = sdbusplus::bus::new_default(); + + for (int i = 0; i < numOfCpu; i++) + { + std::string Ppin = getProperty( + bus, INVENTORY_SERVICE, inventoryPath[i].c_str(), + CPU_INVENTORY_INTERFACE, "PPIN"); + + if (Ppin.empty()) + { + lg2::error("Failed to read ppin"); + } + else + { + ppin[i] = std::stoul(Ppin, nullptr, BASE_16); + } + } +} + +template +T RasManagerBase::getProperty(sdbusplus::bus::bus& bus, const char* service, + const char* path, const char* interface, + const char* propertyName) +{ + auto method = bus.new_method_call(service, path, + "org.freedesktop.DBus.Properties", "Get"); + method.append(interface, propertyName); + std::variant value{}; + try + { + auto reply = bus.call(method); + reply.read(value); + } + catch (const sdbusplus::exception::SdBusError& ex) + { + lg2::info("GetProperty call failed"); + } + return std::get(value); +} + +void RasManagerBase::requestGPIOEvents( + const std::string& name, const std::function& handler, + gpiod::line& gpioLine, + boost::asio::posix::stream_descriptor& gpioEventDescriptor) +{ + try + { + // Find the GPIO line + gpioLine = gpiod::find_line(name); + if (!gpioLine) + { + throw std::runtime_error("Failed to find GPIO line: " + name); + } + + // Request events for the GPIO line + gpioLine.request( + {"RAS", gpiod::line_request::EVENT_BOTH_EDGES, INDEX_0}); + + // Get the GPIO line file descriptor + int gpioLineFd = gpioLine.event_get_fd(); + if (gpioLineFd < 0) + { + throw std::runtime_error( + "Failed to get GPIO line file descriptor: " + name); + } + + // Assign the file descriptor to gpioEventDescriptor + gpioEventDescriptor.assign(gpioLineFd); + + // Set up asynchronous wait for events + gpioEventDescriptor.async_wait( + boost::asio::posix::stream_descriptor::wait_read, + [&name, handler](const boost::system::error_code ec) { + if (ec) + { + throw std::runtime_error( + "Error in fd handler: " + ec.message()); + } + handler(); + }); + } + catch (const std::exception& e) + { + lg2::error("Exception: {ERROR}", "ERROR", e.what()); + } +} + +void RasManagerBase::p0AlertEventHandler() +{ + gpiod::line_event gpioLineEvent = p0_apmlAlertLine.event_read(); + + if (gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE) + { + lg2::debug("Falling Edge: P0 APML Alert received"); + + if (rcd == nullptr) + { + rcd = std::make_shared(); + } + + harvestFatalError(SOCKET_0); + } + p0_apmlAlertEvent.async_wait( + boost::asio::posix::stream_descriptor::wait_read, + [this](const boost::system::error_code ec) { + if (ec) + { + lg2::error("P0 APML alert handler error: {ERROR}", "ERROR", + ec.message().c_str()); + return; + } + p0AlertEventHandler(); + }); +} + +void RasManagerBase::p1AlertEventHandler() +{ + gpiod::line_event gpioLineEvent = p1_apmlAlertLine.event_read(); + + if (gpioLineEvent.event_type == gpiod::line_event::FALLING_EDGE) + { + lg2::debug("Falling Edge: P1 APML Alert received"); + + if (rcd == nullptr) + { + rcd = std::make_shared(); + } + + harvestFatalError(SOCKET_1); + } + p1_apmlAlertEvent.async_wait( + boost::asio::posix::stream_descriptor::wait_read, + [this](const boost::system::error_code ec) { + if (ec) + { + lg2::error("P1 APML alert handler error: {ERROR}", "ERROR", + ec.message().c_str()); + return; + } + p1AlertEventHandler(); + }); +} + +void RasManagerBase::rasRecoveryAction(uint8_t buf) +{ + AttributeValue SystemRecoveryVal = getAttribute("SystemRecovery"); + std::string* SystemRecovery = std::get_if(&SystemRecoveryVal); + + if (*SystemRecovery == "WARM_RESET") + { + if ((buf & SYS_MGMT_CTRL_ERR)) + { + triggerColdReset(); + } + else + { + triggerWarmReset(); + } + } + else if (*SystemRecovery == "COLD_RESET") + { + triggerColdReset(); + } + else if (*SystemRecovery == "NO_RESET") + { + lg2::info("NO RESET triggered"); + } + else + { + lg2::error("CdumpResetPolicy is not valid"); + } +} + +void RasManagerBase::triggerRsmrstReset() +{ + boost::system::error_code ec; + boost::asio::io_context io_conn; + auto conn = std::make_shared(io_conn); + + conn->async_method_call( + [](boost::system::error_code ec) { + if (ec) + { + sd_journal_print( + LOG_ERR, "Failed to trigger cold reset of the system\n"); + } + }, + "xyz.openbmc_project.State.Host", + "/xyz/openbmc_project/control/host0/SOCReset", + "xyz.openbmc_project.Control.Host.SOCReset", "SOCReset"); + + sleep(1); + sdbusplus::bus::bus bus = sdbusplus::bus::new_default(); + std::string CurrentHostState = getProperty( + bus, "xyz.openbmc_project.State.Host", + "/xyz/openbmc_project/state/host0", "xyz.openbmc_project.State.Host", + "CurrentHostState"); + + if (CurrentHostState.compare( + "xyz.openbmc_project.State.Host.HostState.Off") == 0) + { + std::string command = "xyz.openbmc_project.State.Host.Transition.On"; + requestHostTransition(command); + } +} + +void RasManagerBase::requestHostTransition(std::string command) +{ + boost::system::error_code ec; + boost::asio::io_context io; + auto conn = std::make_shared(io); + + conn->async_method_call( + [](boost::system::error_code ec) { + if (ec) + { + sd_journal_print( + LOG_ERR, "Failed to trigger cold reset of the system\n"); + } + }, + "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0", + "org.freedesktop.DBus.Properties", "Set", + "xyz.openbmc_project.State.Host", "RequestedHostTransition", + std::variant{command}); +} + +void RasManagerBase::triggerSysReset() +{ + std::string command = "xyz.openbmc_project.State.Host.Transition.Reboot"; + + requestHostTransition(command); +} + +void RasManagerBase::triggerColdReset() +{ + AttributeValue ResetSignalVal = getAttribute("ResetSignal"); + std::string* ResetSignal = std::get_if(&ResetSignalVal); + + if (*ResetSignal == "RSMRST") + { + sd_journal_print(LOG_INFO, "RSMRST RESET triggered\n"); + triggerRsmrstReset(); + } + else if (*ResetSignal == "SYS_RST") + { + sd_journal_print(LOG_INFO, "SYS RESET triggered\n"); + triggerSysReset(); + } +} +RasManagerBase::~RasManagerBase() +{ + delete[] cpuId; + delete[] uCode; + delete[] ppin; + delete[] inventoryPath; +} diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..d6ef04e --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,56 @@ +/* + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http:www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "apml_manager.hpp" + +int main() +{ + // Setup connection to D-Bus + boost::asio::io_service io; + + // Create a shared connection to the system bus + auto systemBus = std::make_shared(io); + + // Request a unique name on the D-Bus + systemBus->request_name("com.amd.RAS"); + + // Create an object server for managing D-Bus objects + sdbusplus::asio::object_server objectServer(systemBus); + + RasManagerBase* rasManagerObj = nullptr; + +#ifdef APML + // Create an instance of ApmlInterfaceManager if APML is defined + rasManagerObj = new ApmlInterfaceManager(objectServer, systemBus, io); + + rasManagerObj->init(); + + rasManagerObj->configure(); +#endif + +#ifdef PLDM + // Log an error message if PLDM capabilities are not enabled + lg2::error("TODO: PLDM RAS capabilities are yet to be enabled"); +#endif + + io.run(); + + // Clean up the RAS manager object if it was created + if (rasManagerObj != nullptr) + { + delete rasManagerObj; + } + + return 0; +}