diff --git a/.github/scripts/build-gh-pages.sh b/.github/scripts/build-gh-pages.sh new file mode 100644 index 0000000..894d71d --- /dev/null +++ b/.github/scripts/build-gh-pages.sh @@ -0,0 +1,54 @@ +#!/bin/bash + + +set -e + +echo "Installing apt packages" +sudo apt-get update >/dev/null +sudo apt-get install -y wget git cmake graphviz >/dev/null + +echo "Installing Doxygen" +wget https://www.doxygen.nl/files/doxygen-1.12.0.linux.bin.tar.gz >/dev/null +tar -xzf doxygen-1.12.0.linux.bin.tar.gz >/dev/null +export PATH="$PWD/doxygen-1.12.0/bin:$PATH" + +#List of branches to build docs for +#TODO: Remove doxygen branch once tested +BRANCHES="doxygen master develop" + +build-docs() ( + git checkout $1 + + #The CMake Doxygen stuff is weird, and doesn't + #properly clean up and/or overwrite old outputs. + #So to make sure we get the correct doc configs, + #we need to delete everything + #We put the docs themselves into a hidden directory + #so they don't get included in this glob + rm -rf ./* + + cmake ../ -DBUILD_DOCS=ON -DDOCS_ONLY=ON \ + -DFENIX_DOCS_MAN=OFF -DFENIX_BRANCH=$1 \ + -DFENIX_DOCS_OUTPUT=$PWD/.docs + make docs +) + +git clone https://www.github.com/sandialabs/Fenix.git +mkdir Fenix/build +cd Fenix/build + +for branch in $BRANCHES; do + echo "Building docs for $branch" + + #TODO: Fail if any branch fails to build, + # once the develop and master branches have doxygen + # merged in + build-docs $branch || true + + echo + echo +done + +if [ -n "$GITHUB_ENV" ]; then + echo "DOCS_DIR=$PWD/.docs" >> $GITHUB_ENV +fi diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..1f6609f --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,40 @@ +name: Publish GH Pages + +on: + push: + branches: + - master + - develop + - doxygen # TODO: Remove after testing + +#Only one of this workflow runs at a time +concurrency: + group: docs + cancel-in-progress: true + +jobs: + build-pages: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Build pages + run: /bin/bash .github/scripts/build-gh-pages.sh + + - name: Upload documentation artifact + uses: actions/upload-pages-artifact@v3 + with: + path: ${{ env.DOCS_DIR }} + + deploy-docs: + needs: build-pages + runs-on: ubuntu-latest + permissions: + pages: write + id-token: write + + steps: + - name: Deploy documentation to GH Pages + uses: actions/deploy-pages@v4 + diff --git a/CMakeLists.txt b/CMakeLists.txt index 170b576..ecaac8b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,8 +16,9 @@ set(FENIX_VERSION_MAJOR 1) set(FENIX_VERSION_MINOR 0) option(BUILD_EXAMPLES "Builds example programs from the examples directory" OFF) -option(BUILD_TESTING "Builds tests and test modes of files" ON) - +option(BUILD_TESTING "Builds tests and test modes of files" ON) +option(BUILD_DOCS "Builds documentation if is doxygen found" ON) +option(DOCS_ONLY "Only build documentation" OFF) #Solves an issue with some system environments putting their MPI headers before #the headers CMake includes. Forces non-system MPI headers when incorrect headers @@ -25,28 +26,32 @@ option(BUILD_TESTING "Builds tests and test modes of files" ON) option(FENIX_SYSTEM_INC_FIX "Attempts to force overriding any system MPI headers" ON) option(FENIX_PROPAGATE_INC_FIX "Attempt overriding system MPI headers in linking projects" ON) -find_package(MPI REQUIRED) -if(${FENIX_SYSTEM_INC_FIX}) - include(cmake/systemMPIOverride.cmake) -endif() +if(NOT DOCS_ONLY) + find_package(MPI REQUIRED) + if(${FENIX_SYSTEM_INC_FIX}) + include(cmake/systemMPIOverride.cmake) + endif() -add_subdirectory(src) + add_subdirectory(src) + include(CTest) + list(APPEND MPIEXEC_PREFLAGS "--with-ft;mpi") -include(CTest) -list(APPEND MPIEXEC_PREFLAGS "--with-ft;mpi") + if(BUILD_EXAMPLES) + add_subdirectory(examples) + endif() -if(BUILD_EXAMPLES) - add_subdirectory(examples) -endif() + if(BUILD_TESTING) + add_subdirectory(test) + endif() - -if(BUILD_TESTING) - add_subdirectory(test) endif() +if(BUILD_DOCS) + add_subdirectory(doc) +endif() configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/include/fenix-config.h.in diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt new file mode 100644 index 0000000..10c780f --- /dev/null +++ b/doc/CMakeLists.txt @@ -0,0 +1,39 @@ +find_package(Doxygen) + +set(FENIX_DOCS_OUTPUT ${CMAKE_CURRENT_BINARY_DIR} CACHE PATH "Documentation output directory") +set(FENIX_DOCS_MAN "YES" CACHE BOOL "Option to disable man page generation for CI builds") +set(FENIX_BRANCH "local" CACHE BOOL "Git branch being documented, or local if not building for Github Pages") + +if(NOT DOXYGEN_FOUND) + message(STATUS "Doxygen not found, `make docs` disabled") + return() +endif() + +list(APPEND DOXYGEN_EXAMPLE_PATH markdown) +list(APPEND DOXYGEN_IMAGE_PATH images) + +set(DOXYGEN_USE_MDFILE_AS_MAINPAGE markdown/Introduction.md) +set(DOXYGEN_LAYOUT_FILE DoxygenLayout.xml) +set(DOXYGEN_OUTPUT_DIRECTORY ${FENIX_DOCS_OUTPUT}) + +set(DOXYGEN_GENERATE_MAN ${FENIX_DOCS_MAN}) + +set(DOXYGEN_QUIET YES) +set(DOXYGEN_WARN_IF_UNDOCUMENTED NO) +set(DOXYGEN_WARN_IF_DOC_ERROR YES) +set(DOXYGEN_WARN_NO_PARAMDOC YES) +set(DOXYGEN_SHOW_INCLUDE_FILES NO) +set(DOXYGEN_WARN_IF_UNDOC_ENUM_VAL NO) + +list(APPEND DOXYGEN_ALIASES "returnstatus=@return FENIX_SUCCESS if successful, any [return code](@ref ReturnCodes) otherwise.") +list(APPEND DOXYGEN_ALIASES "unimplemented=@qualifier UNIMPLEMENTED @brief @htmlonly @endhtmlonly UNIMPLEMENTED @htmlonly @endhtmlonly") + +add_subdirectory(html) + +doxygen_add_docs(docs + markdown/Introduction.md fake_init.h ../include ../src + ALL + COMMENT "Generate Fenix documentation") +message(STATUS "Run `make docs` to build documentation") + +install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/man DESTINATION ${CMAKE_INSTALL_PREFIX}) diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml new file mode 100644 index 0000000..d636ef1 --- /dev/null +++ b/doc/DoxygenLayout.xml @@ -0,0 +1,265 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/fake_init.h b/doc/fake_init.h new file mode 100644 index 0000000..a9afa16 --- /dev/null +++ b/doc/fake_init.h @@ -0,0 +1,4 @@ +//!@weakgroup ProcessRecovery +//!@{ +void Fenix_Init(int* role, MPI_Comm comm, MPI_Comm* newcomm, int** argc, char*** argv, int spare_ranks, int spawn, MPI_Info info, int* error); +//!@} diff --git a/doc/html/CMakeLists.txt b/doc/html/CMakeLists.txt new file mode 100644 index 0000000..70677f8 --- /dev/null +++ b/doc/html/CMakeLists.txt @@ -0,0 +1,52 @@ +set(DOXYGEN_GENERATE_HTML YES PARENT_SCOPE) + +set(DOXYGEN_TOC_INCLUDE_HEADINGS 0 PARENT_SCOPE) +set(DOXYGEN_DISABLE_INDEX YES PARENT_SCOPE) +set(DOXYGEN_GENERATE_TREEVIEW YES PARENT_SCOPE) +set(DOXYGEN_FULL_SIDEBAR NO PARENT_SCOPE) + +file(GLOB CSS_FILES ./*.css) +set(DOXYGEN_HTML_EXTRA_STYLESHEET ${CSS_FILES} PARENT_SCOPE) +set(DOXYGEN_HTML_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/header.html PARENT_SCOPE) + +if(NOT FENIX_BRANCH STREQUAL "local") + message(STATUS "Building documentation for branch ${FENIX_BRANCH}") + set(DOXYGEN_HTML_OUTPUT ${FENIX_BRANCH} PARENT_SCOPE) + set(DOXYGEN_PROJECT_NUMBER "@${FENIX_BRANCH}" PARENT_SCOPE) +endif() + + + +file(GLOB DOC_INDEXES RELATIVE ${DOXYGEN_OUTPUT_DIRECTORY} CONFIGURE_DEPENDS ${DOXYGEN_OUTPUT_DIRECTORY}/*/index.html) +foreach(DOC_INDEX ${DOC_INDEXES}) + string(REGEX REPLACE "/index.html" "" DOC_VERSION ${DOC_INDEX}) + list(APPEND DOC_VERSIONS ${DOC_VERSION}) +endforeach() +if("html" IN_LIST DOC_VERSIONS) + list(REMOVE_ITEM DOC_VERSIONS "html") +endif() + +message(STATUS "Existing documentation versions: ${FENIX_DOC_VERSIONS}") + +list(APPEND DOC_VERSIONS ${DOXYGEN_HTML_OUTPUT}) +list(REMOVE_DUPLICATES DOC_VERSIONS) +list(SORT DOC_VERSIONS) +if("main" IN_LIST DOC_VERSIONS) + list(REMOVE_ITEM DOC_VERSIONS "main") + list(PREPEND DOC_VERSIONS "main") +endif() + +set(DOC_DEFAULT_VERSION "develop") +if(NOT DOC_DEFAULT_VERSION IN_LIST DOC_VERSIONS) + set(DOC_DEFAULT_VERSION ${FENIX_BRANCH}) +endif() +list(REMOVE_ITEM DOC_VERSIONS ${DOC_DEFAULT_VERSION}) +list(PREPEND DOC_VERSIONS ${DOC_DEFAULT_VERSION}) + +foreach(DOC_VERSION ${DOC_VERSIONS}) + set(DOC_VERSION_SELECT "${DOC_VERSION_SELECT} ") +endforeach() + +configure_file(index.html.in ${DOXYGEN_OUTPUT_DIRECTORY}/index.html) +configure_file(version_selector.html.in ${DOXYGEN_OUTPUT_DIRECTORY}/version_selector.html) +configure_file(version_select_handler.js ${DOXYGEN_OUTPUT_DIRECTORY}/version_select_handler.js COPYONLY) diff --git a/doc/html/DoxygenStyle.css b/doc/html/DoxygenStyle.css new file mode 100644 index 0000000..770b1c6 --- /dev/null +++ b/doc/html/DoxygenStyle.css @@ -0,0 +1,41 @@ +/*Move qualifiers (e.g. collective, unimplemented) to being above function name instead of bottom right*/ +/* It's too easy to miss as-is, especially the unimplemented tag.*/ +table.mlabels { + direction: rtl; + writing-mode: vertical-rl; +} +/*Undo the weird writing-mode changes at each mlabels table member*/ +table.mlabels td.mlabels-right { + writing-mode: horizontal-tb; + text-align: left; + width: auto; +} +table.mlabels td.mlabels-left { + writing-mode: horizontal-tb; + text-align: left; + width: auto; +} +/*Undo the table direction change in the subtable of function parameters*/ +table.mlabels table.memname { + float: left; + direction: ltr; +} + +/*Make the qualifier labels slightly larger, and bold.*/ +table.mlabels td.mlabels-right span.mlabel { + font-weight: bold; + font-size: 12px; +} + + +/* + * Hide the "UNIMPLEMENTED" tag within the function's detailed description + * It's visible already. +*/ +div.memdoc span.mlabel { + display: none; +} + +table.params { + word-wrap: break-all; +} diff --git a/doc/html/header.html b/doc/html/header.html new file mode 100644 index 0000000..edf8ba4 --- /dev/null +++ b/doc/html/header.html @@ -0,0 +1,81 @@ + + + + + + + + +$projectname: $title +$title + + + + + + + + + + + + + + +$treeview +$search +$mathjax +$darkmode + +$extrastylesheet + + + + + +
+ + + +
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
$projectname $projectnumber +
+
$projectbrief
+
+
$projectbrief
+
$searchbox
$searchbox
+
+ + diff --git a/doc/html/index.html.in b/doc/html/index.html.in new file mode 100644 index 0000000..e10ca8c --- /dev/null +++ b/doc/html/index.html.in @@ -0,0 +1,12 @@ + + + + + + + Redirecting... + + +

If you are not redirected automatically, click here.

+ + diff --git a/doc/html/version_select_handler.js b/doc/html/version_select_handler.js new file mode 100644 index 0000000..159d913 --- /dev/null +++ b/doc/html/version_select_handler.js @@ -0,0 +1,19 @@ +$(function () { + var window_location = window.location.pathname.split('/'); + var current_page = window_location.pop(); + var current_version = window_location.pop(); + var base_path = window_location.join('/'); + $.get(base_path + '/version_selector.html', function (data) { + // Inject version selector HTML into the page + $('#projectnumber').html(data); + + // Event listener to handle version selection + document.getElementById('versionSelector').addEventListener('change', function () { + var selectedVersion = this.value; + window.location.href = base_path + '/' + selectedVersion + '/' + current_page + window.location.hash; + }); + + // Set the selected option based on the current version + $('#versionSelector').val(current_version); + }); +}); diff --git a/doc/html/version_selector.html.in b/doc/html/version_selector.html.in new file mode 100644 index 0000000..ca92356 --- /dev/null +++ b/doc/html/version_selector.html.in @@ -0,0 +1,3 @@ +
diff --git a/doc/images/fenix_process_flow.png b/doc/images/fenix_process_flow.png new file mode 100644 index 0000000..e94029a Binary files /dev/null and b/doc/images/fenix_process_flow.png differ diff --git a/doc/markdown/DataRecovery.md b/doc/markdown/DataRecovery.md new file mode 100644 index 0000000..f5e8d3b --- /dev/null +++ b/doc/markdown/DataRecovery.md @@ -0,0 +1,116 @@ +Fenix provides options for redundant storage of application data +to facilitate application data recovery in a transparent manner. +Fenix contains functions to control consistency of collections of +such data, as well as their level of persistence. Functions with +the prefix \c Fenix\_Data\_ perform store, versioning, restore, +and other relevant operations and form the Fenix data recovery API. +The user can select a specific set of application data, identified +by its location in memory, label it using [Fenix_Data_member_create](@ref Fenix_Data_member_create), +and copy it into Fenix's redundant storage space through +[Fenix_Data_member(i)store(v)](@ref Fenix_Data_member_store) at a +point in time. Subsequently, #Fenix_Data_commit finalizes all +preceding Fenix store operations involving this data group and +assigns a unique time stamp to the resulting data *snapshot*, +marking the data as potentially recoverable after a loss of ranks. +Individual pieces of data can then be restored whenever they are +needed with #Fenix_Data_member_restore, for example after a failure +occurs. We note that Fenix's data storage and recovery facility +aims primarily to support in-memory recovery. + +Populating redundant data storage using Fenix may involve the +dispersion of data created by one rank to other ranks within the +system, making the store operation semantically a collective +operation. However, Fenix does not require store operations to be +globally synchronizing. For example, execution of + #Fenix_Data_member_store for a particular collection of data +could potentially be finished in some ranks, but not yet in others. +And if certain ranks nominally participating in the storage +operations have no actual data movement responsibility, Fenix is +allowerd to let them exit the operation immediately. Consequently, +Fenix data storage functions should not be used for synchronization +purposes. + +Multiple distinct pieces (members) of data assigned to Fenix-managed +redundant storage, can be associated with a specific instance of +a Fenix *data group* to form a semantic unit. Committing such a +group ensures that the data involved is available for recovery. + +----- + +## Data Groups + +A Fenix *data group* provides dual functionality. First, it serves +as a container for a set of data objects (*members*) that are +committed together, and hence provides transaction semantics. +Second, it recognizes that #Fenix_Data_member_store is an operation +carried out collectively by a group of ranks, but not necessarily +by all active ranks in the MPI environment. Hence, it adopts the +convenient MPI vehicle of \c communicators to indicate the subset +of ranks involved. Data groups are composed of members that +describe the actual application data and the redundancy policy +to be used for securely storing the members. + +Data groups can and should be recreated after each failure (i.e. do not +conditionally skip the creation after initialization). + +See #Fenix_Data_group_create +for creating a data group. + +----- + +## Data Redundancy Policies + +Fenix internally uses an extensible system for defining data +policies to keep the door open to easily adding new data policies +and configuring them on a per-data-group basis. We currently +support a single, configurable, memory-based policy. + +### In Memory Redundancy Policy (IMR) + +IMR is referenced with the FENIX_DATA_POLICY_IN_MEMORY_RAID definition, +and takes as input an array of integers with the following usage: + +* Mode: (1 or 5) Chooses storage mimicking the given RAID style. +* Separation: Sets the rank separation for groups used to store redundant data. + Users should choose a separation that attempts to ensure the ranks + chosen for grouping are not colocated on nodes/racks to minimize the + chance of multiple ranks in a group +* GroupSize: For Mode 5 only, sets the size of the parity groups, minimum 3. + +The policy is designed to localize recovery as much as possible. Communication +amongst group members is required (as failure during recovery operations +can lead to inconsistent beliefs about which ranks have recovered data), +but groups without recovering ranks may then all recover locally rather +than communicating further. Groups need not wait for ranks outside of +their group to enter or exit recovery. + +* **Mode 1**: Groups ranks into dyadically paired partners of Rank N and + Rank (N+Separation). For odd-size communicators, a single + group of size 3 will also form of the first, middle, and last + ranks. Each rank stores a copy of its own data and a copy of + its partner's. For groups of three, partner data storage is + chained. Should both partners fail (or any two for groups of + three) before recovery operations have completed, data will be + unrecoverable. + + **Memory Usage**: Each rank stores a copy of its own data and of its + partner's data for each timestamp, where checkpoint depth D + stores D+1 checkpoints. Therefore for data size M, + (D+1)*M*2 bytes are used. + + **Computation**: None. + +* **Mode 5**: Groups ranks into parity groups of size GroupSize. + Groups are formed of Rank N, N+Separation, N+2*Separation. + If any two ranks in a group fail before recovery operations + have completed, data will be unrecoverable. + + **Memory Usage**: Each rank stores a copy of its own data and + M/(GroupSize-1) parity bytes per timestamp. Therefore, + (D+1)*M*(GroupSize/(GroupSize-1)) bytes are used. + + **Computation**: O(M) parity bit calculations. + +These options enable users to trade reliability and computation for memory +space, which may be necessary for applications with large memory usage. + diff --git a/doc/markdown/IMR.md b/doc/markdown/IMR.md new file mode 100644 index 0000000..76f852a --- /dev/null +++ b/doc/markdown/IMR.md @@ -0,0 +1,6 @@ +# In Memory Redundancy (IMR) {#md_IMR} + +Fenix supports one data storage policy, IMR, +which stores data through either a RAID-1-like +buddy rank mechanism or a RAID-5-like parity +mechanism. diff --git a/doc/markdown/Introduction.md b/doc/markdown/Introduction.md new file mode 100644 index 0000000..1e8d68b --- /dev/null +++ b/doc/markdown/Introduction.md @@ -0,0 +1,51 @@ +Fenix is a software library compatible with the Message Passing +Interface (MPI) to support fault recovery without application +shutdown. Fenix has two components: process recovery and data +recovery. Process recovery is used to repair communicators whose +ranks suffered failure detected by the MPI runtime. Data recovery +is an optional feature that can be used to implement a +high-performance in-memory checkpoint/restart mechanism. + +Below is a brief overview of these two components, but see the +[Process Recovery](@ref ProcessRecovery) and [Data Recovery](@ref DataRecovery) +topics for more details. + +## Process Recovery + +The core feature of process recovery is creation of a resilient +communicator that will automatically repair itself. This recovery +is achieved by setting aside some number of ranks as *spare ranks*. +When a failure is detected, the spare ranks are used to replace +the failed ranks. + +The exact process of recovery is subject to some nuances of the OpenMPI +ULFM specification, which Fenix is implemented on top of. For example, +messages may have locally succeeded while failing on other participating +ranks. + +![An example process flow diagram for recovery using Fenix](fenix_process_flow.png){html: width=300px} + +The default recovery pattern is to perform a `longjmp` to the location of +#Fenix_Init following communicator repairs. This emulates the typical offline +checkpoint/restart pattern, but without the need to restart the application. +However, `longjmp` has some nebulous behavior in many applications. Fenix also +supports a non-jumping recovery pattern. This is more predictable across compilers +and optimizations, but requires checking the return value of every MPI call to +detect failed operations (though communicator repair is still automatic). A +good practice for C++ applications is to use the non-jumping pattern, but add +a Fenix error-handler callback to throw an exception on failure. + +## Data Recovery + +Fenix provides its own redundant data storage API to facilitate +data recovery along with process recovery, but the user can choose +other data recovery options to meet a variety of application needs. +For example, data could be recovered by approximately interpolating +values from unaffected, topologically neighboring ranks instead of +by reading stored redundant data. In addition, the user may decide +to use external libraries such as +[VeloC](https://veloc.readthedocs.io/en/latest/). + +> Any Fenix function without a return type, e.g. #Fenix_Init, may be +> implemented via macros, in which case it cannot be used to resolve +> function pointers. diff --git a/doc/markdown/ProcessRecovery.md b/doc/markdown/ProcessRecovery.md new file mode 100644 index 0000000..56f338a --- /dev/null +++ b/doc/markdown/ProcessRecovery.md @@ -0,0 +1,116 @@ +Process recovery within Fenix can be broken down into three steps: detection, +communicator recovery, and application recovery. + +--- + +## Detecting Failures + +Fenix is built on top of ULFM MPI, so specific fault detection mechanisms and +options can be found in the [ULFM +documentation](https://docs.open-mpi.org/en/v5.0.x/features/ulfm.html#). At a +high level, this means that Fenix will detect failures when an MPI function +call is made which involves a failed rank. Detection is not collectively +consistent, meaning some ranks may fail to complete a collective while other +ranks finish successfully. Once a failure is detected, Fenix will 'revoke' the +communicator that the failed operation was using and the top-level communicator +output by #Fenix_Init (these communicators are usually the same). The +revocation is permanent, and means that all future operations on the +communicator by any rank will fail. This allows knowledge of the failed rank to +be propagated to all ranks in the communicator, even if some ranks would never +have directly communicated with the failed rank. + +Since failures can only be detected during MPI function calls, applications with +long periods of communication-free computation will experience delays in beginning +recovery. Such applications may benefit from inserting periodic calls to +#Fenix_Process_detect_failures to allow ranks to participate in global recovery +operations with less delay. + +Fenix will only detect and respond to failures that occur on the communicator +provided by #Fenix_Init or any communicators derived from it. Faults on other +communicators will, by default, abort the application. Note that having +multiple derived communicators is not currently recommended, and may lead to +deadlock. In fact, even one derived communicator may lead to deadlock if not +used carefully. If you have a use case that requires multiple communicators, +please contact us about your use case -- we can provide guidance and may be +able to update Fenix to support it. + +**Advanced:** Applications may wish to handle some failures themselves - either +ignoring them or implementing custom recovery logic in certain code regions. +This is not generally recommended. Significant care must be taken to ensure +that the application does not attempt to enter two incompatible recovery steps. +However, if you wish to do this, you can include "fenix_ext.h" and manually set +`fenix.ignore_errs` to a non-zero value. This will cause Fenix's error handler +to simply return any errors it encounters as the exit code of the application's +MPI function call. Alternatively, applications may temporarily replace the +communicator's error handler to avoid Fenix recovery. If you have a use case +that would benefit from this, you can contact us for guidance and/or to request +some specific error handling features. + +--- + +## Communicator Recovery + +Once a failure has been detected, Fenix will begin the collective process of +rebuilding the resilient communicator provided by #Fenix_Init. There are two +ways to rebuild: replacing failed ranks with spares, or shrinking the +communicator to exclude the failed ranks. If there are any spares available, +Fenix will use those to replace the failed ranks and maintain the original +communicator size and guarantee that surviving processes keep the same rank ID. +If there are not enough spares, some processes may have a different rank ID on +the new communicator, and Fenix will warn the user about this by setting the +error code for #Fenix_Init to #FENIX_WARNING_SPARE_RANKS_DEPLETED. + +**Advanced:** Communicator recovery is collective, blocking, and not +interruptable. ULFM exposes some functions (e.g. MPIX_Comm_agree, +MPIX_Comm_shrink) that are also not interrupable -- meaning they will continue +despite any failures or revocations. If multiple collective, non-interruptable +operations are started by different ranks in different orders, the application +will deadlock. This is similar to what would happen if a non-resilient +application called multiple collectives (e.g. `MPI_Allreduce`) in different +orders. However, the preemptive and inconsistent nature of failure recovery +makes it more complex to reason about ordering between ranks. Fenix uses these +ULFM functions internally, so care is taken to ensure that the order of +operations is consistent across ranks. Before any such operation begins, Fenix +first uses MPIX_Comm_agree on the resilient communicator provided by +#Fenix_Init to agree on which 'location' will proceed - if there is any +disagreement, all ranks will enter recovery as if they had detected a failure. +Applications which wish to use these functions themselves should follow this +pattern, providing a unique 'location' value for any operations that may be +interrupted. + +--- + +## Application Recovery + +Once a new communicator has been constructed, application recovery begins. +There are two recovery modes: jumping (default) and non-jumping. With jumping +recovery, Fenix will automatically `longjmp` to the #Fenix_Init call site once +communicator recovery is complete. This allows for very simple recovery logic, +since it mimics the traditional teardown-restart pattern. However, `longjmp` +has many undefined semantics according to the C and C++ specifications and may +result in unexpected behavior due to compiler assumptions and optimizations. +Additionally, some applications may be able to more efficiently recover by +continuing inline. Users can initialize Fenix as non-jumping (see test/no_jump) +to instead return an error code from the triggering MPI function call after +communicator recovery. This may require more intrusive code changes (checking +return statuses of each MPI call). + +Fenix also allows applications to register one or more callback functions with +#Fenix_Callback_register and #Fenix_Callback_pop, which removes the most +recently registered callback. These callbacks are invoked after communicator +recovery, just before control returns to the application. Callbacks are +executed in the reverse order they were registered. + +For C++ applications, it is recommended to use Fenix in non-jumping mode and to +register a callback that throws an exception. At it's simplest, wrapping +everything between #Fenix_Init and #Fenix_Finalize in a single try-catch can +give the same simple recovery logic as jumping mode, but without the undefined +behavior of `longjmp`. + +#Fenix_Init outputs a role, from #Fenix_Rank_role, which helps inform the +application about the recovery state of the rank. It is important to note that +all spare ranks are captured inside #Fenix_Init until they are used for +recovery. Therefore, after recovery, recovered ranks will not have the same +callbacks registered -- recovered ranks will need to manually invoke any +callbacks that use MPI functions. These roles also help the application more +generally modify it's behavior based on each rank's recovery state. diff --git a/include/fenix.h b/include/fenix.h index 77d573b..0a1d783 100644 --- a/include/fenix.h +++ b/include/fenix.h @@ -66,6 +66,18 @@ extern "C" { #include "fenix_data_subset.h" #include "fenix_process_recovery.h" +/** + * @file + * @brief Contains all API function calls and Fenix types. + * This is the only header file a user should include. + */ + +/** + * @defgroup ReturnCodes Return Codes + * @brief All possible return codes from Fenix functions. + * Errors are negative, warnings are positive. + * @{ + */ #define FENIX_SUCCESS 0 #define FENIX_ERROR_UNINITIALIZED -9 #define FENIX_ERROR_NOCATEGORY -10 @@ -91,40 +103,113 @@ extern "C" { #define FENIX_ERROR_CANCELLED -50 #define FENIX_WARNING_SPARE_RANKS_DEPLETED 100 #define FENIX_WARNING_PARTIAL_RESTORE 101 +/**@}*/ -#define FENIX_DATA_GROUP_WORLD_ID 10 -#define FENIX_GROUP_ID_MAX 11 -#define FENIX_TIME_STAMP_MAX 12 -#define FENIX_DATA_MEMBER_ALL 15 -#define FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER 11 -#define FENIX_DATA_MEMBER_ATTRIBUTE_COUNT 12 -#define FENIX_DATA_MEMBER_ATTRIBUTE_DATATYPE 13 -#define FENIX_DATA_MEMBER_ATTRIBUTE_SIZE 14 -#define FENIX_DATA_SNAPSHOT_LATEST -1 -#define FENIX_DATA_SNAPSHOT_ALL 16 -#define FENIX_DATA_SUBSET_CREATED 2 - +//!@internal @brief Agreement code for error handler #define FENIX_ERRHANDLER_LOC 1 +//!@internal @brief Agreement code for finalize #define FENIX_FINALIZE_LOC 2 +//!@internal @brief Agreement code for data commit barrier #define FENIX_DATA_COMMIT_BARRIER_LOC 4 -#define FENIX_DATA_POLICY_IN_MEMORY_RAID 13 +/** + * @defgroup ProcessRecovery Process Recovery + * @brief Functions for managing process recovery in Fenix. + * @details @include{doc} ProcessRecovery.md + * @{ + */ + +/** + * @brief All possible roles returned by Fenix_Init + * + * Describes the current process's state in reference + * to process recovery. + * + * It is important to note that FENIX_ROLE_RECOVERED_RANK + * is only guaranteed to be the value after a single failure, + * so users ought not use the role to directly ensure a valid + * state if they desire to be resilient to failures during their + * failure recovery process. + */ typedef enum { + //!No failures have occurred yet FENIX_ROLE_INITIAL_RANK = 0, + //!This rank was a spare before the most recent failure, or was just spawned FENIX_ROLE_RECOVERED_RANK = 1, + //!This rank was not a spare before the most recent failure FENIX_ROLE_SURVIVOR_RANK = 2 } Fenix_Rank_role; -typedef struct { - MPI_Request mpi_send_req; - MPI_Request mpi_recv_req; -} Fenix_Request; - -extern const Fenix_Data_subset FENIX_DATA_SUBSET_FULL; -extern const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY; - +/** + * @fn void Fenix_Init(int* role, MPI_Comm comm, MPI_Comm* newcomm, int** argc, char*** argv, int spare_ranks, int spawn, MPI_Info info, int* error); + * @brief Build a resilient communicator and set the restart point. + * + * This function must be called by all ranks in \c comm, after MPI initialization. All calling ranks must + * pass the same values for the parameters \c comm, \c spare_ranks, \c spawn, and \c info. \c Fenix_init + * must be called exactly once by each rank. This function is used (1) to activate the Fenix library, (2) + * to specify extra resources in case of rank failure, and (3) to create a logical resumption point in case + * of rank failure. + * + * For C, the program may rely on the the state of any variables defined and set before the call to \c Fenix_Init. + * But note that the code executed before \c Fenix_Init is executed by all ranks in the system (including spare + * ranks, see below). For C++, the state of objects declared before \c Fenix_Init but within the same scope as + * \c Fenix_Init is compiler-dependant, and it is recommended to place \c Fenix_Init within a subscope exluding + * any variables expected to no be destructed. + * + * It is recommended to access argc and argv only after executin \c Fenix_Init, since command line arguments + * passed to this function that apply to Fenix may be removed by \c Fenix_Init. + * + * \c Fenix_Init is blocking in the following sense. If it is entered for the first time via a regular, explicit + * function call, it must be entered by all ranks in communicator \c comm. If it is entered after an error + * intercepted by Fenix (it if the default execution resumption point, see _info below), no ranks are allowed + * to exit from it until all *non-failed* ranks have returned control to it. **Note**: Typically control is + * returned automatically through revocation of the resilient communicator, which means ranks which have long + * delays between MPI function calls or ranks which only use communicators unaffected by failure may lead to + * long delays between a failure and its recovery. + * + * Ranks to be used as spare ranks by Fenix will be available to the application only before \c Fenix_Init, + * or after they are used to replace a failed rank, in which case they turn into active ranks. This document + * refers to the latter as \c RECOVERED ranks (see #Fenix_Rank_role). Note that all spare + * ranks that have not been used to recover from failures (and, therefore, are still reserved by Fenix and kept + * inside \c Fenix_Init) will automatically call \c MPI_Finalize and exit when all active ranks have entered the + * #Fenix_Finalize call. + * + * No Fenix functions may be called before \c Fenix_Init, except #Fenix_Initialized. + * + * @param[out] role The current role of this rank (see #Fenix_Rank_role) + * @param[in] comm The base communicator to construct a resilient communicator from, + * which must include any spare ranks (see below) the user deems necessary. + * MPI_COMM_WORLDis a valid value, but MPI_COMM_SELF is not. + * @param[out] newcomm Resilient output communicator, managed by Fenix and derived + * from comm, to be used by the application instead of comm. + * @param[inout] argc Pointer to application main's argc parameter + * @param[inout] argv Pointer to application main's argv parameter + * @param[in] spare_ranks The number of ranks in comm that are exempted by Fenix + * in the construction of the resilient communicator by Fenix_Init. These ranks + * are kept in reserve to substitute for failed ranks. Failed ranks in resilient + * communicators are replaced by spare or spawned ranks. + * @param[in] spawn *Unimplemented*: Whether to enable spawning new ranks to replace + * failed ranks when spares are unavailable. + * @param[in] info Fenix recovery configuration parameters, may be MPI_INFO_NULL + * Supports the "FENIX_RESUME_MODE" key, used to indicate where execution should resume upon + * rank failure for all active (non-spare) ranks in any resilient communicators, not only for + * those ranks in communicators that failed. The following values associated with the + * "resume_mode" key are supported: + * - "Fenix_init" (default): execution resumes at logical exit of Fenix_Init. + * - "NO_JUMP": execution continues from the failing MPI call. Errors are otherwise handled + * as normal, but return the error code as well. Applications should typically + * either check for return codes or assign an error callback through Fenix. + * @param[out] error The return status of \c Fenix_Init
+ * Used to signal that a non-fatal error or special condition was encountered in the execution of + * Fenix_Init, or FENIX_SUCCESS otherwise. It has the same value across all ranks released by + * Fenix_Init. If spawning is explicitly disabled (_spawn equals false) and spare ranks have been + * depleted, Fenix will repair resilience communicators by shrinking them and will report such + * shrinkage in this return parameter through the value FENIX_WARNING_SPARE_RANKS_DEPLETED. + */ + +//!@internal #define Fenix_Init(_role, _comm, _newcomm, _argc, _argv, _spare_ranks, \ _spawn, _info, _error) \ { \ @@ -138,100 +223,475 @@ extern const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY; __fenix_postinit( _error ); \ } -int Fenix_Initialized(int *); +/** + * @brief Sets flag to true if Fenix_Init has been called, else false. + * @param[out] flag Pointer to the flag to be set. + * @returnstatus + */ +int Fenix_Initialized(int *flag); + +/** + * @brief Register a callback to be invoked after failure process recovery. + * + * This function registers a callback to be invoked after a failure has been recovered by Fenix, + * and right before resuming application execution (e.g. returning from #Fenix_Init by default). + * If this function is called more than once, the different callbacks will be called in the + * reverse order that they were registered (i.e. as a callback stack). + * + * Callback functions are passed the newly-repaired resilient communicator, the error code returned + * by MPI in the communication action which caused a failure recovery, and the user-provided \c void* + * callback data. + * + * Callbacks will only be invoked by survivor ranks, since spare ranks or respawned ranks had no way + * to register them before a failure. + * + * @param[in] recover the callback function to register. + * @param[in] callback_data The user-provided data which will be passed to the callback. + * + * @returnstatus + */ int Fenix_Callback_register(void (*recover)(MPI_Comm, int, void *), void *callback_data); +/** + * @brief Pop the most recently registered callback from the callback stack. + * @returnstatus + */ int Fenix_Callback_pop(); +/** + * @brief Check for any failed ranks + * + * @param[in] do_recovery If true, Fenix will attempt to recover from any detected failures. + * Else, it will ignore any failures and simply return the MPI return code. + * @return MPI_SUCCESS if no failures were detected, else the MPI return code. + */ +int Fenix_Process_detect_failures(int do_recovery); + +//!@unimplemented Returns the number of ranks with a given #Fenix_Rank_role int Fenix_get_number_of_ranks_with_role(int, int *); +//!@unimplemented Returns the #Fenix_Rank_role for a given rank int Fenix_get_role(MPI_Comm comm, int rank, int *role); +/** + * @brief Get the list of ranks that failed in the most recent failure. + * @param[out] fail_list Set to a list of failed ranks. + * @return The number of failed ranks. + */ +int Fenix_Process_fail_list(int** fail_list); + +/** + * @brief Check a pre-recovery request without error + * @param[in] request The request to check + * @param[out] status The status of the request + * @return True if the request was cancelled or has unknown completion status, + * false if it completed successfully. + */ +int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status); + + +/** + * @brief Clean up Fenix state. Each active rank must call \c Fenix_Finalize before exiting. + * + * This function cleans up all Fenix state, if any. If an MPI program using the Fenix library terminates + * normally (i.e. not due to a call to \c MPI_Abort, or an unrecoverable error) then each rank must call + * \c Fenix_Finalize before it exits. It must be called before \c MPI_Finalize, and after #Fenix_Init. + * There shall be no function calls after this function, except #Fenix_Initialized. + * + * As noted in the description of #Fenix_Init, all spare ranks that have not been used to + * recover from failures (and therefore are still reserved by Fenix and kept inside #Fenix_Init) will call + * \c MPI_Finalize and exit when all active ranks have called \c Fenix_Finalize. + * + * **Advice**: Sometimes users may want to remove ranks proactively from the execution, for example because + * monitoring data shows that failure of a rank is imminent or that a rank is executing un-manageably slowly. + * This can be accomplished by calling \c exit on the targeted ranks, followed by an invocation of MPI_Barrier. + * The removed ranks will be reported as failed and error handling will progress appropriately. No calls to finalize + * are needed in this case. + */ int Fenix_Finalize(); -int Fenix_Data_group_create(int group_id, MPI_Comm, int start_time_stamp, +/**@}*/ + + +/** + * @defgroup DataRecovery Data Recovery + * @brief Functions for storing and restoring data in Fenix. + * @details @include{doc} DataRecovery.md + * + * @{ + */ +#define FENIX_DATA_GROUP_WORLD_ID 10 +#define FENIX_GROUP_ID_MAX 11 +#define FENIX_TIME_STAMP_MAX 12 +#define FENIX_DATA_MEMBER_ALL 15 +#define FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER 11 +#define FENIX_DATA_MEMBER_ATTRIBUTE_COUNT 12 +#define FENIX_DATA_MEMBER_ATTRIBUTE_DATATYPE 13 +#define FENIX_DATA_MEMBER_ATTRIBUTE_SIZE 14 +#define FENIX_DATA_SNAPSHOT_LATEST -1 +#define FENIX_DATA_SNAPSHOT_ALL 16 +#define FENIX_DATA_SUBSET_CREATED 2 + +#define FENIX_DATA_POLICY_IN_MEMORY_RAID 13 + +/** + * @unimplemented As MPI_Request, but for Fenix asynchronous data recovery calls + */ +typedef struct { + MPI_Request mpi_send_req; + MPI_Request mpi_recv_req; +} Fenix_Request; + +//!@brief A standin for checkpointing/recovering all available data in a member. +extern const Fenix_Data_subset FENIX_DATA_SUBSET_FULL; + +//!@brief A standin for checkpointing/recovering none of the available data in a member. +extern const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY; + + +/** + * @brief Create a Data Group + * @qualifier collective + * + * If a group with this group_id was already created in the past and has not been deleted, the + * parameters of this call are ignored and this function simply serves to coordinate with any + * ranks that have not yet created this group (e.g. due to a failure). + * + * All calling ranks must pass the same values for the parameters \c group_id, \c comm, + * \c start_time_stamp, \c policy_name, and \c policy_value. + * + * @param group_id A unique identifier to this group. + * @param comm A resilient communicator on which the group is formed. + * @param start_time_stamp The time_stamp to be used for the first commit in this group. + * @param depth + * @parblock + * The number of successive snapshots of this group that are retained by Fenix, in + * addition to the most recent one, and that can be recovered by calling Fenix data member + * restore functions. + * + * For example, a depth of 0 means Fenix will keep only the necessary data to restore the + * most recent snapshot, freeing or overwriting older snapshots automatically. A depth + * of -1 is currently not supported, but would ordinarily indicate that no snapshots should + * be removed automatically. + * @endparblock + * @param policy_name Currently, may only be FENIX_DATA_POLICY_IN_MEMORY_RAID + * @param policy_value Pointer to data passed along to the policy. + * See the specific policy for more information. + * @param flag pointer to store policy-specific status or errors + * @return FENIX_SUCCESS, or an error value. + */ +int Fenix_Data_group_create(int group_id, MPI_Comm comm, int start_time_stamp, int depth, int policy_name, void* policy_value, int* flag); +/** + * @brief Create a data member for store/restore operations + * @qualifier collective + * @qualifier local + * + * All calling ranks in the group's communicator must pass the same values for the parameters + * \c member_id, \c datatype, and \c group_id. + * + * @param group_id Identifier to a data group within which to create the member. + * @param member_id An integer unique within the data group that identifies the data in + * \c source_buffer. Must be nonnegative and less than FENIX_MEMBER_ID_MAX, which is + * guaranteed to be at least 2^30. + * @param buffer Address of the data to be copied to redundant storage maintained by Fenix. + * Note that this parameter may also be specified using #Fenix_Data_member_attr_set, which + * is critical for non-survivor ranks after a failure which will have an invalid address + * which was generated on the failed rank and must update. + * @param count The maximum number of contiguous elements of type \c datatype of the data to be + * stored. Need not be the same in all calling ranks. + * @param datatype The MPI_Datatype of the elements in \c source_buffer + * + * @return FENIX_SUCCESS, or an error value. + */ int Fenix_Data_member_create(int group_id, int member_id, void *buffer, int count, MPI_Datatype datatype); +/** + * @brief Get the storage policy of a data group + * + * @param group_id Identified to the data group to query + * @param policy_name The identifier of the policy name of the data group. + * @param policy_value A location within which to store the policy_values this group's + * policy was configured with. + * @param flag A location set to true if a policy value was extracted, else false. + * @return FENIX_SUCCESS, or an error value. + */ int Fenix_Data_group_get_redundancy_policy(int group_id, int* policy_name, void *policy_value, int *flag); +//!@unimplemented Block on completion of the store operation specified by the request. int Fenix_Data_wait(Fenix_Request request); + +//!@unimplemented Query completion of the store operation specified by the request. int Fenix_Data_test(Fenix_Request request, int *flag); + +/** + * @brief Store a particular group member into the group's resilient storage space, in uncommitted storage. + * @qualifier collective + * + * The user can safely modify the member's data buffer after this call, as the current state is copied immediately. + * Multiple calls may be used to incrementally store data (using subset_specifiers), or overwrite old data prior to a commit. + * + * @param group_id All ranks must provide the same group_id + * @param member_id All ranks must provide the same member_id + * @param subset_specifier Which subset of the data to store. It is always valid for every rank to provide the same + * subset_specifier; depending on the group's policy, varying combinations of specifiers may be possible. + * @return FENIX_SUCCESS, or an error value. + */ int Fenix_Data_member_store(int group_id, int member_id, Fenix_Data_subset subset_specifier); -int Fenix_Data_member_storev(int member_id, int group_id, + +//!@unimplemented As [store](#Fenix_Data_member_store), but subsets may vary rank-to-rank. +int Fenix_Data_member_storev(int group_id, int member_id, Fenix_Data_subset subset_specifier); -int Fenix_Data_member_istore(int member_id, int group_id, +//!@unimplemented As [store](#Fenix_Data_member_store), but asynchronous. +int Fenix_Data_member_istore(int group_id, int member_id, Fenix_Data_subset subset_specifier, Fenix_Request *request); -int Fenix_Data_member_istorev(int member_id, int group_id, +//!@unimplemented As [istore](#Fenix_Data_member_istore), but asynchronous. +int Fenix_Data_member_istorev(int group_id, int member_id, Fenix_Data_subset subset_specifier, Fenix_Request *request); +/** + * @brief Commit stored data members to the group's next snapshot. + * @qualifier collective + * @qualifier local + * + * This function is used to freeze the current state of a data group, + * together with all its application data that has been stored in Fenix’ + * redundant storage, and label it with a time stamp, thus creating a + * snapshot of the stored application data. Only data that has been + * committed is eligible for recovery through #Fenix_Data_member_restore. + * An application needs to call #Fenix_Data_wait for all pending asynchronous + * [Fenix_Data_member_istore(v)](@ref Fenix_Data_member_istore) operations + * in the group before committing. + * + * @param[in] group_id The group to commit + * @param[out] time_stamp The time stamp of the new snapshot + * @returnstatus + */ int Fenix_Data_commit(int group_id, int *time_stamp); +/** + * @brief As [commit](#Fenix_Data_commit), but ensures a globally consistent commit. + * @qualifier collective + * + * This function does not function as a traditional barrier. + * The commit will proceed if all *non-failed* ranks reach the barrier. + * This allows for commits to be made when a rank fails after storing all + * of its data into resilient storage. + * + * @param[in] group_id The group to commit + * @param[out] time_stamp The time stamp of the new snapshot + * @returnstatus + */ int Fenix_Data_commit_barrier(int group_id, int *time_stamp); +//!@unimplemented Block until all ranks in the group have reached this point. int Fenix_Data_barrier(int group_id); +/** + * @brief Restore the data of a group member from a snapshot. + * @qualifier collective + * + * All ranks in the group’s resilient communicator must pass the + * same values for the parameters group_id, member_id, and time_stamp. + * This function is used to retrieve data from consistent snapshot + * members. This function can only be used if the size of the + * communicator used to store the data is the same as that at the time + * of data recovery (this implies non-shrinking communicator recovery + * in case of a rank loss). + * + * If the size of the buffer needing to receive the recovery data is + * unknown for a particular rank, it can be queried using + * #Fenix_Data_member_attr_get. + * + * @param[in] group_id The group to restore from + * @param[in] member_id The member to restore + * @param[out] target_buffer The buffer to store the restored data + * @param[in] max_count The maximum number of elements to restore + * @param[in] time_stamp The time stamp of the snapshot to restore from + * @param[out] found_data The subset of the data that was found in the snapshot + * @returnstatus + */ int Fenix_Data_member_restore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset* found_data); +/** + * @brief Local-only version of Fenix_Data_member_restore + * + * This function restores the data of a group member from the local + * snapshot. + * + * @param[in] group_id The group to restore from + * @param[in] member_id The member to restore + * @param[out] target_buffer The buffer to store the restored data + * @param[in] max_count The maximum number of elements to restore + * @param[in] time_stamp The time stamp of the snapshot to restore from + * @param[out] found_data The subset of the data that was found in the snapshot + * @returnstatus + */ int Fenix_Data_member_lrestore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset* found_data); +//!@unimplemented As #Fenix_Data_member_restore, but restores from a specific rank's data. int Fenix_Data_member_restore_from_rank(int member_id, void *data, int max_count, int time_stamp, int group_id, int source_rank); +/** + * @brief Create a data subset for use in store operations. + * + * Creates a subset based on num_blocks pairs of + * {start_offset,end_offset}, + * {start_offset+stride,end_offset+stride}, + * {start_offset+2*stride,end_offset+2*stride}, + * etc. + * + * The value of start_offset must be smaller than or equal + * to the value of end_offset to indicate non-negative block + * size. Otherwise, the function returns an error code. + * + * Created subsets must be deleted with #Fenix_Data_subset_delete + * to free memory. + * + * @param[in] num_blocks The number of contiguous data blocks. + * @param[in] start_offset The index of the first element in the first data block. + * @param[in] end_offset The index of the last element in the first data block. + * @param[in] stride Regular shift between successive data blocks. + * @param[out] subset_specifier The created subset. + * @returnstatus + */ int Fenix_Data_subset_create(int num_blocks, int start_offset, int end_offset, int stride, Fenix_Data_subset *subset_specifier); +/** + * @brief As #Fenix_Data_subset_create, but with varying start and end offsets. + * + * Creates a subset based on num_blocks pairs of {start_offset,end_offset}. + * The value of start_offset must be smaller than or equal to end_offset + * to indicate non-negative block size. Otherwise, the function returns an + * error code. + * + * Created subsets must be deleted with #Fenix_Data_subset_delete + * to free memory. + * + * @param[in] num_blocks The number of contiguous data blocks. + * @param[in] array_start_offsets The index of the first element in each data block. + * @param[in] array_end_offsets The index of the last element in each data block. + * @param[out] subset_specifier The created subset. + */ int Fenix_Data_subset_createv(int num_blocks, int *array_start_offsets, int *array_end_offsets, Fenix_Data_subset *subset_specifier); +/** + * @brief Delete a data subset. + * + * Frees the memory associated with a data subset object. + * + * @param[in] subset_specifier The subset to delete. + * @returnstatus + */ int Fenix_Data_subset_delete(Fenix_Data_subset *subset_specifier); +//!@unimplemented Get the number of members in a data group. int Fenix_Data_group_get_number_of_members(int group_id, int *number_of_members); -int Fenix_Data_group_get_member_at_position(int position, int *member_id, - int group_id); - +//!@unimplemented Get member ID based on member index +int Fenix_Data_group_get_member_at_position(int group_id, int *member_id, + int position); + +/** + * @brief Get the number of locally-available snapshots in a data group. + * + * May include snapshots that are inconsistent across the group. + * + * @param[in] group_id The group to query + * @param[out] number_of_snapshots The number of snapshots in the group + * @returnstatus + */ int Fenix_Data_group_get_number_of_snapshots(int group_id, int *number_of_snapshots); +/** + * @brief Get the time stamp of a snapshot at a given index. + * + * Snapshots are indexed in reverse order in which the user committed them + * (e.g. the most recent available snapshot has position=0). + * + * @param[in] group_id The group to query + * @param[in] position The index of the snapshot, which must be [0, number_of_snapshots) + * @param[out] time_stamp The time stamp of the snapshot + * + */ int Fenix_Data_group_get_snapshot_at_position(int group_id, int position, int *time_stamp); +//!@unimplemented Get the value of a member's attribute. int Fenix_Data_member_attr_get(int group_id, int member_id, int attributename, void *attributevalue, int *flag, int source_rank); +/** + * @brief Set the value of a member's attribute. + * + * Valid names are #FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER, #FENIX_DATA_MEMBER_ATTRIBUTE_COUNT, + * and #FENIX_DATA_MEMBER_ATTRIBUTE_DATATYPE. + * + * The COUNT and DATATYPE attributes may only be set before the first store operation. + * Contrary to the Fenix specification, returning to #Fenix_Init after a failure does not + * allow the user to set these attributes again. + * + * @param[in] group_id The group to update + * @param[in] member_id The member to update + * @param[in] attribute_name The attribute to update + * @param[in] attribute_value The new value of the attribute + * @param[out] flag Set to true if the attribute was set, else false + * @returnstatus + */ int Fenix_Data_member_attr_set(int group_id, int member_id, int attribute_name, void *attribute_value, int *flag); +/** + * @brief Delete a snapshot from a data group. + * @qualifier local + * + * @param[in] group_id The group to delete from + * @param[in] time_stamp The time stamp of the snapshot to delete + * @returnstatus + */ int Fenix_Data_snapshot_delete(int group_id, int time_stamp); +/** + * @brief Delete a data group. + * @qualifier local + * + * @param[in] group_id The group to delete + * @returnstatus + */ int Fenix_Data_group_delete(int group_id); +/** + * @brief Delete a data member. + * @qualifier local + * + * @param[in] group_id The group to delete from + * @param[in] member_id The member to delete + * @returnstatus + */ int Fenix_Data_member_delete(int group_id, int member_id); - -int Fenix_Process_fail_list(int** fail_list); - -int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status); - -int Fenix_Process_detect_failures(int do_recovery); +/**@}*/ #if defined(c_plusplus) || defined(__cplusplus) }