[ FSU ] Enabls Asynchronos FSU for forwarding

This PR enables asynchronos mode for FSU (flash storage utilization) for better performance. It splits the load and unload tensors which make difficult to handle. Also fix the inference execution order when it is in INFERENCE mode and change the trainable option to false when it calls the request weights and tensors. Add the new function to load and unload tensors as well as check load complete. It also considers weight pool and tensor pool differenetly according to the ExecutionMode. It is not use FSU mode for tensor pool for the INFERENCE Mode. Resolves: **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon <[email protected]>
EunjuYang · Dec 10, 2024 · cd17a66 · cd17a66
1 parent 9c6723f
commit cd17a66
Show file tree

Hide file tree

Showing 21 changed files with 487 additions and 157 deletions.
diff --git a/Applications/SimpleFC/README.md b/Applications/SimpleFC/README.md
diff --git a/Applications/SimpleFC/jni/main.cpp b/Applications/SimpleFC/jni/main.cpp
@@ -1,13 +1,12 @@
 // SPDX-License-Identifier: Apache-2.0
 /**
- * Copyright (C) 2020 Jihoon Lee <jhoon.it.lee@samsung.com>
+ * Copyright (C) 2024 Jijoong Moon <jijoong.moon@samsung.com>
  *
  * @file   main.cpp
- * @date   24 Jun 2021
- * @todo   move resnet model creating to separate sourcefile
- * @brief  task runner for the resnet
+ * @date   10 Dec 2024
+ * @brief  Test Application for Asynch FSU
  * @see    https://github.com/nnstreamer/nntrainer
- * @author Jihoon Lee <jhoon.it.lee@samsung.com>
+ * @author Jijoong Moon <jijoong.moon@samsung.com>
  * @bug    No known bugs except for NYI items
  */
 #include <array>
@@ -76,32 +75,32 @@ static std::string withKey(const std::string &key,
 }
 
 /**
- * @brief Create resnet 18
+ * @brief Create network
  *
- * @return vector of layers that contain full graph of resnet18
+ * @return vector of layers that contain full graph of asynch
  */
 std::vector<LayerHandle> createGraph() {
   using ml::train::createLayer;
 
   std::vector<LayerHandle> layers;
 
   layers.push_back(createLayer(
-    "input", {withKey("name", "input0"), withKey("input_shape", "1:1:32")}));
+    "input", {withKey("name", "input0"), withKey("input_shape", "1:1:320")}));
 
-  layers.push_back(
-    createLayer("fully_connected",
-                {withKey("unit", 10)}));
+  layers.push_back(createLayer("fully_connected",
+                               {withKey("unit", 100),
+                                withKey("weight_initializer", "xavier_uniform"),
+                                withKey("bias_initializer", "zeros")}));
 
-  layers.push_back(
-    createLayer("fully_connected",
-                {withKey("unit", 10)}));
+  layers.push_back(createLayer("fully_connected",
+                               {withKey("unit", 100),
+                                withKey("weight_initializer", "xavier_uniform"),
+                                withKey("bias_initializer", "zeros")}));
 
   return layers;
 }
 
-/// @todo update createResnet18 to be more generic
 ModelHandle create() {
-  /// @todo support "LOSS : cross" for TF_Lite Exporter
   ModelHandle model = ml::train::createModel(ml::train::ModelType::NEURAL_NET,
                                              {withKey("loss", "mse")});
 
@@ -126,17 +125,19 @@ int validData_cb(float **input, float **label, bool *last, void *user_data) {
   return 0;
 }
 
-/// @todo maybe make num_class also a parameter
 void createAndRun(unsigned int epochs, unsigned int batch_size,
                   UserDataType &train_user_data,
                   UserDataType &valid_user_data) {
 
   // setup model
   ModelHandle model = create();
-  model->setProperty({withKey("batch_size", batch_size),
-                      withKey("epochs", epochs),
-                      withKey("save_path", "model_full.bin"),
-		      withKey("memory_swap","true")});
+  model->setProperty(
+    {withKey("batch_size", batch_size), withKey("epochs", epochs),
+     // withKey("save_path", "model_full.bin")});
+     // withKey("save_path", "model_full.bin"), withKey("memory_swap",
+     // "true")});
+     withKey("memory_swap", "true"), withKey("memory_swap_lookahead", "1"),
+     withKey("model_tensor_type", "FP16-FP16")});
 
   auto optimizer = ml::train::createOptimizer("sgd", {"learning_rate=0.001"});
   model->setOptimizer(std::move(optimizer));
@@ -156,28 +157,24 @@ void createAndRun(unsigned int epochs, unsigned int batch_size,
   auto dataset_valid = ml::train::createDataset(
     ml::train::DatasetType::GENERATOR, validData_cb, valid_user_data.get());
 
-  // model->setDataset(ml::train::DatasetModeType::MODE_TRAIN,
-  //                   std::move(dataset_train));
-  // model->setDataset(ml::train::DatasetModeType::MODE_VALID,
-  //                   std::move(dataset_valid));
-
-  // if (transfer_learning)
-  //   model->load(pretrained_bin_path);
-  // model->train();
+  // to test asynch fsu, we do need save the model weight data in file
+  model->save("simplefc_weight_fp16_fp16_100.bin",
+              ml::train::ModelFormat::MODEL_FORMAT_BIN);
+  model->load("./simplefc_weight_fp16_fp16_100.bin");
 
   model->summarize(std::cout, ML_TRAIN_SUMMARY_MODEL);
 
-  uint feature_size = 32;
+  uint feature_size = 320;
 
-  float input [32];
-  float label [1];
+  float input[320];
+  float label[1];
 
-  for(uint j=0;j<feature_size;++j)
+  for (uint j = 0; j < feature_size; ++j)
     input[j] = j;
 
-  std::vector<float*> in;
-  std::vector<float*> l;
-  std::vector<float*> answer;
+  std::vector<float *> in;
+  std::vector<float *> l;
+  std::vector<float *> answer;
 
   in.push_back(input);
   l.push_back(label);
@@ -187,19 +184,18 @@ void createAndRun(unsigned int epochs, unsigned int batch_size,
   in.clear();
   l.clear();
 
-  std::cout << "done"<<std::endl;
-
+  std::cout << "done" << std::endl;
 }
 
 std::array<UserDataType, 2>
 createFakeDataGenerator(unsigned int batch_size,
                         unsigned int simulated_data_size,
                         unsigned int data_split) {
   UserDataType train_data(new nntrainer::util::RandomDataLoader(
-    {{batch_size, 1, 1, 32}}, {{batch_size, 1, 1, 10}},
+    {{batch_size, 1, 1, 320}}, {{batch_size, 1, 1, 100}},
     simulated_data_size / data_split));
   UserDataType valid_data(new nntrainer::util::RandomDataLoader(
-    {{batch_size, 1, 1, 32}}, {{batch_size, 1, 1, 10}},
+    {{batch_size, 1, 1, 320}}, {{batch_size, 1, 1, 100}},
     simulated_data_size / data_split));
 
   return {std::move(train_data), std::move(valid_data)};
@@ -231,9 +227,9 @@ int main(int argc, char *argv[]) {
 
   std::string data_dir = "fake";
   uint batch_size = 1;
-  uint data_split =1;
+  uint data_split = 1;
   uint epoch = 1;
-  
+
   std::array<UserDataType, 2> user_datas;
 
   try {

diff --git a/Applications/SimpleFC/jni/meson.build b/Applications/SimpleFC/jni/meson.build
@@ -1,28 +1,28 @@
-resnet_sources = [
+app_sources = [
   'main.cpp',
   cifar_path / 'cifar_dataloader.cpp'
 ]
 
-resnet_dependencies = [app_utils_dep,
+app_dependencies = [app_utils_dep,
   iniparser_dep,
   nntrainer_dep,
   nntrainer_ccapi_dep
 ]
 
 if get_option('enable-test')
-  resnet_dependencies += [gtest_dep]
+  app_dependencies += [gtest_dep]
 endif
 
 e = executable('nntrainer_simplefc',
-  resnet_sources,
+  app_sources,
   include_directories: [include_directories('.'), cifar_include_dir],
-  dependencies: resnet_dependencies,
+  dependencies: app_dependencies,
   install: get_option('install-app'),
   install_dir: application_install_dir
 )
 
 if get_option('enable-long-test')
   testenv = environment()
   testenv.set('OPENBLAS_NUM_THREADS', '4')
-  test('app_resnet18', e, args: ['fake', '1', '128', '1'], env: testenv, timeout: 300)
+  test('app_asynch_fsu', e, args: ['fake', '1', '128', '1'], env: testenv, timeout: 300)
 endif
diff --git a/meson.build b/meson.build
@@ -90,7 +90,6 @@ if get_option('enable-fp16')
      extra_defines += '-DENABLE_FP16=1'
      extra_defines += '-DUSE__FP16=1'
      extra_defines += '-DUSE_NEON=1'
-     extra_defines += '-DUSE_MMAP=1'
    elif arch == 'aarch64'
      ## About FP16 in GCC (from GCC-9.1 manual)
      # https://gcc.gnu.org/onlinedocs/gcc-9.1.0/gcc/Half-Precision.html

diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
@@ -229,6 +229,14 @@ int NetworkGraph::checkCompiledGraph() {
 void NetworkGraph::markNodesForBackwarding() {
   /** accumulate all the nodes which must support backwarding */
   std::unordered_set<std::string> must_support_backwarding;
+  if (exec_mode == ExecutionMode::INFERENCE) {
+    for (auto iter = cbegin(); iter != cend(); iter++) {
+      auto lnode = (*iter);
+      lnode->needsCalcGradient(false);
+      lnode->needsCalcDerivative(false);
+    }
+    return;
+  }
 
   /**
    * if a node is trainable, then all the nodes ahead of it must support
@@ -867,14 +875,16 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
   }
   lnode->setDataType(init_context.getWeightDataType(),
                      init_context.getActivationDataType());
-
+  bool trainable = lnode->getTrainable();
+  if (exec_mode == ExecutionMode::INFERENCE)
+    trainable = false;
   lnode->configureRunContext(
     // TODO: update weights spec for trainable based on layer trainable prop
     tensor_manager->requestWeights(gnode, init_context.getWeightsSpec(),
-                                   lnode->getTrainable(), shared_weight_names),
+                                   trainable, shared_weight_names),
     inputs, outputs,
     tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
-                                   lnode->getTrainable(), shared_tensor_names),
+                                   trainable, shared_tensor_names),
     init_context.getLossScale());
 
   return outputs;
@@ -1552,6 +1562,22 @@ void NetworkGraph::flushCacheExcept(unsigned int order) {
   tensor_manager->flushCacheExcept(order);
 }
 
+void NetworkGraph::LoadTensors(unsigned int order) {
+  tensor_manager->LoadTensors(order);
+}
+
+bool NetworkGraph::checkLoadComplete(unsigned int order) {
+  return tensor_manager->checkLoadComplete(order);
+}
+
+bool NetworkGraph::checkUnloadComplete(unsigned int order) {
+  return tensor_manager->checkUnloadComplete(order);
+}
+
+void NetworkGraph::UnloadTensors(unsigned int order) {
+  tensor_manager->UnloadTensors(order);
+}
+
 void NetworkGraph::requestOptimizerVariable(
   std::function<std::vector<TensorDim>(const TensorDim &)> cb,
   bool request_only_trainable) {

diff --git a/nntrainer/graph/network_graph.h b/nntrainer/graph/network_graph.h
@@ -370,8 +370,12 @@ class NetworkGraph {
    * @brief Allocate memory for all the managed weights
    */
   void allocateWeights(bool init = true) {
-    tensor_manager->allocateWeights(
-      std::get<3>(backward_iter_end->getExecutionOrder()), init);
+    unsigned int max_exec_order =
+      std::get<3>(backward_iter_end->getExecutionOrder());
+
+    if (exec_mode == ExecutionMode::INFERENCE)
+      max_exec_order = std::get<0>(forward_iter_end->getExecutionOrder());
+    tensor_manager->allocateWeights(max_exec_order, init);
   }
 
   /**
@@ -447,6 +451,34 @@ class NetworkGraph {
    */
   void flushCacheExcept(const unsigned int order);
 
+  /**
+   * @brief Load data of order to the device
+   *
+   * @param order execution order
+   */
+  void LoadTensors(const unsigned int order);
+
+  /**
+   * @brief check data of order is loaded
+   *
+   * @param order execution order
+   */
+  bool checkLoadComplete(const unsigned int order);
+
+  /**
+   * @brief check data of order is Unloaded
+   *
+   * @param order execution order
+   */
+  bool checkUnloadComplete(const unsigned int order);
+
+  /**
+   * @brief Load data of order to the device
+   *
+   * @param order execution order
+   */
+  void UnloadTensors(const unsigned int order);
+
 #ifdef ENABLE_TEST
   /**
    * @brief Get layer node's tenexecution orders

diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp
@@ -489,7 +489,8 @@ void LayerNode::read(std::ifstream &file, bool opt_var,
 
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
       /// @note shared weights are only be read at the first acecss
-      if (run_context->isGradientLastAccess(i)) {
+      //      if (run_context->isGradientLastAccess(i)) {
+      if (run_context->isGradientFirstAccess(i)) {
         if (layer->getType() == BatchNormalizationLayer::type) {
           if ((mode == ml::train::ExecutionMode::TRAIN) &&
               (this->getWeightDataType() != TensorDim::DataType::FP32)) {
@@ -526,7 +527,7 @@ void LayerNode::save(std::ofstream &file, bool opt_var,
 
   if (opt_var) {
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
-      if (run_context->isGradientLastAccess(i) && getTrainable()) {
+      if (run_context->isGradientFirstAccess(i) && getTrainable()) {
         // @note save optimizer variables
         if (run_context->weightHasGradient(i)) {
           for (unsigned int j = 0; j < run_context->getNumWeightOptVar(i);
@@ -539,7 +540,7 @@ void LayerNode::save(std::ofstream &file, bool opt_var,
   } else {
     // @note shared weights are only be saved at the first access
     for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
-      if (run_context->isGradientLastAccess(i)) {
+      if (run_context->isGradientFirstAccess(i)) {
 
         /** @note For batch normalization layer, we do need full precision for
          * training and the data type of weight is full precision. But for

diff --git a/nntrainer/models/model_common_properties.cpp b/nntrainer/models/model_common_properties.cpp
@@ -34,8 +34,6 @@ MemorySwap::MemorySwap(bool value) { set(value); }
 
 MemorySwapPath::MemorySwapPath(const std::string &value) { set(value); }
 
-MemorySwapMode::MemorySwapMode(const std::string &value) { set(value); }
-
 MemorySwapLookahead::MemorySwapLookahead(const unsigned int &value) {
   set(value);
 }