nnstreamer · jihochu · Mar 6, 2024 · Mar 6, 2024 · Mar 6, 2024 · Mar 6, 2024
@@ -55,7 +55,8 @@ class TensorDim {
     QINT4, /** quantized int 4*/
     QINT8, /** quantized int 8*/
     FP16,  /** half precision */
-    FP32   /** single precision */
+    FP32,  /** single precision */
+    NONE,  /** not specified */
   };
 
   /**
@@ -97,9 +98,7 @@ class TensorDim {
      */
     TensorType(Format fm, DataType d_type,
                StorageOrder order = StorageOrder::ROW_MAJOR) :
-      format(fm),
-      data_type(d_type),
-      storage_order(order){};
+      format(fm), data_type(d_type), storage_order(order){};
   };
 
   /**

@@ -23,6 +23,7 @@
 /usr/include/nntrainer/layer_context.h
 /usr/include/nntrainer/layer_devel.h
 /usr/include/nntrainer/layer_impl.h
+/usr/include/nntrainer/loss_layer.h
 # custom layer kits
 /usr/include/nntrainer/app_context.h
 # logger

@@ -6,6 +6,7 @@
  * @date    19 Oct 2020
  * @see     https://github.com/nnstreamer/nntrainer
  * @author  Jijoong Moon <[email protected]>
+ * @author  Jiho Chu <[email protected]>
  * @bug     No known bugs except for NYI items
  * @brief   This is Network Graph Class for Neural Network
  *
@@ -85,6 +86,15 @@ int NetworkGraph::compile(const std::string &loss_type) {
   status = checkCompiledGraph();
   NN_RETURN_STATUS();
 
+  /**
+   * @note It can be integrated with addLossLayer method
+   * if it removes adding loss layer to the model directly.
+   */
+  for (auto iter = cbegin(); iter != cend(); iter++) {
+    auto &ln = *iter;
+    ln->setLossScale(loss_scale);
+  }
+
   compiled = true;
 
   return status;
@@ -353,10 +363,15 @@ sharedConstTensors NetworkGraph::forwarding(
   bool training,
   std::function<void(std::shared_ptr<LayerNode>, bool)> forwarding_op,
   std::function<bool(void *userdata)> stop_cb, void *userdata) {
+
+  for (auto w : clip_weights) {
+    w->applyMaster();
+  }
+
   for (auto iter = cbegin(); iter != cend() && !stop_cb(userdata); iter++) {
     auto &ln = *iter;
     PROFILE_TIME_START(profile_keys.at(ln->getType()));
-    forwarding_op(*iter, training);
+    forwarding_op(ln, training);
     PROFILE_TIME_END(profile_keys.at(ln->getType()));
   }
 
@@ -397,7 +412,7 @@ void NetworkGraph::backwarding(
   int iteration,
   std::function<void(std::shared_ptr<LayerNode>, int)> &backwarding_op,
   std::function<void(Weight &, int)> &apply_grad_clip_op,
-  std::function<bool(void *userdata)> stop_cb, void *userdata) const {
+  std::function<bool(void *userdata)> stop_cb, void *userdata) {
   /**
    * last layer backwarding is run out of this loop
    */
@@ -426,6 +441,60 @@ void NetworkGraph::backwarding(
   if (clip_weights.empty())
     return;
 
+  /**
+   * mixed precision trainging needs gradient clipping and loss scale,
+   * cause all weights are updated with clipping option.
+   * also, loss scale makes to avoid unexpected training result.
+   */
+  auto update_loss_scale = [&](float scale) {
+    ml_logd("set loss scale = %f", scale);
+    for (auto iter = cbegin(); iter != cend(); iter++) {
+      auto &ln = *iter;
+      ln->setLossScale(scale);
+    }
+    loss_scale = scale;
+  };
+
+  auto check_weights = [](std::vector<Weight *> &weights) {
+    bool valid = true;
+    for (auto &w : weights) {
+      auto grad = w->getGradient();
+      if (grad.checkDataValidation(false) == false) {
+        grad.setZero();
+        valid = false;
+      }
+    }
+    return valid;
+  };
+
+  // check first layer's derivative is valid
+  // loss scale is adjusted between 1.0f ~ 256.0f
+  // @todo provide max scale property
+  auto &ln = *(cbegin() + 1);
+  if (loss_scale != 0.0f && !ln->getRunContext().validateDerivatives()) {
+    // It will not apply train results if data is invalid
+    float scale = loss_scale > 1.5f ? loss_scale - 0.5f : 1.0f;
+    ml_logd(
+      "Derivative validation failed. Skip applying gradient. loss_scale(%f)",
+      scale);
+    check_weights(clip_weights);
+    update_loss_scale(scale);
+    return;
+  } else {
+    for (unsigned int idx = 0; idx < clip_weights.size(); idx++) {
+      auto const &w = clip_weights[idx];
+      w->applyScaler(loss_scale);
+
+      if (!check_weights(clip_weights)) {
+        float scale = loss_scale > 1.5f ? loss_scale - 0.5f : 1.0f;
+        ml_loge("gradient validation failed. skip update. loss_scale(%f)",
+                scale);
+        update_loss_scale(scale);
+        return;
+      }
+    }
+  }
+
   /** calculate the global norm */
   Tensor global_norm_t(
     TensorDim({1u, 1u, 1u, (unsigned int)clip_weights.size()}));
@@ -434,6 +503,7 @@ void NetworkGraph::backwarding(
     auto const &w = clip_weights[idx];
     global_norm_data[idx] = w->getGradientNorm();
   }
+
   float global_norm = global_norm_t.l2norm();
   /** apply the gradient with the above global norm */
   for (auto w : clip_weights) {
@@ -443,6 +513,12 @@ void NetworkGraph::backwarding(
   for (auto w : clip_weights) {
     apply_grad_clip_op(*w, iteration);
   }
+
+  // update loss scale
+  if (loss_scale != 0.0f) {
+    float scale = loss_scale + 2.0f;
+    update_loss_scale(scale);
+  }
 }
 
 LayerNode *NetworkGraph::computeBackwardEnd() {
@@ -605,6 +681,14 @@ NetworkGraph::canExecuteInPlace(const std::shared_ptr<LayerNode> &lnode) {
              (lnode->getType() == LayerNormalizationLayer::type);
     };
 
+  /**
+   * if the layer's input and output type is not FP32, then it cannot be
+   * inplace. We assume that the input is always FP32.
+   */
+  if (lnode->getInputConnections().empty() &&
+      !istrequal(getTensorType()[2], "FP32"))
+    return InPlace::NONE;
+
   /**
    * @note Conditions to decide if this layer node can be in-place:
    * 1. if the layer is a no-op, then it can operate in-place as it is not
@@ -686,15 +770,6 @@ NetworkGraph::canExecuteInPlace(const std::shared_ptr<LayerNode> &lnode) {
     return InPlace::RESTRICTING;
   }
 
-  /**
-   * if the layer's input and output type is not FP32, then it cannot be
-   * inplace. We assume that the input is always FP32.
-   */
-  if (lnode->getInputConnections().empty()) {
-    if (!istrequal(getTensorType()[2], "FP32"))
-      return InPlace::NONE;
-  }
-
   return InPlace::NONE;
 }
 
@@ -876,7 +951,11 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
   lnode->configureRunContext(
     // TODO: update weights spec for trainable based on layer trainable prop
     tensor_manager->requestWeights(gnode, init_context.getWeightsSpec(),
-                                   lnode->getTrainable(), shared_weight_names),
+                                   lnode->getTrainable(), shared_weight_names,
+                                   init_context.getActivationDataType() !=
+                                       init_context.getWeightDataType()
+                                     ? init_context.getActivationDataType()
+                                     : TensorDim::DataType::NONE),
     inputs, outputs,
     tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
                                    lnode->getTrainable(), shared_tensor_names));
@@ -1551,13 +1630,25 @@ void NetworkGraph::flushCacheExcept(unsigned int order) {
 void NetworkGraph::requestOptimizerVariable(
   std::function<std::vector<TensorDim>(const TensorDim &)> cb,
   bool request_only_trainable) {
+  bool need_master = !istrequal(getTensorType()[1], getTensorType()[2]);
   for (auto const &w : tensor_manager->getWeights()) {
     if (w->isGradientLastAccess() && w->hasGradient()) {
       const TensorDim &dim = w->getDim();
       std::vector<TensorDim> dims = cb(dim);
       w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
         dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
         w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS));
+      if (need_master) {
+        for (auto &dim : dims)
+          dim.setDataType(
+            str_converter<enum_class_prop_tag, nntrainer::TensorDataTypeInfo>::
+              from_string(getTensorType()[1]));
+        w->setOptimizerMasterVariables(
+          tensor_manager->requestWeightOptimizerVariables(
+            dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
+            w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS,
+            need_master));
+      }
     }
   }
 }

@@ -51,7 +51,8 @@ class NetworkGraph {
     optimize_memory(true),
     exec_mode(ExecutionMode::TRAIN),
     tensor_format("NCHW"),
-    tensor_dtype(split("FP32-FP32", getRegex("\\-"))) {}
+    tensor_dtype(split("FP32-FP32", getRegex("\\-"))),
+    loss_scale(0.0f) {}
 
   /**
    * @brief     Constructor of NeuralNetwork Graph Class
@@ -61,7 +62,8 @@ class NetworkGraph {
   NetworkGraph(bool enable_swap, const std::string &swap_path = "",
                unsigned int lookahead = 0,
                const std::string &tensor_format_ = "NCHW",
-               const std::string &tensor_dtype_ = "FP32-FP32") :
+               const std::string &tensor_dtype_ = "FP32-FP32",
+               const float scale = 0.0f) :
     tensor_manager(std::make_shared<Manager>(enable_swap, swap_path, lookahead,
                                              tensor_format_, tensor_dtype_)),
     graph(),
@@ -73,7 +75,8 @@ class NetworkGraph {
     optimize_memory(true),
     exec_mode(ExecutionMode::TRAIN),
     tensor_format(tensor_format_),
-    tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) {}
+    tensor_dtype(split(tensor_dtype_, getRegex("\\-"))),
+    loss_scale(scale) {}
 
   /**
    * @brief   Destructor of the NeuralNetwork Graph class
@@ -212,7 +215,7 @@ class NetworkGraph {
     std::function<void(Weight &, int)> &apply_grad_clip_op,
     std::function<bool(void *userdata)> stop_cb =
       [](void *user_data) { return false; },
-    void *user_data = nullptr) const;
+    void *user_data = nullptr);
 
   /**
    * @brief     get begin iterator for the graph
@@ -482,6 +485,8 @@ class NetworkGraph {
   std::vector<Weight *>
     clip_weights; /**< weights with global norm based clipping enabled */
 
+  float loss_scale; /**< loss scale factor for the graph */
+
   /**
    * @brief     topological sort
    * @param[in] ith index of LayerNode

@@ -111,6 +111,12 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
     context.requestWeight(dim, bnparams_beta, WeightRegularizer::NONE, 1.0f,
                           bias_decay, "beta", true);
 
+  /**
+   * @note declare weigth dimention with activation datatype
+   */
+  TensorDim w_dim = dim;
+  w_dim.setDataType(in_dim.getDataType());
+
   /**
    * caches the deviation -> input - avg(input)
    * @todo check if avoiding this storage and adding dependency on input (no
@@ -121,7 +127,7 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
                           TensorLifespan::ITERATION_LIFESPAN);
   /** caches the inverse standard deviation */
   wt_idx[BNParams::invstd] =
-    context.requestTensor(dim, "invstd", Tensor::Initializer::NONE, false,
+    context.requestTensor(w_dim, "invstd", Tensor::Initializer::NONE, false,
                           TensorLifespan::ITERATION_LIFESPAN);
   /**
    * Temporary tensor to store the full sized tensors in order to allow batch
@@ -136,13 +142,13 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
    * caches variance + epsilon as well.
    */
   wt_idx[BNParams::cvar] =
-    context.requestTensor(dim, "cvar", Tensor::Initializer::NONE, false,
+    context.requestTensor(w_dim, "cvar", Tensor::Initializer::NONE, false,
                           TensorLifespan::ITERATION_LIFESPAN);
   /**
    * Temporary tensor to store the reduced tensors along the axes_to_reduce.
    */
   wt_idx[BNParams::t_reduced] =
-    context.requestTensor(dim, "tensor_reduced", Tensor::Initializer::NONE,
+    context.requestTensor(w_dim, "tensor_reduced", Tensor::Initializer::NONE,
                           false, TensorLifespan::FORWARD_DERIV_LIFESPAN);
 }
 
@@ -176,6 +182,11 @@ void BatchNormalizationLayer::forwarding(RunLayerContext &context,
   Tensor &cvar = context.getTensor(wt_idx[BNParams::cvar]);
 
   if (training) {
+    t_reduced.setZero();
+    deviation.setZero();
+    invstd.setZero();
+    cvar.setZero();
+
     input_.average(axes_to_reduce, t_reduced);
     input_.subtract(t_reduced, deviation);