diff --git a/Applications/LLaMA/jni/custom_multi_head_attention_layer.cpp b/Applications/LLaMA/jni/custom_multi_head_attention_layer.cpp
index 2a7bcbae28..aad10c1f5f 100644
--- a/Applications/LLaMA/jni/custom_multi_head_attention_layer.cpp
+++ b/Applications/LLaMA/jni/custom_multi_head_attention_layer.cpp
@@ -272,58 +272,58 @@ void MultiHeadAttentionLayer::finalize(InitLayerContext &context) {
     {batch_size, 1, query_height, num_heads * projected_query_dim_prop},
     activation_type);
   weight_idx[AttentionParams::projected_query] = context.requestTensor(
-    projected_query_dim, "projected_query", Tensor::Initializer::NONE, true,
+    projected_query_dim, "projected_query", Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
   /** tensor for output of key fc */
   TensorDim projected_key_dim(
     {batch_size, 1, key_height, num_heads * projected_key_dim_prop},
     activation_type);
-  weight_idx[AttentionParams::projected_key] = context.requestTensor(
-    projected_key_dim, "projected_key", Tensor::Initializer::NONE, true,
-    TensorLifespan::ITERATION_LIFESPAN);
+  weight_idx[AttentionParams::projected_key] =
+    context.requestTensor(projected_key_dim, "projected_key", Initializer::NONE,
+                          true, TensorLifespan::ITERATION_LIFESPAN);
   /** tensor for output of value fc */
   TensorDim projected_value_dim(
     {batch_size, 1, value_height, num_heads * projected_value_dim_prop},
     activation_type);
   weight_idx[AttentionParams::projected_value] = context.requestTensor(
-    projected_value_dim, "projected_value", Tensor::Initializer::NONE, true,
+    projected_value_dim, "projected_value", Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
 
   TensorDim cache_key_dim(
     {batch_size, 1, max_timestep, num_heads * projected_key_dim_prop},
     activation_type);
   weight_idx[AttentionParams::cache_key] =
-    context.requestTensor(cache_key_dim, "cache_key", Tensor::Initializer::NONE,
-                          true, TensorLifespan::MAX_LIFESPAN);
+    context.requestTensor(cache_key_dim, "cache_key", Initializer::NONE, true,
+                          TensorLifespan::MAX_LIFESPAN);
 
   TensorDim cache_value_dim(
     {batch_size, 1, max_timestep, num_heads * projected_value_dim_prop},
     activation_type);
-  weight_idx[AttentionParams::cache_value] = context.requestTensor(
-    cache_value_dim, "cache_value", Tensor::Initializer::NONE, true,
-    TensorLifespan::MAX_LIFESPAN);
+  weight_idx[AttentionParams::cache_value] =
+    context.requestTensor(cache_value_dim, "cache_value", Initializer::NONE,
+                          true, TensorLifespan::MAX_LIFESPAN);
 
   if (provide_attention_mask) {
     /** Intended comment for bool type mask */
     // TensorDim attention_mask_dim(
     //   {batch_size, num_heads, query_height, key_height});
     // weight_idx[AttentionParams::attention_mask] = context.requestTensor(
-    //   attention_mask_dim, "attention_mask", Tensor::Initializer::NONE, false,
+    //   attention_mask_dim, "attention_mask", Initializer::NONE, false,
     //   TensorLifespan::FORWARD_FUNC_LIFESPAN);
   }
   /** tensor for attention weight */
   TensorDim attention_weight_dim(
     {batch_size, num_heads, query_height, key_height}, activation_type);
   weight_idx[AttentionParams::attention_weight] = context.requestTensor(
-    attention_weight_dim, "attention_weight", Tensor::Initializer::NONE, true,
+    attention_weight_dim, "attention_weight", Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
   if (dropout_rate > epsilon) {
     /** tensor for dropout mask */
     TensorDim dropout_mask_dim(
       {batch_size, num_heads, query_height, key_height}, activation_type);
-    weight_idx[AttentionParams::dropout_mask] = context.requestTensor(
-      dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false,
-      TensorLifespan::ITERATION_LIFESPAN);
+    weight_idx[AttentionParams::dropout_mask] =
+      context.requestTensor(dropout_mask_dim, "dropout_mask", Initializer::NONE,
+                            false, TensorLifespan::ITERATION_LIFESPAN);
   }
 
   /** tensor for attention output */
@@ -331,7 +331,7 @@ void MultiHeadAttentionLayer::finalize(InitLayerContext &context) {
     {batch_size, 1, query_height, num_heads * projected_value_dim_prop},
     activation_type);
   weight_idx[AttentionParams::attention_output] = context.requestTensor(
-    attention_output_dim, "attention_output", Tensor::Initializer::NONE, true,
+    attention_output_dim, "attention_output", Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
 
   TensorDim output_dim({batch_size, 1, query_height, output_shape},
diff --git a/Applications/LLaMA/jni/rms_norm.h b/Applications/LLaMA/jni/rms_norm.h
index 1180db82e6..8f769527ab 100644
--- a/Applications/LLaMA/jni/rms_norm.h
+++ b/Applications/LLaMA/jni/rms_norm.h
@@ -38,8 +38,8 @@ class RMS_NORM_GAMMA_INIT final
   /**
    * @brief Construct a RMS_NORM_GAMMA_INIT object
    */
-  RMS_NORM_GAMMA_INIT(nntrainer::Tensor::Initializer value =
-                        nntrainer::Tensor::Initializer::ONES) {
+  RMS_NORM_GAMMA_INIT(
+    nntrainer::Initializer value = nntrainer::Initializer::ONES) {
     set(value);
   };
 
diff --git a/Applications/YOLOv2/jni/yolo_v2_loss.cpp b/Applications/YOLOv2/jni/yolo_v2_loss.cpp
index 8421dd24ee..67b262d283 100644
--- a/Applications/YOLOv2/jni/yolo_v2_loss.cpp
+++ b/Applications/YOLOv2/jni/yolo_v2_loss.cpp
@@ -319,141 +319,136 @@ void YoloV2LossLayer::finalize(nntrainer::InitLayerContext &context) {
   nntrainer::TensorDim bbox_x_pred_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::bbox_x_pred] = context.requestTensor(
-    bbox_x_pred_dim, "bbox_x_pred", nntrainer::Tensor::Initializer::NONE, true,
+    bbox_x_pred_dim, "bbox_x_pred", nntrainer::Initializer::NONE, true,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_y_pred_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::bbox_y_pred] = context.requestTensor(
-    bbox_y_pred_dim, "bbox_y_pred", nntrainer::Tensor::Initializer::NONE, true,
+    bbox_y_pred_dim, "bbox_y_pred", nntrainer::Initializer::NONE, true,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_w_pred_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::bbox_w_pred] = context.requestTensor(
-    bbox_w_pred_dim, "bbox_w_pred", nntrainer::Tensor::Initializer::NONE, true,
+    bbox_w_pred_dim, "bbox_w_pred", nntrainer::Initializer::NONE, true,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_h_pred_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::bbox_h_pred] = context.requestTensor(
-    bbox_h_pred_dim, "bbox_h_pred", nntrainer::Tensor::Initializer::NONE, true,
+    bbox_h_pred_dim, "bbox_h_pred", nntrainer::Initializer::NONE, true,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim confidence_pred_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
-  wt_idx[YoloV2LossParams::confidence_pred] =
-    context.requestTensor(confidence_pred_dim, "confidence_pred",
-                          nntrainer::Tensor::Initializer::NONE, true,
-                          nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+  wt_idx[YoloV2LossParams::confidence_pred] = context.requestTensor(
+    confidence_pred_dim, "confidence_pred", nntrainer::Initializer::NONE, true,
+    nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim class_pred_dim(batch_size,
                                       grid_height_number * grid_width_number,
                                       NUM_ANCHOR, class_number);
   wt_idx[YoloV2LossParams::class_pred] = context.requestTensor(
-    class_pred_dim, "class_pred", nntrainer::Tensor::Initializer::NONE, true,
+    class_pred_dim, "class_pred", nntrainer::Initializer::NONE, true,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_w_pred_anchor_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
-  wt_idx[YoloV2LossParams::bbox_w_pred_anchor] =
-    context.requestTensor(bbox_w_pred_anchor_dim, "bbox_w_pred_anchor",
-                          nntrainer::Tensor::Initializer::NONE, false,
-                          nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+  wt_idx[YoloV2LossParams::bbox_w_pred_anchor] = context.requestTensor(
+    bbox_w_pred_anchor_dim, "bbox_w_pred_anchor", nntrainer::Initializer::NONE,
+    false, nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_h_pred_anchor_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
-  wt_idx[YoloV2LossParams::bbox_h_pred_anchor] =
-    context.requestTensor(bbox_h_pred_anchor_dim, "bbox_h_pred_anchor",
-                          nntrainer::Tensor::Initializer::NONE, false,
-                          nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+  wt_idx[YoloV2LossParams::bbox_h_pred_anchor] = context.requestTensor(
+    bbox_h_pred_anchor_dim, "bbox_h_pred_anchor", nntrainer::Initializer::NONE,
+    false, nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_x_gt_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::bbox_x_gt] = context.requestTensor(
-    bbox_x_gt_dim, "bbox_x_gt", nntrainer::Tensor::Initializer::NONE, false,
+    bbox_x_gt_dim, "bbox_x_gt", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_y_gt_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::bbox_y_gt] = context.requestTensor(
-    bbox_y_gt_dim, "bbox_y_gt", nntrainer::Tensor::Initializer::NONE, false,
+    bbox_y_gt_dim, "bbox_y_gt", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_w_gt_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::bbox_w_gt] = context.requestTensor(
-    bbox_w_gt_dim, "bbox_w_gt", nntrainer::Tensor::Initializer::NONE, false,
+    bbox_w_gt_dim, "bbox_w_gt", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_h_gt_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::bbox_h_gt] = context.requestTensor(
-    bbox_h_gt_dim, "bbox_h_gt", nntrainer::Tensor::Initializer::NONE, false,
+    bbox_h_gt_dim, "bbox_h_gt", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim confidence_gt_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::confidence_gt] = context.requestTensor(
-    confidence_gt_dim, "confidence_gt", nntrainer::Tensor::Initializer::NONE,
-    false, nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+    confidence_gt_dim, "confidence_gt", nntrainer::Initializer::NONE, false,
+    nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim class_gt_dim(batch_size,
                                     grid_height_number * grid_width_number,
                                     NUM_ANCHOR, class_number);
   wt_idx[YoloV2LossParams::class_gt] = context.requestTensor(
-    class_gt_dim, "class_gt", nntrainer::Tensor::Initializer::NONE, false,
+    class_gt_dim, "class_gt", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_class_mask_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
-  wt_idx[YoloV2LossParams::bbox_class_mask] =
-    context.requestTensor(bbox_class_mask_dim, "bbox_class_mask",
-                          nntrainer::Tensor::Initializer::NONE, false,
-                          nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+  wt_idx[YoloV2LossParams::bbox_class_mask] = context.requestTensor(
+    bbox_class_mask_dim, "bbox_class_mask", nntrainer::Initializer::NONE, false,
+    nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim iou_mask_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::iou_mask] = context.requestTensor(
-    iou_mask_dim, "iou_mask", nntrainer::Tensor::Initializer::NONE, false,
+    iou_mask_dim, "iou_mask", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox1_width_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::bbox1_width] = context.requestTensor(
-    bbox1_width_dim, "bbox1_width", nntrainer::Tensor::Initializer::NONE, false,
+    bbox1_width_dim, "bbox1_width", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox1_height_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::bbox1_height] = context.requestTensor(
-    bbox1_height_dim, "bbox1_height", nntrainer::Tensor::Initializer::NONE,
-    false, nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+    bbox1_height_dim, "bbox1_height", nntrainer::Initializer::NONE, false,
+    nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim is_xy_min_max_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 4);
   wt_idx[YoloV2LossParams::is_xy_min_max] = context.requestTensor(
-    is_xy_min_max_dim, "is_xy_min_max", nntrainer::Tensor::Initializer::NONE,
-    false, nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+    is_xy_min_max_dim, "is_xy_min_max", nntrainer::Initializer::NONE, false,
+    nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim intersection_width_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
-  wt_idx[YoloV2LossParams::intersection_width] =
-    context.requestTensor(intersection_width_dim, "intersection_width",
-                          nntrainer::Tensor::Initializer::NONE, false,
-                          nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+  wt_idx[YoloV2LossParams::intersection_width] = context.requestTensor(
+    intersection_width_dim, "intersection_width", nntrainer::Initializer::NONE,
+    false, nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim intersection_height_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::intersection_height] =
     context.requestTensor(intersection_height_dim, "intersection_height",
-                          nntrainer::Tensor::Initializer::NONE, false,
+                          nntrainer::Initializer::NONE, false,
                           nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim unions_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV2LossParams::unions] = context.requestTensor(
-    unions_dim, "unions", nntrainer::Tensor::Initializer::NONE, false,
+    unions_dim, "unions", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 }
 
diff --git a/Applications/YOLOv3/jni/yolo_v3_loss.cpp b/Applications/YOLOv3/jni/yolo_v3_loss.cpp
index 0187e21f87..dc4300a0a7 100644
--- a/Applications/YOLOv3/jni/yolo_v3_loss.cpp
+++ b/Applications/YOLOv3/jni/yolo_v3_loss.cpp
@@ -335,141 +335,136 @@ void YoloV3LossLayer::finalize(nntrainer::InitLayerContext &context) {
   nntrainer::TensorDim bbox_x_pred_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::bbox_x_pred] = context.requestTensor(
-    bbox_x_pred_dim, "bbox_x_pred", nntrainer::Tensor::Initializer::NONE, true,
+    bbox_x_pred_dim, "bbox_x_pred", nntrainer::Initializer::NONE, true,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_y_pred_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::bbox_y_pred] = context.requestTensor(
-    bbox_y_pred_dim, "bbox_y_pred", nntrainer::Tensor::Initializer::NONE, true,
+    bbox_y_pred_dim, "bbox_y_pred", nntrainer::Initializer::NONE, true,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_w_pred_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::bbox_w_pred] = context.requestTensor(
-    bbox_w_pred_dim, "bbox_w_pred", nntrainer::Tensor::Initializer::NONE, true,
+    bbox_w_pred_dim, "bbox_w_pred", nntrainer::Initializer::NONE, true,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_h_pred_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::bbox_h_pred] = context.requestTensor(
-    bbox_h_pred_dim, "bbox_h_pred", nntrainer::Tensor::Initializer::NONE, true,
+    bbox_h_pred_dim, "bbox_h_pred", nntrainer::Initializer::NONE, true,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim confidence_pred_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
-  wt_idx[YoloV3LossParams::confidence_pred] =
-    context.requestTensor(confidence_pred_dim, "confidence_pred",
-                          nntrainer::Tensor::Initializer::NONE, true,
-                          nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+  wt_idx[YoloV3LossParams::confidence_pred] = context.requestTensor(
+    confidence_pred_dim, "confidence_pred", nntrainer::Initializer::NONE, true,
+    nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim class_pred_dim(batch_size,
                                       grid_height_number * grid_width_number,
                                       NUM_ANCHOR, class_number);
   wt_idx[YoloV3LossParams::class_pred] = context.requestTensor(
-    class_pred_dim, "class_pred", nntrainer::Tensor::Initializer::NONE, true,
+    class_pred_dim, "class_pred", nntrainer::Initializer::NONE, true,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_w_pred_anchor_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
-  wt_idx[YoloV3LossParams::bbox_w_pred_anchor] =
-    context.requestTensor(bbox_w_pred_anchor_dim, "bbox_w_pred_anchor",
-                          nntrainer::Tensor::Initializer::NONE, false,
-                          nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+  wt_idx[YoloV3LossParams::bbox_w_pred_anchor] = context.requestTensor(
+    bbox_w_pred_anchor_dim, "bbox_w_pred_anchor", nntrainer::Initializer::NONE,
+    false, nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_h_pred_anchor_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
-  wt_idx[YoloV3LossParams::bbox_h_pred_anchor] =
-    context.requestTensor(bbox_h_pred_anchor_dim, "bbox_h_pred_anchor",
-                          nntrainer::Tensor::Initializer::NONE, false,
-                          nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+  wt_idx[YoloV3LossParams::bbox_h_pred_anchor] = context.requestTensor(
+    bbox_h_pred_anchor_dim, "bbox_h_pred_anchor", nntrainer::Initializer::NONE,
+    false, nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_x_gt_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::bbox_x_gt] = context.requestTensor(
-    bbox_x_gt_dim, "bbox_x_gt", nntrainer::Tensor::Initializer::NONE, false,
+    bbox_x_gt_dim, "bbox_x_gt", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_y_gt_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::bbox_y_gt] = context.requestTensor(
-    bbox_y_gt_dim, "bbox_y_gt", nntrainer::Tensor::Initializer::NONE, false,
+    bbox_y_gt_dim, "bbox_y_gt", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_w_gt_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::bbox_w_gt] = context.requestTensor(
-    bbox_w_gt_dim, "bbox_w_gt", nntrainer::Tensor::Initializer::NONE, false,
+    bbox_w_gt_dim, "bbox_w_gt", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_h_gt_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::bbox_h_gt] = context.requestTensor(
-    bbox_h_gt_dim, "bbox_h_gt", nntrainer::Tensor::Initializer::NONE, false,
+    bbox_h_gt_dim, "bbox_h_gt", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim confidence_gt_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::confidence_gt] = context.requestTensor(
-    confidence_gt_dim, "confidence_gt", nntrainer::Tensor::Initializer::NONE,
-    false, nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+    confidence_gt_dim, "confidence_gt", nntrainer::Initializer::NONE, false,
+    nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim class_gt_dim(batch_size,
                                     grid_height_number * grid_width_number,
                                     NUM_ANCHOR, class_number);
   wt_idx[YoloV3LossParams::class_gt] = context.requestTensor(
-    class_gt_dim, "class_gt", nntrainer::Tensor::Initializer::NONE, false,
+    class_gt_dim, "class_gt", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox_class_mask_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
-  wt_idx[YoloV3LossParams::bbox_class_mask] =
-    context.requestTensor(bbox_class_mask_dim, "bbox_class_mask",
-                          nntrainer::Tensor::Initializer::NONE, false,
-                          nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+  wt_idx[YoloV3LossParams::bbox_class_mask] = context.requestTensor(
+    bbox_class_mask_dim, "bbox_class_mask", nntrainer::Initializer::NONE, false,
+    nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim iou_mask_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::iou_mask] = context.requestTensor(
-    iou_mask_dim, "iou_mask", nntrainer::Tensor::Initializer::NONE, false,
+    iou_mask_dim, "iou_mask", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox1_width_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::bbox1_width] = context.requestTensor(
-    bbox1_width_dim, "bbox1_width", nntrainer::Tensor::Initializer::NONE, false,
+    bbox1_width_dim, "bbox1_width", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim bbox1_height_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::bbox1_height] = context.requestTensor(
-    bbox1_height_dim, "bbox1_height", nntrainer::Tensor::Initializer::NONE,
-    false, nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+    bbox1_height_dim, "bbox1_height", nntrainer::Initializer::NONE, false,
+    nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim is_xy_min_max_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 4);
   wt_idx[YoloV3LossParams::is_xy_min_max] = context.requestTensor(
-    is_xy_min_max_dim, "is_xy_min_max", nntrainer::Tensor::Initializer::NONE,
-    false, nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+    is_xy_min_max_dim, "is_xy_min_max", nntrainer::Initializer::NONE, false,
+    nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim intersection_width_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
-  wt_idx[YoloV3LossParams::intersection_width] =
-    context.requestTensor(intersection_width_dim, "intersection_width",
-                          nntrainer::Tensor::Initializer::NONE, false,
-                          nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
+  wt_idx[YoloV3LossParams::intersection_width] = context.requestTensor(
+    intersection_width_dim, "intersection_width", nntrainer::Initializer::NONE,
+    false, nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim intersection_height_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::intersection_height] =
     context.requestTensor(intersection_height_dim, "intersection_height",
-                          nntrainer::Tensor::Initializer::NONE, false,
+                          nntrainer::Initializer::NONE, false,
                           nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
   nntrainer::TensorDim unions_dim(
     batch_size, grid_height_number * grid_width_number, NUM_ANCHOR, 1);
   wt_idx[YoloV3LossParams::unions] = context.requestTensor(
-    unions_dim, "unions", nntrainer::Tensor::Initializer::NONE, false,
+    unions_dim, "unions", nntrainer::Initializer::NONE, false,
     nntrainer::TensorLifespan::FORWARD_DERIV_LIFESPAN);
 }
 
diff --git a/api/ccapi/include/tensor_api.h b/api/ccapi/include/tensor_api.h
index 087d3b1f25..b4fc20cf5e 100644
--- a/api/ccapi/include/tensor_api.h
+++ b/api/ccapi/include/tensor_api.h
@@ -48,9 +48,10 @@ class Tensor : public nntrainer::Var_Grad {
    * @param needg If the tensor needs gradient
    * @param name Name for this tensor
    */
-  explicit Tensor(const TensorDim &dim,
-                  const iTensor::Initializer init = iTensor::Initializer::ZEROS,
-                  bool ng = false, std::string name = ""){};
+  explicit Tensor(
+    const TensorDim &dim,
+    const nntrainer::Initializer init = nntrainer::Initializer::ZEROS,
+    bool ng = false, std::string name = ""){};
 
   /**
    * @brief Swap for weight
diff --git a/debian/nntrainer-dev.install b/debian/nntrainer-dev.install
index 4fd55b3774..73cc5f924d 100644
--- a/debian/nntrainer-dev.install
+++ b/debian/nntrainer-dev.install
@@ -9,7 +9,6 @@
 # tensor headers
 /usr/include/nntrainer/memory_data.h
 /usr/include/nntrainer/tensor.h
-/usr/include/nntrainer/tensor_v2.h
 /usr/include/nntrainer/tensor_base.h
 /usr/include/nntrainer/float_tensor.h
 /usr/include/nntrainer/tensor_wrap_specs.h
diff --git a/nntrainer/compiler/ini_interpreter.cpp b/nntrainer/compiler/ini_interpreter.cpp
index 1d82ef693d..146e62ed1e 100644
--- a/nntrainer/compiler/ini_interpreter.cpp
+++ b/nntrainer/compiler/ini_interpreter.cpp
@@ -49,8 +49,7 @@ namespace nntrainer {
 IniGraphInterpreter::IniGraphInterpreter(
   const AppContext &app_context_,
   std::function<const std::string(const std::string &)> pathResolver_) :
-  app_context(app_context_),
-  pathResolver(pathResolver_) {}
+  app_context(app_context_), pathResolver(pathResolver_) {}
 
 IniGraphInterpreter::~IniGraphInterpreter() {}
 
@@ -235,8 +234,8 @@ referenced
 //     /** TODO #361: this needs update in model file to be of dictionary format
 //     */
 //     // if (preload) {
-//     //   layer->weight_initializer = Tensor::Initializer::FILE_INITIALIZER;
-//     //   layer->bias_initializer = Tensor::Initializer::FILE_INITIALIZER;
+//     //   layer->weight_initializer = Initializer::FILE_INITIALIZER;
+//     //   layer->bias_initializer = Initializer::FILE_INITIALIZER;
 //     //   layer->initializer_file = backbone.save_path;
 //     // }
 //   }
diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
index 2d4cfdc769..54afe6353d 100644
--- a/nntrainer/graph/network_graph.cpp
+++ b/nntrainer/graph/network_graph.cpp
@@ -1557,7 +1557,7 @@ void NetworkGraph::requestOptimizerVariable(
       std::vector<TensorDim> dims = cb(dim);
       w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
         dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
-        w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS));
+        w->isGradientClipByGlobalNorm(), Initializer::ZEROS));
     }
   }
 }
diff --git a/nntrainer/layers/acti_func.h b/nntrainer/layers/acti_func.h
index 9e43219ee5..c6c3576414 100644
--- a/nntrainer/layers/acti_func.h
+++ b/nntrainer/layers/acti_func.h
@@ -16,6 +16,7 @@
 #define __ACTI_FUNC_H__
 #ifdef __cplusplus
 
+#include <blas_interface.h>
 #include <common_properties.h>
 
 namespace nntrainer {
diff --git a/nntrainer/layers/attention_layer.cpp b/nntrainer/layers/attention_layer.cpp
index 1309214bca..eab36a9af3 100644
--- a/nntrainer/layers/attention_layer.cpp
+++ b/nntrainer/layers/attention_layer.cpp
@@ -65,8 +65,8 @@ void AttentionLayer::finalize(InitLayerContext &context) {
   auto weights_dim = query_dim;
   weights_dim.width(value_dim.height());
   wt_idx[AttentionParams::weights] =
-    context.requestTensor(weights_dim, "weights", Tensor::Initializer::NONE,
-                          false, TensorLifespan::ITERATION_LIFESPAN);
+    context.requestTensor(weights_dim, "weights", Initializer::NONE, false,
+                          TensorLifespan::ITERATION_LIFESPAN);
 
   context.setOutputDimensions({query_dim});
 
diff --git a/nntrainer/layers/bn_layer.cpp b/nntrainer/layers/bn_layer.cpp
index 1723ac677f..d74f4395cf 100644
--- a/nntrainer/layers/bn_layer.cpp
+++ b/nntrainer/layers/bn_layer.cpp
@@ -117,11 +117,11 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
    * more in-place calculation) can save memory during memory optimization.
    */
   wt_idx[BNParams::deviation] =
-    context.requestTensor(in_dim, "deviation", Tensor::Initializer::NONE, false,
+    context.requestTensor(in_dim, "deviation", Initializer::NONE, false,
                           TensorLifespan::ITERATION_LIFESPAN);
   /** caches the inverse standard deviation */
   wt_idx[BNParams::invstd] =
-    context.requestTensor(dim, "invstd", Tensor::Initializer::NONE, false,
+    context.requestTensor(dim, "invstd", Initializer::NONE, false,
                           TensorLifespan::ITERATION_LIFESPAN);
   /**
    * Temporary tensor to store the full sized tensors in order to allow batch
@@ -130,20 +130,19 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
    * as the output of this layer need not be stored all the time.
    */
   wt_idx[BNParams::t_full] =
-    context.requestTensor(in_dim, "tensor_full", Tensor::Initializer::NONE,
-                          false, TensorLifespan::CALC_DERIV_LIFESPAN);
+    context.requestTensor(in_dim, "tensor_full", Initializer::NONE, false,
+                          TensorLifespan::CALC_DERIV_LIFESPAN);
   /**
    * caches variance + epsilon as well.
    */
-  wt_idx[BNParams::cvar] =
-    context.requestTensor(dim, "cvar", Tensor::Initializer::NONE, false,
-                          TensorLifespan::ITERATION_LIFESPAN);
+  wt_idx[BNParams::cvar] = context.requestTensor(
+    dim, "cvar", Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN);
   /**
    * Temporary tensor to store the reduced tensors along the axes_to_reduce.
    */
   wt_idx[BNParams::t_reduced] =
-    context.requestTensor(dim, "tensor_reduced", Tensor::Initializer::NONE,
-                          false, TensorLifespan::FORWARD_DERIV_LIFESPAN);
+    context.requestTensor(dim, "tensor_reduced", Initializer::NONE, false,
+                          TensorLifespan::FORWARD_DERIV_LIFESPAN);
 }
 
 void BatchNormalizationLayer::setProperty(
diff --git a/nntrainer/layers/centroid_knn.cpp b/nntrainer/layers/centroid_knn.cpp
index 611dca1d97..1ccfa15c26 100644
--- a/nntrainer/layers/centroid_knn.cpp
+++ b/nntrainer/layers/centroid_knn.cpp
@@ -62,11 +62,11 @@ void CentroidKNN::finalize(nntrainer::InitLayerContext &context) {
   auto samples_seen = nntrainer::TensorDim({num_class});
 
   weight_idx[KNNParams::map] = context.requestWeight(
-    map_dim, nntrainer::Tensor::Initializer::ZEROS,
-    nntrainer::WeightRegularizer::NONE, 1.0f, 0.0f, "map", false);
+    map_dim, nntrainer::Initializer::ZEROS, nntrainer::WeightRegularizer::NONE,
+    1.0f, 0.0f, "map", false);
 
   weight_idx[KNNParams::num_samples] = context.requestWeight(
-    samples_seen, nntrainer::Tensor::Initializer::ZEROS,
+    samples_seen, nntrainer::Initializer::ZEROS,
     nntrainer::WeightRegularizer::NONE, 1.0f, 0.0f, "num_samples", false);
 }
 
diff --git a/nntrainer/layers/cl_layers/fc_layer_cl.cpp b/nntrainer/layers/cl_layers/fc_layer_cl.cpp
index 890450bebe..0e3cb178f0 100644
--- a/nntrainer/layers/cl_layers/fc_layer_cl.cpp
+++ b/nntrainer/layers/cl_layers/fc_layer_cl.cpp
@@ -124,7 +124,8 @@ void FullyConnectedLayerCl::forwarding(RunLayerContext &context,
     unsigned int axis =
       context.getWeightObject(weight_idx[FCParams::weight]).getOutputAxis();
 
-    weight.dequantize(weight_, axis);
+    // Dequantize is currently disabled
+    // weight.dequantize(weight_, axis);
 
     dotCl(input_, weight_, hidden_, context);
   } else {
diff --git a/nntrainer/layers/cl_layers/rmsnorm_layer_cl.h b/nntrainer/layers/cl_layers/rmsnorm_layer_cl.h
index cd2fc9dea9..4b34729409 100644
--- a/nntrainer/layers/cl_layers/rmsnorm_layer_cl.h
+++ b/nntrainer/layers/cl_layers/rmsnorm_layer_cl.h
@@ -24,7 +24,7 @@
 
 namespace nntrainer {
 
-namespace props{
+namespace props {
 
 /**
  * @brief RMS_NORM_GAMMA_INIT_GPU Initialization Enumeration Information
@@ -36,15 +36,14 @@ class RMS_NORM_GAMMA_INIT_GPU final
   /**
    * @brief Construct a RMS_NORM_GAMMA_INIT object
    */
-  RMS_NORM_GAMMA_INIT_GPU(::nntrainer::Tensor::Initializer value =
-                        ::nntrainer::Tensor::Initializer::ONES) {
+  RMS_NORM_GAMMA_INIT_GPU(
+    ::nntrainer::Initializer value = ::nntrainer::Initializer::ONES) {
     set(value);
   };
   using prop_tag = enum_class_prop_tag;
   static constexpr const char *key = "gamma_initializer";
 };
-};
-
+}; // namespace props
 
 /**
  * @class   RMSNormLayer
@@ -111,9 +110,7 @@ class RMSNormLayerCl : public LayerImpl {
   /**
    * @copydoc Layer::getType()
    */
-  const std::string getType() const override {
-    return RMSNormLayerCl::type;
-  };
+  const std::string getType() const override { return RMSNormLayerCl::type; };
 
   static opencl::Kernel kernel_rmsnorm;
   static opencl::Kernel kernel_rmsnorm_fp16;
@@ -127,10 +124,8 @@ class RMSNormLayerCl : public LayerImpl {
    * @param[in] RunLayerContext reference
    */
 
-
-  void rmsnormProcess(Tensor const &input, Tensor &result, Tensor const &gamma, const float epsilon,
-                    RunLayerContext &context);
-
+  void rmsnormProcess(Tensor const &input, Tensor &result, Tensor const &gamma,
+                      const float epsilon, RunLayerContext &context);
 
   /**
    * @brief Process data and dimensions for FP16 rms norm operation
@@ -141,15 +136,13 @@ class RMSNormLayerCl : public LayerImpl {
    * @param[in] RunLayerContext reference
    */
 
-
-  void rmsnormProcess_fp16(Tensor const &input, Tensor &result, Tensor const &gamma, const float epsilon,
-                    RunLayerContext &context);
+  void rmsnormProcess_fp16(Tensor const &input, Tensor &result,
+                           Tensor const &gamma, const float epsilon,
+                           RunLayerContext &context);
   /**
    * @copydoc Layer::supportBackwarding()
    */
-  bool supportBackwarding() const override {
-    return false;
-  }
+  bool supportBackwarding() const override { return false; }
 
   /**
    * @copydoc Layer::setProperty(const std::vector<std::string> &values)
@@ -167,4 +160,3 @@ class RMSNormLayerCl : public LayerImpl {
 
 #endif /* __cplusplus */
 #endif /* __RMSNORM_LAYER_CL__ */
-
diff --git a/nntrainer/layers/common_properties.cpp b/nntrainer/layers/common_properties.cpp
index 737d47609f..755f4407c6 100644
--- a/nntrainer/layers/common_properties.cpp
+++ b/nntrainer/layers/common_properties.cpp
@@ -306,21 +306,17 @@ RecurrentActivation::RecurrentActivation(ActivationTypeInfo::Enum value) {
   set(value);
 };
 
-WeightInitializer::WeightInitializer(Tensor::Initializer value) { set(value); }
+WeightInitializer::WeightInitializer(Initializer value) { set(value); }
 
-BiasInitializer::BiasInitializer(Tensor::Initializer value) { set(value); }
+BiasInitializer::BiasInitializer(Initializer value) { set(value); }
 
-BNPARAMS_MU_INIT::BNPARAMS_MU_INIT(Tensor::Initializer value) { set(value); }
+BNPARAMS_MU_INIT::BNPARAMS_MU_INIT(Initializer value) { set(value); }
 
-BNPARAMS_VAR_INIT::BNPARAMS_VAR_INIT(Tensor::Initializer value) { set(value); }
+BNPARAMS_VAR_INIT::BNPARAMS_VAR_INIT(Initializer value) { set(value); }
 
-BNPARAMS_GAMMA_INIT::BNPARAMS_GAMMA_INIT(Tensor::Initializer value) {
-  set(value);
-}
+BNPARAMS_GAMMA_INIT::BNPARAMS_GAMMA_INIT(Initializer value) { set(value); }
 
-BNPARAMS_BETA_INIT::BNPARAMS_BETA_INIT(Tensor::Initializer value) {
-  set(value);
-}
+BNPARAMS_BETA_INIT::BNPARAMS_BETA_INIT(Initializer value) { set(value); }
 
 BasicRegularizer::BasicRegularizer(nntrainer::WeightRegularizer value) {
   set(value);
diff --git a/nntrainer/layers/common_properties.h b/nntrainer/layers/common_properties.h
index c5a514b637..2591ab454b 100644
--- a/nntrainer/layers/common_properties.h
+++ b/nntrainer/layers/common_properties.h
@@ -969,7 +969,7 @@ class RecurrentActivation final : public EnumProperty<ActivationTypeInfo> {
  * @brief     Enumeration of tensor initialization type
  */
 struct InitializerInfo {
-  using Enum = Tensor::Initializer;
+  using Enum = Initializer;
   static constexpr std::initializer_list<Enum> EnumList = {
     Enum::ZEROS,         Enum::ONES,          Enum::LECUN_NORMAL,
     Enum::LECUN_UNIFORM, Enum::XAVIER_NORMAL, Enum::XAVIER_UNIFORM,
@@ -990,8 +990,7 @@ class WeightInitializer final : public EnumProperty<InitializerInfo> {
   /**
    * @brief Construct a WeightInitializer object
    */
-  WeightInitializer(
-    Tensor::Initializer value = Tensor::Initializer::XAVIER_UNIFORM);
+  WeightInitializer(Initializer value = Initializer::XAVIER_UNIFORM);
   using prop_tag = enum_class_prop_tag;
   static constexpr const char *key = "weight_initializer";
 };
@@ -1005,7 +1004,7 @@ class BiasInitializer final : public EnumProperty<InitializerInfo> {
   /**
    * @brief Construct a BiasInitializer object
    */
-  BiasInitializer(Tensor::Initializer value = Tensor::Initializer::ZEROS);
+  BiasInitializer(Initializer value = Initializer::ZEROS);
   using prop_tag = enum_class_prop_tag;
   static constexpr const char *key = "bias_initializer";
 };
@@ -1019,7 +1018,7 @@ class BNPARAMS_MU_INIT final : public EnumProperty<InitializerInfo> {
   /**
    * @brief Construct a BNPARAMS_MU_INIT object
    */
-  BNPARAMS_MU_INIT(Tensor::Initializer value = Tensor::Initializer::ZEROS);
+  BNPARAMS_MU_INIT(Initializer value = Initializer::ZEROS);
   using prop_tag = enum_class_prop_tag;
   static constexpr const char *key = "moving_mean_initializer";
 };
@@ -1033,7 +1032,7 @@ class BNPARAMS_VAR_INIT final : public EnumProperty<InitializerInfo> {
   /**
    * @brief Construct a BNPARAMS_VAR_INIT object
    */
-  BNPARAMS_VAR_INIT(Tensor::Initializer value = Tensor::Initializer::ONES);
+  BNPARAMS_VAR_INIT(Initializer value = Initializer::ONES);
   using prop_tag = enum_class_prop_tag;
   static constexpr const char *key = "moving_variance_initializer";
 };
@@ -1047,7 +1046,7 @@ class BNPARAMS_GAMMA_INIT final : public EnumProperty<InitializerInfo> {
   /**
    * @brief Construct a BNPARAMS_GAMMA_INIT object
    */
-  BNPARAMS_GAMMA_INIT(Tensor::Initializer value = Tensor::Initializer::ONES);
+  BNPARAMS_GAMMA_INIT(Initializer value = Initializer::ONES);
   using prop_tag = enum_class_prop_tag;
   static constexpr const char *key = "gamma_initializer";
 };
@@ -1061,7 +1060,7 @@ class BNPARAMS_BETA_INIT final : public EnumProperty<InitializerInfo> {
   /**
    * @brief Construct a BNPARAMS_BETA_INIT object
    */
-  BNPARAMS_BETA_INIT(Tensor::Initializer value = Tensor::Initializer::ZEROS);
+  BNPARAMS_BETA_INIT(Initializer value = Initializer::ZEROS);
   using prop_tag = enum_class_prop_tag;
   static constexpr const char *key = "beta_initializer";
 };
diff --git a/nntrainer/layers/dropout.cpp b/nntrainer/layers/dropout.cpp
index c00c31d10b..63307345b0 100644
--- a/nntrainer/layers/dropout.cpp
+++ b/nntrainer/layers/dropout.cpp
@@ -28,9 +28,8 @@ void DropOutLayer::finalize(InitLayerContext &context) {
 
   mask_idx.reserve(input_dims.size());
   for (auto &t : input_dims) {
-    mask_idx.push_back(
-      context.requestTensor(t, "Mask", Tensor::Initializer::NONE, false,
-                            TensorLifespan::ITERATION_LIFESPAN));
+    mask_idx.push_back(context.requestTensor(
+      t, "Mask", Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN));
   }
 }
 
diff --git a/nntrainer/layers/fc_layer.cpp b/nntrainer/layers/fc_layer.cpp
index 436a936439..67a41f50ed 100644
--- a/nntrainer/layers/fc_layer.cpp
+++ b/nntrainer/layers/fc_layer.cpp
@@ -136,20 +136,20 @@ void FullyConnectedLayer::finalize(InitLayerContext &context) {
       is_nchw ? 0b0001 : 0b0100);
 
     lora_idx[LORAParams::loraA] = context.requestWeight(
-      loraA_dim, Tensor::Initializer::ZEROS, weight_regularizer,
+      loraA_dim, Initializer::ZEROS, weight_regularizer,
       weight_regularizer_constant, weight_decay, "loraA", true);
 
     lora_idx[LORAParams::loraB] = context.requestWeight(
-      loraB_dim, Tensor::Initializer::LECUN_NORMAL, weight_regularizer,
+      loraB_dim, Initializer::LECUN_NORMAL, weight_regularizer,
       weight_regularizer_constant, weight_decay, "loraB", true);
 
-    lora_idx[LORAParams::loraTmp] = context.requestTensor(
-      loraTmp_dim, "hidden_tmp_lora", Tensor::Initializer::NONE, true,
-      TensorLifespan::FORWARD_DERIV_LIFESPAN);
+    lora_idx[LORAParams::loraTmp] =
+      context.requestTensor(loraTmp_dim, "hidden_tmp_lora", Initializer::NONE,
+                            true, TensorLifespan::FORWARD_DERIV_LIFESPAN);
 
     lora_idx[LORAParams::loraOut] =
-      context.requestTensor(bias_dim, "hidden_lora", Tensor::Initializer::NONE,
-                            true, TensorLifespan::FORWARD_FUNC_LIFESPAN);
+      context.requestTensor(bias_dim, "hidden_lora", Initializer::NONE, true,
+                            TensorLifespan::FORWARD_FUNC_LIFESPAN);
   }
 }
 
@@ -181,7 +181,7 @@ void FullyConnectedLayer::forwarding(RunLayerContext &context, bool training) {
     unsigned int axis =
       context.getWeightObject(weight_idx[FCParams::weight]).getOutputAxis();
 
-    weight.dequantize(weight_, axis);
+    // weight.dequantize(weight_, axis);
     input_.dot(weight_, hidden_, false, false);
   } else {
     input_.dot(weight, hidden_, false, false);
diff --git a/nntrainer/layers/gru.cpp b/nntrainer/layers/gru.cpp
index 1b90247b9a..f9ec829f72 100644
--- a/nntrainer/layers/gru.cpp
+++ b/nntrainer/layers/gru.cpp
@@ -64,9 +64,9 @@ GRULayer::GRULayer() :
 }
 
 void GRULayer::finalize(InitLayerContext &context) {
-  const Tensor::Initializer weight_initializer =
+  const Initializer weight_initializer =
     std::get<props::WeightInitializer>(*layer_impl_props).get();
-  const Tensor::Initializer bias_initializer =
+  const Initializer bias_initializer =
     std::get<props::BiasInitializer>(*layer_impl_props).get();
   const WeightRegularizer weight_regularizer =
     std::get<props::WeightRegularizer>(*layer_impl_props).get();
@@ -148,27 +148,27 @@ void GRULayer::finalize(InitLayerContext &context) {
 
   // hidden_state_dim = [ batch, 1, max_timestep, unit ]
   TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit);
-  wt_idx[GRUParams::hidden_state] = context.requestTensor(
-    hidden_state_dim, "hidden_state", Tensor::Initializer::NONE, true,
-    TensorLifespan::ITERATION_LIFESPAN);
+  wt_idx[GRUParams::hidden_state] =
+    context.requestTensor(hidden_state_dim, "hidden_state", Initializer::NONE,
+                          true, TensorLifespan::ITERATION_LIFESPAN);
 
   // zrg_dim = [ batch, 1, max_timestep, NUM_GATE * unit ]
   TensorDim zrg_dim(batch_size, 1, max_timestep, NUM_GATE * unit);
   wt_idx[GRUParams::zrg] =
-    context.requestTensor(zrg_dim, "zrg", Tensor::Initializer::NONE, true,
+    context.requestTensor(zrg_dim, "zrg", Initializer::NONE, true,
                           TensorLifespan::ITERATION_LIFESPAN);
 
   // h_prev_dim = [ batch, 1, 1, unit ]
   TensorDim h_prev_dim = TensorDim({batch_size, 1, 1, unit});
   wt_idx[GRUParams::h_prev] =
-    context.requestTensor(h_prev_dim, "h_prev", Tensor::Initializer::NONE,
-                          false, TensorLifespan::FORWARD_FUNC_LIFESPAN);
+    context.requestTensor(h_prev_dim, "h_prev", Initializer::NONE, false,
+                          TensorLifespan::FORWARD_FUNC_LIFESPAN);
 
   if (dropout_rate > epsilon) {
     TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit);
-    wt_idx[GRUParams::dropout_mask] = context.requestTensor(
-      output_dim, "dropout_mask", Tensor::Initializer::NONE, false,
-      TensorLifespan::ITERATION_LIFESPAN);
+    wt_idx[GRUParams::dropout_mask] =
+      context.requestTensor(output_dim, "dropout_mask", Initializer::NONE,
+                            false, TensorLifespan::ITERATION_LIFESPAN);
   }
 
   acti_func.setActiFunc(hidden_state_activation_type);
diff --git a/nntrainer/layers/grucell.cpp b/nntrainer/layers/grucell.cpp
index 57b840e482..e260bd898a 100644
--- a/nntrainer/layers/grucell.cpp
+++ b/nntrainer/layers/grucell.cpp
@@ -276,9 +276,9 @@ GRUCellLayer::GRUCellLayer() :
 }
 
 void GRUCellLayer::finalize(InitLayerContext &context) {
-  const Tensor::Initializer weight_initializer =
+  const Initializer weight_initializer =
     std::get<props::WeightInitializer>(*layer_impl_props).get();
-  const Tensor::Initializer bias_initializer =
+  const Initializer bias_initializer =
     std::get<props::BiasInitializer>(*layer_impl_props).get();
   const WeightRegularizer weight_regularizer =
     std::get<props::WeightRegularizer>(*layer_impl_props).get();
@@ -368,15 +368,15 @@ void GRUCellLayer::finalize(InitLayerContext &context) {
   // zrg_dim = [ batch_size, 1, 1, NUM_GATE * unit ]
   TensorDim zrg_dim(batch_size, 1, 1, NUM_GATE * unit);
   wt_idx[GRUCellParams::zrg] =
-    context.requestTensor(zrg_dim, "zrg", Tensor::Initializer::NONE, true,
+    context.requestTensor(zrg_dim, "zrg", Initializer::NONE, true,
                           TensorLifespan::ITERATION_LIFESPAN);
 
   if (dropout_rate > epsilon) {
     // dropout_mask_dim = [ batch_size, 1, 1, unit ]
     TensorDim dropout_mask_dim(batch_size, 1, 1, unit);
-    wt_idx[GRUCellParams::dropout_mask] = context.requestTensor(
-      dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false,
-      TensorLifespan::ITERATION_LIFESPAN);
+    wt_idx[GRUCellParams::dropout_mask] =
+      context.requestTensor(dropout_mask_dim, "dropout_mask", Initializer::NONE,
+                            false, TensorLifespan::ITERATION_LIFESPAN);
   }
 
   acti_func.setActiFunc(hidden_state_activation_type);
diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp
index d71221c352..53951d4f69 100644
--- a/nntrainer/layers/layer_context.cpp
+++ b/nntrainer/layers/layer_context.cpp
@@ -244,7 +244,7 @@ const Tensor &RunLayerContext::getOutput(unsigned int idx) const {
  */
 const Tensor RunLayerContext::getOutputGrad(unsigned int idx) const {
   if (!outputs[idx]->hasGradient()) {
-    return Tensor(outputs[idx]->getDim(), true, Tensor::Initializer::ZEROS);
+    return Tensor(outputs[idx]->getDim(), true, Initializer::ZEROS);
   }
   return const_cast<RunLayerContext *>(this)->getOutputGradUnsafe(idx);
 }
diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h
index b8b8ffccd8..993e98fd01 100644
--- a/nntrainer/layers/layer_context.h
+++ b/nntrainer/layers/layer_context.h
@@ -184,8 +184,7 @@ class InitLayerContext {
    * @todo Consider providing a guarantee that the returned indices will always
    * start from 0 and will always be incremental.
    */
-  unsigned int requestWeight(const TensorDim &dim,
-                             const Tensor::Initializer init,
+  unsigned int requestWeight(const TensorDim &dim, const Initializer init,
                              const WeightRegularizer reg, const float reg_const,
                              const float decay, const std::string &name,
                              bool trainable = true, unsigned int out_axis = 3) {
@@ -231,7 +230,7 @@ class InitLayerContext {
    */
   unsigned int
   requestTensor(const TensorDim &dim, const std::string &name,
-                const Tensor::Initializer init = Tensor::Initializer::NONE,
+                const Initializer init = Initializer::NONE,
                 bool trainable = false,
                 TensorLifespan lifespan = TensorLifespan::ITERATION_LIFESPAN,
                 bool private_ = true) {
@@ -441,7 +440,7 @@ class RunLayerContext {
     }
     unsigned int o_ax = getWeightObject(idx).getOutputAxis();
 
-    t_w.dequantize(w, o_ax);
+    // t_w.dequantize(w, o_ax);
 
     return;
   }
diff --git a/nntrainer/layers/layer_normalization_layer.cpp b/nntrainer/layers/layer_normalization_layer.cpp
index 466ca93bb7..a115e82b62 100644
--- a/nntrainer/layers/layer_normalization_layer.cpp
+++ b/nntrainer/layers/layer_normalization_layer.cpp
@@ -98,25 +98,25 @@ void LayerNormalizationLayer::finalize(InitLayerContext &context) {
 
   /** caches the deviation -> input - avg(input) */
   wt_idx[LNParams::deviation] =
-    context.requestTensor(input_dim, "deviation", Tensor::Initializer::NONE,
-                          false, TensorLifespan::ITERATION_LIFESPAN);
+    context.requestTensor(input_dim, "deviation", Initializer::NONE, false,
+                          TensorLifespan::ITERATION_LIFESPAN);
   /** caches variance + epsilon as well */
   wt_idx[LNParams::variance] =
-    context.requestTensor(remain_dim, "variance", Tensor::Initializer::NONE,
-                          false, TensorLifespan::ITERATION_LIFESPAN);
+    context.requestTensor(remain_dim, "variance", Initializer::NONE, false,
+                          TensorLifespan::ITERATION_LIFESPAN);
   /** caches the inverse standard deviation */
   wt_idx[LNParams::inv_std_dev] =
-    context.requestTensor(remain_dim, "inv_std_dev", Tensor::Initializer::NONE,
-                          false, TensorLifespan::ITERATION_LIFESPAN);
+    context.requestTensor(remain_dim, "inv_std_dev", Initializer::NONE, false,
+                          TensorLifespan::ITERATION_LIFESPAN);
 
   /** temporary tensor (origin size) */
-  wt_idx[LNParams::temp_origin_size] = context.requestTensor(
-    input_dim, "temp_origin_size", Tensor::Initializer::NONE, false,
-    TensorLifespan::CALC_DERIV_LIFESPAN);
+  wt_idx[LNParams::temp_origin_size] =
+    context.requestTensor(input_dim, "temp_origin_size", Initializer::NONE,
+                          false, TensorLifespan::CALC_DERIV_LIFESPAN);
   /** temporary tensor (normalized size) */
-  wt_idx[LNParams::temp_normalized_size] = context.requestTensor(
-    remain_dim, "temp_normalized_size", Tensor::Initializer::NONE, false,
-    TensorLifespan::CALC_DERIV_LIFESPAN);
+  wt_idx[LNParams::temp_normalized_size] =
+    context.requestTensor(remain_dim, "temp_normalized_size", Initializer::NONE,
+                          false, TensorLifespan::CALC_DERIV_LIFESPAN);
 }
 
 void LayerNormalizationLayer::setProperty(
diff --git a/nntrainer/layers/lstm.cpp b/nntrainer/layers/lstm.cpp
index d5f13a1fc5..a46b3d4c32 100644
--- a/nntrainer/layers/lstm.cpp
+++ b/nntrainer/layers/lstm.cpp
@@ -409,9 +409,9 @@ LSTMLayer::LSTMLayer() :
 }
 
 void LSTMLayer::finalize(InitLayerContext &context) {
-  const Tensor::Initializer weight_initializer =
+  const Initializer weight_initializer =
     std::get<props::WeightInitializer>(*layer_impl_props).get();
-  const Tensor::Initializer bias_initializer =
+  const Initializer bias_initializer =
     std::get<props::BiasInitializer>(*layer_impl_props).get();
   const nntrainer::WeightRegularizer weight_regularizer =
     std::get<props::WeightRegularizer>(*layer_impl_props).get();
@@ -511,21 +511,21 @@ void LSTMLayer::finalize(InitLayerContext &context) {
   // hidden_state_dim : [ batch_size, 1, max_timestep, unit ]
   const TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit,
                                    weight_tensor_type);
-  wt_idx[LSTMParams::hidden_state] = context.requestTensor(
-    hidden_state_dim, "hidden_state", Tensor::Initializer::NONE, true,
-    TensorLifespan::ITERATION_LIFESPAN);
+  wt_idx[LSTMParams::hidden_state] =
+    context.requestTensor(hidden_state_dim, "hidden_state", Initializer::NONE,
+                          true, TensorLifespan::ITERATION_LIFESPAN);
   // cell_state_dim : [ batch_size, 1, max_timestep, unit ]
   const TensorDim cell_state_dim(batch_size, 1, max_timestep, unit,
                                  weight_tensor_type);
-  wt_idx[LSTMParams::cell_state] = context.requestTensor(
-    cell_state_dim, "cell_state", Tensor::Initializer::NONE, true,
-    TensorLifespan::ITERATION_LIFESPAN);
+  wt_idx[LSTMParams::cell_state] =
+    context.requestTensor(cell_state_dim, "cell_state", Initializer::NONE, true,
+                          TensorLifespan::ITERATION_LIFESPAN);
 
   // ifgo_dim : [ batch_size, 1, max_timestep, NUM_GATE * unit ]
   const TensorDim ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit,
                            weight_tensor_type);
   wt_idx[LSTMParams::ifgo] =
-    context.requestTensor(ifgo_dim, "ifgo", Tensor::Initializer::NONE, true,
+    context.requestTensor(ifgo_dim, "ifgo", Initializer::NONE, true,
                           TensorLifespan::ITERATION_LIFESPAN);
 
   if (bidirectional) {
@@ -579,30 +579,30 @@ void LSTMLayer::finalize(InitLayerContext &context) {
     const TensorDim reverse_hidden_state_dim(batch_size, 1, max_timestep, unit,
                                              weight_tensor_type);
     wt_idx[LSTMParams::reverse_hidden_state] = context.requestTensor(
-      reverse_hidden_state_dim, "reverse_hidden_state",
-      Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN);
+      reverse_hidden_state_dim, "reverse_hidden_state", Initializer::NONE, true,
+      TensorLifespan::ITERATION_LIFESPAN);
     // reverse_cell_state_dim : [ batch_size, 1, max_timestep, unit ]
     const TensorDim reverse_cell_state_dim(batch_size, 1, max_timestep, unit,
                                            weight_tensor_type);
     wt_idx[LSTMParams::reverse_cell_state] = context.requestTensor(
-      reverse_cell_state_dim, "reverse_cell_state", Tensor::Initializer::NONE,
-      true, TensorLifespan::ITERATION_LIFESPAN);
+      reverse_cell_state_dim, "reverse_cell_state", Initializer::NONE, true,
+      TensorLifespan::ITERATION_LIFESPAN);
 
     // reverse_ifgo_dim : [ batch_size, 1, max_timestep, NUM_GATE * unit ]
     const TensorDim reverse_ifgo_dim(batch_size, 1, max_timestep,
                                      NUM_GATE * unit, weight_tensor_type);
-    wt_idx[LSTMParams::reverse_ifgo] = context.requestTensor(
-      reverse_ifgo_dim, "reverse_ifgo", Tensor::Initializer::NONE, true,
-      TensorLifespan::ITERATION_LIFESPAN);
+    wt_idx[LSTMParams::reverse_ifgo] =
+      context.requestTensor(reverse_ifgo_dim, "reverse_ifgo", Initializer::NONE,
+                            true, TensorLifespan::ITERATION_LIFESPAN);
   }
 
   if (dropout_rate > epsilon) {
     // dropout_mask_dim = [ batch, 1, time_iteration, unit ]
     const TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit,
                                      weight_tensor_type);
-    wt_idx[LSTMParams::dropout_mask] = context.requestTensor(
-      dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false,
-      TensorLifespan::ITERATION_LIFESPAN);
+    wt_idx[LSTMParams::dropout_mask] =
+      context.requestTensor(dropout_mask_dim, "dropout_mask", Initializer::NONE,
+                            false, TensorLifespan::ITERATION_LIFESPAN);
   }
 
   if (context.getActivationDataType() == TensorDim::DataType::FP32) {
diff --git a/nntrainer/layers/lstmcell.cpp b/nntrainer/layers/lstmcell.cpp
index 4a578e7d8a..a9cad5d260 100644
--- a/nntrainer/layers/lstmcell.cpp
+++ b/nntrainer/layers/lstmcell.cpp
@@ -34,9 +34,9 @@ LSTMCellLayer::LSTMCellLayer() : lstmcell_props(props::DropOutRate()) {
 }
 
 void LSTMCellLayer::finalize(InitLayerContext &context) {
-  const Tensor::Initializer weight_initializer =
+  const Initializer weight_initializer =
     std::get<props::WeightInitializer>(*layer_impl_props).get();
-  const Tensor::Initializer bias_initializer =
+  const Initializer bias_initializer =
     std::get<props::BiasInitializer>(*layer_impl_props).get();
   const WeightRegularizer weight_regularizer =
     std::get<props::WeightRegularizer>(*layer_impl_props).get();
@@ -151,16 +151,16 @@ void LSTMCellLayer::finalize(InitLayerContext &context) {
   const TensorDim ifgo_dim(batch_size, 1, 1, NUM_GATE * unit,
                            weight_tensor_type);
   wt_idx[LSTMCellParams::ifgo] =
-    context.requestTensor(ifgo_dim, "ifgo", Tensor::Initializer::NONE, true,
+    context.requestTensor(ifgo_dim, "ifgo", Initializer::NONE, true,
                           TensorLifespan::ITERATION_LIFESPAN);
 
   if (dropout_rate > epsilon) {
     // dropout_mask_dim = [ batch_size, 1, 1, unit ]
     const TensorDim dropout_mask_dim(batch_size, 1, 1, unit,
                                      weight_tensor_type);
-    wt_idx[LSTMCellParams::dropout_mask] = context.requestTensor(
-      dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false,
-      TensorLifespan::ITERATION_LIFESPAN);
+    wt_idx[LSTMCellParams::dropout_mask] =
+      context.requestTensor(dropout_mask_dim, "dropout_mask", Initializer::NONE,
+                            false, TensorLifespan::ITERATION_LIFESPAN);
   }
 
   if (context.getActivationDataType() == TensorDim::DataType::FP32) {
diff --git a/nntrainer/layers/mol_attention_layer.cpp b/nntrainer/layers/mol_attention_layer.cpp
index efacd24849..3d3fb77865 100644
--- a/nntrainer/layers/mol_attention_layer.cpp
+++ b/nntrainer/layers/mol_attention_layer.cpp
@@ -111,44 +111,44 @@ void MoLAttentionLayer::finalize(InitLayerContext &context) {
   TensorDim fc_out_dim = query_dim;
   fc_out_dim.width(fc_w_dim.width());
   wt_idx[MoLAttentionParams::fc_out] =
-    context.requestTensor(fc_out_dim, "fc_out", Tensor::Initializer::NONE,
-                          false, TensorLifespan::FORWARD_FUNC_LIFESPAN);
+    context.requestTensor(fc_out_dim, "fc_out", Initializer::NONE, false,
+                          TensorLifespan::FORWARD_FUNC_LIFESPAN);
 
   wt_idx[MoLAttentionParams::fc_tanh] =
-    context.requestTensor(fc_out_dim, "fc_tanh", Tensor::Initializer::NONE,
-                          false, TensorLifespan::ITERATION_LIFESPAN);
+    context.requestTensor(fc_out_dim, "fc_tanh", Initializer::NONE, false,
+                          TensorLifespan::ITERATION_LIFESPAN);
 
   TensorDim fc_proj_out_dim = fc_out_dim;
   fc_proj_out_dim.width(fc_proj_w_dim.width());
-  wt_idx[MoLAttentionParams::fc_proj_out] = context.requestTensor(
-    fc_proj_out_dim, "fc_proj_out", Tensor::Initializer::NONE, false,
-    TensorLifespan::ITERATION_LIFESPAN);
+  wt_idx[MoLAttentionParams::fc_proj_out] =
+    context.requestTensor(fc_proj_out_dim, "fc_proj_out", Initializer::NONE,
+                          false, TensorLifespan::ITERATION_LIFESPAN);
 
   TensorDim scores_dim =
     TensorDim({value_dim.batch(), 1, 1, value_dim.height()});
   wt_idx[MoLAttentionParams::scores] =
-    context.requestTensor(scores_dim, "scores", Tensor::Initializer::NONE,
-                          false, TensorLifespan::ITERATION_LIFESPAN);
+    context.requestTensor(scores_dim, "scores", Initializer::NONE, false,
+                          TensorLifespan::ITERATION_LIFESPAN);
 
   TensorDim prob_dim = value_dim;
   prob_dim.width(mol_k);
   wt_idx[MoLAttentionParams::prob] =
-    context.requestTensor(prob_dim, "prob", Tensor::Initializer::NONE, false,
+    context.requestTensor(prob_dim, "prob", Initializer::NONE, false,
                           TensorLifespan::ITERATION_LIFESPAN);
   wt_idx[MoLAttentionParams::prob_left] =
-    context.requestTensor(prob_dim, "prob_left", Tensor::Initializer::NONE,
-                          false, TensorLifespan::ITERATION_LIFESPAN);
+    context.requestTensor(prob_dim, "prob_left", Initializer::NONE, false,
+                          TensorLifespan::ITERATION_LIFESPAN);
   wt_idx[MoLAttentionParams::prob_right] =
-    context.requestTensor(prob_dim, "prob_right", Tensor::Initializer::NONE,
-                          false, TensorLifespan::ITERATION_LIFESPAN);
+    context.requestTensor(prob_dim, "prob_right", Initializer::NONE, false,
+                          TensorLifespan::ITERATION_LIFESPAN);
   wt_idx[MoLAttentionParams::u_neg_div] =
-    context.requestTensor(prob_dim, "u_neg_div", Tensor::Initializer::NONE,
-                          false, TensorLifespan::ITERATION_LIFESPAN);
+    context.requestTensor(prob_dim, "u_neg_div", Initializer::NONE, false,
+                          TensorLifespan::ITERATION_LIFESPAN);
   wt_idx[MoLAttentionParams::u_pos_div] =
-    context.requestTensor(prob_dim, "u_pos_div", Tensor::Initializer::NONE,
-                          false, TensorLifespan::ITERATION_LIFESPAN);
+    context.requestTensor(prob_dim, "u_pos_div", Initializer::NONE, false,
+                          TensorLifespan::ITERATION_LIFESPAN);
   wt_idx[MoLAttentionParams::dstate] =
-    context.requestTensor(state_dim, "dstate", Tensor::Initializer::NONE, false,
+    context.requestTensor(state_dim, "dstate", Initializer::NONE, false,
                           TensorLifespan::BACKWARD_FUNC_LIFESPAN);
 
   if (context.getNumRequestedOutputs() == 2)
diff --git a/nntrainer/layers/multi_head_attention_layer.cpp b/nntrainer/layers/multi_head_attention_layer.cpp
index 0d4b73b67f..bdb38fcf92 100644
--- a/nntrainer/layers/multi_head_attention_layer.cpp
+++ b/nntrainer/layers/multi_head_attention_layer.cpp
@@ -261,52 +261,52 @@ void MultiHeadAttentionLayer::finalize(InitLayerContext &context) {
     {batch_size, 1, query_height, num_heads * projected_query_dim_prop},
     activation_type);
   weight_idx[AttentionParams::projected_query] = context.requestTensor(
-    projected_query_dim, "projected_query", Tensor::Initializer::NONE, true,
+    projected_query_dim, "projected_query", Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
   /** tensor for output of key fc */
   TensorDim projected_key_dim(
     {batch_size, 1, key_height, num_heads * projected_key_dim_prop},
     activation_type);
-  weight_idx[AttentionParams::projected_key] = context.requestTensor(
-    projected_key_dim, "projected_key", Tensor::Initializer::NONE, true,
-    TensorLifespan::ITERATION_LIFESPAN);
+  weight_idx[AttentionParams::projected_key] =
+    context.requestTensor(projected_key_dim, "projected_key", Initializer::NONE,
+                          true, TensorLifespan::ITERATION_LIFESPAN);
   /** tensor for output of value fc */
   TensorDim projected_value_dim(
     {batch_size, 1, value_height, num_heads * projected_value_dim_prop},
     activation_type);
   weight_idx[AttentionParams::projected_value] = context.requestTensor(
-    projected_value_dim, "projected_value", Tensor::Initializer::NONE, true,
+    projected_value_dim, "projected_value", Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
 
-  weight_idx[AttentionParams::cache_key] = context.requestTensor(
-    projected_key_dim, "cache_key", Tensor::Initializer::NONE, true,
-    TensorLifespan::MAX_LIFESPAN);
+  weight_idx[AttentionParams::cache_key] =
+    context.requestTensor(projected_key_dim, "cache_key", Initializer::NONE,
+                          true, TensorLifespan::MAX_LIFESPAN);
 
-  weight_idx[AttentionParams::cache_value] = context.requestTensor(
-    projected_value_dim, "cache_value", Tensor::Initializer::NONE, true,
-    TensorLifespan::MAX_LIFESPAN);
+  weight_idx[AttentionParams::cache_value] =
+    context.requestTensor(projected_value_dim, "cache_value", Initializer::NONE,
+                          true, TensorLifespan::MAX_LIFESPAN);
 
   if (provide_attention_mask) {
     /** Intended comment for bool type mask */
     // TensorDim attention_mask_dim(
     //   {batch_size, num_heads, query_height, key_height});
     // weight_idx[AttentionParams::attention_mask] = context.requestTensor(
-    //   attention_mask_dim, "attention_mask", Tensor::Initializer::NONE, false,
+    //   attention_mask_dim, "attention_mask", Initializer::NONE, false,
     //   TensorLifespan::FORWARD_FUNC_LIFESPAN);
   }
   /** tensor for attention weight */
   TensorDim attention_weight_dim(
     {batch_size, num_heads, query_height, key_height}, activation_type);
   weight_idx[AttentionParams::attention_weight] = context.requestTensor(
-    attention_weight_dim, "attention_weight", Tensor::Initializer::NONE, true,
+    attention_weight_dim, "attention_weight", Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
   if (dropout_rate > epsilon) {
     /** tensor for dropout mask */
     TensorDim dropout_mask_dim(
       {batch_size, num_heads, query_height, key_height}, activation_type);
-    weight_idx[AttentionParams::dropout_mask] = context.requestTensor(
-      dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false,
-      TensorLifespan::ITERATION_LIFESPAN);
+    weight_idx[AttentionParams::dropout_mask] =
+      context.requestTensor(dropout_mask_dim, "dropout_mask", Initializer::NONE,
+                            false, TensorLifespan::ITERATION_LIFESPAN);
   }
 
   /** tensor for attention output */
@@ -314,7 +314,7 @@ void MultiHeadAttentionLayer::finalize(InitLayerContext &context) {
     {batch_size, 1, query_height, num_heads * projected_value_dim_prop},
     activation_type);
   weight_idx[AttentionParams::attention_output] = context.requestTensor(
-    attention_output_dim, "attention_output", Tensor::Initializer::NONE, true,
+    attention_output_dim, "attention_output", Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
 
   TensorDim output_dim({batch_size, 1, query_height, output_shape},
@@ -570,9 +570,7 @@ void MultiHeadAttentionLayer::incremental_forwarding(RunLayerContext &context,
   Tensor &key = context.getInput(INOUT_INDEX::KEY);
   Tensor &value = context.getInput(INOUT_INDEX::VALUE);
 
-  Tensor empty_tensor;
-
-  empty_tensor.setTensorType(value.getTensorType());
+  Tensor empty_tensor("empty", value.getFormat(), value.getDataType());
 
   Tensor &mask =
     provide_attention_mask ? context.getInput(INOUT_INDEX::MASK) : empty_tensor;
diff --git a/nntrainer/layers/pooling2d_layer.cpp b/nntrainer/layers/pooling2d_layer.cpp
index a68e42e8d0..52f5ee5066 100644
--- a/nntrainer/layers/pooling2d_layer.cpp
+++ b/nntrainer/layers/pooling2d_layer.cpp
@@ -112,13 +112,13 @@ void Pooling2DLayer::finalize(InitLayerContext &context) {
    */
   if (pooling_type == props::PoolingTypeInfo::Enum::global_max) {
     pool_helper_idx =
-      context.requestTensor(in_dim, "helper_idx", Tensor::Initializer::NONE,
-                            false, TensorLifespan::ITERATION_LIFESPAN);
+      context.requestTensor(in_dim, "helper_idx", Initializer::NONE, false,
+                            TensorLifespan::ITERATION_LIFESPAN);
     pool_helper_size.resize(in_dim.batch() * in_dim.channel());
   } else {
     pool_helper_idx =
-      context.requestTensor(out_dim, "helper_idx", Tensor::Initializer::NONE,
-                            false, TensorLifespan::ITERATION_LIFESPAN);
+      context.requestTensor(out_dim, "helper_idx", Initializer::NONE, false,
+                            TensorLifespan::ITERATION_LIFESPAN);
   }
 }
 
diff --git a/nntrainer/layers/positional_encoding_layer.cpp b/nntrainer/layers/positional_encoding_layer.cpp
index 6295bbad76..5f98b41e8e 100644
--- a/nntrainer/layers/positional_encoding_layer.cpp
+++ b/nntrainer/layers/positional_encoding_layer.cpp
@@ -47,7 +47,7 @@ void PositionalEncodingLayer::finalize(InitLayerContext &context) {
     {context.getFormat(), context.getWeightDataType()});
   weight_idx[PositionalEncodingParams::positional_encoding] =
     context.requestTensor(pe_dim, "positional_encoding",
-                          nntrainer::Tensor::Initializer::NONE, false,
+                          nntrainer::Initializer::NONE, false,
                           nntrainer::TensorLifespan::MAX_LIFESPAN);
 }
 
diff --git a/nntrainer/layers/rnn.cpp b/nntrainer/layers/rnn.cpp
index e5fb70a6ed..5e50a8484a 100644
--- a/nntrainer/layers/rnn.cpp
+++ b/nntrainer/layers/rnn.cpp
@@ -53,9 +53,9 @@ void RNNLayer::finalize(InitLayerContext &context) {
     std::get<props::WeightRegularizer>(*layer_impl_props);
   const float weight_regularizer_constant =
     std::get<props::WeightRegularizerConstant>(*layer_impl_props);
-  const Tensor::Initializer weight_initializer =
+  const Initializer weight_initializer =
     std::get<props::WeightInitializer>(*layer_impl_props);
-  const Tensor::Initializer bias_initializer =
+  const Initializer bias_initializer =
     std::get<props::BiasInitializer>(*layer_impl_props);
   auto &weight_decay = std::get<props::WeightDecay>(*layer_impl_props);
   auto &bias_decay = std::get<props::BiasDecay>(*layer_impl_props);
@@ -128,18 +128,18 @@ void RNNLayer::finalize(InitLayerContext &context) {
 
   // hidden_state_dim : [ batch_size, 1, max_timestep, unit ]
   const TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit);
-  wt_idx[RNNParams::hidden_state] = context.requestTensor(
-    hidden_state_dim, "hidden_state", Tensor::Initializer::NONE, true,
-    TensorLifespan::ITERATION_LIFESPAN);
+  wt_idx[RNNParams::hidden_state] =
+    context.requestTensor(hidden_state_dim, "hidden_state", Initializer::NONE,
+                          true, TensorLifespan::ITERATION_LIFESPAN);
 
   if (dropout_rate > epsilon) {
     // dropout_mask_dim = [ batch, 1, (return_sequences ? time_iteration : 1),
     // unit ]
     const TensorDim dropout_mask_dim(batch_size, 1,
                                      return_sequences ? max_timestep : 1, unit);
-    wt_idx[RNNParams::dropout_mask] = context.requestTensor(
-      dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false,
-      TensorLifespan::ITERATION_LIFESPAN);
+    wt_idx[RNNParams::dropout_mask] =
+      context.requestTensor(dropout_mask_dim, "dropout_mask", Initializer::NONE,
+                            false, TensorLifespan::ITERATION_LIFESPAN);
   }
 
   acti_func.setActiFunc(hidden_state_activation_type);
diff --git a/nntrainer/layers/rnncell.cpp b/nntrainer/layers/rnncell.cpp
index 9a2191f97d..eefbfa1b6f 100644
--- a/nntrainer/layers/rnncell.cpp
+++ b/nntrainer/layers/rnncell.cpp
@@ -54,9 +54,9 @@ void RNNCellLayer::finalize(InitLayerContext &context) {
     std::get<props::WeightRegularizer>(*layer_impl_props);
   const float weight_regularizer_constant =
     std::get<props::WeightRegularizerConstant>(*layer_impl_props);
-  const Tensor::Initializer weight_initializer =
+  const Initializer weight_initializer =
     std::get<props::WeightInitializer>(*layer_impl_props);
-  const Tensor::Initializer bias_initializer =
+  const Initializer bias_initializer =
     std::get<props::BiasInitializer>(*layer_impl_props);
   auto &weight_decay = std::get<props::WeightDecay>(*layer_impl_props);
   auto &bias_decay = std::get<props::BiasDecay>(*layer_impl_props);
@@ -138,9 +138,9 @@ void RNNCellLayer::finalize(InitLayerContext &context) {
   if (dropout_rate > epsilon) {
     // dropout_mask_dim = [ batch, 1, 1, unit ]
     const TensorDim dropout_mask_dim(batch_size, 1, 1, unit);
-    wt_idx[RNNCellParams::dropout_mask] = context.requestTensor(
-      dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false,
-      TensorLifespan::ITERATION_LIFESPAN);
+    wt_idx[RNNCellParams::dropout_mask] =
+      context.requestTensor(dropout_mask_dim, "dropout_mask", Initializer::NONE,
+                            false, TensorLifespan::ITERATION_LIFESPAN);
   }
 
   acti_func.setActiFunc(hidden_state_activation_type);
diff --git a/nntrainer/layers/time_dist.cpp b/nntrainer/layers/time_dist.cpp
index 80451416df..fe2a2173b1 100644
--- a/nntrainer/layers/time_dist.cpp
+++ b/nntrainer/layers/time_dist.cpp
@@ -205,9 +205,8 @@ void TimeDistLayer::forwarding(RunLayerContext &context, bool training) {
   // TODO: This transposed Input Tensor could be resued for backwarding
   Tensor in = transposeTensor(input_);
 
-  Tensor out =
-    Tensor({ho_dim[2], 1, ho_dim[0], ho_dim[3]}, true,
-           Tensor::Initializer::NONE, context.getName() + ":inter_output");
+  Tensor out = Tensor({ho_dim[2], 1, ho_dim[0], ho_dim[3]}, true,
+                      Initializer::NONE, context.getName() + ":inter_output");
 
   TensorDim i_dim = in_dim;
   i_dim.channel(1);
@@ -223,8 +222,8 @@ void TimeDistLayer::forwarding(RunLayerContext &context, bool training) {
     h_g = transposeTensor(hidden_g);
   }
 
-  Var_Grad in_var(i_dim, Tensor::Initializer::NONE, false, false, "input");
-  Var_Grad out_var(h_dim, Tensor::Initializer::NONE,
+  Var_Grad in_var(i_dim, Initializer::NONE, false, false, "input");
+  Var_Grad out_var(h_dim, Initializer::NONE,
                    dist_layer->requireLabel() &&
                      context.isLabelAvailable(SINGLE_INOUT_IDX),
                    false, "output");
@@ -280,8 +279,8 @@ void TimeDistLayer::calcDerivative(RunLayerContext &context) {
   TensorDim r_dim = {ret_dim[2], 1, 1, ret_dim[3]};
   TensorDim d_dim = {der_dim[2], 1, 1, der_dim[3]};
 
-  Var_Grad in_var(r_dim, Tensor::Initializer::NONE, true, false, "input");
-  Var_Grad out_var(d_dim, Tensor::Initializer::NONE, true, false, "output");
+  Var_Grad in_var(r_dim, Initializer::NONE, true, false, "input");
+  Var_Grad out_var(d_dim, Initializer::NONE, true, false, "output");
 
   fillWeightsFromContext(context);
   fillTensorsFromContext(context);
@@ -346,8 +345,8 @@ void TimeDistLayer::calcGradient(RunLayerContext &context) {
     Tensor d_iter = derivative_.getSharedDataTensor(
       d_dim, i * d_dim.batch() * d_dim.width(), true, derivative_.getName());
 
-    Var_Grad in_var(i_dim, Tensor::Initializer::NONE, true, false, "input");
-    Var_Grad out_var(d_dim, Tensor::Initializer::NONE, true, false, "output");
+    Var_Grad in_var(i_dim, Initializer::NONE, true, false, "input");
+    Var_Grad out_var(d_dim, Initializer::NONE, true, false, "output");
 
     in_var.initializeVariable(in_iter);
     out_var.initializeGradient(d_iter);
@@ -388,8 +387,8 @@ void TimeDistLayer::setBatch(RunLayerContext &context, unsigned int batch) {
     TensorDim i_dim = {in_dim[2], 1, 1, in_dim[3]};
     TensorDim o_dim = {out_dim[2], 1, 1, out_dim[3]};
 
-    Var_Grad in_var(i_dim, Tensor::Initializer::NONE, true, false, "input");
-    Var_Grad out_var(o_dim, Tensor::Initializer::NONE, true, false, "output");
+    Var_Grad in_var(i_dim, Initializer::NONE, true, false, "input");
+    Var_Grad out_var(o_dim, Initializer::NONE, true, false, "output");
 
     fillWeightsFromContext(context);
     fillTensorsFromContext(context);
diff --git a/nntrainer/layers/zoneout_lstmcell.cpp b/nntrainer/layers/zoneout_lstmcell.cpp
index 419a02e17f..20976f8b3c 100644
--- a/nntrainer/layers/zoneout_lstmcell.cpp
+++ b/nntrainer/layers/zoneout_lstmcell.cpp
@@ -58,9 +58,9 @@ bool ZoneoutLSTMCellLayer::CellStateZoneOutRate::isValid(
 }
 
 void ZoneoutLSTMCellLayer::finalize(InitLayerContext &context) {
-  const Tensor::Initializer weight_initializer =
+  const Initializer weight_initializer =
     std::get<props::WeightInitializer>(*layer_impl_props).get();
-  const Tensor::Initializer bias_initializer =
+  const Initializer bias_initializer =
     std::get<props::BiasInitializer>(*layer_impl_props).get();
   const WeightRegularizer weight_regularizer =
     std::get<props::WeightRegularizer>(*layer_impl_props).get();
@@ -187,7 +187,7 @@ void ZoneoutLSTMCellLayer::finalize(InitLayerContext &context) {
    * ] */
   const TensorDim ifgo_dim(batch_size, 1, 1, NUM_GATE * unit);
   wt_idx[ZoneoutLSTMParams::ifgo] =
-    context.requestTensor(ifgo_dim, "ifgo", Tensor::Initializer::NONE, true,
+    context.requestTensor(ifgo_dim, "ifgo", Initializer::NONE, true,
                           TensorLifespan::ITERATION_LIFESPAN);
 
   // hidden_state_zoneout_mask_dim = [ max_timestep
@@ -196,15 +196,14 @@ void ZoneoutLSTMCellLayer::finalize(InitLayerContext &context) {
                                                 unit);
   if (test) {
     wt_idx[ZoneoutLSTMParams::hidden_state_zoneout_mask] =
-      context.requestWeight(hidden_state_zoneout_mask_dim,
-                            Tensor::Initializer::NONE, WeightRegularizer::NONE,
-                            1.0f, 0.0f, "hidden_state_zoneout_mask", false);
+      context.requestWeight(hidden_state_zoneout_mask_dim, Initializer::NONE,
+                            WeightRegularizer::NONE, 1.0f, 0.0f,
+                            "hidden_state_zoneout_mask", false);
   } else {
     wt_idx[ZoneoutLSTMParams::hidden_state_zoneout_mask] =
       context.requestTensor(hidden_state_zoneout_mask_dim,
-                            "hidden_state_zoneout_mask",
-                            Tensor::Initializer::NONE, false,
-                            TensorLifespan::ITERATION_LIFESPAN, false);
+                            "hidden_state_zoneout_mask", Initializer::NONE,
+                            false, TensorLifespan::ITERATION_LIFESPAN, false);
   }
 
   // cell_state_zoneout_mask_dim = [ max_timestep * batch_size, 1, 1, unit ]
@@ -212,19 +211,18 @@ void ZoneoutLSTMCellLayer::finalize(InitLayerContext &context) {
                                               unit);
   if (test) {
     wt_idx[ZoneoutLSTMParams::cell_state_zoneout_mask] = context.requestWeight(
-      cell_state_zoneout_mask_dim, Tensor::Initializer::NONE,
-      WeightRegularizer::NONE, 1.0f, 0.0f, "cell_state_zoneout_mask", false);
+      cell_state_zoneout_mask_dim, Initializer::NONE, WeightRegularizer::NONE,
+      1.0f, 0.0f, "cell_state_zoneout_mask", false);
   } else {
     wt_idx[ZoneoutLSTMParams::cell_state_zoneout_mask] = context.requestTensor(
-      cell_state_zoneout_mask_dim, "cell_state_zoneout_mask",
-      Tensor::Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN,
-      false);
+      cell_state_zoneout_mask_dim, "cell_state_zoneout_mask", Initializer::NONE,
+      false, TensorLifespan::ITERATION_LIFESPAN, false);
   }
 
   // lstm_cell_state_dim = [ batch_size, 1, 1, unit ]
   const TensorDim lstm_cell_state_dim(batch_size, 1, 1, unit);
   wt_idx[ZoneoutLSTMParams::lstm_cell_state] = context.requestTensor(
-    lstm_cell_state_dim, "lstm_cell_state", Tensor::Initializer::NONE, true,
+    lstm_cell_state_dim, "lstm_cell_state", Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
 
   acti_func.setActiFunc(hidden_state_activation_type);
@@ -441,7 +439,9 @@ void ZoneoutLSTMCellLayer::calcGradient(RunLayerContext &context) {
   Tensor hidden_state_zoneout_mask = hs_zoneout_mask.getBatchSlice(timestep, 1);
   hidden_state_zoneout_mask.reshape({batch_size, 1, 1, unit});
   Tensor prev_hidden_state_zoneout_mask = hidden_state_zoneout_mask.apply(
-    (std::function<float (float)>) [epsilon = epsilon](float x) { return x < epsilon; });
+    (std::function<float(float)>)[epsilon = epsilon](float x) {
+      return x < epsilon;
+    });
 
   d_hidden_state.multiply(prev_hidden_state_zoneout_mask,
                           d_prev_hidden_state_residual);
@@ -456,7 +456,9 @@ void ZoneoutLSTMCellLayer::calcGradient(RunLayerContext &context) {
   Tensor cell_state_zoneout_mask = cs_zoneout_mask.getBatchSlice(timestep, 1);
   cell_state_zoneout_mask.reshape({batch_size, 1, 1, unit});
   Tensor prev_cell_state_zoneout_mask = cell_state_zoneout_mask.apply(
-    (std::function<float (float)>) [epsilon = epsilon](float x) { return x < epsilon; });
+    (std::function<float(float)>)[epsilon = epsilon](float x) {
+      return x < epsilon;
+    });
 
   d_cell_state.multiply(prev_cell_state_zoneout_mask,
                         d_prev_cell_state_residual);
diff --git a/nntrainer/tensor/float_tensor.cpp b/nntrainer/tensor/float_tensor.cpp
index 915d1b9466..c0781de954 100644
--- a/nntrainer/tensor/float_tensor.cpp
+++ b/nntrainer/tensor/float_tensor.cpp
@@ -14,6 +14,7 @@
 
 #include <blas_interface.h>
 #include <float_tensor.h>
+#include <tensor.h>
 #include <util_func.h>
 
 namespace nntrainer {
@@ -36,57 +37,6 @@ FloatTensor::FloatTensor(const TensorDim &d, const void *buf) :
   }
 }
 
-FloatTensor::FloatTensor(
-  std::vector<std::vector<std::vector<std::vector<float>>>> const &d,
-  Tformat fm) {
-  if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
-    throw std::out_of_range(
-      "[Tensor] trying to initialize FloatTensor from empty vector");
-  }
-
-  dim.setTensorDim(0, d.size());
-  if (fm == Tformat::NCHW) {
-    dim.setTensorDim(1, d[0].size());
-    dim.setTensorDim(2, d[0][0].size());
-    dim.setTensorDim(3, d[0][0][0].size());
-  } else {
-    dim.setTensorDim(2, d[0].size());
-    dim.setTensorDim(3, d[0][0].size());
-    dim.setTensorDim(1, d[0][0][0].size());
-  }
-
-  dim.setTensorType({fm, Tdatatype::FP32});
-
-  strides = dim.computeStrides();
-  contiguous = true;
-  initializer = Initializer::NONE;
-
-  MemoryData *mem_data =
-    new MemoryData((void *)(new float[dim.getDataLen()]()));
-  data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
-    delete[] mem_data->getAddr<float>();
-  });
-
-  offset = 0;
-
-  // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
-  // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
-  // dim[1] == height, dim[2] == width, dim[3] == channel
-  if (fm == Tformat::NCHW) {
-    for (unsigned int i = 0; i < batch(); ++i)
-      for (unsigned int j = 0; j < channel(); ++j)
-        for (unsigned int k = 0; k < height(); ++k)
-          for (unsigned int l = 0; l < width(); ++l)
-            this->setValue(i, j, k, l, d[i][j][k][l]);
-  } else {
-    for (unsigned int i = 0; i < batch(); ++i)
-      for (unsigned int j = 0; j < height(); ++j)
-        for (unsigned int k = 0; k < width(); ++k)
-          for (unsigned int l = 0; l < channel(); ++l)
-            this->setValue(i, l, j, k, d[i][j][k][l]);
-  }
-}
-
 bool FloatTensor::operator==(const FloatTensor &rhs) const {
   const float *_data = (float *)getData();
   const float *_rdata = (float *)rhs.getData();
@@ -282,9 +232,9 @@ void FloatTensor::initialize(Initializer init) {
   initialize();
 }
 
-TensorV2 &FloatTensor::apply(std::function<float(float)> f,
-                             TensorV2 &output) const {
-  CREATE_V2_IF_EMPTY_DIMS(output, dim, nullptr);
+Tensor &FloatTensor::apply(std::function<float(float)> f,
+                           Tensor &output) const {
+  CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
 
   if (contiguous && output.getContiguous()) {
     const float *data = (float *)getData();
@@ -317,9 +267,9 @@ TensorV2 &FloatTensor::apply(std::function<float(float)> f,
   return output;
 }
 
-TensorV2 FloatTensor::multiply_strided(TensorV2 const &m, TensorV2 &output,
-                                       const float beta) const {
-  CREATE_V2_IF_EMPTY_DIMS(output, dim, nullptr);
+Tensor FloatTensor::multiply_strided(Tensor const &m, Tensor &output,
+                                     const float beta) const {
+  CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
 
   if (size() != m.size() || size() != output.size())
     throw std::invalid_argument(
@@ -386,28 +336,18 @@ int FloatTensor::multiply_i(float const &value) {
   return ML_ERROR_NONE;
 }
 
-TensorV2 &FloatTensor::multiply(float const &value, TensorV2 &out) const {
+Tensor &FloatTensor::multiply(float const &value, Tensor &out) const {
   auto f = std::bind(std::multiplies<float>(), std::placeholders::_1, value);
   apply(f, out);
   return out;
 }
 
-TensorV2 &FloatTensor::multiply(TensorV2 const &m, TensorV2 &output,
-                                const float beta) const {
-  auto f = [&](const BroadcastInfoV2 &e, const float *buf, const float *m_buf,
+Tensor &FloatTensor::multiply(Tensor const &m, Tensor &output,
+                              const float beta) const {
+  auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
                float *out_buf) {
-    if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1 &&
-        std::fpclassify(beta) == FP_ZERO) {
-      std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
-                     std::multiplies<float>());
-    } else {
-      for (unsigned int i = 0; i < e.buffer_size; ++i) {
-        *out_buf = *buf * *m_buf + beta * *out_buf;
-        buf += strides[3];
-        m_buf += e.strides[3];
-        out_buf += output.getStrides()[3];
-      }
-    }
+    ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
+            strides[3]);
   };
 
   NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
@@ -427,34 +367,24 @@ TensorV2 &FloatTensor::multiply(TensorV2 const &m, TensorV2 &output,
   return output;
 }
 
-TensorV2 &FloatTensor::divide(float const &value, TensorV2 &output) const {
+Tensor &FloatTensor::divide(float const &value, Tensor &output) const {
   auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
   apply(f, output);
   return output;
 }
 
-TensorV2 &FloatTensor::divide(TensorV2 const &m, TensorV2 &output) const {
-  auto f = [&](const BroadcastInfoV2 &e, const float *buf, const float *m_buf,
+Tensor &FloatTensor::divide(Tensor const &m, Tensor &output) const {
+  auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
                float *out_buf) {
-    if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1) {
-      std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
-                     std::divides<float>());
-    } else {
-      for (unsigned int i = 0; i < e.buffer_size; ++i) {
-        *out_buf = *buf / *m_buf;
-        buf += strides[3];
-        m_buf += e.strides[3];
-        out_buf += output.getStrides()[3];
-      }
-    }
+    ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]);
   };
 
   apply_broadcast(m, f, output);
   return output;
 }
 
-TensorV2 &FloatTensor::add_strided(TensorV2 const &input, TensorV2 &output,
-                                   const float beta) const {
+Tensor &FloatTensor::add_strided(Tensor const &input, Tensor &output,
+                                 const float beta) const {
   NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
     << getName() << " is not allocated";
   NNTR_THROW_IF(input.getData<float>() == nullptr, std::invalid_argument)
@@ -507,54 +437,68 @@ TensorV2 &FloatTensor::add_strided(TensorV2 const &input, TensorV2 &output,
   return output;
 }
 
-TensorV2 &FloatTensor::add(float const &value, TensorV2 &output) const {
+int FloatTensor::add_i(Tensor const &m, Tensor &output, float const alpha) {
+  auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
+               float *out_buf) {
+    saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
+  };
+
+  try {
+    apply_broadcast(m, f, output);
+  } catch (std::exception &err) {
+    ml_loge("%s %s", typeid(err).name(), err.what());
+    return ML_ERROR_INVALID_PARAMETER;
+  }
+  return ML_ERROR_NONE;
+}
+
+int FloatTensor::add_i_partial(unsigned int len, unsigned int addr_idx,
+                               Tensor &m, unsigned int incX, unsigned int incY,
+                               const Tensor alphas, unsigned int alpha_idx) {
+  saxpy(len, alphas.getValue<float>(alpha_idx), m.getData<float>(), incX,
+        (float *)getAddress(addr_idx), incY);
+
+  return ML_ERROR_NONE;
+}
+
+Tensor &FloatTensor::add(float const &value, Tensor &output) const {
   auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
   apply(f, output);
   return output;
 }
 
-TensorV2 &FloatTensor::add(TensorV2 const &m, TensorV2 &output,
-                           float const alpha) const {
-  auto f = [&](const BroadcastInfoV2 &e, const float *buf, const float *m_buf,
+Tensor &FloatTensor::add(Tensor const &m, Tensor &output,
+                         float const alpha) const {
+  auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
                float *out_buf) {
-    if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 &&
-        std::fpclassify(alpha) == FP_ZERO) {
-      std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
-                     std::plus<float>());
-    } else {
-      for (unsigned int i = 0; i < e.buffer_size; ++i) {
-        *out_buf = *buf + *m_buf * alpha;
-        buf += strides[3];
-        m_buf += e.strides[3];
-        out_buf += strides[3];
-      }
-    }
+    ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
+            strides[3]);
   };
   apply_broadcast(m, f, output);
   return output;
 }
 
-TensorV2 &FloatTensor::subtract(float const &value, TensorV2 &output) const {
+Tensor &FloatTensor::subtract(float const &value, Tensor &output) const {
   auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
   apply(f, output);
   return output;
 }
 
-void FloatTensor::sum_by_batch(TensorV2 &output) const {
+void FloatTensor::sum_by_batch(Tensor &output) const {
   size_t feat_len = dim.getFeatureLen();
   size_t batch = dim.batch();
 
   const float *data = (float *)getData();
   float *out_data = output.getData<float>();
 
-  TensorV2 ones(1, 1, 1, feat_len, this->getFormat());
+  Tensor ones(1, 1, 1, feat_len, this->getFormat());
   ones.setValue(1.0);
   sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
         ones.getData<float>(), 1, 0.0, out_data, 1);
 }
 
-TensorV2 &FloatTensor::sum(unsigned int axis, TensorV2 &output, float alpha,
-                           float beta) const {
+Tensor &FloatTensor::sum(unsigned int axis, Tensor &output, float alpha,
+                         float beta) const {
   const float *data = (float *)getData();
 
   NNTR_THROW_IF(!contiguous, std::invalid_argument)
@@ -564,35 +508,35 @@ TensorV2 &FloatTensor::sum(unsigned int axis, TensorV2 &output, float alpha,
     throw std::out_of_range("Error: axis is invalid");
 
   if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
-    CREATE_V2_IF_EMPTY_DIMS(output, dim);
+    CREATE_IF_EMPTY_DIMS(output, dim);
     scopy(size(), (float *)getData(), 1, output.getData<float>(), 1);
     return output;
   }
 
   switch (axis) {
   case 0: {
-    CREATE_V2_IF_EMPTY_DIMS(output, 1, dim.channel(), dim.height(), dim.width(),
-                            getTensorType());
+    CREATE_IF_EMPTY_DIMS(output, 1, dim.channel(), dim.height(), dim.width(),
+                         getTensorType());
     size_t feat_len = dim.getFeatureLen();
     size_t batch = dim.batch();
-    TensorV2 ones(1, 1, 1, batch, getTensorType());
+    Tensor ones(1, 1, 1, batch, getTensorType());
     ones.setValue(alpha);
     sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
           ones.getData<float>(), 1, beta, output.getData<float>(), 1);
   } break;
   case 1: {
-    CREATE_V2_IF_EMPTY_DIMS(output, dim[0], 1, dim[2], dim[3], getTensorType());
+    CREATE_IF_EMPTY_DIMS(output, dim[0], 1, dim[2], dim[3], getTensorType());
     if (this->getFormat() == Tformat::NHWC) {
       unsigned int feat_len = output.getDim().getDataLen();
       unsigned int t_axis = dim[1];
-      TensorV2 ones(1, 1, 1, t_axis, getTensorType());
+      Tensor ones(1, 1, 1, t_axis, getTensorType());
       ones.setValue(alpha);
       sgemv(CblasRowMajor, CblasNoTrans, feat_len, t_axis, 1, data, t_axis,
             ones.getData<float>(), 1, beta, output.getData<float>(), 1);
     } else {
       unsigned int feat_len = dim[2] * dim[3];
       unsigned int t_axis = dim[1];
-      TensorV2 ones(1, 1, 1, t_axis, getTensorType());
+      Tensor ones(1, 1, 1, t_axis, getTensorType());
       ones.setValue(alpha);
       float *rdata = output.getData<float>();
       for (unsigned int k = 0; k < dim[0]; ++k) {
@@ -603,11 +547,11 @@ TensorV2 &FloatTensor::sum(unsigned int axis, TensorV2 &output, float alpha,
     }
   } break;
   case 2: {
-    CREATE_V2_IF_EMPTY_DIMS(output, dim[0], dim[1], 1, dim[3], getTensorType());
+    CREATE_IF_EMPTY_DIMS(output, dim[0], dim[1], 1, dim[3], getTensorType());
     if (this->getFormat() == Tformat::NHWC) {
       unsigned int feat_len = dim[1] * dim[3];
       unsigned int t_axis = dim[2];
-      TensorV2 ones(1, 1, 1, t_axis, getTensorType());
+      Tensor ones(1, 1, 1, t_axis, getTensorType());
       ones.setValue(alpha);
       float *rdata = output.getData<float>();
       for (unsigned int k = 0; k < dim[0]; ++k) {
@@ -618,7 +562,7 @@ TensorV2 &FloatTensor::sum(unsigned int axis, TensorV2 &output, float alpha,
     } else {
       unsigned int t_3 = dim[3];
       unsigned int t_axis = dim[2];
-      TensorV2 ones(1, 1, 1, t_axis, getTensorType());
+      Tensor ones(1, 1, 1, t_axis, getTensorType());
       ones.setValue(alpha);
 
       if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
@@ -641,12 +585,12 @@ TensorV2 &FloatTensor::sum(unsigned int axis, TensorV2 &output, float alpha,
     }
   } break;
   case 3: {
-    CREATE_V2_IF_EMPTY_DIMS(output, dim[0], dim[1], dim[2], 1,
-                            this->getTensorType());
+    CREATE_IF_EMPTY_DIMS(output, dim[0], dim[1], dim[2], 1,
+                         this->getTensorType());
     if (this->getFormat() == Tformat::NHWC) {
       unsigned int t_3 = dim[1];
       unsigned int t_axis = dim[3];
-      TensorV2 ones(1, 1, 1, t_axis, getTensorType());
+      Tensor ones(1, 1, 1, t_axis, getTensorType());
       ones.setValue(alpha);
       float *rdata = output.getData<float>();
       for (unsigned int k = 0; k < dim[0]; ++k) {
@@ -660,7 +604,7 @@ TensorV2 &FloatTensor::sum(unsigned int axis, TensorV2 &output, float alpha,
     } else {
       unsigned int m = output.getDim().getDataLen();
       unsigned int n = dim[3];
-      TensorV2 ones(1, 1, 1, n, getTensorType());
+      Tensor ones(1, 1, 1, n, getTensorType());
       ones.setValue(alpha);
 
       if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
@@ -692,19 +636,19 @@ float FloatTensor::l2norm() const {
   return snrm2(size(), (float *)getData(), 1);
 }
 
-TensorV2 &FloatTensor::pow(float exponent, TensorV2 &output) const {
+Tensor &FloatTensor::pow(float exponent, Tensor &output) const {
   auto f = [exponent](float in) { return powf(in, exponent); };
   apply(f, output);
   return output;
 }
 
-TensorV2 &FloatTensor::erf(TensorV2 &output) const {
+Tensor &FloatTensor::erf(Tensor &output) const {
   auto f = [](float in) { return std::erf(in); };
   apply(f, output);
   return output;
 }
 
-void FloatTensor::sin(TensorV2 &out, float alpha) {
+void FloatTensor::sin(Tensor &out, float alpha) {
   if (!contiguous) {
     auto f = [alpha](float val) -> float { return std::sin(alpha * val); };
     apply(f, out);
@@ -713,7 +657,7 @@ void FloatTensor::sin(TensorV2 &out, float alpha) {
   }
 }
 
-void FloatTensor::cos(TensorV2 &out, float alpha) {
+void FloatTensor::cos(Tensor &out, float alpha) {
   if (!contiguous) {
     auto f = [alpha](float val) -> float { return std::cos(alpha * val); };
     apply(f, out);
@@ -722,8 +666,16 @@ void FloatTensor::cos(TensorV2 &out, float alpha) {
   }
 }
 
-TensorV2 &FloatTensor::dot(TensorV2 const &input, TensorV2 &output, bool trans,
-                           bool trans_in, float beta) const {
+void FloatTensor::inv_sqrt(Tensor &out) {
+  if (!contiguous) {
+    apply([](float val) -> float { return 1 / std::sqrt(val); }, out);
+  } else {
+    inv_sqrt_inplace(out.size(), out.getData<float>());
+  }
+}
+
+Tensor &FloatTensor::dot(Tensor const &input, Tensor &output, bool trans,
+                         bool trans_in, float beta) const {
   // Comment out with intension to support the calculation wrt. batch and height
   // direction. It supposes to have this->dim as [ BxCxH,W ] and input.dim is
   // [BxCxH,W] as well if (input.dim.rank() > 2) {
@@ -782,12 +734,12 @@ TensorV2 &FloatTensor::dot(TensorV2 const &input, TensorV2 &output, bool trans,
   return output;
 }
 
-void FloatTensor::copy(const TensorV2 &from) {
+void FloatTensor::copy(const Tensor &from) {
   reshape(from.getDim());
   copy(from.getData<float>());
 }
 
-void FloatTensor::copyData(const TensorV2 &from) {
+void FloatTensor::copyData(const Tensor &from) {
   NNTR_THROW_IF(!contiguous, std::invalid_argument)
     << getName() << " is not contiguous, cannot copy.";
 
@@ -812,6 +764,18 @@ void FloatTensor::copyData(const TensorV2 &from) {
   }
 }
 
+void FloatTensor::copy_with_stride(const Tensor &input, Tensor &output) {
+  for (unsigned int b = 0; b < output.batch(); ++b) {
+    for (unsigned int c = 0; c < output.channel(); ++c) {
+      for (unsigned int h = 0; h < output.height(); ++h) {
+        for (unsigned int w = 0; w < output.width(); ++w) {
+          output.setValue(b, c, h, w, input.getValue<float>(b, c, h, w));
+        }
+      }
+    }
+  }
+}
+
 std::vector<unsigned int> FloatTensor::argmax() const {
   std::vector<unsigned int> result;
   const float *data = (float *)getData();
@@ -844,8 +808,8 @@ float FloatTensor::minValue() const {
   return *std::min_element(data, data + size());
 }
 
-TensorV2 &FloatTensor::transpose(const std::string &direction,
-                                 TensorV2 &output) const {
+Tensor &FloatTensor::transpose(const std::string &direction,
+                               Tensor &output) const {
   unsigned int SL, SI, SJ, SK;
 
   output.reshape(dim.transpose(direction));
@@ -921,7 +885,7 @@ void FloatTensor::dropout_mask(float dropout) {
   }
 }
 
-void FloatTensor::filter_mask(const TensorV2 &mask_len, bool reverse) {
+void FloatTensor::filter_mask(const Tensor &mask_len, bool reverse) {
   float fill_mask_val = 0.0;
   float en_mask_val = 1.0 - fill_mask_val;
 
@@ -942,7 +906,7 @@ void FloatTensor::filter_mask(const TensorV2 &mask_len, bool reverse) {
   }
 }
 
-void FloatTensor::zoneout_mask(TensorV2 &opposite, float zoneout) {
+void FloatTensor::zoneout_mask(Tensor &opposite, float zoneout) {
   opposite.setRandBernoulli(zoneout);
 
   float *data = (float *)getData();
@@ -957,7 +921,7 @@ void FloatTensor::zoneout_mask(TensorV2 &opposite, float zoneout) {
   }
 }
 
-std::vector<TensorV2> FloatTensor::split(std::vector<size_t> sizes, int axis) {
+std::vector<Tensor> FloatTensor::split(std::vector<size_t> sizes, int axis) {
   size_t num_size = sizes.size();
 
   if (axis == -1) {
@@ -977,7 +941,7 @@ std::vector<TensorV2> FloatTensor::split(std::vector<size_t> sizes, int axis) {
   }
 
   bool is_format_nchw = (dim.getFormat() == Tformat::NCHW) ? true : false;
-  std::vector<TensorV2> ret;
+  std::vector<Tensor> ret;
 
   auto iter_value = [this, is_format_nchw](
                       std::array<size_t, 4> &loc,
@@ -1059,17 +1023,16 @@ std::vector<TensorV2> FloatTensor::split(std::vector<size_t> sizes, int axis) {
   return ret;
 }
 
-TensorV2 FloatTensor::cat(const std::vector<TensorV2> &tensors, int axis) {
+Tensor FloatTensor::concat(const std::vector<Tensor> &tensors, int axis) {
   if (axis == -1) {
     axis = 3;
   }
 
-  TensorV2 ret;
   auto ref_dim = tensors.front().getDim();
   bool is_format_nchw = (ref_dim.getFormat() == Tformat::NCHW);
   ref_dim.setTensorDim(axis, 1);
   NNTR_THROW_IF(!std::all_of(tensors.begin(), tensors.end(),
-                             [&ref_dim, axis](const TensorV2 &t) {
+                             [&ref_dim, axis](const Tensor &t) {
                                auto cur_dim = t.getDim();
                                cur_dim.setTensorDim(axis, 1);
                                return ref_dim == cur_dim;
@@ -1079,12 +1042,12 @@ TensorV2 FloatTensor::cat(const std::vector<TensorV2> &tensors, int axis) {
     << ref_dim << " axis : " << axis;
 
   auto axis_dim = std::accumulate(tensors.begin(), tensors.end(), 0u,
-                                  [axis](unsigned cur, const TensorV2 &t) {
+                                  [axis](unsigned cur, const Tensor &t) {
                                     return cur += t.getDim().getTensorDim(axis);
                                   });
   auto iter_value =
     [is_format_nchw](std::array<unsigned, 4> &loc,
-                     const std::array<unsigned, 4> &start_loc, TensorV2 &t,
+                     const std::array<unsigned, 4> &start_loc, Tensor &t,
                      const std::array<unsigned, 4> &ref_dim_arr) -> float & {
     auto &value = is_format_nchw
                     ? t.getValue<float>(loc[0], loc[1], loc[2], loc[3])
@@ -1104,7 +1067,7 @@ TensorV2 FloatTensor::cat(const std::vector<TensorV2> &tensors, int axis) {
   auto ret_dim = ref_dim;
   ret_dim.setTensorDim(axis, axis_dim);
 
-  ret = TensorV2(ret_dim);
+  Tensor ret = Tensor(ret_dim);
 
   std::array<unsigned, 4> loc = {0, 0, 0, 0};
   for (auto &t : tensors) {
@@ -1143,7 +1106,6 @@ TensorV2 FloatTensor::cat(const std::vector<TensorV2> &tensors, int axis) {
 }
 
 void FloatTensor::print(std::ostream &out) const {
-  printInstance(out, this);
   const float *data = (float *)getData();
   unsigned int len = size();
   out << "data addr: " << data << '\n';
@@ -1203,11 +1165,11 @@ void FloatTensor::copy(const void *buf) {
 }
 
 void FloatTensor::apply_broadcast_util(
-  TensorV2 const &m,
-  std::function<void(const BroadcastInfoV2 &e, const float *, const float *,
+  Tensor const &m,
+  std::function<void(const BroadcastInfo &e, const float *, const float *,
                      float *)>
     v_func,
-  TensorV2 &output, const BroadcastInfoV2 &e, int cur_axis, size_t offset,
+  Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
   size_t m_offset) const {
 
   const float *buf = (float *)this->getData();
@@ -1235,12 +1197,12 @@ void FloatTensor::apply_broadcast_util(
 }
 
 void FloatTensor::apply_broadcast(
-  TensorV2 const &m,
-  std::function<void(const BroadcastInfoV2 &e, const float *, const float *,
+  Tensor const &m,
+  std::function<void(const BroadcastInfo &e, const float *, const float *,
                      float *)>
     v_func,
-  TensorV2 &output) const {
-  CREATE_V2_IF_EMPTY_DIMS(output, dim);
+  Tensor &output) const {
+  CREATE_IF_EMPTY_DIMS(output, dim);
 
   NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
     << getName() << " is not allocated";
@@ -1253,7 +1215,7 @@ void FloatTensor::apply_broadcast(
   /// note that buffer_size, the last stride is only used in v_func but it
   /// might be changed
   if (dim == m.getDim()) {
-    BroadcastInfoV2 e;
+    BroadcastInfo e;
     e.buffer_size = size();
     e.strides[3] = 1;
     e.tensor_type = getTensorType();
diff --git a/nntrainer/tensor/float_tensor.h b/nntrainer/tensor/float_tensor.h
index 6eae7d0d9c..dd976d91d9 100644
--- a/nntrainer/tensor/float_tensor.h
+++ b/nntrainer/tensor/float_tensor.h
@@ -14,7 +14,6 @@
 #ifdef __cplusplus
 
 #include <tensor_base.h>
-#include <tensor_v2.h>
 
 #ifdef DEBUG
 #define EXCEPT_WHEN_DEBUG
@@ -62,7 +61,60 @@ class FloatTensor : public TensorBase {
    */
   FloatTensor(
     std::vector<std::vector<std::vector<std::vector<float>>>> const &d,
-    Tformat fm);
+    Tformat fm) {
+    if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
+      throw std::out_of_range(
+        "[Tensor] trying to initialize FloatTensor from empty vector");
+    }
+
+    dim.setTensorDim(0, d.size());
+    if (fm == Tformat::NCHW) {
+      dim.setTensorDim(1, d[0].size());
+      dim.setTensorDim(2, d[0][0].size());
+      dim.setTensorDim(3, d[0][0][0].size());
+    } else {
+      dim.setTensorDim(2, d[0].size());
+      dim.setTensorDim(3, d[0][0].size());
+      dim.setTensorDim(1, d[0][0][0].size());
+    }
+
+    dim.setTensorType({fm, Tdatatype::FP32});
+
+    strides = dim.computeStrides();
+    contiguous = true;
+    initializer = Initializer::NONE;
+
+    MemoryData *mem_data =
+      new MemoryData((void *)(new float[dim.getDataLen()]()));
+    data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
+      delete[] mem_data->getAddr<float>();
+    });
+
+    offset = 0;
+
+    // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
+    // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
+    // dim[1] == height, dim[2] == width, dim[3] == channel
+    if (fm == Tformat::NCHW) {
+      for (unsigned int i = 0; i < batch(); ++i)
+        for (unsigned int j = 0; j < channel(); ++j)
+          for (unsigned int k = 0; k < height(); ++k)
+            for (unsigned int l = 0; l < width(); ++l)
+              this->setValue(i, j, k, l, d[i][j][k][l]);
+    } else {
+      for (unsigned int i = 0; i < batch(); ++i)
+        for (unsigned int j = 0; j < height(); ++j)
+          for (unsigned int k = 0; k < width(); ++k)
+            for (unsigned int l = 0; l < channel(); ++l)
+              this->setValue(i, l, j, k, d[i][j][k][l]);
+    }
+  }
+
+  /**
+   * @brief Construct a new FloatTensor object
+   * @param rhs TensorBase object to copy
+   */
+  FloatTensor(TensorBase &rhs) : TensorBase(rhs) {}
 
   /**
    * @brief Basic Destructor
@@ -84,22 +136,22 @@ class FloatTensor : public TensorBase {
   bool operator!=(const FloatTensor &rhs) const { return !(*this == rhs); }
 
   /**
-   * @copydoc TensorV2::allocate()
+   * @copydoc Tensor::allocate()
    */
   void allocate() override;
 
   /**
-   * @copydoc TensorV2::deallocate()
+   * @copydoc Tensor::deallocate()
    */
   void deallocate() override;
 
   /**
-   * @copydoc TensorV2::getData()
+   * @copydoc Tensor::getData()
    */
   void *getData() const override;
 
   /**
-   * @copydoc TensorV2::getData(size_t idx)
+   * @copydoc Tensor::getData(size_t idx)
    */
   void *getData(size_t idx) const override;
 
@@ -148,24 +200,24 @@ class FloatTensor : public TensorBase {
                   unsigned int w);
 
   /**
-   * @copydoc TensorV2::setValue(float value)
+   * @copydoc Tensor::setValue(float value)
    */
   void setValue(float value) override;
 
   /**
-   * @copydoc TensorV2::setValue(b, c, h, w, value)
+   * @copydoc Tensor::setValue(b, c, h, w, value)
    */
   void setValue(unsigned int b, unsigned int c, unsigned int h, unsigned int w,
                 float value) override;
 
   /**
-   * @copydoc TensorV2::addValue(b, c, h, w, value, beta)
+   * @copydoc Tensor::addValue(b, c, h, w, value, beta)
    */
   void addValue(unsigned int b, unsigned int c, unsigned int h, unsigned int w,
                 float value, float beta) override;
 
   /**
-   * @copydoc TensorV2::setZero()
+   * @copydoc Tensor::setZero()
    */
   void setZero() override;
 
@@ -186,180 +238,203 @@ class FloatTensor : public TensorBase {
   };
 
   /**
-   * @copydoc TensorV2::setRandNormal()
+   * @copydoc Tensor::setRandNormal()
    */
-  void setRandNormal(float mean = 0.0f, float stddev = 0.05f);
+  void setRandNormal(float mean = 0.0f, float stddev = 0.05f) override;
 
   /**
-   * @copydoc TensorV2::setRandUniform()
+   * @copydoc Tensor::setRandUniform()
    */
-  void setRandUniform(float min = -0.05f, float max = 0.05f);
+  void setRandUniform(float min = -0.05f, float max = 0.05f) override;
 
   /**
-   * @copydoc TensorV2::setRandBernoulli()
+   * @copydoc Tensor::setRandBernoulli()
    */
-  void setRandBernoulli(float probability = 0.5f);
+  void setRandBernoulli(float probability = 0.5f) override;
 
   /**
-   * @copydoc TensorV2::initialize()
+   * @copydoc Tensor::initialize()
    */
   void initialize() override;
 
   /**
-   * @copydoc TensorV2::initialize(Initializer init)
+   * @copydoc Tensor::initialize(Initializer init)
    */
   void initialize(Initializer init) override;
 
   /**
-   * @copydoc TensorV2::apply(std::function<T(T)> f, TensorV2 &output)
+   * @copydoc Tensor::apply(std::function<T(T)> f, Tensor &output)
    */
-  TensorV2 &apply(std::function<float(float)> f,
-                  TensorV2 &output) const override;
+  Tensor &apply(std::function<float(float)> f, Tensor &output) const override;
 
   /**
-   * @copydoc TensorV2::multiply_strided(TensorV2 const &m, TensorV2 &output,
+   * @copydoc Tensor::multiply_strided(Tensor const &m, Tensor &output,
    * const float beta)
    */
-  TensorV2 multiply_strided(TensorV2 const &m, TensorV2 &output,
-                            const float beta) const override;
+  Tensor multiply_strided(Tensor const &m, Tensor &output,
+                          const float beta) const override;
 
   /**
-   * @copydoc TensorV2::multiply_i(float const &value)
+   * @copydoc Tensor::multiply_i(float const &value)
    */
   int multiply_i(float const &value) override;
 
   /**
-   * @copydoc TensorV2::multiply(float const &value, TensorV2 &out)
+   * @copydoc Tensor::multiply(float const &value, Tensor &out)
    */
-  TensorV2 &multiply(float const &value, TensorV2 &out) const override;
+  Tensor &multiply(float const &value, Tensor &out) const override;
 
   /**
-   * @copydoc TensorV2::multiply(TensorV2 const &m, TensorV2 &output, const
+   * @copydoc Tensor::multiply(Tensor const &m, Tensor &output, const
    * float beta = 0.0)
    */
-  TensorV2 &multiply(TensorV2 const &m, TensorV2 &output,
-                     const float beta = 0.0) const override;
+  Tensor &multiply(Tensor const &m, Tensor &output,
+                   const float beta = 0.0) const override;
 
   /**
-   * @copydoc TensorV2::divide(float const &value, TensorV2 &output)
+   * @copydoc Tensor::divide(float const &value, Tensor &output)
    */
-  TensorV2 &divide(float const &value, TensorV2 &output) const override;
+  Tensor &divide(float const &value, Tensor &output) const override;
 
   /**
-   * @copydoc TensorV2::divide(TensorV2 const &m, TensorV2 &output)
+   * @copydoc Tensor::divide(Tensor const &m, Tensor &output)
    */
-  TensorV2 &divide(TensorV2 const &m, TensorV2 &output) const override;
+  Tensor &divide(Tensor const &m, Tensor &output) const override;
 
   /**
-   * @copydoc TensorV2::add_strided(TensorV2 const &input, TensorV2 &output,
+   * @copydoc Tensor::add_strided(Tensor const &input, Tensor &output,
    * const float beta)
    */
-  TensorV2 &add_strided(TensorV2 const &input, TensorV2 &output,
-                        const float beta) const override;
+  Tensor &add_strided(Tensor const &input, Tensor &output,
+                      const float beta) const override;
+
+  /**
+   * @copydoc Tensor::add_i(Tensor const &m, float const alpha)
+   */
+  int add_i(Tensor const &m, Tensor &output, float const alpha) override;
 
   /**
-   * @copydoc TensorV2::add(float const &value, TensorV2 &output)
+   * @copydoc Tensor::add_i_partial()
    */
-  TensorV2 &add(float const &value, TensorV2 &output) const override;
+  int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
+                    unsigned int incX, unsigned int incY, const Tensor alphas,
+                    unsigned int alpha_idx) override;
 
   /**
-   * @copydoc TensorV2::add(TensorV2 const &m, TensorV2 &output, float const
+   * @copydoc Tensor::add(float const &value, Tensor &output)
+   */
+  Tensor &add(float const &value, Tensor &output) const override;
+
+  /**
+   * @copydoc Tensor::add(Tensor const &m, Tensor &output, float const
    * alpha)
    */
-  TensorV2 &add(TensorV2 const &m, TensorV2 &output,
-                float const alpha) const override;
+  Tensor &add(Tensor const &m, Tensor &output,
+              float const alpha) const override;
 
   /**
-   *  @copydoc TensorV2::subtract(float const &value, TensorV2 &output)
+   *  @copydoc Tensor::subtract(float const &value, Tensor &output)
    */
-  TensorV2 &subtract(float const &value, TensorV2 &output) const override;
+  Tensor &subtract(float const &value, Tensor &output) const override;
 
   /**
-   *  @copydoc TensorBase::sum_by_batch(TensorV2 &output)
+   *  @copydoc TensorBase::sum_by_batch(Tensor &output)
    */
-  void sum_by_batch(TensorV2 &output) const override;
+  void sum_by_batch(Tensor &output) const override;
 
   /**
-   * @copydoc TensorV2::sum(unsigned int axis, TensorV2 &output, float alpha,
+   * @copydoc Tensor::sum(unsigned int axis, Tensor &output, float alpha,
    * float beta) const
    */
-  TensorV2 &sum(unsigned int axis, TensorV2 &output, float alpha,
-                float beta) const override;
+  Tensor &sum(unsigned int axis, Tensor &output, float alpha,
+              float beta) const override;
 
   /**
-   * @copydoc TensorV2::l2norm
+   * @copydoc Tensor::l2norm
    */
   float l2norm() const override;
 
   /**
-   * @copydoc TensorV2::pow(float exponent, TensorV2 &output)
+   * @copydoc Tensor::pow(float exponent, Tensor &output)
+   */
+  Tensor &pow(float exponent, Tensor &output) const override;
+
+  /**
+   * @copydoc Tensor::erf(Tensor &output)
    */
-  TensorV2 &pow(float exponent, TensorV2 &output) const override;
+  Tensor &erf(Tensor &output) const override;
 
   /**
-   * @copydoc TensorV2::erf(TensorV2 &output)
+   * @copydoc Tensor::sin(Tensor &out, float alpha)
    */
-  TensorV2 &erf(TensorV2 &output) const override;
+  void sin(Tensor &out, float alpha) override;
 
   /**
-   * @copydoc TensorV2::sin(TensorV2 &out, float alpha)
+   * @copydoc Tensor::cos(Tensor &out, float alpha)
    */
-  void sin(TensorV2 &out, float alpha) override;
+  void cos(Tensor &out, float alpha) override;
 
   /**
-   * @copydoc TensorV2::cos(TensorV2 &out, float alpha)
+   * @copydoc TensorBase::inv_sqrt(Tensor &out)
    */
-  void cos(TensorV2 &out, float alpha) override;
+  void inv_sqrt(Tensor &out) override;
 
   /**
-   *  @copydoc TensorV2::dot(TensorV2 const &input, TensorV2 &output, bool
+   *  @copydoc Tensor::dot(Tensor const &input, Tensor &output, bool
    * trans, bool trans_in, float beta)
    */
-  TensorV2 &dot(TensorV2 const &input, TensorV2 &output, bool trans,
-                bool trans_in, float beta) const override;
+  Tensor &dot(Tensor const &input, Tensor &output, bool trans, bool trans_in,
+              float beta) const override;
 
   /**
-   * @copydoc TensorV2::dropout_mask(float dropout)
+   * @copydoc Tensor::dropout_mask(float dropout)
    */
   void dropout_mask(float dropout) override;
 
   /**
-   * @copydoc TensorV2::filter_mask(const TensorV2 &mask_len, bool reverse)
+   * @copydoc Tensor::filter_mask(const Tensor &mask_len, bool reverse)
    */
-  void filter_mask(const TensorV2 &mask_len, bool reverse) override;
+  void filter_mask(const Tensor &mask_len, bool reverse) override;
 
   /**
-   * @copydoc TensorV2::zoneout_mask(TensorV2 &opposite, float zoneout)
+   * @copydoc Tensor::zoneout_mask(Tensor &opposite, float zoneout)
    */
-  void zoneout_mask(TensorV2 &opposite, float zoneout) override;
+  void zoneout_mask(Tensor &opposite, float zoneout) override;
 
   /**
-   * @copydoc TensorV2::split(std::vector<size_t> sizes, int axis)
+   * @copydoc Tensor::split(std::vector<size_t> sizes, int axis)
    */
-  std::vector<TensorV2> split(std::vector<size_t> sizes, int axis) override;
+  std::vector<Tensor> split(std::vector<size_t> sizes, int axis) override;
 
   /**
-   * @copydoc TensorV2::cat(const std::vector<TensorV2> &tensors, int axis)
+   * @copydoc Tensor::cat(const std::vector<Tensor> &tensors, int axis)
    */
-  static TensorV2 cat(const std::vector<TensorV2> &tensors, int axis);
+  Tensor concat(const std::vector<Tensor> &tensors, int axis) override;
 
   /**
-   * @copydoc TensorV2::copy(const TensorV2 &from)
+   * @copydoc Tensor::copy(const Tensor &from)
    */
-  void copy(const TensorV2 &from);
+  void copy(const Tensor &from) override;
 
   /**
-   * @copydoc TensorV2::copyData(const TensorV2 &from)
+   * @copydoc Tensor::copyData(const Tensor &from)
    */
-  void copyData(const TensorV2 &from);
+  void copyData(const Tensor &from) override;
 
   /**
-   * @copydoc TensorV2::argmax()
+   * @brief      Copy the Tensor
+   * @param[in]  input Tensor to be copied
+   * @param[out] output output Tensor
+   */
+  void copy_with_stride(const Tensor &input, Tensor &output) override;
+
+  /**
+   * @copydoc Tensor::argmax()
    */
   std::vector<unsigned int> argmax() const override;
 
   /**
-   * @copydoc TensorV2::max_abs()
+   * @copydoc Tensor::max_abs()
    */
   float max_abs() const override;
   /**
@@ -373,13 +448,13 @@ class FloatTensor : public TensorBase {
   float minValue() const override;
 
   /**
-   * @copydoc TensorV2::transpose(const std::string &direction, TensorV2 &out)
+   * @copydoc Tensor::transpose(const std::string &direction, Tensor &out)
    */
-  TensorV2 &transpose(const std::string &direction,
-                      TensorV2 &output) const override;
+  Tensor &transpose(const std::string &direction,
+                    Tensor &output) const override;
 
   /**
-   * @copydoc TensorV2::print(std::ostream &out)
+   * @copydoc Tensor::print(std::ostream &out)
    */
   void print(std::ostream &out) const override;
 
@@ -403,13 +478,14 @@ class FloatTensor : public TensorBase {
    * @retval #ML_ERROR_NONE Successful
    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
    */
-  void apply_broadcast_util(
-    TensorV2 const &m,
-    std::function<void(const BroadcastInfoV2 &e, const float *, const float *,
-                       float *)>
-      v_func,
-    TensorV2 &output, const BroadcastInfoV2 &e, int cur_axis = -1,
-    size_t offset = 0, size_t m_offset = 0) const;
+  void
+  apply_broadcast_util(Tensor const &m,
+                       std::function<void(const BroadcastInfo &e, const float *,
+                                          const float *, float *)>
+                         v_func,
+                       Tensor &output, const BroadcastInfo &e,
+                       int cur_axis = -1, size_t offset = 0,
+                       size_t m_offset = 0) const;
 
   /**
    * @brief Applies the given operator to the tensor with the passed argument
@@ -419,12 +495,11 @@ class FloatTensor : public TensorBase {
    * @retval #ML_ERROR_NONE Successful
    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
    */
-  void
-  apply_broadcast(TensorV2 const &m,
-                  std::function<void(const BroadcastInfoV2 &e, const float *,
-                                     const float *, float *)>
-                    v_func,
-                  TensorV2 &output) const;
+  void apply_broadcast(Tensor const &m,
+                       std::function<void(const BroadcastInfo &e, const float *,
+                                          const float *, float *)>
+                         v_func,
+                       Tensor &output) const;
 };
 
 } // namespace nntrainer
diff --git a/nntrainer/tensor/half_tensor.cpp b/nntrainer/tensor/half_tensor.cpp
index cff0691895..f34955f38f 100644
--- a/nntrainer/tensor/half_tensor.cpp
+++ b/nntrainer/tensor/half_tensor.cpp
@@ -14,6 +14,7 @@
 
 #include <blas_interface.h>
 #include <half_tensor.h>
+#include <tensor.h>
 #include <util_func.h>
 
 namespace nntrainer {
@@ -36,58 +37,6 @@ HalfTensor::HalfTensor(const TensorDim &d, const void *buf) :
   }
 }
 
-HalfTensor::HalfTensor(
-  std::vector<std::vector<std::vector<std::vector<_FP16>>>> const &d,
-  Tformat fm) {
-
-  if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
-    throw std::out_of_range(
-      "[Tensor] trying to initialize HalfTensor from empty vector");
-  }
-
-  dim.setTensorDim(0, d.size());
-  if (fm == Tformat::NCHW) {
-    dim.setTensorDim(1, d[0].size());
-    dim.setTensorDim(2, d[0][0].size());
-    dim.setTensorDim(3, d[0][0][0].size());
-  } else {
-    dim.setTensorDim(2, d[0].size());
-    dim.setTensorDim(3, d[0][0].size());
-    dim.setTensorDim(1, d[0][0][0].size());
-  }
-
-  dim.setTensorType({fm, Tdatatype::FP16});
-
-  strides = dim.computeStrides();
-  contiguous = true;
-  initializer = Initializer::NONE;
-
-  MemoryData *mem_data =
-    new MemoryData((void *)(new _FP16[dim.getDataLen()]()));
-  data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
-    delete[] mem_data->getAddr<_FP16>();
-  });
-
-  offset = 0;
-
-  // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
-  // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
-  // dim[1] == height, dim[2] == width, dim[3] == channel
-  if (fm == Tformat::NCHW) {
-    for (unsigned int i = 0; i < batch(); ++i)
-      for (unsigned int j = 0; j < channel(); ++j)
-        for (unsigned int k = 0; k < height(); ++k)
-          for (unsigned int l = 0; l < width(); ++l)
-            this->setValue(i, j, k, l, d[i][j][k][l]);
-  } else {
-    for (unsigned int i = 0; i < batch(); ++i)
-      for (unsigned int j = 0; j < height(); ++j)
-        for (unsigned int k = 0; k < width(); ++k)
-          for (unsigned int l = 0; l < channel(); ++l)
-            this->setValue(i, l, j, k, d[i][j][k][l]);
-  }
-}
-
 bool HalfTensor::operator==(const HalfTensor &rhs) const {
   const _FP16 *_data = (_FP16 *)getData();
   const _FP16 *_rdata = (_FP16 *)rhs.getData();
@@ -282,9 +231,8 @@ void HalfTensor::initialize(Initializer init) {
   initialize();
 }
 
-TensorV2 &HalfTensor::apply(std::function<_FP16(_FP16)> f,
-                            TensorV2 &output) const {
-  CREATE_V2_IF_EMPTY_DIMS(output, dim, nullptr);
+Tensor &HalfTensor::apply(std::function<_FP16(_FP16)> f, Tensor &output) const {
+  CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
 
   if (contiguous && output.getContiguous()) {
     const _FP16 *data = (_FP16 *)getData();
@@ -317,9 +265,9 @@ TensorV2 &HalfTensor::apply(std::function<_FP16(_FP16)> f,
   return output;
 }
 
-TensorV2 HalfTensor::multiply_strided(TensorV2 const &m, TensorV2 &output,
-                                      const float beta) const {
-  CREATE_V2_IF_EMPTY_DIMS(output, dim, nullptr);
+Tensor HalfTensor::multiply_strided(Tensor const &m, Tensor &output,
+                                    const float beta) const {
+  CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
 
   if (size() != m.size() || size() != output.size())
     throw std::invalid_argument(
@@ -385,28 +333,19 @@ int HalfTensor::multiply_i(float const &value) {
   return ML_ERROR_NONE;
 }
 
-TensorV2 &HalfTensor::multiply(float const &value, TensorV2 &out) const {
+Tensor &HalfTensor::multiply(float const &value, Tensor &out) const {
   auto f = std::bind(std::multiplies<_FP16>(), std::placeholders::_1,
                      static_cast<_FP16>(value));
   apply(f, out);
   return out;
 }
 
-TensorV2 &HalfTensor::multiply(TensorV2 const &m, TensorV2 &output,
-                               const float beta) const {
-  auto f = [&](const BroadcastInfoV2 &e, const _FP16 *buf, const _FP16 *m_buf,
+Tensor &HalfTensor::multiply(Tensor const &m, Tensor &output,
+                             const float beta) const {
+  auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
                _FP16 *out_buf) {
-    if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1 &&
-        std::fpclassify(beta) == FP_ZERO) {
-      ele_mul(e.buffer_size, buf, m_buf, out_buf);
-    } else {
-      for (unsigned int i = 0; i < e.buffer_size; ++i) {
-        *out_buf = *buf * *m_buf + static_cast<_FP16>(beta) * *out_buf;
-        buf += strides[3];
-        m_buf += e.strides[3];
-        out_buf += output.getStrides()[3];
-      }
-    }
+    ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
+            strides[3]);
   };
 
   NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
@@ -422,8 +361,8 @@ TensorV2 &HalfTensor::multiply(TensorV2 const &m, TensorV2 &output,
   return output;
 }
 
-TensorV2 &HalfTensor::add_strided(TensorV2 const &input, TensorV2 &output,
-                                  const float beta) const {
+Tensor &HalfTensor::add_strided(Tensor const &input, Tensor &output,
+                                const float beta) const {
   if (size() != input.size() || size() != output.size())
     throw std::invalid_argument(
       "Strided multiplication does not support broadcasting");
@@ -480,54 +419,71 @@ TensorV2 &HalfTensor::add_strided(TensorV2 const &input, TensorV2 &output,
   return output;
 }
 
-TensorV2 &HalfTensor::add(float const &value, TensorV2 &output) const {
+int HalfTensor::add_i(Tensor const &m, Tensor &output, float const alpha) {
+  auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
+               _FP16 *out_buf) {
+    saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
+    /// @todo: saxpy is not valid for _FP16
+  };
+
+  try {
+    apply_broadcast(m, f, output);
+  } catch (std::exception &err) {
+    ml_loge("%s %s", typeid(err).name(), err.what());
+    return ML_ERROR_INVALID_PARAMETER;
+  }
+  return ML_ERROR_NONE;
+}
+
+int HalfTensor::add_i_partial(unsigned int len, unsigned int addr_idx,
+                              Tensor &m, unsigned int incX, unsigned int incY,
+                              const Tensor alphas, unsigned int alpha_idx) {
+  saxpy(len, alphas.getValue<_FP16>(alpha_idx), m.getData<_FP16>(), incX,
+        (_FP16 *)getAddress(addr_idx), incY);
+
+  return ML_ERROR_NONE;
+}
+
+Tensor &HalfTensor::add(float const &value, Tensor &output) const {
   auto f = std::bind(std::plus<_FP16>(), std::placeholders::_1,
                      static_cast<_FP16>(value));
   apply(f, output);
   return output;
 }
 
-TensorV2 &HalfTensor::add(TensorV2 const &m, TensorV2 &output,
-                          float const alpha) const {
-  auto f = [&](const BroadcastInfoV2 &e, const _FP16 *buf, const _FP16 *m_buf,
+Tensor &HalfTensor::add(Tensor const &m, Tensor &output,
+                        float const alpha) const {
+  auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
                _FP16 *out_buf) {
-    if (e.strides[3] == 1 && strides[3] == 1 && strides[3] == 1 && alpha == 1) {
-      ele_add(e.buffer_size, buf, m_buf, out_buf);
-    } else {
-      for (unsigned int i = 0; i < e.buffer_size; ++i) {
-        *out_buf = *buf + *m_buf * static_cast<_FP16>(alpha);
-        buf += strides[3];
-        m_buf += e.strides[3];
-        out_buf += strides[3];
-      }
-    }
+    ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
+            strides[3]);
   };
   apply_broadcast(m, f, output);
   return output;
 }
 
-TensorV2 &HalfTensor::subtract(float const &value, TensorV2 &output) const {
+Tensor &HalfTensor::subtract(float const &value, Tensor &output) const {
   auto f = std::bind(std::minus<_FP16>(), std::placeholders::_1,
                      static_cast<_FP16>(value));
   apply(f, output);
   return output;
 }
 
-void HalfTensor::sum_by_batch(TensorV2 &output) const {
+void HalfTensor::sum_by_batch(Tensor &output) const {
   size_t feat_len = dim.getFeatureLen();
   size_t batch = dim.batch();
 
   const _FP16 *data = (_FP16 *)getData();
   _FP16 *out_data = output.getData<_FP16>();
 
-  TensorV2 ones(1, 1, 1, feat_len, this->getTensorType());
+  Tensor ones(1, 1, 1, feat_len, this->getTensorType());
   ones.setValue((_FP16)1.0);
   sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
         ones.getData<_FP16>(), 1, 0.0, out_data, 1);
 }
 
-TensorV2 &HalfTensor::sum(unsigned int axis, TensorV2 &output, float alpha,
-                          float beta) const {
+Tensor &HalfTensor::sum(unsigned int axis, Tensor &output, float alpha,
+                        float beta) const {
 
   const _FP16 *data = (_FP16 *)getData();
 
@@ -538,35 +494,35 @@ TensorV2 &HalfTensor::sum(unsigned int axis, TensorV2 &output, float alpha,
     throw std::out_of_range("Error: axis is invalid");
 
   if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
-    CREATE_V2_IF_EMPTY_DIMS(output, dim);
+    CREATE_IF_EMPTY_DIMS(output, dim);
     scopy(size(), (_FP16 *)getData(), 1, output.getData<_FP16>(), 1);
     return output;
   }
 
   switch (axis) {
   case 0: {
-    CREATE_V2_IF_EMPTY_DIMS(output, 1, dim.channel(), dim.height(), dim.width(),
-                            this->getTensorType());
+    CREATE_IF_EMPTY_DIMS(output, 1, dim.channel(), dim.height(), dim.width(),
+                         this->getTensorType());
     size_t feat_len = dim.getFeatureLen();
     size_t batch = dim.batch();
-    TensorV2 ones(1, 1, 1, batch, this->getTensorType());
+    Tensor ones(1, 1, 1, batch, this->getTensorType());
     ones.setValue(alpha);
     sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
           ones.getData<_FP16>(), 1, beta, output.getData<_FP16>(), 1);
   } break;
   case 1: {
-    CREATE_V2_IF_EMPTY_DIMS(output, dim[0], 1, dim[2], dim[3], getTensorType());
+    CREATE_IF_EMPTY_DIMS(output, dim[0], 1, dim[2], dim[3], getTensorType());
     if (this->getFormat() == Tformat::NHWC) {
       unsigned int feat_len = output.getDim().getDataLen();
       unsigned int t_axis = dim[1];
-      TensorV2 ones(1, 1, 1, t_axis, this->getTensorType());
+      Tensor ones(1, 1, 1, t_axis, this->getTensorType());
       ones.setValue(alpha);
       sgemv(CblasRowMajor, CblasNoTrans, feat_len, t_axis, 1, data, t_axis,
             ones.getData<_FP16>(), 1, beta, output.getData<_FP16>(), 1);
     } else {
       unsigned int feat_len = dim[2] * dim[3];
       unsigned int t_axis = dim[1];
-      TensorV2 ones(1, 1, 1, t_axis, getTensorType());
+      Tensor ones(1, 1, 1, t_axis, getTensorType());
       ones.setValue(alpha);
       _FP16 *rdata = output.getData<_FP16>();
       for (unsigned int k = 0; k < dim[0]; ++k) {
@@ -577,12 +533,12 @@ TensorV2 &HalfTensor::sum(unsigned int axis, TensorV2 &output, float alpha,
     }
   } break;
   case 2: {
-    CREATE_V2_IF_EMPTY_DIMS(output, dim[0], dim[1], 1, dim[3], getTensorType());
+    CREATE_IF_EMPTY_DIMS(output, dim[0], dim[1], 1, dim[3], getTensorType());
 
     if (this->getFormat() == Tformat::NHWC) {
       unsigned int feat_len = dim[1] * dim[3];
       unsigned int t_axis = dim[2];
-      TensorV2 ones(1, 1, 1, t_axis, getTensorType());
+      Tensor ones(1, 1, 1, t_axis, getTensorType());
       ones.setValue(alpha);
       _FP16 *rdata = output.getData<_FP16>();
       for (unsigned int k = 0; k < dim[0]; ++k) {
@@ -593,7 +549,7 @@ TensorV2 &HalfTensor::sum(unsigned int axis, TensorV2 &output, float alpha,
     } else {
       unsigned int t_3 = dim[3];
       unsigned int t_axis = dim[2];
-      TensorV2 ones(1, 1, 1, t_axis, getTensorType());
+      Tensor ones(1, 1, 1, t_axis, getTensorType());
       ones.setValue(alpha);
       _FP16 *rdata = output.getData<_FP16>();
       for (unsigned int k = 0; k < dim[0]; ++k) {
@@ -607,11 +563,11 @@ TensorV2 &HalfTensor::sum(unsigned int axis, TensorV2 &output, float alpha,
     }
   } break;
   case 3: {
-    CREATE_V2_IF_EMPTY_DIMS(output, dim[0], dim[1], dim[2], 1, getTensorType());
+    CREATE_IF_EMPTY_DIMS(output, dim[0], dim[1], dim[2], 1, getTensorType());
     if (this->getFormat() == Tformat::NHWC) {
       unsigned int t_3 = dim[1];
       unsigned int t_axis = dim[3];
-      TensorV2 ones(1, 1, 1, t_axis, getTensorType());
+      Tensor ones(1, 1, 1, t_axis, getTensorType());
       ones.setValue(alpha);
       _FP16 *rdata = output.getData<_FP16>();
       for (unsigned int k = 0; k < dim[0]; ++k) {
@@ -625,7 +581,7 @@ TensorV2 &HalfTensor::sum(unsigned int axis, TensorV2 &output, float alpha,
     } else {
       unsigned int m = output.getDim().getDataLen();
       unsigned int n = dim[3];
-      TensorV2 ones(1, 1, 1, n, getTensorType());
+      Tensor ones(1, 1, 1, n, getTensorType());
       ones.setValue(alpha);
       sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
             ones.getData<_FP16>(), 1, beta, output.getData<_FP16>(), 1);
@@ -642,7 +598,7 @@ float HalfTensor::l2norm() const {
   return snrm2(size(), (_FP16 *)getData(), 1);
 }
 
-TensorV2 &HalfTensor::pow(float exponent, TensorV2 &output) const {
+Tensor &HalfTensor::pow(float exponent, Tensor &output) const {
   auto f = [exponent](float in) {
     return static_cast<_FP16>(powf(in, exponent));
   };
@@ -650,7 +606,7 @@ TensorV2 &HalfTensor::pow(float exponent, TensorV2 &output) const {
   return output;
 }
 
-TensorV2 &HalfTensor::erf(TensorV2 &output) const {
+Tensor &HalfTensor::erf(Tensor &output) const {
   auto f = [](_FP16 in) {
     return static_cast<_FP16>(std::erf(static_cast<float>(in)));
   };
@@ -658,8 +614,20 @@ TensorV2 &HalfTensor::erf(TensorV2 &output) const {
   return output;
 }
 
-TensorV2 &HalfTensor::dot(TensorV2 const &input, TensorV2 &output, bool trans,
-                          bool trans_in, float beta) const {
+void HalfTensor::inv_sqrt(Tensor &out) {
+  if (!contiguous) {
+    apply(
+      [](_FP16 val) -> _FP16 {
+        return static_cast<_FP16>(1 / std::sqrt(static_cast<float>(val)));
+      },
+      out);
+  } else {
+    inv_sqrt_inplace(out.size(), out.getData<_FP16>());
+  }
+}
+
+Tensor &HalfTensor::dot(Tensor const &input, Tensor &output, bool trans,
+                        bool trans_in, float beta) const {
   // Comment out with intension to support the calculation wrt. batch and height
   // direction. It supposes to have this->dim as [ BxCxH,W ] and input.dim is
   // [BxCxH,W] as well if (input.dim.rank() > 2) {
@@ -729,7 +697,7 @@ void HalfTensor::dropout_mask(float dropout) {
   }
 }
 
-void HalfTensor::filter_mask(const TensorV2 &mask_len, bool reverse) {
+void HalfTensor::filter_mask(const Tensor &mask_len, bool reverse) {
   float fill_mask_val = 0.0;
   float en_mask_val = 1.0 - fill_mask_val;
 
@@ -750,7 +718,7 @@ void HalfTensor::filter_mask(const TensorV2 &mask_len, bool reverse) {
   }
 }
 
-void HalfTensor::zoneout_mask(TensorV2 &opposite, float zoneout) {
+void HalfTensor::zoneout_mask(Tensor &opposite, float zoneout) {
   _FP16 zoneout_fp16 = (_FP16)zoneout;
   opposite.setRandBernoulli(zoneout_fp16);
 
@@ -766,7 +734,7 @@ void HalfTensor::zoneout_mask(TensorV2 &opposite, float zoneout) {
   }
 }
 
-std::vector<TensorV2> HalfTensor::split(std::vector<size_t> sizes, int axis) {
+std::vector<Tensor> HalfTensor::split(std::vector<size_t> sizes, int axis) {
   size_t num_size = sizes.size();
 
   if (axis == -1) {
@@ -786,7 +754,7 @@ std::vector<TensorV2> HalfTensor::split(std::vector<size_t> sizes, int axis) {
   }
 
   bool is_format_nchw = (dim.getFormat() == Tformat::NCHW) ? true : false;
-  std::vector<TensorV2> ret;
+  std::vector<Tensor> ret;
 
   auto iter_value = [this, is_format_nchw](
                       std::array<size_t, 4> &loc,
@@ -868,16 +836,15 @@ std::vector<TensorV2> HalfTensor::split(std::vector<size_t> sizes, int axis) {
   return ret;
 }
 
-TensorV2 HalfTensor::cat(const std::vector<TensorV2> &tensors, int axis) {
+Tensor HalfTensor::concat(const std::vector<Tensor> &tensors, int axis) {
   if (axis == -1) {
     axis = 3;
   }
-  TensorV2 ret;
   auto ref_dim = tensors.front().getDim();
   bool is_format_nchw = (ref_dim.getFormat() == Tformat::NCHW);
   ref_dim.setTensorDim(axis, 1);
   NNTR_THROW_IF(!std::all_of(tensors.begin(), tensors.end(),
-                             [&ref_dim, axis](const TensorV2 &t) {
+                             [&ref_dim, axis](const Tensor &t) {
                                auto cur_dim = t.getDim();
                                cur_dim.setTensorDim(axis, 1);
                                return ref_dim == cur_dim;
@@ -887,12 +854,12 @@ TensorV2 HalfTensor::cat(const std::vector<TensorV2> &tensors, int axis) {
     << ref_dim << " axis : " << axis;
 
   auto axis_dim = std::accumulate(tensors.begin(), tensors.end(), 0u,
-                                  [axis](unsigned cur, const TensorV2 &t) {
+                                  [axis](unsigned cur, const Tensor &t) {
                                     return cur += t.getDim().getTensorDim(axis);
                                   });
   auto iter_value =
     [is_format_nchw](std::array<unsigned, 4> &loc,
-                     const std::array<unsigned, 4> &start_loc, TensorV2 &t,
+                     const std::array<unsigned, 4> &start_loc, Tensor &t,
                      const std::array<unsigned, 4> &ref_dim_arr) -> _FP16 & {
     auto &value = is_format_nchw
                     ? t.getValue<_FP16>(loc[0], loc[1], loc[2], loc[3])
@@ -912,7 +879,7 @@ TensorV2 HalfTensor::cat(const std::vector<TensorV2> &tensors, int axis) {
   auto ret_dim = ref_dim;
   ret_dim.setTensorDim(axis, axis_dim);
 
-  ret = TensorV2(ret_dim);
+  Tensor output = Tensor(ret_dim);
 
   std::array<unsigned, 4> loc = {0, 0, 0, 0};
   for (auto &t : tensors) {
@@ -931,7 +898,7 @@ TensorV2 HalfTensor::cat(const std::vector<TensorV2> &tensors, int axis) {
     }
 
     for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
-      iter_value(loc, start_loc, ret, tensor_dim_arr) = t.getValue<_FP16>(i);
+      iter_value(loc, start_loc, output, tensor_dim_arr) = t.getValue<_FP16>(i);
     }
 
     if (is_format_nchw) {
@@ -946,11 +913,10 @@ TensorV2 HalfTensor::cat(const std::vector<TensorV2> &tensors, int axis) {
       }
     }
   }
-  return ret;
+  return output;
 }
 
 void HalfTensor::print(std::ostream &out) const {
-  printInstance(out, this);
   const _FP16 *data = (_FP16 *)getData();
   unsigned int len = size();
   out << "data addr: " << data << '\n';
@@ -999,39 +965,29 @@ void HalfTensor::print(std::ostream &out) const {
   out.copyfmt(init);
 }
 
-TensorV2 &HalfTensor::divide(float const &value, TensorV2 &output) const {
+Tensor &HalfTensor::divide(float const &value, Tensor &output) const {
   auto f = std::bind(std::divides<_FP16>(), std::placeholders::_1,
                      static_cast<_FP16>(value));
   apply(f, output);
   return output;
 }
 
-TensorV2 &HalfTensor::divide(TensorV2 const &m, TensorV2 &output) const {
-  auto f = [&](const BroadcastInfoV2 &e, const _FP16 *buf, const _FP16 *m_buf,
+Tensor &HalfTensor::divide(Tensor const &m, Tensor &output) const {
+  auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
                _FP16 *out_buf) {
-    if (e.strides[3] == 1 && output.getStrides()[3] == 1 && strides[3] == 1) {
-      std::transform(buf, buf + e.buffer_size, m_buf, out_buf,
-                     std::divides<_FP16>());
-    } else {
-      for (unsigned int i = 0; i < e.buffer_size; ++i) {
-        *out_buf = *buf / *m_buf;
-        buf += strides[3];
-        m_buf += e.strides[3];
-        out_buf += output.getStrides()[3];
-      }
-    }
+    ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]);
   };
 
   apply_broadcast(m, f, output);
   return output;
 }
 
-void HalfTensor::copy(const TensorV2 &from) {
+void HalfTensor::copy(const Tensor &from) {
   reshape(from.getDim());
   copy(from.getData<_FP16>());
 }
 
-void HalfTensor::copyData(const TensorV2 &from) {
+void HalfTensor::copyData(const Tensor &from) {
   if (!contiguous) {
     throw std::runtime_error("Cannot copy non-contiguous tensor");
   }
@@ -1052,6 +1008,18 @@ void HalfTensor::copyData(const TensorV2 &from) {
   }
 }
 
+void HalfTensor::copy_with_stride(const Tensor &input, Tensor &output) {
+  for (unsigned int b = 0; b < output.batch(); ++b) {
+    for (unsigned int c = 0; c < output.channel(); ++c) {
+      for (unsigned int h = 0; h < output.height(); ++h) {
+        for (unsigned int w = 0; w < output.width(); ++w) {
+          output.setValue(b, c, h, w, input.getValue<_FP16>(b, c, h, w));
+        }
+      }
+    }
+  }
+}
+
 std::vector<unsigned int> HalfTensor::argmax() const {
   std::vector<unsigned int> result;
   const _FP16 *data = (_FP16 *)getData();
@@ -1085,8 +1053,8 @@ float HalfTensor::minValue() const {
   return (float)*std::min_element(data, data + size());
 }
 
-TensorV2 &HalfTensor::transpose(const std::string &direction,
-                                TensorV2 &output) const {
+Tensor &HalfTensor::transpose(const std::string &direction,
+                              Tensor &output) const {
   unsigned int SL, SI, SJ, SK;
 
   output.reshape(dim.transpose(direction));
@@ -1110,7 +1078,14 @@ TensorV2 &HalfTensor::transpose(const std::string &direction,
       }
     } else {
       if (is_format_nchw) {
-        transposeloop(l, i, k, j, SL, SI, SK, SJ);
+        for (unsigned int b = 0; b < batch(); ++b) {
+          for (unsigned int c = 0; c < channel(); ++c) {
+            transpose_matrix(
+              height(), width(), (_FP16 *)getData() + getIndex(b, c, 0, 0),
+              width(), (_FP16 *)output.getData() + output.getIndex(b, c, 0, 0),
+              output.width());
+          }
+        }
       } else {
         transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
       }
@@ -1163,12 +1138,12 @@ void HalfTensor::copy(const void *buf) {
 }
 
 void HalfTensor::apply_broadcast(
-  TensorV2 const &m,
-  std::function<void(const BroadcastInfoV2 &e, const _FP16 *, const _FP16 *,
+  Tensor const &m,
+  std::function<void(const BroadcastInfo &e, const _FP16 *, const _FP16 *,
                      _FP16 *)>
     v_func,
-  TensorV2 &output) const {
-  CREATE_V2_IF_EMPTY_DIMS(output, dim, nullptr);
+  Tensor &output) const {
+  CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
 
   NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
     << getName() << " is not allocated";
@@ -1181,7 +1156,7 @@ void HalfTensor::apply_broadcast(
   /// note that buffer_size, the last stride is only used in v_func but it
   /// might be changed
   if (dim == m.getDim()) {
-    BroadcastInfoV2 e;
+    BroadcastInfo e;
     e.buffer_size = size();
     e.strides[3] = 1;
     v_func(e, (_FP16 *)getData(), m.getData<_FP16>(), output.getData<_FP16>());
@@ -1192,11 +1167,11 @@ void HalfTensor::apply_broadcast(
 }
 
 void HalfTensor::apply_broadcast_util(
-  TensorV2 const &m,
-  std::function<void(const BroadcastInfoV2 &e, const _FP16 *, const _FP16 *,
+  Tensor const &m,
+  std::function<void(const BroadcastInfo &e, const _FP16 *, const _FP16 *,
                      _FP16 *)>
     v_func,
-  TensorV2 &output, const BroadcastInfoV2 &e, int cur_axis, size_t offset,
+  Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
   size_t m_offset) const {
 
   const _FP16 *buf = (_FP16 *)this->getData();
diff --git a/nntrainer/tensor/half_tensor.h b/nntrainer/tensor/half_tensor.h
index 57451e3517..e0dfd77748 100644
--- a/nntrainer/tensor/half_tensor.h
+++ b/nntrainer/tensor/half_tensor.h
@@ -14,7 +14,6 @@
 #ifdef __cplusplus
 
 #include <tensor_base.h>
-#include <tensor_v2.h>
 
 #ifdef DEBUG
 #define EXCEPT_WHEN_DEBUG
@@ -61,7 +60,61 @@ class HalfTensor : public TensorBase {
    * @param fm format for the Tensor
    */
   HalfTensor(std::vector<std::vector<std::vector<std::vector<_FP16>>>> const &d,
-             Tformat fm);
+             Tformat fm) {
+    if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
+      throw std::out_of_range(
+        "[Tensor] trying to initialize HalfTensor from empty vector");
+    }
+
+    dim.setTensorDim(0, d.size());
+    if (fm == Tformat::NCHW) {
+      dim.setTensorDim(1, d[0].size());
+      dim.setTensorDim(2, d[0][0].size());
+      dim.setTensorDim(3, d[0][0][0].size());
+    } else {
+      dim.setTensorDim(2, d[0].size());
+      dim.setTensorDim(3, d[0][0].size());
+      dim.setTensorDim(1, d[0][0][0].size());
+    }
+
+    dim.setTensorType({fm, Tdatatype::FP16});
+
+    strides = dim.computeStrides();
+    contiguous = true;
+    initializer = Initializer::NONE;
+
+    MemoryData *mem_data =
+      new MemoryData((void *)(new _FP16[dim.getDataLen()]()));
+    data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
+      delete[] mem_data->getAddr<_FP16>();
+    });
+
+    offset = 0;
+
+    // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
+    // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
+    // dim[1] == height, dim[2] == width, dim[3] == channel
+    if (fm == Tformat::NCHW) {
+      for (unsigned int i = 0; i < batch(); ++i)
+        for (unsigned int j = 0; j < channel(); ++j)
+          for (unsigned int k = 0; k < height(); ++k)
+            for (unsigned int l = 0; l < width(); ++l)
+              this->setValue(i, j, k, l, d[i][j][k][l]);
+    } else {
+      for (unsigned int i = 0; i < batch(); ++i)
+        for (unsigned int j = 0; j < height(); ++j)
+          for (unsigned int k = 0; k < width(); ++k)
+            for (unsigned int l = 0; l < channel(); ++l)
+              this->setValue(i, l, j, k, d[i][j][k][l]);
+    }
+  }
+
+  /**
+   * @brief Construct a new FloatTensor object
+   *
+   * @param rhs TensorBase object to copy
+   */
+  HalfTensor(TensorBase &rhs) : TensorBase(rhs) {}
 
   /**
    * @brief Basic Destructor
@@ -83,22 +136,22 @@ class HalfTensor : public TensorBase {
   bool operator!=(const HalfTensor &rhs) const { return !(*this == rhs); }
 
   /**
-   * @copydoc TensorV2::allocate()
+   * @copydoc Tensor::allocate()
    */
   void allocate() override;
 
   /**
-   * @copydoc TensorV2::deallocate()
+   * @copydoc Tensor::deallocate()
    */
   void deallocate() override;
 
   /**
-   * @copydoc TensorV2::getData()
+   * @copydoc Tensor::getData()
    */
   void *getData() const override;
 
   /**
-   * @copydoc TensorV2::getData(size_t idx)
+   * @copydoc Tensor::getData(size_t idx)
    */
   void *getData(size_t idx) const override;
 
@@ -147,24 +200,24 @@ class HalfTensor : public TensorBase {
                   unsigned int w);
 
   /**
-   * @copydoc TensorV2::setValue(float value)
+   * @copydoc Tensor::setValue(float value)
    */
   void setValue(float value) override;
 
   /**
-   * @copydoc TensorV2::setValue(b, c, h, w, value)
+   * @copydoc Tensor::setValue(b, c, h, w, value)
    */
   void setValue(unsigned int b, unsigned int c, unsigned int h, unsigned int w,
                 float value) override;
 
   /**
-   * @copydoc TensorV2::addValue(b, c, h, w, value, beta)
+   * @copydoc Tensor::addValue(b, c, h, w, value, beta)
    */
   void addValue(unsigned int b, unsigned int c, unsigned int h, unsigned int w,
                 float value, float beta) override;
 
   /**
-   * @copydoc TensorV2::setZero()
+   * @copydoc Tensor::setZero()
    */
   void setZero() override;
 
@@ -185,170 +238,193 @@ class HalfTensor : public TensorBase {
   };
 
   /**
-   * @copydoc TensorV2::setRandNormal()
+   * @copydoc Tensor::setRandNormal()
    */
-  void setRandNormal(float mean = 0.0f, float stddev = 0.05f);
+  void setRandNormal(float mean = 0.0f, float stddev = 0.05f) override;
 
   /**
-   * @copydoc TensorV2::setRandUniform()
+   * @copydoc Tensor::setRandUniform()
    */
-  void setRandUniform(float min = -0.05f, float max = 0.05f);
+  void setRandUniform(float min = -0.05f, float max = 0.05f) override;
 
   /**
-   * @copydoc TensorV2::setRandBernoulli()
+   * @copydoc Tensor::setRandBernoulli()
    */
-  void setRandBernoulli(float probability = 0.5f);
+  void setRandBernoulli(float probability = 0.5f) override;
 
   /**
-   * @copydoc TensorV2::initialize()
+   * @copydoc Tensor::initialize()
    */
   void initialize() override;
 
   /**
-   * @copydoc TensorV2::initialize(Initializer init)
+   * @copydoc Tensor::initialize(Initializer init)
    */
   void initialize(Initializer init) override;
 
   /**
-   * @copydoc TensorV2::apply(std::function<T(T)> f, TensorV2 &output)
+   * @copydoc Tensor::apply(std::function<T(T)> f, Tensor &output)
    */
-  TensorV2 &apply(std::function<_FP16(_FP16)> f,
-                  TensorV2 &output) const override;
+  Tensor &apply(std::function<_FP16(_FP16)> f, Tensor &output) const override;
 
   /**
-   * @copydoc TensorV2::multiply_strided(TensorV2 const &m, TensorV2 &output,
+   * @copydoc Tensor::multiply_strided(Tensor const &m, Tensor &output,
    * const float beta)
    */
-  TensorV2 multiply_strided(TensorV2 const &m, TensorV2 &output,
-                            const float beta) const override;
+  Tensor multiply_strided(Tensor const &m, Tensor &output,
+                          const float beta) const override;
 
   /**
-   * @copydoc TensorV2::multiply_i(float const &value)
+   * @copydoc Tensor::multiply_i(float const &value)
    */
   int multiply_i(float const &value) override;
 
   /**
-   * @copydoc TensorV2::multiply(float const &value, TensorV2 &out)
+   * @copydoc Tensor::multiply(float const &value, Tensor &out)
    */
-  TensorV2 &multiply(float const &value, TensorV2 &out) const override;
+  Tensor &multiply(float const &value, Tensor &out) const override;
 
   /**
-   * @copydoc TensorV2::multiply(TensorV2 const &m, TensorV2 &output, const
+   * @copydoc Tensor::multiply(Tensor const &m, Tensor &output, const
    * float beta = 0.0)
    */
-  TensorV2 &multiply(TensorV2 const &m, TensorV2 &output,
-                     const float beta = 0.0) const override;
+  Tensor &multiply(Tensor const &m, Tensor &output,
+                   const float beta = 0.0) const override;
 
   /**
-   * @copydoc TensorV2::divide(float const &value, TensorV2 &output)
+   * @copydoc Tensor::divide(float const &value, Tensor &output)
    */
-  TensorV2 &divide(float const &value, TensorV2 &output) const override;
+  Tensor &divide(float const &value, Tensor &output) const override;
 
   /**
-   * @copydoc TensorV2::divide(TensorV2 const &m, TensorV2 &output)
+   * @copydoc Tensor::divide(Tensor const &m, Tensor &output)
    */
-  TensorV2 &divide(TensorV2 const &m, TensorV2 &output) const override;
+  Tensor &divide(Tensor const &m, Tensor &output) const override;
 
   /**
-   * @copydoc TensorV2::add_strided(TensorV2 const &input, TensorV2 &output,
+   * @copydoc Tensor::add_strided(Tensor const &input, Tensor &output,
    * const float beta)
    */
-  TensorV2 &add_strided(TensorV2 const &input, TensorV2 &output,
-                        const float beta) const override;
+  Tensor &add_strided(Tensor const &input, Tensor &output,
+                      const float beta) const override;
+
+  /**
+   * @copydoc Tensor::add_i(Tensor const &m, float const alpha)
+   */
+  int add_i(Tensor const &m, Tensor &output, float const alpha) override;
 
   /**
-   * @copydoc TensorV2::add(float const &value, TensorV2 &output)
+   * @copydoc Tensor::add_i_partial()
    */
-  TensorV2 &add(float const &value, TensorV2 &output) const override;
+  int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
+                    unsigned int incX, unsigned int incY, const Tensor alphas,
+                    unsigned int alpha_idx) override;
 
   /**
-   * @copydoc TensorV2::add(TensorV2 const &m, TensorV2 &output, float const
+   * @copydoc Tensor::add(float const &value, Tensor &output)
+   */
+  Tensor &add(float const &value, Tensor &output) const override;
+
+  /**
+   * @copydoc Tensor::add(Tensor const &m, Tensor &output, float const
    * alpha)
    */
-  TensorV2 &add(TensorV2 const &m, TensorV2 &output,
-                float const alpha) const override;
+  Tensor &add(Tensor const &m, Tensor &output,
+              float const alpha) const override;
 
   /**
-   * @copydoc TensorV2::subtract(float const &value, TensorV2 &output)
+   * @copydoc Tensor::subtract(float const &value, Tensor &output)
    */
-  TensorV2 &subtract(float const &value, TensorV2 &output) const override;
+  Tensor &subtract(float const &value, Tensor &output) const override;
 
   /**
-   *  @copydoc TensorBase::sum_by_batch(TensorV2 &output)
+   *  @copydoc TensorBase::sum_by_batch(Tensor &output)
    */
-  void sum_by_batch(TensorV2 &output) const override;
+  void sum_by_batch(Tensor &output) const override;
 
   /**
-   * @copydoc TensorV2::sum(unsigned int axis, TensorV2 &output, float alpha,
+   * @copydoc Tensor::sum(unsigned int axis, Tensor &output, float alpha,
    * float beta) const
    */
-  TensorV2 &sum(unsigned int axis, TensorV2 &output, float alpha,
-                float beta) const override;
+  Tensor &sum(unsigned int axis, Tensor &output, float alpha,
+              float beta) const override;
 
   /**
-   * @copydoc TensorV2::l2norm
+   * @copydoc Tensor::l2norm
    */
   float l2norm() const override;
 
   /**
-   * @copydoc TensorV2::pow(float exponent, TensorV2 &output)
+   * @copydoc Tensor::pow(float exponent, Tensor &output)
+   */
+  Tensor &pow(float exponent, Tensor &output) const override;
+
+  /**
+   * @copydoc Tensor::erf(Tensor &output)
    */
-  TensorV2 &pow(float exponent, TensorV2 &output) const override;
+  Tensor &erf(Tensor &output) const override;
 
   /**
-   * @copydoc TensorV2::erf(TensorV2 &output)
+   * @copydoc TensorBase::inv_sqrt(Tensor &out)
    */
-  TensorV2 &erf(TensorV2 &output) const override;
+  void inv_sqrt(Tensor &out) override;
 
   /**
-   *  @copydoc TensorV2::dot(TensorV2 const &input, TensorV2 &output, bool
+   *  @copydoc Tensor::dot(Tensor const &input, Tensor &output, bool
    * trans, bool trans_in, float beta)
    */
-  TensorV2 &dot(TensorV2 const &input, TensorV2 &output, bool trans,
-                bool trans_in, float beta) const override;
+  Tensor &dot(Tensor const &input, Tensor &output, bool trans, bool trans_in,
+              float beta) const override;
 
   /**
-   * @copydoc TensorV2::dropout_mask(float dropout)
+   * @copydoc Tensor::dropout_mask(float dropout)
    */
   void dropout_mask(float dropout) override;
 
   /**
-   * @copydoc TensorV2::filter_mask(const TensorV2 &mask_len, bool reverse)
+   * @copydoc Tensor::filter_mask(const Tensor &mask_len, bool reverse)
    */
-  void filter_mask(const TensorV2 &mask_len, bool reverse) override;
+  void filter_mask(const Tensor &mask_len, bool reverse) override;
 
   /**
-   * @copydoc TensorV2::zoneout_mask(TensorV2 &opposite, float zoneout)
+   * @copydoc Tensor::zoneout_mask(Tensor &opposite, float zoneout)
    */
-  void zoneout_mask(TensorV2 &opposite, float zoneout) override;
+  void zoneout_mask(Tensor &opposite, float zoneout) override;
 
   /**
-   * @copydoc TensorV2::split(std::vector<size_t> sizes, int axis)
+   * @copydoc Tensor::split(std::vector<size_t> sizes, int axis)
    */
-  std::vector<TensorV2> split(std::vector<size_t> sizes, int axis) override;
+  std::vector<Tensor> split(std::vector<size_t> sizes, int axis) override;
 
   /**
-   * @copydoc TensorV2::cat(const std::vector<TensorV2> &tensors, int axis)
+   * @copydoc Tensor::cat(const std::vector<Tensor> &tensors, int axis)
    */
-  static TensorV2 cat(const std::vector<TensorV2> &tensors, int axis);
+  Tensor concat(const std::vector<Tensor> &tensors, int axis) override;
 
   /**
-   * @copydoc TensorV2::copy(const TensorV2 &from)
+   * @copydoc Tensor::copy(const Tensor &from)
    */
-  void copy(const TensorV2 &from);
+  void copy(const Tensor &from) override;
 
   /**
-   * @copydoc TensorV2::copyData(const TensorV2 &from)
+   * @copydoc Tensor::copyData(const Tensor &from)
    */
-  void copyData(const TensorV2 &from);
+  void copyData(const Tensor &from) override;
 
   /**
-   * @copydoc TensorV2::argmax()
+   * @brief      Copy the Tensor
+   * @param[in]  input Tensor to be copied
+   * @param[out] output output Tensor
+   */
+  void copy_with_stride(const Tensor &input, Tensor &output) override;
+
+  /**
+   * @copydoc Tensor::argmax()
    */
   std::vector<unsigned int> argmax() const override;
 
   /**
-   * @copydoc TensorV2::max_abs()
+   * @copydoc Tensor::max_abs()
    */
   float max_abs() const override;
 
@@ -363,13 +439,13 @@ class HalfTensor : public TensorBase {
   float minValue() const override;
 
   /**
-   * @copydoc TensorV2::transpose(const std::string &direction, TensorV2 &out)
+   * @copydoc Tensor::transpose(const std::string &direction, Tensor &out)
    */
-  TensorV2 &transpose(const std::string &direction,
-                      TensorV2 &output) const override;
+  Tensor &transpose(const std::string &direction,
+                    Tensor &output) const override;
 
   /**
-   * @copydoc TensorV2::print(std::ostream &out)
+   * @copydoc Tensor::print(std::ostream &out)
    */
   void print(std::ostream &out) const override;
 
@@ -393,13 +469,14 @@ class HalfTensor : public TensorBase {
    * @retval #ML_ERROR_NONE Successful
    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
    */
-  void apply_broadcast_util(
-    TensorV2 const &m,
-    std::function<void(const BroadcastInfoV2 &e, const _FP16 *, const _FP16 *,
-                       _FP16 *)>
-      v_func,
-    TensorV2 &output, const BroadcastInfoV2 &e, int cur_axis = -1,
-    size_t offset = 0, size_t m_offset = 0) const;
+  void
+  apply_broadcast_util(Tensor const &m,
+                       std::function<void(const BroadcastInfo &e, const _FP16 *,
+                                          const _FP16 *, _FP16 *)>
+                         v_func,
+                       Tensor &output, const BroadcastInfo &e,
+                       int cur_axis = -1, size_t offset = 0,
+                       size_t m_offset = 0) const;
 
   /**
    * @brief Applies the given operator to the tensor with the passed argument
@@ -409,12 +486,11 @@ class HalfTensor : public TensorBase {
    * @retval #ML_ERROR_NONE Successful
    * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
    */
-  void
-  apply_broadcast(TensorV2 const &m,
-                  std::function<void(const BroadcastInfoV2 &e, const _FP16 *,
-                                     const _FP16 *, _FP16 *)>
-                    v_func,
-                  TensorV2 &output) const;
+  void apply_broadcast(Tensor const &m,
+                       std::function<void(const BroadcastInfo &e, const _FP16 *,
+                                          const _FP16 *, _FP16 *)>
+                         v_func,
+                       Tensor &output) const;
 };
 
 } // namespace nntrainer
diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp
index 9a0d235ba9..8b47235791 100644
--- a/nntrainer/tensor/manager.cpp
+++ b/nntrainer/tensor/manager.cpp
@@ -430,7 +430,7 @@ std::vector<Weight *> Manager::requestWeights(
          */
         grad = tensor_pool.requestOrExtend(shared_name + Var_Grad::grad_suffix,
                                            dim_g, grad_exec_order, grad_ls,
-                                           Tensor::Initializer::ZEROS);
+                                           Initializer::ZEROS);
       }
     } else {
       /** case requesting fresh weights */
@@ -446,8 +446,8 @@ std::vector<Weight *> Manager::requestWeights(
         if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm))
           is_wgrad = false;
         grad = tensor_pool.request(name + Var_Grad::grad_suffix, dim_g,
-                                   grad_exec_order, grad_ls,
-                                   Tensor::Initializer::ZEROS, is_wgrad);
+                                   grad_exec_order, grad_ls, Initializer::ZEROS,
+                                   is_wgrad);
       }
     }
 
@@ -515,17 +515,16 @@ std::vector<Var_Grad *> Manager::requestTensors(
       if (need_grad && tspan > TensorLifespan::FORWARD_FUNC_LIFESPAN) {
         grad = tensor_pool.requestOrExtend(shared_name + Var_Grad::grad_suffix,
                                            dim, grad_exec_order, tspan,
-                                           Tensor::Initializer::ZEROS);
+                                           Initializer::ZEROS);
       }
     } else {
       var = tensor_pool.request(name, dim, var_exec_order, tspan, t_init);
 
       if (need_grad && tspan > TensorLifespan::FORWARD_FUNC_LIFESPAN) {
-        grad =
-          tensor_pool.request(name + Var_Grad::grad_suffix, /// name
-                              dim, grad_exec_order, tspan,
-                              Tensor::Initializer::ZEROS /// tensor initializer
-          );
+        grad = tensor_pool.request(name + Var_Grad::grad_suffix, /// name
+                                   dim, grad_exec_order, tspan,
+                                   Initializer::ZEROS /// tensor initializer
+        );
       }
     }
 
@@ -668,8 +667,7 @@ bool Manager::isSecondLastAccess(const std::string &name,
  */
 std::vector<Tensor *> Manager::requestWeightOptimizerVariables(
   const std::vector<TensorDim> &dims, const std::string &name,
-  const TensorLifespan &lifespan, bool is_grad_clip,
-  Tensor::Initializer initializer) {
+  const TensorLifespan &lifespan, bool is_grad_clip, Initializer initializer) {
 
   std::vector<Tensor *> ret;
   ret.reserve(dims.size());
diff --git a/nntrainer/tensor/manager.h b/nntrainer/tensor/manager.h
index 8ae5aa890a..9ccde77113 100644
--- a/nntrainer/tensor/manager.h
+++ b/nntrainer/tensor/manager.h
@@ -225,7 +225,7 @@ class Manager {
   std::vector<Tensor *> requestWeightOptimizerVariables(
     const std::vector<TensorDim> &dims, const std::string &name,
     const TensorLifespan &lifespan, bool is_grad_clip,
-    Tensor::Initializer initializer = Tensor::Initializer::NONE);
+    Initializer initializer = Initializer::NONE);
 
   /**
    * @brief     Create tensors with the given spec
diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build
index 202b730060..d1d824b61d 100644
--- a/nntrainer/tensor/meson.build
+++ b/nntrainer/tensor/meson.build
@@ -6,7 +6,6 @@ tensor_sources = [
   'lazy_tensor.cpp',
   'manager.cpp',
   'tensor.cpp',
-  'tensor_v2.cpp',
   'tensor_base.cpp',
   'float_tensor.cpp',
   'tensor_dim.cpp',
@@ -25,7 +24,6 @@ tensor_sources = [
 tensor_headers = [
   'memory_data.h',
   'tensor.h',
-  'tensor_v2.h',
   'tensor_base.h',
   'float_tensor.h',
   'weight.h',
diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp
index 0997a5ee37..a1d3525602 100644
--- a/nntrainer/tensor/tensor.cpp
+++ b/nntrainer/tensor/tensor.cpp
@@ -1,571 +1,186 @@
+// SPDX-License-Identifier: Apache-2.0
 /**
- * Copyright (C) 2019 Samsung Electronics Co., Ltd. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *   http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- *
  * @file	tensor.cpp
- * @date	04 December 2019
- * @brief	This is Tensor class for calculation
+ * @date	01 December 2023
+ * @brief	This is a Tensor class
  * @see		https://github.com/nnstreamer/nntrainer
  * @author	Jijoong Moon <jijoong.moon@samsung.com>
+ * @author	Donghyeon Jeong <dhyeon.jeong@samsung.com>
  * @bug		No known bugs except for NYI items
- *
  */
 
-#include <algorithm>
-#include <assert.h>
-#include <cmath>
-#include <cstring>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <iterator>
-#include <numeric>
-#include <regex>
-#include <sstream>
-#include <stdexcept>
-#include <stdio.h>
-
+#include <float_tensor.h>
 #include <lazy_tensor.h>
 #include <tensor.h>
-#include <util_func.h>
-
-#define transposeloop(cl, ci, cj, ck, sl, si, sj, sk)                 \
-  do {                                                                \
-    unsigned int i, j, k, l;                                          \
-    int inidx = 0, outidx = 0;                                        \
-    for (cl = 0; cl < sl; cl++)                                       \
-      for (ci = 0; ci < si; ci++)                                     \
-        for (cj = 0; cj < sj; cj++)                                   \
-          for (ck = 0; ck < sk; ck++) {                               \
-            outidx = si * sj * sk * cl + sj * sk * ci + sk * cj + ck; \
-            inidx = l * SI * SJ * SK + i * SJ * SK + j * SK + k;      \
-            outptr[outidx] = inptr[inidx];                            \
-          }                                                           \
-  } while (0);
-
-#define transposeloop_nhwc(cl, ci, cj, ck, sl, si, sj, sk)            \
-  do {                                                                \
-    unsigned int i, j, k, l;                                          \
-    int inidx = 0, outidx = 0;                                        \
-    for (cl = 0; cl < sl; cl++)                                       \
-      for (ci = 0; ci < si; ci++)                                     \
-        for (cj = 0; cj < sj; cj++)                                   \
-          for (ck = 0; ck < sk; ck++) {                               \
-            outidx = si * sj * sk * cl + sj * sk * ci + sk * cj + ck; \
-            inidx = l * SJ * SK * SI + j * SK * SI + k * SI + i;      \
-            outptr[outidx] = inptr[inidx];                            \
-          }                                                           \
-  } while (0);
 
-namespace nntrainer {
+#ifdef ENABLE_FP16
+#include <half_tensor.h>
+#endif
 
-/**
- * @struct External Loop Info for broadcasted info
- * @brief External Loop Info for broadcasted iteration. Please refer to
- * DISABLED_private_external_loop_n in unittest_nntrainer_tensor.
- * @note This should better be implemented in iterator fashion before used
- * extensively.
- */
-struct Tensor::BroadcastInfo {
-
-  /**
-   * @brief Construct a new External Loop Info object
-   *
-   */
-  BroadcastInfo() :
-    buffer_size(0),
-    buffer_axis(-1),
-    strides{0, 0, 0, 0},
-    tensor_type(nntrainer::TensorDim::TensorType()) {}
-
-  unsigned int buffer_size; /**< virtual size of the buffer */
-  int buffer_axis;          /**< the smallest axis that should be looped.
-                                 -1 means no loop needed*/
-  std::array<unsigned int, TensorDim::MAXDIM>
-    strides; /**< modified strides for the loop */
-  nntrainer::TensorDim::TensorType tensor_type;
-};
+namespace nntrainer {
 
-Tensor::Tensor(const TensorDim &d, bool alloc_now, Tensor::Initializer init,
-               std::string name_) :
-  Tensor(name_, d.getFormat()) {
-  if (d.getDataLen() != 0) {
-    dim = d;
-    strides = d.computeStrides();
-    initializer = init;
-    if (alloc_now)
-      allocate();
-  }
-}
+Tensor::Tensor(std::string name_, Tformat fm, Tdatatype d_type) {
+  itensor = nullptr;
 
-Tensor::Tensor(const TensorDim &d, const void *buf) : Tensor(d, true) {
-  if (d.getDataLen() != 0) {
-    if (buf != nullptr)
-      copy(buf);
+  if (d_type == Tdatatype::FP32) {
+    itensor = std::shared_ptr<FloatTensor>(new FloatTensor(name_, fm),
+                                           std::default_delete<FloatTensor>());
+  } else if (d_type == Tdatatype::FP16) {
+#ifdef ENABLE_FP16
+    itensor = std::shared_ptr<HalfTensor>(new HalfTensor(name_, fm),
+                                          std::default_delete<HalfTensor>());
+#else
+    throw std::invalid_argument("Error: enable-fp16 is not enabled");
+#endif
+  } else {
+    throw std::invalid_argument(
+      "Error: Tensor cannot be constructed because the given d_type is not "
+      "compatible with itensor. The supported d_types are: FP32, FP16 "
+      "(if built with ENABLE_FP16).");
   }
 }
 
-/**
- * @class SrcSharedTensor
- * @brief Source of the shared tensor
- */
-class SrcSharedTensor {
-public:
-  /**
-   * @brief   Constructor for the class
-   */
-  SrcSharedTensor() : src(nullptr), off(0) {}
-
-  SrcSharedTensor(const Tensor *tensor, size_t offset) :
-    src(tensor), off(offset) {}
-
-  /**
-   * @brief   Get the allocated src tensor
-   */
-  const Tensor *tensor() const {
-    if (!src)
-      throw std::runtime_error("Accessing empty src tensor");
-
-    return src;
-  }
-
-  /**
-   * @brief   Get the offset from the source tensor
-   */
-  size_t offset() const { return off; }
-
-private:
-  const Tensor *src; /**< Tensor of the source */
-  size_t off;        /**< offset from the source data ptr */
-};
-
-void Tensor::allocate() {
-  if (empty() || data)
-    /// already allocated
-    return;
+Tensor::Tensor(const TensorDim &d, bool alloc_now, Initializer init,
+               std::string name) {
+  itensor = nullptr;
 
-  if (src_tensor) {
-    /// allocate data based on the source tensor
-    data = src_tensor->tensor()->data;
-    offset = src_tensor->tensor()->offset + src_tensor->offset();
-    /** as this memory is shared, do NOT initialize */
-  } else {
-    /// allocate new memory for the tensor data
-
-    MemoryData *mem_data;
-
-    if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-      mem_data = new MemoryData((void *)(new float[dim.getDataLen()]{}));
-      data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
-        delete[] mem_data->template getAddr<float>();
-        delete mem_data;
-      });
-
-    } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
+  if (d.getDataType() == Tdatatype::FP32) {
+    itensor =
+      std::shared_ptr<FloatTensor>(new FloatTensor(d, alloc_now, init, name),
+                                   std::default_delete<FloatTensor>());
+  } else if (d.getDataType() == Tdatatype::FP16) {
 #ifdef ENABLE_FP16
-      mem_data = new MemoryData((void *)(new _FP16[dim.getDataLen()]{}));
-      data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
-        delete[] mem_data->template getAddr<_FP16>();
-        delete mem_data;
-      });
+    itensor =
+      std::shared_ptr<HalfTensor>(new HalfTensor(d, alloc_now, init, name),
+                                  std::default_delete<HalfTensor>());
 #else
-      throw std::invalid_argument("Error: enable-fp16 is not enabled");
+    throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
-    } else if (getDataType() == ml::train::TensorDim::DataType::QINT8) {
-      mem_data = new MemoryData((void *)(new uint8_t[dim.getDataLen()]{}));
-      data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
-        delete[] mem_data->template getAddr<uint8_t>();
-        delete mem_data;
-      });
-    } else if (getDataType() == ml::train::TensorDim::DataType::QINT4) {
-      mem_data =
-        new MemoryData((void *)(new uint8_t[(dim.getDataLen() + 1) / 2]{}));
-      data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
-        delete[] mem_data->template getAddr<uint8_t>();
-        delete mem_data;
-      });
-    }
-    offset = 0;
-    initialize();
+  } else {
+    throw std::invalid_argument(
+      "Error: Tensor cannot be constructed because the given d_type is not "
+      "compatible with itensor. The supported d_types are: FP32, FP16 "
+      "(if built with ENABLE_FP16).");
   }
 }
 
-bool Tensor::operator==(const Tensor &rhs) const {
-  if (this->dim != rhs.dim)
-    return false;
-
-  size_t len = size();
-
-  if (len != rhs.size())
-    return false;
-
-  if (contiguous != rhs.contiguous)
-    return false;
-
-  if (strides != rhs.strides)
-    return false;
-
-  if (getScaleFactors() != rhs.getScaleFactors())
-    return false;
-
-  if (getZeroPoints() != rhs.getZeroPoints())
-    return false;
+Tensor::Tensor(const TensorDim &d, const void *buf) {
+  itensor = nullptr;
 
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    const float *_data = getData<float>();
-    const float *_rdata = rhs.getData<float>();
-    for (size_t i = 0; i < len; ++i) {
-      /** not checking sign change is intentional to avoid float calculation
-       * errors around 0 */
-      if (std::isnan(_data[i]) || std::isnan(_rdata[i]) ||
-          std::fabs(_data[i] - _rdata[i]) > epsilon)
-        return false;
-    }
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
+  if (d.getDataType() == Tdatatype::FP32) {
+    itensor = std::shared_ptr<FloatTensor>(new FloatTensor(d, buf),
+                                           std::default_delete<FloatTensor>());
+  } else if (d.getDataType() == Tdatatype::FP16) {
 #ifdef ENABLE_FP16
-    const _FP16 *_data = getData<_FP16>();
-    const _FP16 *_rdata = rhs.getData<_FP16>();
-    for (size_t i = 0; i < len; ++i) {
-      // @todo: need to check if float casting valid
-      if ((std::isnan((float)_data[i]) && !std::isnan((float)_rdata[i])) ||
-          (!std::isnan((float)_data[i]) && std::isnan((float)_rdata[i])) ||
-          std::fabs((float)(_data[i] - _rdata[i])) > epsilon)
-        return false;
-    }
+    itensor = std::shared_ptr<HalfTensor>(new HalfTensor(d, buf),
+                                          std::default_delete<HalfTensor>());
 #else
     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::QINT8) {
-    const uint8_t *_data = getData<uint8_t>();
-    const uint8_t *_rdata = rhs.getData<uint8_t>();
-    for (size_t i = 0; i < len; ++i) {
-      if (_data[i] != _rdata[i])
-        return false;
-    }
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::QINT4) {
-    const uint8_t *_data = getData<uint8_t>();
-    const uint8_t *_rdata = rhs.getData<uint8_t>();
-    uint8_t data, rdata;
-    for (size_t i = 0; i < len; ++i) {
-      /** not checking sign change is intentional to avoid float calculation
-       * errors around 0 */
-      data = decode_qint(_data[i / 2], (i % 2 == 0));
-      rdata = decode_qint(_rdata[i / 2], (i % 2 == 0));
-
-      if (data != rdata)
-        return false;
-    }
+  } else {
+    throw std::invalid_argument(
+      "Error: Tensor cannot be constructed because the given d_type is not "
+      "compatible with itensor. The supported d_types are: FP32, FP16 "
+      "(if built with ENABLE_FP16).");
   }
-
-  return true;
 }
 
-void Tensor::setRandNormal(float mean, float std) {
-  if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
-    setDist<float, std::normal_distribution<float>>(
-      std::normal_distribution<float>(mean, std));
-  } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
+Tensor::Tensor(const Tensor &rhs) {
+  if (rhs.getDataType() == Tdatatype::FP32) {
+    itensor = std::shared_ptr<FloatTensor>(new FloatTensor(*rhs.itensor),
+                                           std::default_delete<FloatTensor>());
+  } else if (rhs.getDataType() == Tdatatype::FP16) {
 #ifdef ENABLE_FP16
-    setDist<_FP16, std::normal_distribution<float>>(
-      std::normal_distribution<float>(mean, std));
+    itensor = std::shared_ptr<HalfTensor>(new HalfTensor(*rhs.itensor),
+                                          std::default_delete<HalfTensor>());
 #else
     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
-  } else if (this->getDataType() == ml::train::TensorDim::DataType::QINT8) {
-    throw std::invalid_argument("Error: RandNormal is invalid for QINT8");
-  } else if (this->getDataType() == ml::train::TensorDim::DataType::QINT4) {
-    throw std::invalid_argument("Error: RandNormal is invalid for QINT4");
   }
 }
 
-void Tensor::setRandUniform(float min, float max) {
-  if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
-    setDist<float, std::uniform_real_distribution<float>>(
-      std::uniform_real_distribution<float>(min, max));
-  } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
+Tensor &Tensor::operator=(const Tensor &rhs) {
+  if (rhs.getDataType() == Tdatatype::FP32) {
+    itensor = std::shared_ptr<FloatTensor>(new FloatTensor(*rhs.itensor),
+                                           std::default_delete<FloatTensor>());
+  } else if (rhs.getDataType() == Tdatatype::FP16) {
 #ifdef ENABLE_FP16
-    setDist<_FP16, std::uniform_real_distribution<float>>(
-      std::uniform_real_distribution<float>(min, max));
+    itensor = std::shared_ptr<HalfTensor>(new HalfTensor(*rhs.itensor),
+                                          std::default_delete<HalfTensor>());
 #else
     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
-  } else if (this->getDataType() == ml::train::TensorDim::DataType::QINT8) {
-    throw std::invalid_argument("Error: RandUniform is invalid for QINT8");
-  } else if (this->getDataType() == ml::train::TensorDim::DataType::QINT4) {
-    throw std::invalid_argument("Error: RandUniform is invalid for QINT4");
   }
+  return *this;
 }
 
-void Tensor::setRandBernoulli(float probability) {
-  if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
-    setDist<float, std::bernoulli_distribution>(
-      std::bernoulli_distribution(probability));
-  } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
+bool Tensor::operator==(const Tensor &rhs) const {
+  /// compares tensor information
+  if (*itensor == *rhs.itensor) {
+    /// compares tensor data
+    if (getDataType() == Tdatatype::FP32) {
+      return *std::dynamic_pointer_cast<FloatTensor>(itensor) ==
+             *std::dynamic_pointer_cast<FloatTensor>(rhs.itensor);
+    } else if (getDataType() == Tdatatype::FP16) {
 #ifdef ENABLE_FP16
-    setDist<_FP16, std::bernoulli_distribution>(
-      std::bernoulli_distribution(probability));
+      return *std::dynamic_pointer_cast<HalfTensor>(itensor) ==
+             *std::dynamic_pointer_cast<HalfTensor>(rhs.itensor);
 #else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
+      throw std::invalid_argument(
+        "Error: HalfTensor cannot be created or used when FP16 is not enabled. "
+        "Please check if the tensor data type is set properly.");
 #endif
-  } else if (this->getDataType() == ml::train::TensorDim::DataType::QINT8) {
-    throw std::invalid_argument("Error: setRandBernoulli is invalid for QINT8");
-  } else if (this->getDataType() == ml::train::TensorDim::DataType::QINT4) {
-    throw std::invalid_argument("Error: setRandBernoulli is invalid for QINT4");
+    }
   }
+  return false;
 }
 
-void Tensor::initialize() {
-  if (empty() || !isAllocated())
-    return;
+void Tensor::allocate() { itensor->allocate(); }
 
-  unsigned int fan_in, fan_out;
-
-  /// @fixme: when unit is equal to one, this does not work, we need to rely on
-  /// effective dimension then actual numbers here. For now, some heuristics
-  /// added to infer what would be fan_in/fan_out
-  if (dim.batch() * dim.channel() * dim.height() == 1) {
-    fan_out = fan_in = dim.width();
-  } else if (dim.batch() * dim.channel() == 1) { /// fc layer - 2-D tensor
-    fan_in = dim.height();
-    fan_out = dim.width();
-  } else { /// conv2d filters - 4d tensor, @todo extend this to > 4
-    auto field_size = dim.height() * dim.width();
-
-    // this also handles below cases.
-    // 1. fan_in = fan_out = 1 as well.
-    // 2. batch == 1, channel == 1 and height == 1, theoretical rank of 1
-    fan_in = dim.channel() * field_size;
-    fan_out = dim.batch() * field_size;
-  }
+void Tensor::deallocate() { itensor->deallocate(); }
 
-  switch (initializer) {
-  case Tensor::Initializer::ZEROS:
-    setZero();
-    break;
-  case Tensor::Initializer::ONES:
-    setValue(1.0f);
-    break;
-  case Tensor::Initializer::LECUN_NORMAL:
-    setRandNormal(0.0f, sqrtFloat(1.0f / fan_in));
-    break;
-  case Tensor::Initializer::XAVIER_NORMAL:
-    setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in + fan_out)));
-    break;
-  case Tensor::Initializer::HE_NORMAL:
-    setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in)));
-    break;
-  case Tensor::Initializer::LECUN_UNIFORM:
-    setRandUniform(-1.0f * sqrtFloat(1.0f / fan_in), sqrtFloat(1.0f / fan_in));
-    break;
-  case Tensor::Initializer::XAVIER_UNIFORM:
-    setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in + fan_out)),
-                   sqrtFloat(6.0 / (fan_in + fan_out)));
-    break;
-  case Tensor::Initializer::HE_UNIFORM:
-    setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in)),
-                   sqrtFloat(6.0 / (fan_in)));
-    break;
-  default:
-    break;
-  }
+bool Tensor::isAllocated() { return itensor->isAllocated(); }
 
-  putData();
+void Tensor::setValue(float value) { itensor->setValue(value); }
+
+void Tensor::setValue(unsigned int b, unsigned int c, unsigned int h,
+                      unsigned int w, float value) {
+  itensor->setValue(b, c, h, w, value);
 }
 
-int Tensor::multiply_i_strided(Tensor const &m, const float beta) {
-  try {
-    this->multiply_strided(m, *this, beta);
-  } catch (std::exception &err) {
-    ml_loge("%s %s", typeid(err).name(), err.what());
-    return ML_ERROR_INVALID_PARAMETER;
-  }
+void Tensor::addValue(unsigned int b, unsigned int c, unsigned int h,
+                      unsigned int w, float value, float beta) noexcept {
+  itensor->addValue(b, c, h, w, value, beta);
+}
 
-  return ML_ERROR_NONE;
+void Tensor::setZero() { itensor->setZero(); }
+
+void Tensor::setRandNormal(float mean, float stddev) {
+  itensor->setRandNormal(mean, stddev);
 }
 
-Tensor Tensor::multiply_strided(Tensor const &m, const float beta) const {
-  Tensor t;
-  return this->multiply_strided(m, t, beta);
+void Tensor::setRandUniform(float min, float max) {
+  itensor->setRandUniform(min, max);
 }
 
-Tensor &Tensor::multiply_strided(Tensor const &m, Tensor &output,
-                                 const float beta) const {
-  /** TODO: throw than create new dimenions */
-  CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
+void Tensor::setRandBernoulli(float probability) {
+  itensor->setRandBernoulli(probability);
+}
 
-  if (size() != m.size() || size() != output.size())
-    throw std::invalid_argument(
-      "Strided multiplication does not support broadcasting");
-
-  if (getDataType() == Tdatatype::FP32) {
-    NNTR_THROW_IF(getData<float>() == nullptr, std::invalid_argument)
-      << getName() << " is not allocated";
-    NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
-      << m.getName() << " is not allocated";
-    NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
-      << output.getName() << " is not allocated";
-  } else if (getDataType() == Tdatatype::FP16) {
-#ifdef ENABLE_FP16
-    NNTR_THROW_IF(getData<_FP16>() == nullptr, std::invalid_argument)
-      << getName() << " is not allocated";
-    NNTR_THROW_IF(m.getData<_FP16>() == nullptr, std::invalid_argument)
-      << m.getName() << " is not allocated";
-    NNTR_THROW_IF(output.getData<_FP16>() == nullptr, std::invalid_argument)
-      << output.getName() << " is not allocated";
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
+void Tensor::initialize() { itensor->initialize(); }
 
-  // Format NCHW Case
-  if (this->getFormat() == Tformat::NCHW) {
-    if (getDataType() == Tdatatype::FP32) {
-      if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
-          beta != 0.0) {
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int c = 0; c < channel(); ++c) {
-            for (unsigned int h = 0; h < height(); ++h) {
-              for (unsigned int w = 0; w < width(); ++w) {
-                output.addValue(b, c, h, w,
-                                getValue<float>(b, c, h, w) *
-                                  m.getValue<float>(b, c, h, w),
-                                beta);
-              }
-            }
-          }
-        }
-      } else {
-        /** @todo optimize this with combining these loops where stride is 1
-         */
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int c = 0; c < channel(); ++c) {
-            for (unsigned int h = 0; h < height(); ++h) {
-              float *out_data = output.getAddress<float>(b, c, h, 0);
-              const float *m_data = m.getAddress<float>(b, c, h, 0);
-              const float *in_data = getAddress<float>(b, c, h, 0);
-              std::transform(in_data, in_data + width(), m_data, out_data,
-                             std::multiplies<float>());
-            }
-          }
-        }
-      }
-    } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-      if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
-          beta != 0.0) {
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int c = 0; c < channel(); ++c) {
-            for (unsigned int h = 0; h < height(); ++h) {
-              for (unsigned int w = 0; w < width(); ++w) {
-                output.addValue(b, c, h, w,
-                                getValue<_FP16>(b, c, h, w) *
-                                  m.getValue<_FP16>(b, c, h, w),
-                                beta);
-              }
-            }
-          }
-        }
-      } else {
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int c = 0; c < channel(); ++c) {
-            for (unsigned int h = 0; h < height(); ++h) {
-              _FP16 *out_data = output.getAddress<_FP16>(b, c, h, 0);
-              const _FP16 *m_data = m.getAddress<_FP16>(b, c, h, 0);
-              const _FP16 *in_data = getAddress<_FP16>(b, c, h, 0);
-              std::transform(in_data, in_data + width(), m_data, out_data,
-                             std::multiplies<_FP16>());
-            }
-          }
-        }
-      }
-#else
-      throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-    }
-  } else { // Format NHWC Case
-    if (getDataType() == Tdatatype::FP32) {
-      if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
-          beta != 0.0) {
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            for (unsigned int w = 0; w < width(); ++w) {
-              for (unsigned int c = 0; c < channel(); ++c) {
-                output.addValue(b, c, h, w,
-                                getValue<float>(b, c, h, w) *
-                                  m.getValue<float>(b, c, h, w),
-                                beta);
-              }
-            }
-          }
-        }
-      } else {
-        /** @todo optimize this with combining these loops where
-         * stride is 1 */
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            for (unsigned int w = 0; w < width(); ++w) {
-              float *out_data = output.getAddress<float>(b, 0, h, w);
-              const float *m_data = m.getAddress<float>(b, 0, h, w);
-              const float *in_data = getAddress<float>(b, 0, h, w);
-              std::transform(in_data, in_data + channel(), m_data, out_data,
-                             std::multiplies<float>());
-            }
-          }
-        }
-      }
-    } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-      if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
-          beta != 0.0) {
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            for (unsigned int w = 0; w < width(); ++w) {
-              for (unsigned int c = 0; c < channel(); ++c) {
-                output.addValue(b, c, h, w,
-                                getValue<_FP16>(b, c, h, w) *
-                                  m.getValue<_FP16>(b, c, h, w),
-                                beta);
-              }
-            }
-          }
-        }
-      } else {
-        /** @todo optimize this with combining these loops where
-         * stride is 1 */
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            for (unsigned int w = 0; w < width(); ++w) {
-              _FP16 *out_data = output.getAddress<_FP16>(b, 0, h, w);
-              const _FP16 *m_data = m.getAddress<_FP16>(b, 0, h, w);
-              const _FP16 *in_data = getAddress<_FP16>(b, 0, h, w);
-              std::transform(in_data, in_data + channel(), m_data, out_data,
-                             std::multiplies<_FP16>());
-            }
-          }
-        }
-      }
-#else
-      throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-    }
-  }
+void Tensor::initialize(Initializer init) { itensor->initialize(init); }
 
-  return output;
+Tensor Tensor::apply(std::function<Tensor(Tensor)> f) const { return f(*this); }
+
+Tensor &Tensor::apply(std::function<Tensor &(Tensor, Tensor &)> f,
+                      Tensor &output) const {
+  return f(*this, output);
 }
 
-int Tensor::add_i_strided(Tensor const &m, const float beta) {
+int Tensor::multiply_i_strided(Tensor const &m, const float beta) {
   try {
-    this->add_strided(m, *this, beta);
+    this->multiply_strided(m, *this, beta);
   } catch (std::exception &err) {
     ml_loge("%s %s", typeid(err).name(), err.what());
     return ML_ERROR_INVALID_PARAMETER;
@@ -574,214 +189,31 @@ int Tensor::add_i_strided(Tensor const &m, const float beta) {
   return ML_ERROR_NONE;
 }
 
-Tensor Tensor::add_strided(Tensor const &m, const float beta) const {
-  Tensor t;
-  return this->add_strided(m, t, beta);
+Tensor Tensor::multiply_strided(Tensor const &m, const float beta) const {
+  Tensor t("", getFormat(), getDataType());
+  return this->multiply_strided(m, t, beta);
 }
 
-Tensor &Tensor::add_strided(Tensor const &m, Tensor &output,
-                            const float beta) const {
-  /** TODO: throw than create new dimenions */
-  CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
-
-  if (size() != m.size() || size() != output.size())
-    throw std::invalid_argument(
-      "Strided addition does not support broadcasting");
-
-  if (getDataType() == Tdatatype::FP32) {
-    NNTR_THROW_IF(getData<float>() == nullptr, std::invalid_argument)
-      << getName() << " is not allocated";
-    NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
-      << m.getName() << " is not allocated";
-    NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
-      << output.getName() << " is not allocated";
-  } else if (getDataType() == Tdatatype::FP16) {
-#ifdef ENABLE_FP16
-    NNTR_THROW_IF(getData<_FP16>() == nullptr, std::invalid_argument)
-      << getName() << " is not allocated";
-    NNTR_THROW_IF(m.getData<_FP16>() == nullptr, std::invalid_argument)
-      << m.getName() << " is not allocated";
-    NNTR_THROW_IF(output.getData<_FP16>() == nullptr, std::invalid_argument)
-      << output.getName() << " is not allocated";
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-
-  // Format NCHW Case
-  if (this->getFormat() == Tformat::NCHW) {
-    if (getDataType() == Tdatatype::FP32) {
-      if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
-          beta != 0.0) {
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int c = 0; c < channel(); ++c) {
-            for (unsigned int h = 0; h < height(); ++h) {
-              for (unsigned int w = 0; w < width(); ++w) {
-                output.setValue(b, c, h, w,
-                                getValue<float>(b, c, h, w) +
-                                  m.getValue<float>(b, c, h, w) * beta);
-              }
-            }
-          }
-        }
-      } else {
-        /** @todo optimize this with combining these loops where stride is 1 */
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int c = 0; c < channel(); ++c) {
-            for (unsigned int h = 0; h < height(); ++h) {
-              float *out_data = output.getAddress<float>(b, c, h, 0);
-              const float *m_data = m.getAddress<float>(b, c, h, 0);
-              const float *in_data = getAddress<float>(b, c, h, 0);
-              std::transform(in_data, in_data + width(), m_data, out_data,
-                             std::plus<float>());
-            }
-          }
-        }
-      }
-    } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-      if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
-          beta != 0.0) {
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int c = 0; c < channel(); ++c) {
-            for (unsigned int h = 0; h < height(); ++h) {
-              for (unsigned int w = 0; w < width(); ++w) {
-                output.setValue(b, c, h, w,
-                                getValue<_FP16>(b, c, h, w) +
-                                  m.getValue<_FP16>(b, c, h, w) * beta);
-              }
-            }
-          }
-        }
-      } else {
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int c = 0; c < channel(); ++c) {
-            for (unsigned int h = 0; h < height(); ++h) {
-              _FP16 *out_data = output.getAddress<_FP16>(b, c, h, 0);
-              const _FP16 *m_data = m.getAddress<_FP16>(b, c, h, 0);
-              const _FP16 *in_data = getAddress<_FP16>(b, c, h, 0);
-              std::transform(in_data, in_data + width(), m_data, out_data,
-                             std::plus<_FP16>());
-            }
-          }
-        }
-      }
-#else
-      throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-    }
-  } else { // Format NHWC Case
-    if (getDataType() == Tdatatype::FP32) {
-      if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
-          beta != 0.0) {
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            for (unsigned int w = 0; w < width(); ++w) {
-              for (unsigned int c = 0; c < channel(); ++c) {
-                output.setValue(b, c, h, w,
-                                getValue<float>(b, c, h, w) +
-                                  m.getValue<float>(b, c, h, w) * beta);
-              }
-            }
-          }
-        }
-      } else {
-        /** @todo optimize this with combining these loops where
-         * stride is 1 */
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            for (unsigned int w = 0; w < width(); ++w) {
-              float *out_data = output.getAddress<float>(b, 0, h, w);
-              const float *m_data = m.getAddress<float>(b, 0, h, w);
-              const float *in_data = getAddress<float>(b, 0, h, w);
-              std::transform(in_data, in_data + channel(), m_data, out_data,
-                             std::plus<float>());
-            }
-          }
-        }
-      }
-    } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-      if (strides[3] != 1 || m.strides[3] != 1 || output.strides[3] != 1 ||
-          beta != 0.0) {
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            for (unsigned int w = 0; w < width(); ++w) {
-              for (unsigned int c = 0; c < channel(); ++c) {
-                output.setValue(b, c, h, w,
-                                getValue<_FP16>(b, c, h, w) +
-                                  m.getValue<_FP16>(b, c, h, w) * beta);
-              }
-            }
-          }
-        }
-      } else {
-        /** @todo optimize this with combining these loops where
-         * stride is 1 */
-        for (unsigned int b = 0; b < batch(); ++b) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            for (unsigned int w = 0; w < width(); ++w) {
-              _FP16 *out_data = output.getAddress<_FP16>(b, 0, h, w);
-              const _FP16 *m_data = m.getAddress<_FP16>(b, 0, h, w);
-              const _FP16 *in_data = getAddress<_FP16>(b, 0, h, w);
-              std::transform(in_data, in_data + channel(), m_data, out_data,
-                             std::plus<_FP16>());
-            }
-          }
-        }
-      }
-#else
-      throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-    }
-  }
+Tensor &Tensor::multiply_strided(Tensor const &m, Tensor &output,
+                                 const float beta) const {
+  itensor->multiply_strided(m, output, beta);
   return output;
 }
 
 int Tensor::multiply_i(float const &value) {
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
+  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
     << getName() << " is not contiguous, cannot multiply";
 
-  /// @note this is not depending on multiply_i as there is an optimized
-  /// version for multiply_i
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    float *data = getData<float>();
-    unsigned int len = size();
-
-    sscal(len, value, data, 1);
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    _FP16 *data = getData<_FP16>();
-    unsigned int len = size();
-    sscal(len, value, data, 1);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-  return ML_ERROR_NONE;
+  return itensor->multiply_i(value);
 }
 
 Tensor Tensor::multiply(float const &value) const {
-  Tensor t;
+  Tensor t("", getFormat(), getDataType());
   return multiply(value, t);
 }
 
 Tensor &Tensor::multiply(float const &value, Tensor &out) const {
-  /// @todo add unittest
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto f = std::bind(std::multiplies<float>(), std::placeholders::_1, value);
-    apply<float>(f, out);
-    return out;
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto f = std::bind(std::multiplies<_FP16>(), std::placeholders::_1,
-                       static_cast<_FP16>(value));
-    apply<_FP16>(f, out);
-    return out;
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
+  itensor->multiply(value, out);
   return out;
 }
 
@@ -798,48 +230,26 @@ int Tensor::multiply_i(Tensor const &m, const float beta) {
 
 Tensor Tensor::multiply(Tensor const &m, const float beta) const {
   Tensor t("", this->getFormat());
-  return this->multiply(m, t, beta);
+  return multiply(m, t, beta);
 }
 
 Tensor &Tensor::multiply(Tensor const &m, Tensor &output,
                          const float beta) const {
-  /**
-   * @note this does not work correctly with differently strided inputs.
-   * Use multiply_strided alternatively
-   */
   NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
     << "Tensor Format of " << getName() << ":"
     << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
     << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
 
-  NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
+  NNTR_THROW_IF(!getContiguous() || !m.getContiguous() ||
+                  !output.getContiguous(),
                 std::invalid_argument)
     << getName() << " is not contiguous, cannot multiply";
 
-  NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
+  NNTR_THROW_IF(!getContiguous() || !m.getContiguous() ||
+                  !output.getContiguous(),
                 std::invalid_argument)
     << getName() << " is not contiguous, cannot multiply";
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
-                 float *out_buf) {
-      ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
-              strides[3]);
-    };
-    apply_broadcast(m, f, output);
-
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
-                 _FP16 *out_buf) {
-      ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
-              strides[3]);
-    };
-    apply_broadcast(m, f, output);
-
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
+  itensor->multiply(m, output, beta);
   return output;
 }
 
@@ -852,33 +262,19 @@ int Tensor::divide_i(float const &value) {
 }
 
 Tensor Tensor::divide(float const &value) const {
-  Tensor t;
-  return divide(value, t);
+  Tensor output("", getFormat(), getDataType());
+  return divide(value, output);
 }
 
-Tensor &Tensor::divide(float const &value, Tensor &out) const {
-  /// @todo add unittest, _FP16 ZeroDivisionError
+Tensor &Tensor::divide(float const &value, Tensor &output) const {
+  /// @todo add unittest, ZeroDivisionError
   if (value == 0.0f) {
     std::stringstream ss;
     ss << "[Tensor] divide by value failed, value: " << value;
     throw std::invalid_argument(ss.str().c_str());
   }
-
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
-    apply<float>(f, out);
-    return out;
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto f = std::bind(std::divides<_FP16>(), std::placeholders::_1,
-                       static_cast<_FP16>(value));
-    apply<_FP16>(f, out);
-    return out;
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-  return out;
+  itensor->divide(value, output);
+  return output;
 }
 
 int Tensor::divide_i(Tensor const &m) {
@@ -893,34 +289,45 @@ int Tensor::divide_i(Tensor const &m) {
 }
 
 Tensor Tensor::divide(Tensor const &m) const {
-  Tensor t;
-  return this->divide(m, t);
+  Tensor output("", getFormat(), getDataType());
+  return this->divide(m, output);
 }
 
 Tensor &Tensor::divide(Tensor const &m, Tensor &output) const {
-
-  NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
+  NNTR_THROW_IF(!getContiguous() || !m.getContiguous() ||
+                  !output.getContiguous(),
                 std::invalid_argument)
     << getName() << " is not contiguous, cannot divide";
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
-                 float *out_buf) {
-      ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3],
-              strides[3]);
-    };
-    apply_broadcast(m, f, output);
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
-                 _FP16 *out_buf) {
-      ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3],
-              strides[3]);
-    };
-    apply_broadcast(m, f, output);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
+  itensor->divide(m, output);
+  return output;
+}
+
+int Tensor::add_i_strided(Tensor const &input, const float beta) {
+  try {
+    this->add_strided(input, *this, beta);
+  } catch (std::exception &err) {
+    ml_loge("%s %s", typeid(err).name(), err.what());
+    return ML_ERROR_INVALID_PARAMETER;
   }
+
+  return ML_ERROR_NONE;
+}
+
+Tensor Tensor::add_strided(Tensor const &input, const float beta) const {
+  Tensor output("", getFormat(), getDataType());
+  return this->add_strided(input, output, beta);
+}
+
+Tensor &Tensor::add_strided(Tensor const &input, Tensor &output,
+                            const float beta) const {
+  CREATE_IF_EMPTY_DIMS(output, getDim(), nullptr);
+
+  if (size() != input.size() || size() != output.size())
+    throw std::invalid_argument(
+      "Strided addition does not support broadcasting");
+
+  itensor->add_strided(input, output, beta);
+
   return output;
 }
 
@@ -930,123 +337,37 @@ int Tensor::add_i(float const &value) {
 }
 
 Tensor Tensor::add(float const &value) const {
-  Tensor t;
+  Tensor t("", getFormat(), getDataType());
   return add(value, t);
 }
 
-Tensor &Tensor::add(float const &value, Tensor &out) const {
-  /// @todo add unittest
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
-    apply<float>(f, out);
-    return out;
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto f = std::bind(std::plus<_FP16>(), std::placeholders::_1,
-                       static_cast<_FP16>(value));
-    apply<_FP16>(f, out);
-    return out;
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-  return out;
+Tensor &Tensor::add(float const &value, Tensor &output) const {
+  itensor->add(value, output);
+  return output;
 }
 
 int Tensor::add_i(Tensor const &m, float const alpha) {
-  /// @todo: add axis rather doing add over the last two dimensions always
-  /// operator i has optimized version
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
-                 float *out_buf) {
-      saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
-    };
-
-    /// @todo: enable this after add_strided supports broadcast
-    // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
-    //   << getName() << " is not contiguous, cannot add";
-
-    try {
-      apply_broadcast(m, f, *this);
-    } catch (std::exception &err) {
-      ml_loge("%s %s", typeid(err).name(), err.what());
-      return ML_ERROR_INVALID_PARAMETER;
-    }
-
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
-                 _FP16 *out_buf) {
-      saxpy(e.buffer_size, alpha, m_buf, e.strides[3], out_buf, strides[3]);
-      /// @todo: saxpy is not valid for _FP16
-    };
-
-    /// @todo: enable this after add_strided supports broadcast
-    // NNTR_THROW_IF(!contiguous || !m.contiguous, std::invalid_argument)
-    //   << getName() << " is not contiguous, cannot add";
-
-    try {
-      apply_broadcast(m, f, *this);
-    } catch (std::exception &err) {
-      ml_loge("%s %s", typeid(err).name(), err.what());
-      return ML_ERROR_INVALID_PARAMETER;
-    }
-
-#else
-    ml_loge("%s", "Error: enable-fp16 is not enabled");
-    return ML_ERROR_INVALID_PARAMETER;
-#endif
-  }
-  return ML_ERROR_NONE;
+  return itensor->add_i(m, *this, alpha);
 }
 
 int Tensor::add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
                           unsigned int incX, unsigned int incY,
                           const Tensor alphas, unsigned int alpha_idx) {
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    saxpy(len, alphas.getValue<float>(alpha_idx), m.getData<float>(), incX,
-          getAddress<float>(addr_idx), incY);
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    saxpy(len, alphas.getValue<_FP16>(alpha_idx), m.getData<_FP16>(), incX,
-          getAddress<_FP16>(addr_idx), incY);
-#else
-    ml_loge("%s", "Error: enable-fp16 is not enabled");
-    return ML_ERROR_INVALID_PARAMETER;
-#endif
-  }
-  return ML_ERROR_NONE;
+  return itensor->add_i_partial(len, addr_idx, m, incX, incY, alphas,
+                                alpha_idx);
 }
 
 Tensor Tensor::add(Tensor const &m, float const alpha) const {
-  Tensor t;
+  Tensor t("", getFormat(), getDataType());
   return this->add(m, t, alpha);
 }
 
 Tensor &Tensor::add(Tensor const &m, Tensor &output, float const alpha) const {
-  NNTR_THROW_IF(!contiguous || !m.contiguous || !output.contiguous,
+  NNTR_THROW_IF(!itensor->getContiguous() || !m.getContiguous() ||
+                  !output.getContiguous(),
                 std::invalid_argument)
     << getName() << " is not contiguous, cannot add";
-
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
-                 float *out_buf) {
-      ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
-              strides[3]);
-    };
-    apply_broadcast(m, f, output);
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
-                 _FP16 *out_buf) {
-      ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
-              strides[3]);
-    };
-    apply_broadcast(m, f, output);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
+  itensor->add(m, output, alpha);
   return output;
 }
 
@@ -1056,27 +377,13 @@ int Tensor::subtract_i(float const &value) {
 }
 
 Tensor Tensor::subtract(float const &value) const {
-  Tensor t;
-  return subtract(value, t);
+  Tensor output("", getFormat(), getDataType());
+  return subtract(value, output);
 }
 
-Tensor &Tensor::subtract(float const &value, Tensor &out) const {
-  /// @todo add unittest
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
-    apply<float>(f, out);
-    return out;
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto f = std::bind(std::minus<_FP16>(), std::placeholders::_1,
-                       static_cast<_FP16>(value));
-    apply<_FP16>(f, out);
-    return out;
-#else
-    ml_loge("%s", "Error: enable-fp16 is not enabled");
-#endif
-  }
-  return out; // shouldn't reach
+Tensor &Tensor::subtract(float const &value, Tensor &output) const {
+  itensor->subtract(value, output);
+  return output;
 }
 
 int Tensor::subtract_i(Tensor const &m) { return add_i(m, -1); }
@@ -1086,2160 +393,85 @@ Tensor Tensor::subtract(Tensor const &m) const {
   return this->subtract(m, t);
 }
 
-Tensor &Tensor::subtract(Tensor const &m, Tensor &out) const {
-  NNTR_THROW_IF(!contiguous || !m.contiguous || !out.contiguous,
-                std::invalid_argument)
-    << getName() << " is not contiguous, cannot add";
+Tensor &Tensor::subtract(Tensor const &m, Tensor &output) const {
+  return add(m, output, -1);
+}
 
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
-                 float *out_buf) {
-      ele_sub(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3],
-              strides[3]);
-    };
-    apply_broadcast(m, f, out);
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto f = [&](const BroadcastInfo &e, const _FP16 *buf, const _FP16 *m_buf,
-                 _FP16 *out_buf) {
-      ele_sub(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3],
-              strides[3]);
-    };
-    apply_broadcast(m, f, out);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-  return out;
+/**
+ * This is to sum the Tensor data according to the dim.batch().
+ * Therefore the result has M(dim.batch(), 1, 1, 1) dimension.
+ */
+Tensor Tensor::sum_by_batch() const {
+  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
+    << getName() << " is not contiguous, cannot sum";
+
+  Tensor output(batch(), 1, 1, 1, this->getFormat(), getDataType());
+  itensor->sum_by_batch(output);
+  return output;
 }
 
-int Tensor::pow_i(float exponent) {
-  pow(exponent, *this);
-  return ML_ERROR_NONE;
-}
-
-Tensor Tensor::pow(float exponent) const {
-  Tensor t;
-  return pow(exponent, t);
-}
-
-Tensor &Tensor::pow(float exponent, Tensor &out) const {
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto f = [exponent](float in) { return powf(in, exponent); };
-    apply<float>(f, out);
-    return out;
-  }
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto f = [exponent](_FP16 in) {
-      return static_cast<_FP16>(powf(in, exponent));
-    };
-    apply<_FP16>(f, out);
-    return out;
-#else
-    ml_loge("%s", "Error: enable-fp16 is not enabled");
-#endif
-  }
-  return out;
-}
-
-Tensor Tensor::getBatchSlice(size_t offset, unsigned int size) const {
-  TensorDim dim_ = dim;
-  dim_.batch(size);
-
-  return getSharedDataTensor(dim_, offset * this->dim.getFeatureLen());
-}
-
-void Tensor::createSharedDataTensor(const Tensor &src, Tensor &dest,
-                                    size_t offset) {
-  /**
-   * - If src already has data allocaed, then directly make dest tensor based on
-   * the src tensor.
-   * - If src.data does not exist (meaning tensor does not memory allocated),
-   * and src.src_tensor does not exist (meaning the src tensor does not depened
-   * on another tensor), then create a SrcSharedTensor around the src.
-   * - If src.src_tensor exists, then use the src.src_tensor to create the
-   *  required SrcSharedTensor to avoid recursive dependency.
-   *
-   * @note src.data and src.src_tensor CAN co-exist. src.src_tensor is stored
-   * if the batch size of src is updated and needs reallocation.
-   */
-  dest.data = nullptr;
-  if (src.data) {
-    dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
-    dest.allocate();
-  } else if (!src.src_tensor)
-    dest.src_tensor = std::make_shared<SrcSharedTensor>(&src, offset);
-  else
-    dest.src_tensor = std::make_shared<SrcSharedTensor>(
-      src.src_tensor->tensor(), offset + src.src_tensor->offset());
-}
-
-Tensor Tensor::getSharedDataTensor(const TensorDim dim_, size_t offset,
-                                   bool reset_stride,
-                                   const std::string &name_) const {
-  Tensor ret = *this;
-  if (dim_.getFormat() != ret.dim.getFormat())
-    throw std::invalid_argument("Tensor format does not match");
-
-  ret.dim = dim_;
-  if (!name_.empty())
-    ret.name = name_;
-
-  if (dim_.getDataLen() + offset > dim.getDataLen())
-    throw std::invalid_argument(
-      "Creating shared tensor of size bigger than tensor memory.");
-
-  if (reset_stride)
-    ret.strides = ret.dim.computeStrides();
-
-  TensorDim new_match_dim = dim_;
-  new_match_dim.batch(dim.batch());
-  if (new_match_dim != dim && !reset_stride)
-    ret.contiguous = false;
-
-  /**
-   * In this case, its the caller's responsibility to ensure that allocate() is
-   * called for the output tensor before operating on the output tensor.
-   */
-  createSharedDataTensor(*this, ret, offset);
-
-  return ret;
-}
-
-std::vector<Tensor> Tensor::split(unsigned num_size, int axis) {
-  NNTR_THROW_IF(num_size == 0, std::invalid_argument)
-    << "num size cannot be zero";
-
-  if (axis == -1) {
-    axis = 3;
-  }
-
-  NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
-    << "cannot split axis of axis: " << axis;
-
-  NNTR_THROW_IF(dim.getTensorDim(axis) % num_size != 0, std::invalid_argument)
-    << "axis is not divisible by num_size, axis: " << axis
-    << " num size: " << num_size;
-
-  std::vector<size_t> sizes;
-  sizes.resize(num_size);
-
-  unsigned int sz = dim.getTensorDim(axis) / num_size;
-  std::fill(sizes.begin(), sizes.end(), sz);
-
-  return split(sizes, axis);
-}
-
-std::vector<Tensor> Tensor::split(std::vector<size_t> sizes, int axis) {
-  size_t num_size = sizes.size();
-
-  NNTR_THROW_IF(num_size == 0, std::invalid_argument)
-    << "num size cannot be zero";
-
-  if (axis == -1) {
-    axis = 3;
-  }
-
-  NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
-    << "cannot split axis of axis: " << axis;
-
-  NNTR_THROW_IF(
-    std::any_of(sizes.begin(), sizes.end(), [](size_t sz) { return !sz; }),
-    std::invalid_argument)
-    << "among given sizes at least one of size is 0";
-
-  size_t total_size = std::accumulate(sizes.begin(), sizes.end(), 0);
-  NNTR_THROW_IF(dim.getTensorDim(axis) != total_size, std::invalid_argument)
-    << "given sum of sizes did not match with origin tensor dim, tensor dim: "
-    << dim.getTensorDim(axis) << " total size: " << total_size;
-
-  std::vector<TensorDim> ret_dims;
-  ret_dims.reserve(num_size);
-  for (unsigned int i = 0; i < num_size; ++i) {
-    ret_dims[i] = dim;
-    ret_dims[i].setTensorDim(axis, sizes[i]);
-  }
-
-  bool is_format_nchw = (dim.getFormat() == Tformat::NCHW) ? true : false;
-  std::vector<Tensor> ret;
-
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto iter_value = [this, is_format_nchw](
-                        std::array<size_t, 4> &loc,
-                        const std::array<size_t, 4> &end_loc,
-                        const std::array<size_t, 4> &reset_dim_arr) -> float & {
-      auto &value = (is_format_nchw) ? getValue(loc[0], loc[1], loc[2], loc[3])
-                                     : getValue(loc[0], loc[3], loc[1], loc[2]);
-      for (int i = 3; i >= 0; --i) {
-        loc[i]++;
-        if (loc[i] == end_loc[i]) {
-          loc[i] -= reset_dim_arr[i];
-          continue;
-        }
-        break;
-      }
-      return value;
-    };
-
-    ret.reserve(num_size);
-
-    unsigned int accumulated_size = 0;
-    for (unsigned int i = 0; i < num_size; ++i) {
-      std::array<size_t, 4> loc = {0, 0, 0, 0};
-
-      if (is_format_nchw) {
-        loc[axis] += accumulated_size;
-      } else {
-        if (axis == 0) {
-          loc[0] += accumulated_size;
-        } else if (axis == 1) {
-          loc[3] += accumulated_size;
-        } else if (axis == 2 || axis == 3) {
-          loc[axis - 1] += accumulated_size;
-        }
-      }
-
-      ret.emplace_back(ret_dims[i]);
-      auto &ret_t = ret.back();
-
-      std::array<size_t, 4> end_loc;
-
-      if (is_format_nchw) {
-        end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
-                   ret_dims[i].height(), ret_dims[i].width()};
-      } else {
-        end_loc = {ret_dims[i].batch(), ret_dims[i].height(),
-                   ret_dims[i].width(), ret_dims[i].channel()};
-      }
-
-      accumulated_size += sizes[i];
-
-      if (is_format_nchw) {
-        end_loc[axis] = accumulated_size;
-      } else {
-        if (axis == 0) {
-          end_loc[0] = accumulated_size;
-        } else if (axis == 1) {
-          end_loc[3] = accumulated_size;
-        } else if (axis == 2 || axis == 3) {
-          end_loc[axis - 1] = accumulated_size;
-        }
-      }
-
-      std::array<size_t, 4> reset_dim_arr;
-      if (is_format_nchw) {
-        reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
-                         ret_dims[i].height(), ret_dims[i].width()};
-      } else {
-        reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
-                         ret_dims[i].width(), ret_dims[i].channel()};
-      }
-
-      ret_t.apply_i<float>(
-        [&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
-          return iter_value(loc, end_loc, reset_dim_arr);
-        });
-    }
-  }
-  if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto iter_value = [this, is_format_nchw](
-                        std::array<size_t, 4> &loc,
-                        const std::array<size_t, 4> &end_loc,
-                        const std::array<size_t, 4> &reset_dim_arr) -> _FP16 & {
-      auto &value = (is_format_nchw)
-                      ? getValue<_FP16>(loc[0], loc[1], loc[2], loc[3])
-                      : getValue<_FP16>(loc[0], loc[3], loc[1], loc[2]);
-      for (int i = 3; i >= 0; --i) {
-        loc[i]++;
-        if (loc[i] == end_loc[i]) {
-          loc[i] -= reset_dim_arr[i];
-          continue;
-        }
-        break;
-      }
-      return value;
-    };
-
-    ret.reserve(num_size);
-
-    unsigned int accumulated_size = 0;
-    for (unsigned int i = 0; i < num_size; ++i) {
-      std::array<size_t, 4> loc = {0, 0, 0, 0};
-
-      if (is_format_nchw) {
-        loc[axis] += accumulated_size;
-      } else {
-        if (axis == 0) {
-          loc[0] += accumulated_size;
-        } else if (axis == 1) {
-          loc[3] += accumulated_size;
-        } else if (axis == 2 || axis == 3) {
-          loc[axis - 1] += accumulated_size;
-        }
-      }
-
-      ret.emplace_back(ret_dims[i]);
-      auto &ret_t = ret.back();
-
-      std::array<size_t, 4> end_loc;
-
-      if (is_format_nchw) {
-        end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
-                   ret_dims[i].height(), ret_dims[i].width()};
-      } else {
-        end_loc = {ret_dims[i].batch(), ret_dims[i].height(),
-                   ret_dims[i].width(), ret_dims[i].channel()};
-      }
-
-      accumulated_size += sizes[i];
-
-      if (is_format_nchw) {
-        end_loc[axis] = accumulated_size;
-      } else {
-        if (axis == 0) {
-          end_loc[0] = accumulated_size;
-        } else if (axis == 1) {
-          end_loc[3] = accumulated_size;
-        } else if (axis == 2 || axis == 3) {
-          end_loc[axis - 1] = accumulated_size;
-        }
-      }
-
-      std::array<size_t, 4> reset_dim_arr;
-      if (is_format_nchw) {
-        reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
-                         ret_dims[i].height(), ret_dims[i].width()};
-      } else {
-        reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
-                         ret_dims[i].width(), ret_dims[i].channel()};
-      }
-
-      ret_t.apply_i<_FP16>(
-        [&iter_value, &loc, &end_loc, &reset_dim_arr](_FP16 _) {
-          return iter_value(loc, end_loc, reset_dim_arr);
-        });
-    }
-
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-
-  return ret;
-}
-
-Tensor Tensor::cat(const std::vector<Tensor> &tensors, int axis) {
-
-  if (axis == -1) {
-    axis = 3;
-  }
-
-  NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
-    << "cannot split axis of axis: " << axis;
-
-  NNTR_THROW_IF(tensors.empty(), std::invalid_argument)
-    << "given tensor vector is empty";
-
-  Tensor ret;
-  auto ref_dim = tensors.front().getDim();
-  bool is_format_nchw = (ref_dim.getFormat() == Tformat::NCHW);
-  ref_dim.setTensorDim(axis, 1);
-  NNTR_THROW_IF(!std::all_of(tensors.begin(), tensors.end(),
-                             [&ref_dim, axis](const Tensor &t) {
-                               auto cur_dim = t.getDim();
-                               cur_dim.setTensorDim(axis, 1);
-                               return ref_dim == cur_dim;
-                             }),
-                std::invalid_argument)
-    << " all tensor must have the same dimension except for the axis, ref_dim: "
-    << ref_dim << " axis : " << axis;
-
-  auto axis_dim = std::accumulate(tensors.begin(), tensors.end(), 0u,
-                                  [axis](unsigned cur, const Tensor &t) {
-                                    return cur += t.getDim().getTensorDim(axis);
-                                  });
-  if (ref_dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto iter_value =
-      [is_format_nchw](std::array<unsigned, 4> &loc,
-                       const std::array<unsigned, 4> &start_loc, Tensor &t,
-                       const std::array<unsigned, 4> &ref_dim_arr) -> float & {
-      auto &value = is_format_nchw
-                      ? t.getValue<float>(loc[0], loc[1], loc[2], loc[3])
-                      : t.getValue<float>(loc[0], loc[3], loc[1], loc[2]);
-
-      for (int i = 3; i >= 0; --i) {
-        loc[i]++;
-        if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
-          loc[i] = start_loc[i];
-          continue;
-        }
-        break;
-      }
-      return value;
-    };
-
-    auto ret_dim = ref_dim;
-    ret_dim.setTensorDim(axis, axis_dim);
-
-    ret = Tensor(ret_dim);
-
-    std::array<unsigned, 4> loc = {0, 0, 0, 0};
-    for (auto &t : tensors) {
-      std::array<unsigned, 4> start_loc = loc;
-      std::array<unsigned, 4> tensor_dim_arr;
-      if (is_format_nchw) {
-        tensor_dim_arr[0] = t.getDim().getTensorDim(0);
-        tensor_dim_arr[1] = t.getDim().getTensorDim(1);
-        tensor_dim_arr[2] = t.getDim().getTensorDim(2);
-        tensor_dim_arr[3] = t.getDim().getTensorDim(3);
-      } else {
-        tensor_dim_arr[0] = t.getDim().getTensorDim(0);
-        tensor_dim_arr[1] = t.getDim().getTensorDim(2);
-        tensor_dim_arr[2] = t.getDim().getTensorDim(3);
-        tensor_dim_arr[3] = t.getDim().getTensorDim(1);
-      }
-
-      for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
-        iter_value(loc, start_loc, ret, tensor_dim_arr) = t.getValue<float>(i);
-      }
-
-      if (is_format_nchw) {
-        loc[axis] += t.getDim().getTensorDim(axis);
-      } else {
-        if (axis == 0) {
-          loc[0] += t.getDim().getTensorDim(axis);
-        } else if (axis == 1) {
-          loc[3] += t.getDim().getTensorDim(axis);
-        } else if (axis == 2 || axis == 3) {
-          loc[axis - 1] += t.getDim().getTensorDim(axis);
-        }
-      }
-    }
-
-    // return ret;
-  } else if (ref_dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto iter_value =
-      [is_format_nchw](std::array<unsigned, 4> &loc,
-                       const std::array<unsigned, 4> &start_loc, Tensor &t,
-                       const std::array<unsigned, 4> &ref_dim_arr) -> _FP16 & {
-      auto &value = is_format_nchw
-                      ? t.getValue<_FP16>(loc[0], loc[1], loc[2], loc[3])
-                      : t.getValue<_FP16>(loc[0], loc[3], loc[1], loc[2]);
-
-      for (int i = 3; i >= 0; --i) {
-        loc[i]++;
-        if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
-          loc[i] = start_loc[i];
-          continue;
-        }
-        break;
-      }
-      return value;
-    };
-
-    auto ret_dim = ref_dim;
-    ret_dim.setTensorDim(axis, axis_dim);
-
-    ret = Tensor(ret_dim);
-
-    std::array<unsigned, 4> loc = {0, 0, 0, 0};
-    for (auto &t : tensors) {
-      std::array<unsigned, 4> start_loc = loc;
-      std::array<unsigned, 4> tensor_dim_arr;
-      if (is_format_nchw) {
-        tensor_dim_arr[0] = t.getDim().getTensorDim(0);
-        tensor_dim_arr[1] = t.getDim().getTensorDim(1);
-        tensor_dim_arr[2] = t.getDim().getTensorDim(2);
-        tensor_dim_arr[3] = t.getDim().getTensorDim(3);
-      } else {
-        tensor_dim_arr[0] = t.getDim().getTensorDim(0);
-        tensor_dim_arr[1] = t.getDim().getTensorDim(2);
-        tensor_dim_arr[2] = t.getDim().getTensorDim(3);
-        tensor_dim_arr[3] = t.getDim().getTensorDim(1);
-      }
-
-      for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
-        iter_value(loc, start_loc, ret, tensor_dim_arr) = t.getValue<_FP16>(i);
-      }
-
-      if (is_format_nchw) {
-        loc[axis] += t.getDim().getTensorDim(axis);
-      } else {
-        if (axis == 0) {
-          loc[0] += t.getDim().getTensorDim(axis);
-        } else if (axis == 1) {
-          loc[3] += t.getDim().getTensorDim(axis);
-        } else if (axis == 2 || axis == 3) {
-          loc[axis - 1] += t.getDim().getTensorDim(axis);
-        }
-      }
-    }
-
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-  return ret;
-}
-
-void Tensor::makeSharedDataTensor(const Tensor &src, size_t offset) {
-  if (strides != src.strides)
-    throw std::invalid_argument(
-      "Creating shared tensor of different stride than source tensor.");
-
-  if (getDim().getDataLen() + offset > src.getDim().getDataLen())
-    throw std::invalid_argument(
-      "Creating shared tensor of different size or stride than source tensor.");
-
-  /**
-   * In this case, its the caller's responsibility to ensure that allocate() is
-   * called for the output tensor before operating on the output tensor.
-   */
-  createSharedDataTensor(src, *this, offset);
-}
-
-void Tensor::apply_broadcast(
-  Tensor const &m,
-  std::function<void(const BroadcastInfo &e, const float *, const float *,
-                     float *)>
-    v_func,
-  Tensor &output) const {
-  CREATE_IF_EMPTY_DIMS(output, dim);
-
-  NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
-    << getName() << " is not allocated";
-  NNTR_THROW_IF(m.getData() == nullptr, std::invalid_argument)
-    << m.getName() << " is not allocated";
-  NNTR_THROW_IF(output.getData() == nullptr, std::invalid_argument)
-    << output.getName() << " is not allocated";
-
-  /// shortcut to cover when dimension matches
-  /// note that buffer_size, the last stride is only used in v_func but it
-  /// might be changed
-  if (dim == m.dim) {
-    BroadcastInfo e;
-    e.buffer_size = size();
-    e.strides[3] = 1;
-    e.tensor_type = getTensorType();
-    v_func(e, getData(), m.getData(), output.getData());
-    return;
-  }
-
-  return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
-}
-
-#ifdef ENABLE_FP16
-void Tensor::apply_broadcast(
-  Tensor const &m,
-  std::function<void(const BroadcastInfo &e, const _FP16 *, const _FP16 *,
-                     _FP16 *)>
-    v_func,
-  Tensor &output) const {
-  CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
-
-  NNTR_THROW_IF(getData<_FP16>() == nullptr, std::invalid_argument)
-    << getName() << " is not allocated";
-  NNTR_THROW_IF(m.getData<_FP16>() == nullptr, std::invalid_argument)
-    << m.getName() << " is not allocated";
-  NNTR_THROW_IF(output.getData<_FP16>() == nullptr, std::invalid_argument)
-    << output.getName() << " is not allocated";
-
-  /// shortcut to cover when dimension matches
-  /// note that buffer_size, the last stride is only used in v_func but it
-  /// might be changed
-  if (dim == m.dim) {
-    BroadcastInfo e;
-    e.buffer_size = size();
-    e.strides[3] = 1;
-    v_func(e, getData<_FP16>(), m.getData<_FP16>(), output.getData<_FP16>());
-    return;
-  }
-
-  return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
-}
-
-void Tensor::apply_broadcast_util(
-  Tensor const &m,
-  std::function<void(const BroadcastInfo &e, const _FP16 *, const _FP16 *,
-                     _FP16 *)>
-    v_func,
-  Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
-  size_t m_offset) const {
-
-  const _FP16 *buf = this->getData<_FP16>();
-  const _FP16 *m_buf = m.getData<_FP16>();
-  _FP16 *out_buf = output.getData<_FP16>();
-
-  if (e.buffer_axis == cur_axis) {
-    v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
-    return;
-  }
-
-  cur_axis++;
-  for (unsigned int i = 0; i < dim.getTensorDim(cur_axis); ++i) {
-    size_t next_offset = offset + i * strides[cur_axis];
-    size_t next_m_offset = m_offset + i * e.strides[cur_axis];
-    apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
-                         next_m_offset);
-  }
-}
-
-#endif
-
-void Tensor::apply_broadcast_util(
-  Tensor const &m,
-  std::function<void(const BroadcastInfo &e, const float *, const float *,
-                     float *)>
-    v_func,
-  Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
-  size_t m_offset) const {
-
-  const float *buf = this->getData();
-  const float *m_buf = m.getData();
-  float *out_buf = output.getData();
-
-  if (e.buffer_axis == cur_axis) {
-    v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
-    return;
-  }
-
-  cur_axis++;
-  uint continuity[4] = {0, 1, 2, 3};
-  if (getFormat() == Tformat::NHWC) {
-    continuity[1] = 2;
-    continuity[2] = 3;
-    continuity[3] = 1;
-  }
-  for (unsigned int i = 0; i < dim.getTensorDim(continuity[cur_axis]); ++i) {
-    size_t next_offset = offset + i * strides[cur_axis];
-    size_t next_m_offset = m_offset + i * e.strides[cur_axis];
-    apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
-                         next_m_offset);
-  }
-}
-
-/**
- * This is to sum the Tensor data according to the dim.batch().
- * Therefore the result has M(dim.batch(), 1, 1, 1) dimension.
- */
-Tensor Tensor::sum_by_batch() const {
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << " is not contiguous, cannot sum";
-
-  Tensor ret(dim.batch(), 1, 1, 1, this->getFormat(), getDataType());
-  size_t feat_len = dim.getFeatureLen();
-  size_t batch = dim.batch();
-
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    const float *data = getData();
-    float *rdata = ret.getData();
-
-    Tensor ones(1, 1, 1, feat_len, this->getFormat());
-    ones.setValue(1.0);
-    sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
-          ones.getData<float>(), 1, 0.0, rdata, 1);
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    const _FP16 *data = getData<_FP16>();
-    _FP16 *rdata = ret.getData<_FP16>();
-
-    Tensor ones(1, 1, 1, feat_len, this->getTensorType());
-    ones.setValue((_FP16)1.0);
-    sgemv(CblasRowMajor, CblasNoTrans, batch, feat_len, 1, data, feat_len,
-          ones.getData<_FP16>(), 1, 0.0, rdata, 1);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-
-  return ret;
-}
-
-/**
- * @brief Calculate sum according to the axis.
- */
-Tensor Tensor::sum(unsigned int axis, float alpha) const {
-  Tensor ret("", this->getFormat(), this->getDataType());
-  return sum(axis, ret, alpha, 0);
-}
-
-Tensor &Tensor::sum(unsigned int axis, Tensor &ret, float alpha,
-                    float beta) const {
-
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    const float *data = getData<float>();
-
-    NNTR_THROW_IF(!contiguous, std::invalid_argument)
-      << getName() << " is not contiguous, cannot sum";
-
-    if (axis >= 4)
-      throw std::out_of_range("Error: axis is invalid");
-
-    if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
-      CREATE_IF_EMPTY_DIMS(ret, dim);
-      ret.copy(this->getData());
-      return ret;
-    }
-
-    switch (axis) {
-    case 0: {
-      CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width(),
-                           this->getTensorType());
-      size_t feat_len = dim.getFeatureLen();
-      size_t batch = dim.batch();
-      Tensor ones(1, 1, 1, batch, this->getFormat());
-      ones.setValue(alpha);
-      sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
-            ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
-    } break;
-    case 1: {
-      CREATE_IF_EMPTY_DIMS(ret, dim[0], 1, dim[2], dim[3], getTensorType());
-      if (this->getFormat() == Tformat::NHWC) {
-        unsigned int m = ret.dim.getDataLen();
-        unsigned int n = dim[1];
-        Tensor ones(1, 1, 1, n, this->getTensorType());
-        ones.setValue(alpha);
-        sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
-              ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
-      } else {
-        unsigned int feat_len = dim[2] * dim[3];
-        unsigned int t_axis = dim[1];
-        Tensor ones(1, 1, 1, t_axis, getTensorType());
-        ones.setValue(alpha);
-        float *rdata = ret.getData<float>();
-        for (unsigned int k = 0; k < dim[0]; ++k) {
-          sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
-                &data[k * dim.getFeatureLen()], feat_len, ones.getData<float>(),
-                1, beta, &rdata[k * feat_len], 1);
-        }
-      }
-    } break;
-    case 2: {
-      CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], 1, dim[3], getTensorType());
-
-      if (this->getFormat() == Tformat::NHWC) {
-        unsigned int feat_len = dim[1] * dim[3];
-        unsigned int t_axis = dim[2];
-        Tensor ones(1, 1, 1, t_axis, this->getTensorType());
-        ones.setValue(alpha);
-        float *rdata = ret.getData<float>();
-        for (unsigned int k = 0; k < dim[0]; ++k) {
-          sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
-                &data[k * dim.getFeatureLen()], feat_len, ones.getData<float>(),
-                1, beta, &rdata[k * feat_len], 1);
-        }
-      } else {
-        unsigned int t_3 = dim[3];
-        unsigned int t_axis = dim[2];
-        Tensor ones(1, 1, 1, t_axis, this->getTensorType());
-        ones.setValue(alpha);
-
-        if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
-          float *rdata = ret.getData<float>();
-          for (unsigned int k = 0; k < dim[0]; ++k) {
-            for (unsigned int c = 0; c < dim[1]; ++c) {
-              unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
-              unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[3];
-
-              sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
-                    ones.getData<float>(), 1, beta, &rdata[ridx], 1);
-            }
-          }
-        } else {
-          sgemv(CblasColMajor, CblasTrans, t_axis, ret.dim.getDataLen(), 1,
-                data, t_axis, ones.getData<float>(), 1, beta,
-                ret.getData<float>(), 1);
-        }
-      }
-    } break;
-    case 3: {
-      CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], dim[2], 1,
-                           this->getTensorType());
-      if (this->getFormat() == Tformat::NHWC) {
-        unsigned int t_3 = dim[1];
-        unsigned int t_axis = dim[3];
-        Tensor ones(1, 1, 1, t_axis, this->getTensorType());
-        ones.setValue(alpha);
-        float *rdata = ret.getData<float>();
-        for (unsigned int k = 0; k < dim[0]; ++k) {
-          for (unsigned int c = 0; c < dim[2]; ++c) {
-            unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
-            unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[1];
-            sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
-                  ones.getData<float>(), 1, beta, &rdata[ridx], 1);
-          }
-        }
-      } else {
-        unsigned int m = ret.dim.getDataLen();
-        unsigned int n = dim[3];
-        Tensor ones(1, 1, 1, n);
-        ones.setValue(alpha);
-
-        if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
-          sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
-                ones.getData<float>(), 1, beta, ret.getData<float>(), 1);
-        } else {
-          float *rdata = ret.getData<float>();
-
-          for (unsigned int k = 0; k < dim[0]; ++k) {
-            for (unsigned int c = 0; c < dim[1]; ++c) {
-              unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
-              unsigned int ridx = k * dim[1] * dim[2] + c * dim[2];
-
-              sgemv(CblasColMajor, CblasNoTrans, dim[2], n, 1, &data[idx],
-                    dim[2], ones.getData<float>(), 1, beta, &rdata[ridx], 1);
-            }
-          }
-        }
-      }
-    } break;
-    default:
-      throw std::out_of_range("Error: Dimension cannot exceed 3");
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    const _FP16 *data = getData<_FP16>();
-
-    NNTR_THROW_IF(!contiguous, std::invalid_argument)
-      << getName() << " is not contiguous, cannot sum";
-
-    if (axis >= 4)
-      throw std::out_of_range("Error: axis is invalid");
-
-    if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
-      CREATE_IF_EMPTY_DIMS(ret, dim);
-      ret.copy(this->getData<_FP16>());
-      return ret;
-    }
-
-    switch (axis) {
-    case 0: {
-      CREATE_IF_EMPTY_DIMS(ret, 1, dim.channel(), dim.height(), dim.width(),
-                           this->getTensorType());
-      size_t feat_len = dim.getFeatureLen();
-      size_t batch = dim.batch();
-      Tensor ones(1, 1, 1, batch, this->getTensorType());
-      ones.setValue(alpha);
-      sgemv(CblasRowMajor, CblasTrans, batch, feat_len, 1, data, feat_len,
-            ones.getData<_FP16>(), 1, beta, ret.getData<_FP16>(), 1);
-    } break;
-    case 1: {
-      CREATE_IF_EMPTY_DIMS(ret, dim[0], 1, dim[2], dim[3], getTensorType());
-      if (this->getFormat() == Tformat::NHWC) {
-        unsigned int m = ret.dim.getDataLen();
-        unsigned int n = dim[1];
-        Tensor ones(1, 1, 1, n, this->getTensorType());
-        ones.setValue(alpha);
-        sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
-              ones.getData<_FP16>(), 1, beta, ret.getData<_FP16>(), 1);
-      } else {
-        unsigned int feat_len = dim[2] * dim[3];
-        unsigned int t_axis = dim[1];
-        Tensor ones(1, 1, 1, t_axis, getTensorType());
-        ones.setValue(alpha);
-        _FP16 *rdata = ret.getData<_FP16>();
-        for (unsigned int k = 0; k < dim[0]; ++k) {
-          sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
-                &data[k * dim.getFeatureLen()], feat_len, ones.getData<_FP16>(),
-                1, beta, &rdata[k * feat_len], 1);
-        }
-      }
-    } break;
-    case 2: {
-      CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], 1, dim[3], getTensorType());
-
-      if (this->getFormat() == Tformat::NHWC) {
-        unsigned int feat_len = dim[1] * dim[3];
-        unsigned int t_axis = dim[2];
-        Tensor ones(1, 1, 1, t_axis, getTensorType());
-        ones.setValue(alpha);
-        _FP16 *rdata = ret.getData<_FP16>();
-        for (unsigned int k = 0; k < dim[0]; ++k) {
-          sgemv(CblasRowMajor, CblasTrans, t_axis, feat_len, 1,
-                &data[k * dim.getFeatureLen()], feat_len, ones.getData<_FP16>(),
-                1, beta, &rdata[k * feat_len], 1);
-        }
-      } else {
-        unsigned int t_3 = dim[3];
-        unsigned int t_axis = dim[2];
-        Tensor ones(1, 1, 1, t_axis, getTensorType());
-        ones.setValue(alpha);
-        _FP16 *rdata = ret.getData<_FP16>();
-        for (unsigned int k = 0; k < dim[0]; ++k) {
-          for (unsigned int c = 0; c < dim[1]; ++c) {
-            unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
-            unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[3];
-            sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
-                  ones.getData<_FP16>(), 1, beta, &rdata[ridx], 1);
-          }
-        }
-      }
-    } break;
-    case 3: {
-      CREATE_IF_EMPTY_DIMS(ret, dim[0], dim[1], dim[2], 1, getTensorType());
-      if (this->getFormat() == Tformat::NHWC) {
-        unsigned int t_3 = dim[1];
-        unsigned int t_axis = dim[3];
-        Tensor ones(1, 1, 1, t_axis, getTensorType());
-        ones.setValue(alpha);
-        _FP16 *rdata = ret.getData<_FP16>();
-        for (unsigned int k = 0; k < dim[0]; ++k) {
-          for (unsigned int c = 0; c < dim[2]; ++c) {
-            unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
-            unsigned int ridx = k * ret.dim.getFeatureLen() + c * dim[1];
-            sgemv(CblasRowMajor, CblasTrans, t_axis, t_3, 1, &data[idx], t_3,
-                  ones.getData<_FP16>(), 1, beta, &rdata[ridx], 1);
-          }
-        }
-      } else {
-        unsigned int m = ret.dim.getDataLen();
-        unsigned int n = dim[3];
-        Tensor ones(1, 1, 1, n, getTensorType());
-        ones.setValue(alpha);
-        sgemv(CblasRowMajor, CblasNoTrans, m, n, 1, data, n,
-              ones.getData<_FP16>(), 1, beta, ret.getData<_FP16>(), 1);
-      }
-    } break;
-    default:
-      throw std::out_of_range("Error: Dimension cannot exceed 3");
-    }
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-  return ret;
-}
-
-Tensor Tensor::sum(const std::vector<unsigned int> &axes, float alpha) const {
-  Tensor ret("", this->getFormat());
-  return sum(axes, ret, alpha);
-}
-
-void Tensor::mergeAxis(unsigned int axis1, unsigned int axis2) {
-  std::vector<unsigned int> continuous_order = {0, 3, 1, 2};
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << " is not contiguous, cannot merge axis";
-
-  if (axis2 != axis1 + 1)
-    if (!checkContinuous(axis1, axis2))
-      throw std::invalid_argument("axis2 must be axis1 + 1 for merging.");
-
-  dim.setTensorDim(axis2, dim.getTensorDim(axis1) * dim.getTensorDim(axis2));
-  dim.setTensorDim(axis1, 1);
-}
-
-Tensor &Tensor::sum(const std::vector<unsigned int> &axes, Tensor &output,
-                    float alpha) const {
-  if (axes.empty())
-    throw std::invalid_argument("empty axes given");
-
-  if (axes.size() == 1) {
-    this->sum(axes[0], output, alpha);
-  } else {
-    /** club axes together */
-    Tensor new_reshaped = *this;
-    std::vector<unsigned int> continuous_order = {0, 3, 1, 2};
-    std::vector<unsigned int> new_axes = {axes[0]};
-
-    for (unsigned int i = 1; i < axes.size(); ++i) {
-      if (checkContinuous(axes[i - 1], axes[i])) {
-        new_reshaped.mergeAxis(axes[i - 1], axes[i]);
-        new_axes.back() = axes[i];
-      } else {
-        new_axes.push_back(axes[i]);
-      }
-    }
-
-    Tensor ret = new_reshaped.sum(new_axes[0]);
-    for (unsigned int i = 1; i < new_axes.size() - 1; ++i)
-      ret = ret.sum(axes[i]);
-    ret.sum(new_axes.back(), output, alpha);
-  }
-
-  return output;
-}
-
-Tensor &Tensor::dotBatched(Tensor const &m, Tensor &result, bool trans,
-                           bool trans_m, float beta) const {
-  if (!result.isAllocated())
-    throw std::invalid_argument(
-      "Output tensor must be preallocated for dotBatched operation");
-  for (unsigned int b = 0; b < batch(); b++) {
-    /** @todo try using transpose to speedup the operation */
-    const Tensor this_b = this->getBatchSlice(b, 1);
-    Tensor m_b = m.getBatchSlice(b, 1);
-    Tensor result_b = result.getBatchSlice(b, 1);
-
-    this_b.dot(m_b, result_b, trans, trans_m, beta);
-  }
-
-  return result;
-}
-
-Tensor Tensor::dot(Tensor const &m, bool trans, bool trans_m) const {
-  Tensor output("", this->getFormat(), this->getDataType());
-  dot(m, output, trans, trans_m);
-
-  return output;
-}
-/**
- * @brief compute the derivative of this in the current tensor
- * @todo will have to see if beta effects this computation
- */
-Tensor &Tensor::dot_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
-                                bool trans, bool trans_m, float beta) {
-  bool deriv_trans_m = true;
-  bool deriv_trans = false;
-  /** @todo handle all cases of trans and trans_m */
-  if (!trans && trans_m) {
-    deriv_trans_m = false;
-  }
-
-  return output_deriv.dot(m, *this, deriv_trans, deriv_trans_m, beta);
-}
-
-/**
- * @brief compute the derivative wrt m in the m tensor
- * @note The caller tensor must be the same tensor as the one which called the
- * dot() product.
- */
-Tensor &Tensor::dot_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
-                                bool trans, bool trans_m, float beta) const {
-  bool deriv_trans_m = false;
-  bool deriv_trans = true;
-  /** @todo handle all cases of trans and trans_m */
-
-  if (!trans && trans_m) {
-    output_deriv.dot(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
-    return m_deriv;
-  } else {
-    return dot(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
-  }
-}
-
-Tensor &Tensor::dot_batched_deriv_wrt_1(Tensor const &m,
-                                        Tensor const &output_deriv, bool trans,
-                                        bool trans_m, float beta) {
-  bool deriv_trans_m = true;
-  bool deriv_trans = false;
-  /** @todo handle all cases of trans and trans_m */
-  if (!trans && trans_m) {
-    deriv_trans_m = false;
-  }
-
-  return output_deriv.dotBatched(m, *this, deriv_trans, deriv_trans_m, beta);
-}
-
-Tensor &Tensor::dot_batched_deriv_wrt_2(Tensor &m_deriv,
-                                        Tensor const &output_deriv, bool trans,
-                                        bool trans_m, float beta) const {
-  bool deriv_trans_m = false;
-  bool deriv_trans = true;
-  /** @todo handle all cases of trans and trans_m */
-
-  if (!trans && trans_m) {
-    output_deriv.dotBatched(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
-    return m_deriv;
-  } else {
-    return dotBatched(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
-  }
-}
-
-/**
- * @note: This dot product flattens the fist 3 axis for the purpose of
- * computation. So, while performing, these matrices are behaving as 2-D
- * matrices. The dimensions are restored while returning back the tensor
- * in case of trans is false.
- */
-Tensor &Tensor::dot(Tensor const &m, Tensor &result, bool trans, bool trans_m,
-                    float beta) const {
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << " is not contiguous. Cannot dot product.";
-
-  // Comment out with intension to support the calculation wrt. batch and height
-  // direction. It supposes to have this->dim as [ BxCxH,W ] and m.dim is
-  // [BxCxH,W] as well if (m.dim.rank() > 2) {
-  //   throw exception::not_supported("Error: support only for rank of dot "
-  //                                  "matrix <= 2");
-  // }
-
-  // Comment out with intension to support the calculation wrt. batch and height
-  // direction of this tensor. It is OK as long as m is 2D
-  //
-  if (trans && dim.rank() > 2) {
-    ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
-  }
-  unsigned int dim1, dim2, mdim1, mdim2;
-  if (getFormat() == Tformat::NHWC) {
-    dim1 = batch() * height() * width();
-    dim2 = channel();
-    mdim1 = m.batch() * m.height() * m.width();
-    mdim2 = m.channel();
-  } else {
-    dim1 = batch() * channel() * height();
-    dim2 = width();
-    mdim1 = m.batch() * m.channel() * m.height();
-    mdim2 = m.width();
-  }
-
-  unsigned int M, N, K, lda, ldb, ldc;
-
-  if (!trans && !trans_m) {
-    if (dim2 != mdim1)
-      throw std::runtime_error(
-        "Error: incompatible dimensions for dot product");
-    K = mdim1; /** == dim2 */
-    N = mdim2;
-    M = dim1;
-    if (getFormat() == Tformat::NHWC) {
-      CREATE_IF_EMPTY_DIMS(result, batch(), N, height(), width(),
-                           getTensorType()); //  NHWC Result Tensor
-    } else {
-      CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
-                           getTensorType());
-    }
-
-    // We are not set zero the result because of performance reason.
-    // However, result is not initialized properly. There might include
-    // garbage like nan. When we have to use this value as in C = alpha*A*B +
-    // beta*C, then have to check garbage data of C is not effect or not.
-
-  } else if (!trans && trans_m) {
-    if (dim2 != mdim2)
-      throw std::runtime_error(
-        "Error: incompatible dimensions for dot product");
-    K = mdim2; /** == dim2 */
-    N = mdim1;
-    M = dim1;
-    if (getFormat() == Tformat::NHWC) {
-      CREATE_IF_EMPTY_DIMS(result, batch(), N, height(), width(),
-                           getTensorType());
-    } else {
-      CREATE_IF_EMPTY_DIMS(result, batch(), channel(), height(), N,
-                           getTensorType());
-    }
-  } else if (trans && !trans_m) {
-    if (dim1 != mdim1)
-      throw std::runtime_error(
-        "Error: incompatible dimensions for dot product");
-    K = mdim1; /** == dim1 */
-    N = mdim2;
-    M = dim2;
-    if (getFormat() == Tformat::NHWC) {
-      CREATE_IF_EMPTY_DIMS(result, 1, N, M, 1, getTensorType());
-    } else {
-      CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N, getTensorType());
-    }
-  } else {
-    if (dim1 != mdim2)
-      throw std::runtime_error(
-        "Error: incompatible dimensions for dot product");
-    K = mdim2; /** == dim1 */
-    N = mdim1;
-    M = dim2;
-    if (getFormat() == Tformat::NHWC) {
-      CREATE_IF_EMPTY_DIMS(result, 1, N, M, 1, getTensorType());
-    } else {
-      CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N, getTensorType());
-    }
-  }
-  lda = dim2;
-  ldb = mdim2;
-  ldc = (getFormat() == Tformat::NHWC) ? result.channel() : result.width();
-
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    const float *data = getData();
-    const float *mdata = m.getData();
-    float *rdata = result.getData();
-    const float alpha = 1.0f;
-    enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
-    enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
-
-    /// shortcut handling in case of vector
-    /// for vector, (1 * K) == (K * 1) in current memory layout...
-    /// and plaese note that N, K, M is a fixed place holder after considering
-    /// transpose.
-    /// For example, there is no case like (1 * K) X (1 * K) while
-    /// (1 * K) X (1 * M) can be a case
-    /// case1: (1 * K) X (K * 1)
-    if (M == 1 && N == 1) {
-      *rdata = sdot(K, data, 1, mdata, 1) + beta * (*rdata);
-    }
-    /// case2: (M * K) X (K * 1)
-    else if (N == 1) {
-      sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
-            rdata, 1);
-    }
-    /// case3: (1 * K) X (K * N) = 1 * N = R
-    /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
-    /// Effectively a translation of sgemv
-    else if (M == 1) {
-      transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
-      sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1,
-            beta, rdata, 1);
-    }
-    /// case others: use gemm
-    else {
-      sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata,
-            ldb, beta, rdata, ldc);
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    const _FP16 *data = getData<_FP16>();
-    const _FP16 *mdata = m.getData<_FP16>();
-    _FP16 *rdata = result.getData<_FP16>();
-    const float alpha = 1.0f;
-    enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
-    enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;
-
-    /// shortcut handling in case of vector
-    /// for vector, (1 * K) == (K * 1) in current memory layout...
-    /// and plaese note that N, K, M is a fixed place holder after considering
-    /// transpose.
-    /// For example, there is no case like (1 * K) X (1 * K) while
-    /// (1 * K) X (1 * M) can be a case
-    /// case1: (1 * K) X (K * 1)
-    if (M == 1 && N == 1) {
-      *rdata = sdot(K, data, 1, mdata, 1) + static_cast<_FP16>(beta) * (*rdata);
-    }
-    /// case2: (M * K) X (K * 1)
-    else if (N == 1) {
-      sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta,
-            rdata, 1);
-    }
-    /// case3: (1 * K) X (K * N) = 1 * N = R
-    /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
-    /// Effectively a translation of sgemv
-    else if (M == 1) {
-      transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
-      sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1,
-            beta, rdata, 1);
-    }
-    /// case others: use sgemm
-    else {
-      sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata,
-            ldb, beta, rdata, ldc);
-    }
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-
-  return result;
-}
-
-Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << " is not contiguous. Cannot transpose.";
-
-  if (out.getData() == getData()) {
-    Tensor tmp = clone();
-    return tmp.transpose(direction, out);
-  }
-
-  unsigned int SL, SI, SJ, SK;
-
-  out.reshape(dim.transpose(direction));
-  int indexI = direction[0] - '0';
-  int indexJ = direction[2] - '0';
-
-  SL = dim.batch(), SI = dim.channel(), SJ = dim.height(), SK = dim.width();
-
-  bool is_format_nchw = (getFormat() == Tformat::NCHW);
-
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    const float *inptr = getData();
-    float *outptr = out.getData();
-    switch (indexI) {
-    case 0:
-      if (indexJ == 1) {
-        if (is_format_nchw) {
-          transposeloop(l, i, j, k, SL, SI, SJ, SK);
-        } else {
-          transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
-        }
-      } else {
-        if (is_format_nchw) {
-          transposeloop(l, i, k, j, SL, SI, SK, SJ);
-        } else {
-          transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
-        }
-      }
-      break;
-    case 1:
-      if (indexJ == 0) {
-        if (is_format_nchw) {
-          transposeloop(l, j, i, k, SL, SJ, SI, SK);
-        } else {
-          transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
-        }
-      } else {
-        if (is_format_nchw) {
-          transposeloop(l, j, k, i, SL, SJ, SK, SI);
-        } else {
-          transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
-        }
-      }
-      break;
-    case 2:
-      if (indexJ == 0) {
-        if (is_format_nchw) {
-          transposeloop(l, k, i, j, SL, SK, SI, SJ);
-        } else {
-          transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
-        }
-      } else {
-        if (is_format_nchw) {
-          transposeloop(l, k, j, i, SL, SK, SJ, SI);
-        } else {
-          transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
-        }
-      }
-      break;
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    const _FP16 *inptr = getData<_FP16>();
-    _FP16 *outptr = out.getData<_FP16>();
-    switch (indexI) {
-    case 0:
-      if (indexJ == 1) {
-        if (is_format_nchw) {
-          transposeloop(l, i, j, k, SL, SI, SJ, SK);
-        } else {
-          transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
-        }
-      } else {
-        if (is_format_nchw) {
-          for (unsigned int b = 0; b < batch(); ++b) {
-            for (unsigned int c = 0; c < channel(); ++c) {
-              transpose_matrix(height(), width(),
-                               getData<_FP16>() + getIndex(b, c, 0, 0), width(),
-                               out.getData<_FP16>() + out.getIndex(b, c, 0, 0),
-                               out.width());
-            }
-          }
-        } else {
-          transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
-        }
-      }
-      break;
-    case 1:
-      if (indexJ == 0) {
-        if (is_format_nchw) {
-          transposeloop(l, j, i, k, SL, SJ, SI, SK);
-        } else {
-          transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
-        }
-      } else {
-        if (is_format_nchw) {
-          transposeloop(l, j, k, i, SL, SJ, SK, SI);
-        } else {
-          transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
-        }
-      }
-      break;
-    case 2:
-      if (indexJ == 0) {
-        if (is_format_nchw) {
-          transposeloop(l, k, i, j, SL, SK, SI, SJ);
-        } else {
-          transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
-        }
-      } else {
-        if (is_format_nchw) {
-          transposeloop(l, k, j, i, SL, SK, SJ, SI);
-        } else {
-          transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
-        }
-      }
-      break;
-    }
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-
-  return out;
-}
-
-Tensor Tensor::transpose(const std::string &direction) const {
-  Tensor result(dim);
-  transpose(direction, result);
-  return result;
-}
-
-Tensor Tensor::dropout_mask(float dropout) const {
-  Tensor result(dim);
-  result.dropout_mask(dropout);
-  return result;
-}
-
-void Tensor::dropout_mask(float dropout) {
-  setRandUniform(0.0, 1.0);
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    float scale = 1.0 / (1 - dropout);
-    float *data_ = getData();
-    for (unsigned int i = 0; i < size(); ++i) {
-      if (data_[i] >= dropout)
-        data_[i] = scale;
-      else
-        data_[i] = 0.0;
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    _FP16 scale = static_cast<_FP16>(1.0 / (1 - dropout));
-    _FP16 *data_ = getData<_FP16>();
-    for (unsigned int i = 0; i < size(); ++i) {
-      if (data_[i] >= dropout)
-        data_[i] = scale;
-      else
-        data_[i] = 0;
-    }
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-}
-
-void Tensor::filter_mask(const Tensor &mask_len, bool reverse) {
-  float fill_mask_val = 0.0;
-  float en_mask_val = 1.0 - fill_mask_val;
-
-  if (reverse) {
-    fill_mask_val = 1.0;
-    en_mask_val = 1.0 - fill_mask_val;
-  }
-
-  setValue(fill_mask_val);
-  if (mask_len.batch() != batch())
-    throw std::invalid_argument("Number of filter masks mismatched");
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    for (unsigned int b = 0; b < batch(); b++) {
-      float *addr = getAddress(b, 0, 0, 0);
-      const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
-      std::fill(addr, addr + (*mask_len_val), en_mask_val);
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    for (unsigned int b = 0; b < batch(); b++) {
-      _FP16 *addr = getAddress<_FP16>(b, 0, 0, 0);
-      const uint *mask_len_val = mask_len.getAddress<uint>(b, 0, 0, 0);
-      std::fill(addr, addr + (*mask_len_val), (_FP16)en_mask_val);
-    }
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-}
-
-Tensor Tensor::zoneout_mask(float zoneout) {
-  Tensor ret(getDim());
-  zoneout_mask(ret, zoneout);
-  return ret;
-}
-
-void Tensor::zoneout_mask(Tensor &opposite, float zoneout) {
-  if (dim != opposite.dim) {
-    throw std::invalid_argument(
-      "[Tensor::zoneout_mask] opposite dimension does not match");
-  }
-
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    opposite.setRandBernoulli(zoneout);
-
-    float *data = getData();
-    float *opposite_data = opposite.getData();
-
-    for (unsigned int i = 0; i < size(); ++i) {
-      if (opposite_data[i] > epsilon) {
-        data[i] = 0.0f;
-      } else {
-        data[i] = 1.0f;
-      }
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    _FP16 zoneout_fp16 = (_FP16)zoneout;
-    opposite.setRandBernoulli(zoneout_fp16);
-
-    _FP16 *data = getData<_FP16>();
-    _FP16 *opposite_data = opposite.getData<_FP16>();
-
-    for (unsigned int i = 0; i < size(); ++i) {
-      if (opposite_data[i] > epsilon) {
-        data[i] = (_FP16)0.0;
-      } else {
-        data[i] = (_FP16)1.0;
-      }
-    }
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-}
-
-Tensor Tensor::apply(std::function<Tensor(Tensor)> f) const { return f(*this); }
-
-Tensor &Tensor::apply(std::function<Tensor &(Tensor, Tensor &)> f,
-                      Tensor &output) const {
-  return f(*this, output);
-}
-
-void Tensor::print(std::ostream &out) const {
-  printInstance(out, this);
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    const float *data = getData<float>();
-    unsigned int len = size();
-    out << "data addr: " << data << '\n';
-    out << dim;
-
-    if (len > 100) {
-      out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
-          << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1]
-          << ']' << std::endl;
-      return;
-    }
-
-    std::ios init(NULL);
-    init.copyfmt(out);
-    float max_ = 0.0;
-    float min_ = 10000000;
-    if (getFormat() == Tformat::NCHW) {
-      for (unsigned int k = 0; k < batch(); k++) {
-        for (unsigned int l = 0; l < channel(); l++) {
-          for (unsigned int i = 0; i < height(); i++) {
-            for (unsigned int j = 0; j < width(); j++) {
-              out << std::setw(10) << std::setprecision(10)
-                  << this->getValue<float>(k, l, i, j) << " ";
-            }
-            out << std::endl;
-          }
-          out << std::endl;
-        }
-        out << "-------" << std::endl;
-      }
-    } else {
-      for (unsigned int k = 0; k < batch(); k++) {
-        for (unsigned int i = 0; i < height(); i++) {
-          for (unsigned int j = 0; j < width(); j++) {
-            for (unsigned int l = 0; l < channel(); l++) {
-              out << std::setw(10) << std::setprecision(10)
-                  << this->getValue<float>(k, l, i, j) << " ";
-            }
-            out << std::endl;
-          }
-          out << std::endl;
-        }
-        out << "-------" << std::endl;
-      }
-    }
-    out.copyfmt(init);
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    const _FP16 *data = getData<_FP16>();
-    unsigned int len = size();
-    out << "data addr: " << data << '\n';
-    out << dim;
-
-    if (len > 100) {
-      out << '[' << (float)data[0] << ' ' << (float)data[1] << ' '
-          << (float)data[2] << " ... " << (float)data[len - 3] << ' '
-          << (float)data[len - 2] << ' ' << (float)data[len - 1] << ']'
-          << std::endl;
-      return;
-    }
-
-    std::ios init(NULL);
-    init.copyfmt(out);
-    float max_ = 0.0;
-    float min_ = 10000000;
-    if (getFormat() == Tformat::NCHW) {
-      for (unsigned int k = 0; k < batch(); k++) {
-        for (unsigned int l = 0; l < channel(); l++) {
-          for (unsigned int i = 0; i < height(); i++) {
-            for (unsigned int j = 0; j < width(); j++) {
-              out << std::setw(10) << std::setprecision(10)
-                  << (float)this->getValue<_FP16>(k, l, i, j) << " ";
-              if (std::isinf((float)this->getValue<_FP16>(k, l, i, j)))
-                out << "INF or NAN " << k << ":" << l << ":" << i << ":" << j
-                    << std::endl;
-              if ((float)this->getValue<_FP16>(k, l, i, j) < min_)
-                min_ = (float)this->getValue<_FP16>(k, l, i, j);
-              if ((float)this->getValue<_FP16>(k, l, i, j) > max_)
-                max_ = (float)this->getValue<_FP16>(k, l, i, j);
-            }
-            out << std::endl;
-          }
-          out << std::endl;
-        }
-        out << "-------" << std::endl;
-      }
-    } else {
-      for (unsigned int k = 0; k < batch(); k++) {
-        for (unsigned int i = 0; i < height(); i++) {
-          for (unsigned int j = 0; j < width(); j++) {
-            for (unsigned int l = 0; l < channel(); l++) {
-              out << std::setw(10) << std::setprecision(10)
-                  << (float)this->getValue<_FP16>(k, l, i, j) << " ";
-            }
-            out << std::endl;
-          }
-          out << std::endl;
-        }
-        out << "-------" << std::endl;
-      }
-    }
-    out.copyfmt(init);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  } else if (getDataType() == ml::train::TensorDim::DataType::QINT8) {
-    const uint8_t *data = getData<uint8_t>();
-    unsigned int len = size();
-    out << "data addr: " << reinterpret_cast<const float *>(data) << '\n';
-    out << dim;
-
-    if (len > 100) {
-      out << '[' << (int)data[0] << ' ' << (int)data[1] << ' ' << (int)data[2]
-          << " ... " << (int)data[len - 3] << ' ' << (int)data[len - 2] << ' '
-          << (int)data[len - 1] << ']' << std::endl;
-      return;
-    }
-
-    std::ios init(NULL);
-    init.copyfmt(out);
-    if (getFormat() == Tformat::NCHW) {
-      for (unsigned int k = 0; k < batch(); k++) {
-        for (unsigned int l = 0; l < channel(); l++) {
-          for (unsigned int i = 0; i < height(); i++) {
-            for (unsigned int j = 0; j < width(); j++) {
-              out << std::setw(10) << (int)this->getValue<uint8_t>(k, l, i, j)
-                  << " ";
-            }
-            out << std::endl;
-          }
-          out << std::endl;
-        }
-        out << "-------" << std::endl;
-      }
-    } else {
-      for (unsigned int k = 0; k < batch(); k++) {
-        for (unsigned int i = 0; i < height(); i++) {
-          for (unsigned int j = 0; j < width(); j++) {
-            for (unsigned int l = 0; l < channel(); l++) {
-              out << std::setw(10) << (int)this->getValue<uint8_t>(k, l, i, j)
-                  << " ";
-            }
-            out << std::endl;
-          }
-          out << std::endl;
-        }
-        out << "-------" << std::endl;
-      }
-      out.copyfmt(init);
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::QINT4) {
-    const uint8_t *data = getData<uint8_t>();
-    unsigned int len = (size() + 1) / 2;
-    out << "data addr: " << (float *)data << '\n';
-    out << dim;
-
-    if (len > 100) {
-      out << '[' << (int)decode_qint(data[0], true) << ' '
-          << (int)decode_qint(data[0], false) << ' '
-          << (int)decode_qint(data[1], true) << " ... "
-          << (int)decode_qint(data[len - 2], false) << ' '
-          << (int)decode_qint(data[len - 1], true) << ' '
-          << (int)decode_qint(data[len - 1], false) << ']' << std::endl;
-      return;
-    }
-
-    std::ios init(NULL);
-    init.copyfmt(out);
-    if (getFormat() == Tformat::NCHW) {
-      for (unsigned int k = 0; k < batch(); k++) {
-        for (unsigned int l = 0; l < channel(); l++) {
-          for (unsigned int i = 0; i < height(); i++) {
-            for (unsigned int j = 0; j < width(); j++) {
-              out << std::setw(3) << (int)this->getValueQint4(k, l, i, j)
-                  << " ";
-            }
-            out << std::endl;
-          }
-          out << std::endl;
-        }
-        out << "-------" << std::endl;
-      }
-    } else {
-      for (unsigned int k = 0; k < batch(); k++) {
-        for (unsigned int i = 0; i < height(); i++) {
-          for (unsigned int j = 0; j < width(); j++) {
-            for (unsigned int l = 0; l < channel(); l++) {
-              out << std::setw(3) << (int)this->getValueQint4(k, l, i, j)
-                  << " ";
-            }
-            out << std::endl;
-          }
-          out << std::endl;
-        }
-        out << "-------" << std::endl;
-      }
-      out.copyfmt(init);
-    }
-  }
-}
-
-void Tensor::print_(std::ostream &out, uint opt) const {
-  printInstance(out, this);
-
-  unsigned int len = size();
-
-  std::ios init(NULL);
-  init.copyfmt(out);
-  if (opt == 0) {
-    if (getFormat() == Tformat::NCHW) {
-      out << "{";
-      for (unsigned int k = 0; k < batch(); k++) {
-        out << "{";
-        for (unsigned int i = 0; i < channel(); i++) {
-          out << "{";
-          for (unsigned int j = 0; j < height(); j++) {
-            out << "{";
-            for (unsigned int l = 0; l < width(); l++) {
-              if (l < width() - 1)
-                out << std::setw(10) << std::setprecision(10)
-                    << this->getValue<float>(k, l, i, j) << ", ";
-              else
-                out << std::setw(10) << std::setprecision(10)
-                    << this->getValue<float>(k, l, i, j);
-            }
-            if (j < height() - 1)
-              out << "},";
-            else
-              out << "}";
-            out << std::endl;
-          }
-          if (i < channel() - 1)
-            out << "},";
-          else
-            out << "}";
-          out << std::endl;
-        }
-        if (k < batch() - 1)
-          out << "},";
-        else
-          out << "}";
-        out << std::endl;
-      }
-      out << "}";
-    } else {
-      out << "{";
-      for (unsigned int k = 0; k < batch(); k++) {
-        out << "{";
-        for (unsigned int i = 0; i < height(); i++) {
-          out << "{";
-          for (unsigned int j = 0; j < width(); j++) {
-            out << "{";
-            for (unsigned int l = 0; l < channel(); l++) {
-              if (l < channel() - 1)
-                out << std::setw(10) << std::setprecision(10)
-                    << this->getValue<float>(k, l, i, j) << ", ";
-              else
-                out << std::setw(10) << std::setprecision(10)
-                    << this->getValue<float>(k, l, i, j);
-            }
-            if (j < width() - 1)
-              out << "},";
-            else
-              out << "}";
-            out << std::endl;
-          }
-          if (i < height() - 1)
-            out << "},";
-          else
-            out << "}";
-          out << std::endl;
-        }
-        if (k < batch() - 1)
-          out << "},";
-        else
-          out << "}";
-        out << std::endl;
-      }
-      out << "}";
-    }
-  } else {
-    for (uint i = 0; i < len; ++i) {
-      out << getData<float>()[i] << ", ";
-    }
-  }
-  out.copyfmt(init);
-}
-
-std::ostream &operator<<(std::ostream &out, Tensor const &m) {
-  m.print(out);
-  return out;
-}
-
-void Tensor::copy(const void *buf) {
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << "Tensor is not contiguous, cannot copy.";
-
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    if (buf == getData()) {
-      return;
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    if (buf == getData<_FP16>()) {
-      return;
-    }
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  } else if (getDataType() == ml::train::TensorDim::DataType::QINT8) {
-    if (buf == getData<uint8_t>()) {
-      return;
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::QINT4) {
-    if (buf == getData<uint8_t>()) {
-      return;
-    }
-  }
-
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    scopy(size(), (float *)buf, 1, getData<float>(), 1);
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    scopy(size(), (_FP16 *)buf, 1, getData<_FP16>(), 1);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  } else if (getDataType() == ml::train::TensorDim::DataType::QINT8) {
-    for (unsigned int i = 0; i < size(); ++i) {
-      getData<uint8_t>()[i] = ((uint8_t *)buf)[i];
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::QINT4) {
-    for (unsigned int i = 0; i < (size() + 1) / 2; ++i) {
-      getData<uint8_t>()[i] = ((uint8_t *)buf)[i];
-    }
-  }
-}
-
-void Tensor::copy_with_stride(const Tensor &from) {
-
-  if (dim == from.getDim()) {
-    if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-      for (unsigned int b = 0; b < batch(); ++b) {
-        for (unsigned int c = 0; c < channel(); ++c) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            for (unsigned int w = 0; w < width(); ++w) {
-              setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
-            }
-          }
-        }
-      }
-    } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-      for (unsigned int b = 0; b < batch(); ++b) {
-        for (unsigned int c = 0; c < channel(); ++c) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            for (unsigned int w = 0; w < width(); ++w) {
-              setValue(b, c, h, w, from.getValue<_FP16>(b, c, h, w));
-            }
-          }
-        }
-      }
-#else
-      throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-    }
-  } else {
-    Tensor t = Tensor(from.getDim(), true);
-    if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-      for (unsigned int b = 0; b < t.batch(); ++b) {
-        for (unsigned int c = 0; c < t.channel(); ++c) {
-          for (unsigned int h = 0; h < t.height(); ++h) {
-            for (unsigned int w = 0; w < t.width(); ++w) {
-              t.setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
-            }
-          }
-        }
-      }
-    } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-      for (unsigned int b = 0; b < batch(); ++b) {
-        for (unsigned int c = 0; c < channel(); ++c) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            for (unsigned int w = 0; w < width(); ++w) {
-              setValue(b, c, h, w, from.getValue<_FP16>(b, c, h, w));
-            }
-          }
-        }
-      }
-#else
-      throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-    }
-    swap(t, *this);
-  }
-}
-
-void Tensor::copy(const Tensor &from) {
-  // todo: enable copy to non-contiguous tensor
-  if (!contiguous) {
-    throw std::runtime_error("Cannot copy non-contiguous tensor");
-  }
-
-  if (from.size() != 0 && size() == from.size() &&
-      getDataType() == from.getDataType()) {
-    reshape(from.getDim());
-    if (from.getDataType() == ml::train::TensorDim::DataType::FP32) {
-      copy(from.getData());
-    } else if (from.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-      copy(from.getData<_FP16>());
-#else
-      throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-    }
-
-  } else {
-    if (from.getDataType() == ml::train::TensorDim::DataType::FP32) {
-      Tensor t = Tensor(from.getDim(), from.getData<float>());
-      swap(t, *this);
-    } else if (from.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-      Tensor t = Tensor(from.getDim(), from.getData<_FP16>());
-      swap(t, *this);
-#else
-      throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-    }
-  }
-}
-
-void Tensor::copyData(const Tensor &from) {
-  // todo: enable copy to non-contiguous tensor
-  if (!contiguous) {
-    throw std::runtime_error("Cannot copy non-contiguous tensor");
-  }
-
-  if (size() != from.size())
-    throw std::invalid_argument("Size of tensor to copy must match");
-
-  if (getDataType() == from.getDataType()) {
-    if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-      copy(from.getData<float>());
-    } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-      copy(from.getData<_FP16>());
-#else
-      throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-    } else {
-      copy(from.getData<uint8_t>());
-    }
-  } else {
-    if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-      if (from.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-        scopy(size(), from.getData<_FP16>(), 1, getData<float>(), 1);
-#else
-        throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-      } else if (from.getDataType() == ml::train::TensorDim::DataType::QINT8) {
-        scopy_int8_to_float32(from.size(), from.getData<uint8_t>(), 1,
-                              getData<float>(), 1);
-      } else if (from.getDataType() == ml::train::TensorDim::DataType::QINT4) {
-        scopy_int4_to_float32((from.size() + 1) / 2, from.getData<uint8_t>(), 1,
-                              getData<float>(), 1);
-      }
-    } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-      if (from.getDataType() == ml::train::TensorDim::DataType::FP32) {
-        scopy(size(), from.getData<float>(), 1, getData<_FP16>(), 1);
-      } else if (from.getDataType() == ml::train::TensorDim::DataType::QINT8) {
-        scopy_int8_to_float16(from.size(), from.getData<uint8_t>(), 1,
-                              getData<_FP16>(), 1);
-      } else if (from.getDataType() == ml::train::TensorDim::DataType::QINT4) {
-        scopy_int4_to_float16((from.size() + 1) / 2, from.getData<uint8_t>(), 1,
-                              getData<_FP16>(), 1);
-      }
-#else
-      throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-    }
-  }
-}
-
-Tensor Tensor::clone() const {
-  Tensor t;
-  t.copy(*this);
-  t.name = name;
-  return t;
-}
-
-void Tensor::reshape(const TensorDim &d) {
-
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << " is not contiguous, cannot reshape.";
-
-  NNTR_THROW_IF(d.getDataLen() != dim.getDataLen(), std::invalid_argument)
-    << "[Tensor]: reshape cannot change the buffer size, trying reshaping "
-       "\nfrom "
-    << getDim() << " to " << d;
-
-  // dim = d;
-  dim.batch(d.batch());
-  dim.channel(d.channel());
-  dim.height(d.height());
-  dim.width(d.width());
-
-  strides = d.computeStrides();
-}
-
-void Tensor::fill(const Tensor &from, bool alloc) {
-  if (alloc && this->empty()) {
-    this->copy(from);
-    return;
-  }
-
-  if (!from.contiguous || !contiguous) {
-    /// @todo enable this if needed
-    throw nntrainer::exception::not_supported(
-      "[Tensor::fill] non-contiguous tensors are not supported");
-  }
-
-  if (dim != from.getDim()) {
-    throw std::invalid_argument("[Tensor::fill] dimension must be the same");
-  }
-
-  if (strides != from.getStrides()) {
-    /// @todo length does not represent buffer size, there should be way to
-    /// get the buffer size
-    throw std::invalid_argument("[Tensor::fill] buffer size must be the same");
-  }
-
-  if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
-    this->copy(from.getData<float>());
-  } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    this->copy(from.getData<_FP16>());
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
+Tensor Tensor::sum(unsigned int axis, float alpha) const {
+  Tensor output("", this->getFormat(), this->getDataType());
+  return sum(axis, output, alpha, 0);
 }
 
-void Tensor::save(std::ostream &file) {
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << " is not contiguous, cannot save.";
-
-  std::streamsize sz = static_cast<std::streamsize>(bytes());
-  NNTR_THROW_IF(sz < 0, std::invalid_argument)
-    << "save size: " << bytes()
-    << " is too big. It cannot be represented by std::streamsize";
-
-  if (this->getDataType() == ml::train::TensorDim::DataType::FP32) {
-    checkedWrite(file, (char *)getData(), sz,
-                 "[Tensor::save] operation failed");
-  } else if (this->getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    checkedWrite(file, (char *)getData<_FP16>(),
-                 static_cast<std::streamsize>(size() * sizeof(_FP16)),
-                 "[Tensor::save] operation failed");
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
+Tensor &Tensor::sum(unsigned int axis, Tensor &output, float alpha,
+                    float beta) const {
+  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
+    << getName() << " is not contiguous, cannot sum";
 
-  putData();
+  itensor->sum(axis, output, alpha, beta);
+  return output;
 }
 
-void Tensor::read(std::ifstream &file, Tdatatype s_type) {
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << " is not contiguous, cannot read.";
-
-  std::streamsize sz = static_cast<std::streamsize>(bytes());
-
-  NNTR_THROW_IF(sz < 0, std::invalid_argument)
-    << "read size: " << bytes()
-    << " is too big. It cannot be represented by std::streamsize";
+Tensor Tensor::sum(const std::vector<unsigned int> &axes, float alpha) const {
+  Tensor output("", this->getFormat());
+  return sum(axes, output, alpha);
+}
 
-  if (getDataType() == Tdatatype::QINT4 || getDataType() == Tdatatype::QINT8) {
-    uint8_t axis, zp;
-    unsigned int len = 0;
+Tensor &Tensor::sum(const std::vector<unsigned int> &axes, Tensor &output,
+                    float alpha) const {
+  if (axes.empty())
+    throw std::invalid_argument("empty axes given");
 
-    file.read((char *)&axis, sizeof(uint8_t));
+  if (axes.size() == 1) {
+    this->sum(axes[0], output, alpha);
+  } else {
 
-    if (axis == 0)
-      len = batch();
-    else if (axis == 1) {
-      len = channel();
-    } else if (axis == 2) {
-      len = height();
-    } else if (axis == 3) {
-      len = width();
-    }
+    /** club axes together */
+    Tensor new_reshaped = Tensor(getDim());
+    new_reshaped.copy(*this);
+    std::vector<unsigned int> continuous_order = {0, 3, 1, 2};
+    std::vector<unsigned int> new_axes = {axes[0]};
 
-    // read scale factors
-    for (unsigned int i = 0; i < len; ++i) {
-      if (s_type == Tdatatype::FP32) {
-        float scale;
-        file.read((char *)&scale, sizeof(float));
-        scale_factors_fp32.push_back(scale);
-      } else if (s_type == Tdatatype::FP16) {
-#ifdef ENABLE_FP16
-        _FP16 scale;
-        file.read((char *)&scale, sizeof(_FP16));
-        scale_factors_fp16.push_back(scale);
-#else
-        throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
+    for (unsigned int i = 1; i < axes.size(); ++i) {
+      if (checkContinuous(axes[i - 1], axes[i])) {
+        new_reshaped.mergeAxis(axes[i - 1], axes[i]);
+        new_axes.back() = axes[i];
+      } else {
+        new_axes.push_back(axes[i]);
       }
     }
 
-    // read zero points and parse if needed
-    if (getDataType() == Tdatatype::QINT4) {
-      for (unsigned int i = 0; i < (len + 1) / 2; ++i) {
-        file.read((char *)&zp, sizeof(uint8_t));
-        zero_points.push_back(decode_qint(zp, true));
-        zero_points.push_back(decode_qint(zp, false));
-      }
-    } else if (getDataType() == Tdatatype::QINT8) {
-      for (unsigned int i = 0; i < len; ++i) {
-        file.read((char *)&zp, sizeof(uint8_t));
-        zero_points.push_back(zp);
-      }
-    }
+    Tensor ret = new_reshaped.sum(new_axes[0]);
+    for (unsigned int i = 1; i < new_axes.size() - 1; ++i)
+      ret = ret.sum(axes[i]);
+    ret.sum(new_axes.back(), output, alpha);
   }
-
-  checkedRead(file, (char *)getData(), sz, "[Tensor::read] operation failed");
-  putData();
+  return output;
 }
 
-/**
- * @brief Calculate average value according to the axis.
- */
 Tensor Tensor::average(unsigned int axis) const {
-  Tensor t("", this->getFormat(), this->getDataType());
-  return average(axis, t);
+  Tensor output("", this->getFormat(), this->getDataType());
+  return average(axis, output);
 }
 
-/**
- * @brief Calculate average value according to the axis.
- */
 Tensor &Tensor::average(unsigned int axis, Tensor &output) const {
   if (axis >= TensorDim::MAXDIM)
     throw std::out_of_range(
       "negative axis or axis more then MAXDIM is invalid");
 
-  unsigned int axis_size = dim.getDim()[axis];
+  unsigned int axis_size = getDim()[axis];
   if (axis_size == 1)
     output.copy(*this);
   else
@@ -3249,8 +481,8 @@ Tensor &Tensor::average(unsigned int axis, Tensor &output) const {
 }
 
 Tensor Tensor::average(const std::vector<unsigned int> &axes) const {
-  Tensor t("", this->getFormat(), this->getDataType());
-  return average(axes, t);
+  Tensor output("", this->getFormat(), this->getDataType());
+  return average(axes, output);
 }
 
 Tensor &Tensor::average(const std::vector<unsigned int> &axes,
@@ -3264,564 +496,581 @@ Tensor &Tensor::average(const std::vector<unsigned int> &axes,
     if (idx >= TensorDim::MAXDIM) {
       throw std::out_of_range("axis more then MAXDIM is invalid");
     }
-    ret_shape.setTensorDim(idx, dim.getTensorDim(idx));
+    ret_shape.setTensorDim(idx, getDim().getTensorDim(idx));
   }
 
   return this->sum(axes, output, 1.0 / (float)ret_shape.getDataLen());
 }
 
-/**
- * @brief Calculate average value according to the axis.
- */
 Tensor Tensor::average() const {
-  Tensor result = *this;
+  Tensor output = *this;
   unsigned int axis = 0;
   if (this->getFormat() == Tformat::NHWC) {
-    result.reshape({1, dim.getDataLen(), 1, 1, this->getTensorType()});
+    output.reshape({1, getDim().getDataLen(), 1, 1, this->getTensorType()});
     axis = 1;
   } else {
-    result.reshape({1, 1, 1, dim.getDataLen(), this->getTensorType()});
+    output.reshape({1, 1, 1, getDim().getDataLen(), this->getTensorType()});
     axis = 3;
   }
-  return result.average(axis);
+  return output.average(axis);
 }
 
-/**
- * @brief Calculate average value according to the axis.
- */
 Tensor &Tensor::average(Tensor &output) const {
   Tensor result = *this;
-  result.reshape({1, 1, 1, dim.getDataLen()});
+  result.reshape({1, 1, 1, getDim().getDataLen()});
   return result.average(3, output);
 }
 
-void Tensor::setValue(float val) {
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << " is not contiguous, cannot set value.";
+int Tensor::pow_i(float exponent) {
+  pow(exponent, *this);
+  return ML_ERROR_NONE;
+}
+
+Tensor Tensor::pow(float exponent) const {
+  Tensor output("", getFormat(), getDataType());
+  return pow(exponent, output);
+}
+
+Tensor &Tensor::pow(float exponent, Tensor &output) const {
+  itensor->pow(exponent, output);
+  return output;
+}
+
+int Tensor::erf_i() {
+  erf(*this);
+  return ML_ERROR_NONE;
+}
+
+Tensor Tensor::erf() const {
+  Tensor output("", getFormat(), getDataType());
+  return erf(output);
+}
+
+Tensor &Tensor::erf(Tensor &output) const {
+  itensor->erf(output);
+  return output;
+}
+
+void Tensor::sin(Tensor &out, float alpha) {
+  if (size() != out.size())
+    throw std::invalid_argument("Error: Size of out of Tensor::sin must match");
+
+  itensor->sin(out, alpha);
+}
+
+void Tensor::cos(Tensor &out, float alpha) {
+  if (size() != out.size())
+    throw std::invalid_argument("Error: Size of out of Tensor::cos must match");
+
+  itensor->cos(out, alpha);
+}
+
+void Tensor::inv_sqrt_i() { itensor->inv_sqrt(*this); }
+
+LazyTensor Tensor::chain() const { return LazyTensor(*this); }
+
+float Tensor::l2norm() const { return itensor->l2norm(); }
+
+void Tensor::normalization_i() {
+  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
+    << getName() << " is not contiguous, cannot do normalization.";
+
+  const float min = minValue();
+  const float max = maxValue();
+
+  if (max == min) {
+    Tensor tmp = *this;
+    this->subtract_i(tmp);
+  } else {
+    this->subtract_i(min);
+    this->divide_i(max - min);
+  }
+}
+
+void Tensor::standardization_i() {
+  Tensor mean_by_batch = this->sum_by_batch();
+  mean_by_batch.divide_i(getDim().getFeatureLen());
+
+  this->subtract_i(mean_by_batch);
+  Tensor std_dev_by_batch(batch(), 1, 1, 1, getFormat(), getDataType());
+  std_dev_by_batch.setZero();
+
+  /// @todo remove conditional statement
   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    float *data = getData<float>();
-    std::fill(data, data + size(), val);
+    float *std_dev = std_dev_by_batch.getData<float>();
+
+    for (unsigned int k = 0; k < batch(); ++k) {
+      Tensor sub_this = this->getBatchSlice(k, 1);
+      std_dev[k] = sub_this.l2norm();
+    }
   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
 #ifdef ENABLE_FP16
-    _FP16 *data = getData<_FP16>();
-    std::fill(data, data + size(), static_cast<_FP16>(val));
+    _FP16 *std_dev = std_dev_by_batch.getData<_FP16>();
+
+    for (unsigned int k = 0; k < batch(); ++k) {
+      Tensor sub_this = this->getBatchSlice(k, 1);
+      std_dev[k] = static_cast<_FP16>(sub_this.l2norm());
+    }
 #else
     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
-  } else if (getDataType() == ml::train::TensorDim::DataType::QINT8) {
-    uint8_t *data = getData<uint8_t>();
-    std::fill(data, data + size(), val);
-  } else if (getDataType() == ml::train::TensorDim::DataType::QINT4) {
-    uint8_t *data = getData<uint8_t>();
-    uint8_t mixed = encode_qint(val, val);
-    std::fill(data, data + (size() + 1) / 2, mixed);
   }
+
+  std_dev_by_batch.divide_i(getDim().getFeatureLen());
+  this->divide_i(std_dev_by_batch);
+}
+
+Tensor Tensor::dot(Tensor const &input, bool trans, bool trans_in) const {
+  Tensor output("", this->getFormat(), this->getDataType());
+  dot(input, output, trans, trans_in);
+
+  return output;
+}
+
+/**
+ * @note: This dot product flattens the fist 3 axis for the purpose of
+ * computation. So, while performing, these matrices are behaving as 2-D
+ * matrices. The dimensions are restored while returning back the tensor
+ * in case of trans is false.
+ */
+Tensor &Tensor::dot(Tensor const &input, Tensor &output, bool trans,
+                    bool trans_in, float beta) const {
+  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
+    << getName() << " is not contiguous. Cannot dot product.";
+
+  itensor->dot(input, output, trans, trans_in, beta);
+  return output;
+}
+
+Tensor &Tensor::dot_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
+                                bool trans, bool trans_m, float beta) {
+  bool deriv_trans_m = true;
+  bool deriv_trans = false;
+  /** @todo handle all cases of trans and trans_m */
+  if (!trans && trans_m) {
+    deriv_trans_m = false;
+  }
+
+  return output_deriv.dot(m, *this, deriv_trans, deriv_trans_m, beta);
+}
+
+/**
+ * @brief compute the derivative wrt m in the m tensor
+ * @note The caller tensor must be the same tensor as the one which called the
+ * dot() product.
+ */
+Tensor &Tensor::dot_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
+                                bool trans, bool trans_m, float beta) const {
+  bool deriv_trans_m = false;
+  bool deriv_trans = true;
+  /** @todo handle all cases of trans and trans_m */
+
+  if (!trans && trans_m) {
+    output_deriv.dot(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
+    return m_deriv;
+  } else {
+    return dot(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
+  }
+}
+
+Tensor &Tensor::dotBatched(Tensor const &m, Tensor &result, bool trans,
+                           bool trans_m, float beta) const {
+  if (!result.isAllocated())
+    throw std::invalid_argument(
+      "Output tensor must be preallocated for dotBatched operation");
+  for (unsigned int b = 0; b < batch(); b++) {
+    /** @todo try using transpose to speedup the operation */
+    const Tensor this_b = this->getBatchSlice(b, 1);
+    Tensor m_b = m.getBatchSlice(b, 1);
+    Tensor result_b = result.getBatchSlice(b, 1);
+
+    this_b.dot(m_b, result_b, trans, trans_m, beta);
+  }
+
+  return result;
+}
+
+Tensor &Tensor::dot_batched_deriv_wrt_1(Tensor const &m,
+                                        Tensor const &output_deriv, bool trans,
+                                        bool trans_m, float beta) {
+  bool deriv_trans_m = true;
+  bool deriv_trans = false;
+  /** @todo handle all cases of trans and trans_m */
+  if (!trans && trans_m) {
+    deriv_trans_m = false;
+  }
+
+  return output_deriv.dotBatched(m, *this, deriv_trans, deriv_trans_m, beta);
+}
+
+Tensor &Tensor::dot_batched_deriv_wrt_2(Tensor &m_deriv,
+                                        Tensor const &output_deriv, bool trans,
+                                        bool trans_m, float beta) const {
+  bool deriv_trans_m = false;
+  bool deriv_trans = true;
+  /** @todo handle all cases of trans and trans_m */
+
+  if (!trans && trans_m) {
+    output_deriv.dotBatched(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
+    return m_deriv;
+  } else {
+    return dotBatched(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
+  }
+}
+
+Tensor Tensor::dropout_mask(float dropout) const {
+  Tensor output(getDim());
+  output.dropout_mask(dropout);
+  return output;
+}
+
+void Tensor::dropout_mask(float dropout) {
+  /// @todo add unittest
+  NNTR_THROW_IF(dropout < 0 || dropout > 1, std::invalid_argument)
+    << "[Tensor::dropout_mask] Dropout rate should be between 0 and 1";
+
+  // if the rate is zero, no change is needed
+  if (std::fpclassify(dropout) == FP_ZERO)
+    return;
+
+  setRandUniform(0.0, 1.0);
+  itensor->dropout_mask(dropout);
+}
+
+void Tensor::filter_mask(const Tensor &mask_len, bool reverse) {
+  /// @todo add unittest
+  itensor->filter_mask(mask_len, reverse);
 }
 
-void Tensor::setZero() {
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    if (contiguous)
-      sscal(size(), 0, getData<float>(), 1);
-    else
-      apply_i<float>([](float val) -> float { return 0; });
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    if (contiguous)
-      sscal(size(), 0, getData<_FP16>(), 1);
-    else
-      apply_i<_FP16>([](_FP16 val) -> _FP16 { return 0; });
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::QINT8) {
-    apply_i<uint8_t>([](uint8_t val) -> uint8_t { return 0; });
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::QINT4) {
-    setValue(0);
-  }
+Tensor Tensor::zoneout_mask(float zoneout) {
+  Tensor output(getDim());
+  zoneout_mask(output, zoneout);
+  return output;
 }
 
-std::vector<unsigned int> Tensor::argmax() const {
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << " is not contiguous, cannot get argmax.";
-  std::vector<unsigned int> result;
+void Tensor::zoneout_mask(Tensor &opposite, float zoneout) {
+  NNTR_THROW_IF(getDim() != opposite.getDim(), std::invalid_argument)
+    << "[Tensor::zoneout_mask] opposite dimension does not match";
 
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    const float *data = getData();
-    size_t batch_size = batch();
-    size_t feature_len = dim.getFeatureLen();
+  NNTR_THROW_IF(zoneout < 0 || zoneout > 1, std::invalid_argument)
+    << "[Tensor::zoneout_mask] Zoneout rate should be between 0 and 1";
 
-    result.resize(batch_size);
+  // if the rate is zero, no change is needed
+  if (std::fpclassify(zoneout) == FP_ZERO)
+    return;
 
-    for (unsigned int b = 0; b < batch_size; b++) {
-      auto max_iter =
-        std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
-      result[b] = std::distance(data, max_iter) - (b * feature_len);
-    }
-  }
-  if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    const _FP16 *data = getData<_FP16>();
-    size_t batch_size = batch();
-    size_t feature_len = dim.getFeatureLen();
+  itensor->zoneout_mask(opposite, zoneout);
+}
 
-    result.resize(batch_size);
+std::vector<Tensor> Tensor::split(unsigned num_size, int axis) {
+  NNTR_THROW_IF(num_size == 0, std::invalid_argument)
+    << "num size cannot be zero";
 
-    for (unsigned int b = 0; b < batch_size; b++) {
-      auto max_iter =
-        std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
-      result[b] = std::distance(data, max_iter) - (b * feature_len);
-    }
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
+  if (axis == -1) {
+    axis = 3;
   }
 
-  return result;
-}
+  NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
+    << "cannot split axis of axis: " << axis;
 
-int Tensor::erf_i() {
-  erf(*this);
-  return ML_ERROR_NONE;
-}
+  NNTR_THROW_IF(getDim().getTensorDim(axis) % num_size != 0,
+                std::invalid_argument)
+    << "axis is not divisible by num_size, axis: " << axis
+    << " num size: " << num_size;
 
-Tensor Tensor::erf() const {
-  Tensor t;
-  return erf(t);
-}
+  std::vector<size_t> sizes;
+  sizes.resize(num_size);
 
-Tensor &Tensor::erf(Tensor &out) const {
-  if (dim.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    auto f = [](float in) { return std::erf(in); };
-    apply<float>(f, out);
-  } else if (dim.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    auto f = [](_FP16 in) {
-      return static_cast<_FP16>(std::erf(static_cast<float>(in)));
-    };
-    apply<_FP16>(f, out);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-  return out;
-}
+  unsigned int sz = getDim().getTensorDim(axis) / num_size;
+  std::fill(sizes.begin(), sizes.end(), sz);
 
-void Tensor::sin(Tensor &out, float alpha) {
-  if (size() != out.size())
-    throw std::invalid_argument("Error: Size of out of Tensor::sin must match");
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    if (!contiguous) {
-      auto f = [alpha](float val) -> float { return std::sin(alpha * val); };
-      apply<float>(f, out);
-    } else {
-      sine(size(), getData<float>(), out.getData<float>(), alpha);
-    }
-  } else
-    throw std::invalid_argument("Error: Tensor::sin supports fp32 case only.");
+  return split(sizes, axis);
 }
 
-void Tensor::cos(Tensor &out, float alpha) {
-  if (size() != out.size())
-    throw std::invalid_argument("Error: Size of out of Tensor::sin must match");
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    if (!contiguous) {
-      auto f = [alpha](float val) -> float { return std::cos(alpha * val); };
-      apply<float>(f, out);
-    } else {
-      cosine(size(), getData<float>(), out.getData<float>(), alpha);
-    }
-  } else
-    throw std::invalid_argument("Error: Tensor::cos supports fp32 case only.");
-}
+std::vector<Tensor> Tensor::split(std::vector<size_t> sizes, int axis) {
+  NNTR_THROW_IF(sizes.size() == 0, std::invalid_argument)
+    << "num size cannot be zero";
 
-void Tensor::inv_sqrt_i() {
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    if (!contiguous) {
-      apply_i<float>([](float val) -> float { return 1 / std::sqrt(val); });
-    } else {
-      inv_sqrt_inplace(this->size(), getData<float>());
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    if (!contiguous) {
-      apply_i<_FP16>([](_FP16 val) -> _FP16 {
-        return static_cast<_FP16>(1 / std::sqrt(static_cast<float>(val)));
-      });
-    } else {
-      inv_sqrt_inplace(this->size(), getData<_FP16>());
-    }
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  } else
-    throw std::invalid_argument(
-      "Error: Tensor::inv_sqrt_i only supports fp32, fp16");
+  NNTR_THROW_IF(!(-1 <= axis && axis < 4), std::invalid_argument)
+    << "cannot split axis of axis: " << axis;
+
+  NNTR_THROW_IF(
+    std::any_of(sizes.begin(), sizes.end(), [](size_t sz) { return !sz; }),
+    std::invalid_argument)
+    << "among given sizes at least one of size is 0";
+
+  return itensor->split(sizes, axis);
 }
 
-float Tensor::l2norm() const {
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << " is not contiguous, cannot get l2norm.";
-  float ret = 0;
-  unsigned int len = size();
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    const float *data = getData<float>();
-    ret = snrm2(len, data, 1);
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    const _FP16 *data = getData<_FP16>();
-    ret = snrm2(len, data, 1);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-  return ret;
+Tensor Tensor::concat(const std::vector<Tensor> &tensors, int axis) {
+  NNTR_THROW_IF(!(-1 <= axis && axis < 4), std::invalid_argument)
+    << "cannot split axis of axis: " << axis;
+
+  NNTR_THROW_IF(tensors.empty(), std::invalid_argument)
+    << "given tensor vector is empty";
+
+  return itensor->concat(tensors, axis);
 }
 
-float Tensor::max_abs() const {
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << " is not contiguous, cannot get max_abs.";
+Tensor Tensor::cat(const std::vector<Tensor> &tensors, int axis) {
+  Tensor input = tensors[0];
+  return input.concat(tensors, axis);
+}
 
-  unsigned int len = size();
-  float ret = 0;
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    const float *data = getData<float>();
+void Tensor::print(std::ostream &out) const {
+  printInstance(out, this);
+  itensor->print(out);
+}
 
-    unsigned int idx = isamax(len, data, 1);
-    ret = *(data + idx);
+void Tensor::putData() const { itensor->putData(); }
 
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    const _FP16 *data = getData<_FP16>();
+void Tensor::setData(const std::shared_ptr<MemoryData> buf, size_t off,
+                     bool init) {
+  itensor->setMemoryData(buf, off);
 
-    unsigned int idx = isamax(len, data, 1);
-    ret = *(data + idx);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
+  if (buf && init) {
+    initialize();
   }
-  return ret;
 }
 
-Tensor &Tensor::normalization(Tensor &output) const {
-  if (output.empty())
-    output = Tensor(dim);
+const std::shared_ptr<MemoryData> Tensor::getMemoryData() const {
+  return itensor->getMemoryData();
+}
 
-  output.copy(*this);
-  output.normalization_i();
+size_t Tensor::getOffset() const { return itensor->getOffset(); }
 
-  return output;
+void Tensor::copy(const Tensor &from) {
+  /// @todo enable copy to non-contiguous tensor
+  if (!itensor->getContiguous()) {
+    throw std::runtime_error("Cannot copy non-contiguous tensor");
+  }
+
+  if (from.size() != 0 && size() == from.size() &&
+      getDataType() == from.getDataType()) {
+    // if tensor size and data type match, copy data
+    itensor->copy(from);
+  } else {
+    Tensor t = Tensor(from.getDim(), from.getData<char>());
+    swap(t, *this);
+  }
 }
 
-void Tensor::normalization_i() {
-  NNTR_THROW_IF(!contiguous, std::invalid_argument)
-    << getName() << " is not contiguous, cannot do normalization.";
+void Tensor::copyData(const Tensor &from) { itensor->copyData(from); }
 
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    const float *data = getData();
-
-    auto bounds = std::minmax_element(data, data + size());
-    const float min = *bounds.first;
-    const float max = *bounds.second;
-
-    if (max == min) {
-      Tensor tmp = *this;
-      this->subtract_i(tmp);
-    } else {
-      this->subtract_i(min);
-      this->divide_i(max - min);
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    const _FP16 *data = getData<_FP16>();
-
-    auto bounds = std::minmax_element(data, data + size());
-    const _FP16 min = *bounds.first;
-    const _FP16 max = *bounds.second;
-
-    if (max == min) {
-      Tensor tmp = *this;
-      this->subtract_i(tmp);
-    } else {
-      this->subtract_i(min);
-      this->divide_i(max - min);
-    }
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
+void Tensor::copy_with_stride(const Tensor &from) {
+  if (itensor->getDim() == from.getDim()) {
+    // If the tensor dim matches, copy the data. This also applies to
+    // uncontigous tensor.
+    itensor->copy_with_stride(from, *this);
+  } else {
+    // replace with a new tensor that has the same data as the given tensor
+    Tensor t = Tensor(from.getDim(), true);
+    itensor->copy_with_stride(from, t);
+    swap(t, *this);
   }
 }
 
-LazyTensor Tensor::chain() const { return LazyTensor(*this); }
+Tensor Tensor::getBatchSlice(size_t offset, unsigned int size) const {
+  TensorDim dim_ = getDim();
+  dim_.batch(size);
 
-Tensor &Tensor::standardization(Tensor &output) const {
-  if (output.empty())
-    output = Tensor(dim);
+  return getSharedDataTensor(dim_, offset * this->getDim().getFeatureLen(),
+                             true, "");
+}
 
+Tensor Tensor::clone() const {
+  Tensor output(getName(), getFormat(), getDataType());
   output.copy(*this);
-  output.standardization_i();
-
   return output;
 }
 
-void Tensor::standardization_i() {
-  Tensor mean_by_batch = this->sum_by_batch();
-  mean_by_batch.divide_i(dim.getFeatureLen());
+void Tensor::save(std::ostream &file) {
+  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
+    << getName() << " is not contiguous, cannot save.";
 
-  this->subtract_i(mean_by_batch);
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    Tensor std_dev_by_batch(dim.batch(), 1, 1, 1, dim.getFormat(),
-                            dim.getDataType());
-    std_dev_by_batch.setZero();
-    float *std_dev = std_dev_by_batch.getData();
+  std::streamsize sz = static_cast<std::streamsize>(bytes());
+  NNTR_THROW_IF(sz < 0, std::invalid_argument)
+    << "save size: " << bytes()
+    << " is too big. It cannot be represented by std::streamsize";
 
-    for (unsigned int k = 0; k < dim.batch(); ++k) {
-      Tensor sub_this = this->getBatchSlice(k, 1);
-      std_dev[k] = sub_this.l2norm();
-    }
+  checkedWrite(file, getData<char>(), sz, "[Tensor::save] operation failed");
+  putData();
+}
 
-    std_dev_by_batch.divide_i(dim.getFeatureLen());
-    this->divide_i(std_dev_by_batch);
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    Tensor std_dev_by_batch(dim.batch(), 1, 1, 1, dim.getFormat(),
-                            dim.getDataType());
-    std_dev_by_batch.setZero();
-    _FP16 *std_dev = std_dev_by_batch.getData<_FP16>();
+void Tensor::read(std::ifstream &file) {
+  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
+    << getName() << " is not contiguous, cannot read.";
 
-    for (unsigned int k = 0; k < dim.batch(); ++k) {
-      Tensor sub_this = this->getBatchSlice(k, 1);
-      std_dev[k] = static_cast<_FP16>(sub_this.l2norm());
-    }
+  std::streamsize sz = static_cast<std::streamsize>(bytes());
 
-    std_dev_by_batch.divide_i(dim.getFeatureLen());
-    this->divide_i(std_dev_by_batch);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
+  NNTR_THROW_IF(sz < 0, std::invalid_argument)
+    << "read size: " << bytes()
+    << " is too big. It cannot be represented by std::streamsize";
+
+  checkedRead(file, getData<char>(), sz, "[Tensor::read] operation failed");
+  putData();
 }
 
-Tensor::BroadcastInfo Tensor::computeBroadcastInfo(const Tensor &m) const {
-  if (m.size() > this->size())
-    throw exception::not_supported("broadcasting *this is not supported");
+std::vector<unsigned int> Tensor::argmax() const {
+  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
+    << getName() << " is not contiguous, cannot get argmax.";
+  return itensor->argmax();
+}
 
-  const TensorDim m_dim = m.getDim();
+float Tensor::max_abs() const {
+  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
+    << getName() << " is not contiguous, cannot get max_abs.";
+  return itensor->max_abs();
+}
 
-  BroadcastInfo e;
-  e.tensor_type = getTensorType();
+float Tensor::maxValue() const { return itensor->maxValue(); }
 
-  uint continuity[4] = {0, 1, 2, 3};
-  if (getFormat() == Tformat::NHWC) {
-    continuity[1] = 2;
-    continuity[2] = 3;
-    continuity[3] = 1;
-  }
+float Tensor::minValue() const { return itensor->minValue(); }
 
-  /// checking if given Tensor's can be broadcasted
-  for (unsigned int i = 0; i < TensorDim::MAXDIM; ++i) {
-    if (dim.getTensorDim(continuity[i]) == m_dim.getTensorDim(continuity[i])) {
-      e.strides[i] = m.strides[i];
-      continue;
-    }
+Tensor Tensor::transpose(const std::string &direction) const {
+  Tensor output(getDim());
+  transpose(direction, output);
+  return output;
+}
 
-    /// If given dimension is 1, it could be reused, the stride remaining 0
-    /// Need to check if dim[i] == 1 && m_dim[i] == 1 first though
-    /// If so, strides should not change
-    if (m_dim.getTensorDim(continuity[i]) == 1) {
-      continue;
-    }
+Tensor &Tensor::transpose(const std::string &direction, Tensor &output) const {
+  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
+    << getName() << " is not contiguous. Cannot transpose.";
 
-    std::stringstream ss;
-    ss << "[computeBroadcastInfo] broadcasting only allowed for "
-          "dimension value of 1 \n"
-       << "this: " << dim << "target: " << m_dim;
-    throw std::invalid_argument(ss.str().c_str());
+  if (output.getData<char>() == getData<char>()) {
+    Tensor result = clone();
+    return result.transpose(direction, output);
   }
 
-  /// calculate inner loop size
-  e.buffer_size = 1;
-  e.buffer_axis = -1;
-  e.strides[3] = m.strides[3];
-
-  /// initiate buffer info with matching dimension strategy
-  for (int axis = 3; axis >= 0; --axis) {
-    if (dim.getTensorDim(continuity[axis]) !=
-        m_dim.getTensorDim(continuity[axis])) {
-      e.buffer_axis = axis;
-      break;
-    }
+  itensor->transpose(direction, output);
 
-    e.buffer_size *= dim.getTensorDim(continuity[axis]);
-  }
+  return output;
+}
 
-  /// check strategy that uses consecutive ones
-  if (m_dim.getTensorDim(continuity[3]) == 1) {
-    unsigned int inner_loop_size = 1;
-    int axis;
-    for (axis = 3; axis >= 0; --axis) {
-      if (m_dim.getTensorDim(continuity[axis]) != 1) {
-        break;
-      }
+void Tensor::reshape(const TensorDim &d) { itensor->reshape(d); }
 
-      inner_loop_size *= dim.getTensorDim(continuity[axis]);
-    }
+void Tensor::fill(const Tensor &from, bool allocate) {
+  if (allocate && this->empty()) {
+    this->copy(from);
+    return;
+  }
 
-    /// if consecutive-one strategy has bigger chunk size, replace the
-    /// information
-    if (inner_loop_size > e.buffer_size) {
-      e.buffer_axis = axis;
-      e.buffer_size = inner_loop_size;
-      e.strides[3] = 0;
-    }
+  if (!from.getContiguous() || !getContiguous()) {
+    /// @todo enable this if needed
+    throw nntrainer::exception::not_supported(
+      "[Tensor::fill] non-contiguous tensors are not supported");
   }
 
-  return e;
-}
-
-Tensor Tensor::rotate_180(Tensor in) {
-  Tensor output(in.getDim());
-  if (in.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    output.setZero();
-    for (unsigned int i = 0; i < in.batch(); ++i) {
-      for (unsigned int j = 0; j < in.channel(); ++j) {
-        for (unsigned int k = 0; k < in.height(); ++k) {
-          for (unsigned int l = 0; l < in.width(); ++l) {
-            output.setValue(i, j, k, l,
-                            in.getValue<float>(i, j, (in.height() - k - 1),
-                                               (in.width() - l - 1)));
-          }
-        }
-      }
-    }
-  } else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    output.setZero();
-    for (unsigned int i = 0; i < in.batch(); ++i) {
-      for (unsigned int j = 0; j < in.channel(); ++j) {
-        for (unsigned int k = 0; k < in.height(); ++k) {
-          for (unsigned int l = 0; l < in.width(); ++l) {
-            output.setValue(i, j, k, l,
-                            in.getValue<_FP16>(i, j, (in.height() - k - 1),
-                                               (in.width() - l - 1)));
-          }
-        }
-      }
-    }
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
+  if (getDim() != from.getDim()) {
+    throw std::invalid_argument("[Tensor::fill] dimension must be the same");
   }
-  return output;
+
+  if (getStrides() != from.getStrides()) {
+    /// @todo length does not represent buffer size, there should be way to
+    /// get the buffer size
+    throw std::invalid_argument("[Tensor::fill] buffer size must be the same");
+  }
+
+  copyData(from);
 }
 
-uint8_t Tensor::encode_qint(uint8_t high, uint8_t low) const {
-  return (high << 4) | (low & 0x0f);
+TensorDim Tensor::getDim() const { return itensor->getDim(); }
+
+TensorDim::TensorType Tensor::getTensorType() const {
+  return itensor->getTensorType();
 };
 
-uint8_t Tensor::decode_qint(uint8_t val, bool isHigh) const {
-  if (isHigh) {
-    val = val >> 4;
-  } else {
-    val = val << 4;
-    val = val >> 4;
-  }
+Initializer Tensor::getInitializer() const { return itensor->getInitializer(); }
+
+TensorDim::Format Tensor::getFormat() const { return itensor->getFormat(); }
+
+Tdatatype Tensor::getDataType() const { return itensor->getDataType(); }
 
-  return val;
+void Tensor::updateBatch(unsigned int batch) { itensor->updateBatch(batch); }
+
+const bool Tensor::getContiguous() const noexcept {
+  return itensor->getContiguous();
 }
 
-std::vector<float> Tensor::getScaleFactors() const {
-  return scale_factors_fp32;
+const std::array<size_t, TensorDim::MAXDIM>
+Tensor::getStrides() const noexcept {
+  return itensor->getStrides();
 }
 
-void Tensor::setZeroPoints(std::vector<uint8_t> zp) {
-  if (zp.empty()) {
-    throw std::invalid_argument("Error: invalid parameter");
+bool Tensor::checkContinuous(unsigned int np1, unsigned int np2) const {
+  if (np1 > 3 || np2 > 3) {
+    throw std::invalid_argument(
+      "Error: Input value must be within the range of 0 to 3.");
+  }
+
+  if (getFormat() == Tformat::NCHW) {
+    if (np1 + 1 == np2)
+      return true;
+  } else {
+    std::vector<unsigned int> continuous_order_nhwc = {0, 3, 1, 2};
+    if (continuous_order_nhwc[np2] == continuous_order_nhwc[np1] + 1)
+      return true;
   }
 
-  zero_points = zp;
+  return false;
 }
 
-std::vector<uint8_t> Tensor::getZeroPoints() const { return zero_points; }
+void Tensor::setName(const std::string &name_) { itensor->setName(name_); }
 
-void Tensor::dequantize(Tensor &output, unsigned int axis) const {
-  if (getDataType() == Tdatatype::FP32 || getDataType() == Tdatatype::FP16) {
-    throw std::invalid_argument("Error: Tensor cannot be dequantized");
-  }
+const std::string &Tensor::getName() const { return itensor->getName(); }
 
-  if (output.getDataType() == Tdatatype::QINT8 ||
-      output.getDataType() == Tdatatype::QINT4) {
-    throw std::invalid_argument("Error: Target datatype is quantized type");
-  }
+size_t Tensor::getIndex(unsigned int b, unsigned int c, unsigned int h,
+                        unsigned int w) const noexcept {
+  return itensor->getIndex(b, c, h, w);
+}
 
-  if (getFormat() != output.getFormat())
-    throw std::invalid_argument("Error: TensorType do not match");
+size_t Tensor::size() const { return itensor->size(); }
 
-  if (batch() != output.batch() || channel() != output.channel() ||
-      width() != output.width() || height() != output.height())
-    throw std::invalid_argument("Error: TensorDim do not match");
+bool Tensor::empty() const { return itensor->empty(); }
 
-  if (output.getDataType() == Tdatatype::FP32 && scale_factors_fp32.empty()) {
-    throw std::invalid_argument("Error: No scale factors");
-  }
-#ifdef ENABLE_FP16
-  if (output.getDataType() == Tdatatype::FP16 && scale_factors_fp16.empty()) {
-    throw std::invalid_argument("Error: No scale factors");
-  }
-#endif
-  if (axis == 0 && zero_points.size() != batch()) {
-    throw std::invalid_argument("Error: output axis do not match ");
-  }
+size_t Tensor::bytes() const { return itensor->bytes(); }
 
-  if (axis == 1 && zero_points.size() != channel()) {
-    throw std::invalid_argument("Error: output axis do not match ");
-  }
+size_t Tensor::batch() const { return itensor->batch(); }
 
-  if (axis == 2 && zero_points.size() != height()) {
-    throw std::invalid_argument("Error: output axis do not match ");
-  }
+size_t Tensor::channel() const { return itensor->channel(); }
 
-  if (axis == 3 && zero_points.size() != width()) {
-    throw std::invalid_argument("Error: output axis do not match ");
-  }
+size_t Tensor::height() const { return itensor->height(); }
 
-  size_t b = (axis == 0) ? zero_points.size() : 1;
-  size_t c = (axis == 1) ? zero_points.size() : 1;
-  size_t h = (axis == 2) ? zero_points.size() : 1;
-  size_t w = (axis == 3) ? zero_points.size() : 1;
+size_t Tensor::width() const { return itensor->width(); }
 
-  output.copyData(*this);
+void Tensor::mergeAxis(unsigned int axis1, unsigned int axis2) {
+  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
+    << getName() << " is not contiguous, cannot merge axis";
 
-  if (output.getDataType() == Tdatatype::FP16) {
-#ifdef ENABLE_FP16
-    std::vector<_FP16> zero_points_16(zero_points.begin(), zero_points.end());
-    Tensor zero_points_fp16_tensor(
-      {{b, c, h, w}, {getFormat(), Tdatatype::FP16}}, zero_points_16.data());
+  if (axis2 != axis1 + 1)
+    if (!checkContinuous(axis1, axis2))
+      throw std::invalid_argument("axis2 must be axis1 + 1 for merging.");
 
-    Tensor scale_factors_fp16_tensor(
-      {{b, c, h, w}, {getFormat(), Tdatatype::FP16}},
-      scale_factors_fp16.data());
+  itensor->mergeAxis(axis1, axis2);
+}
 
-    output.subtract_i(zero_points_fp16_tensor);
-    output.multiply_i(scale_factors_fp16_tensor);
+void Tensor::createSharedDataTensor(const Tensor &src, Tensor &dest,
+                                    size_t offset) const {
+  itensor->createSharedDataTensor(src.itensor.get(), dest.itensor.get(),
+                                  offset);
+}
 
-#else
-    throw std::invalid_argument("enble-fp16 is not set");
-#endif
-  } else if (output.getDataType() == Tdatatype::FP32) {
-    std::vector<float> zero_points_32(zero_points.begin(), zero_points.end());
-    Tensor zero_points_fp32_tensor(
-      {{b, c, h, w}, {getFormat(), Tdatatype::FP32}}, zero_points_32.data());
-    Tensor scale_factors_fp32_tensor(
-      {{b, c, h, w}, {getFormat(), Tdatatype::FP32}},
-      scale_factors_fp32.data());
-
-    output.subtract_i(zero_points_fp32_tensor);
-    output.multiply_i(scale_factors_fp32_tensor);
-  }
+Tensor Tensor::getSharedDataTensor(const TensorDim dim_, size_t offset,
+                                   bool reset_stride,
+                                   const std::string &name_) const {
+  Tensor ret = *this;
+  itensor->getSharedDataTensor(dim_, offset, reset_stride, name_,
+                               ret.itensor.get());
+  return ret;
+}
 
-  return;
+void Tensor::setTensorVar(TensorDim d, void *buf, size_t offset) {
+  itensor->setTensorVar(d, buf, offset);
 }
 
-// namespace nntrainer
+std::ostream &operator<<(std::ostream &out, Tensor const &input) {
+  input.print(out);
+  return out;
+}
 
-} /* namespace nntrainer */
+} // namespace nntrainer
diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h
index 4904e9cfba..b5df3ab9bd 100644
--- a/nntrainer/tensor/tensor.h
+++ b/nntrainer/tensor/tensor.h
@@ -1,52 +1,18 @@
+// SPDX-License-Identifier: Apache-2.0
 /**
- * Copyright (C) 2019 Samsung Electronics Co., Ltd. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *   http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- *
  * @file	tensor.h
- * @date	04 December 2019
- * @brief	This is Tensor class for calculation
+ * @date	01 December 2023
+ * @brief	This is a Tensor class
  * @see		https://github.com/nnstreamer/nntrainer
  * @author	Jijoong Moon <jijoong.moon@samsung.com>
+ * @author	Donghyeon Jeong <dhyeon.jeong@samsung.com>
  * @bug		No known bugs except for NYI items
- *
- * @todo deprecate new tensor allocation for out of place operations.
  */
 
 #ifndef __TENSOR_H__
 #define __TENSOR_H__
 #ifdef __cplusplus
 
-#include <array>
-#include <functional>
-#include <memory>
-#include <random>
-#include <stdexcept>
-#include <vector>
-
-#include <blas_interface.h>
-#include <iostream>
-#include <memory_data.h>
-#include <nntrainer_error.h>
-#include <nntrainer_log.h>
-#include <tensor_dim.h>
-#include <util_func.h>
-
-#ifdef DEBUG
-#define EXCEPT_WHEN_DEBUG
-#else
-#define EXCEPT_WHEN_DEBUG noexcept
-#endif
-
 #define MAKE_SHARED_TENSOR(...) std::make_shared<nntrainer::Tensor>(__VA_ARGS__)
 
 #define CREATE_IF_EMPTY_DIMS(tensor, ...) \
@@ -55,51 +21,32 @@
       tensor = Tensor(__VA_ARGS__);       \
   } while (0);
 
-namespace nntrainer {
+#include <cstddef>
+
+#include <blas_interface.h>
+#include <float_tensor.h>
+#include <nntrainer_log.h>
+#include <tensor_base.h>
+
+#ifdef ENABLE_FP16
+#include <half_tensor.h>
+#endif
 
-using TensorDim = ml::train::TensorDim;
-using Tformat = ml::train::TensorDim::Format;
-using Tdatatype = ml::train::TensorDim::DataType;
-using TStorageOrder = ml::train::TensorDim::StorageOrder;
+namespace nntrainer {
 
 class LazyTensor;
-class SrcSharedTensor;
 
 /**
- * @class   Tensor Class for Calculation
- * @brief   Tensor Class for Calculation
+ * @class   Tensor Class
+ * @brief   Tensor Class
  */
 class Tensor {
 public:
-  /**
-   * @brief     Enumeration of Weight Initialization Type
-   * @todo      support intialization from file
-   */
-  enum class Initializer {
-    ZEROS,          /** Zero initialization */
-    ONES,           /** One initialization */
-    LECUN_NORMAL,   /** LeCun normal initialization */
-    LECUN_UNIFORM,  /** uniform initialization */
-    XAVIER_NORMAL,  /** Xavier normal initialization */
-    XAVIER_UNIFORM, /** Xavier uniform initialization */
-    HE_NORMAL,      /** He normal initialization */
-    HE_UNIFORM,     /** He uniform initialization */
-    NONE            /** No initialization */
-  };
-
   /**
    * @brief     Basic Constructor of Tensor
    */
   Tensor(std::string name_ = "", Tformat fm = Tformat::NCHW,
-         Tdatatype d_type = Tdatatype::FP32) :
-    dim(TensorDim(fm, d_type)),
-    strides(dim.computeStrides()),
-    contiguous(true),
-    initializer(Initializer::NONE),
-    name(name_),
-    data(nullptr),
-    offset(0),
-    src_tensor() {}
+         Tdatatype d_type = Tdatatype::FP32);
 
   /**
    * @brief     Constructor of Tensor with dimension, possibly lazily
@@ -125,6 +72,8 @@ class Tensor {
    * @param[in] d1 Channel
    * @param[in] d2 Height
    * @param[in] d3 Width
+   * @param[in] fm Tensor Format
+   * @param[in] d_type Tensor Data Type
    */
   Tensor(size_t d0, size_t d1, size_t d2, size_t d3, Tformat fm = Tformat::NCHW,
          Tdatatype d_type = Tdatatype::FP32) :
@@ -135,6 +84,8 @@ class Tensor {
    * @param[in] d1 Channel
    * @param[in] d2 Height
    * @param[in] d3 Width
+   * @param[in] fm Tensor Format
+   * @param[in] d_type Tensor Data Type
    */
   Tensor(size_t d1, size_t d2, size_t d3, Tformat fm = Tformat::NCHW,
          Tdatatype d_type = Tdatatype::FP32) :
@@ -144,6 +95,8 @@ class Tensor {
    * @brief     Constructor of Tensor with batch size one and d1 size one
    * @param[in] d2 Height (NCHW) or Width (NHWC)
    * @param[in] d3 Width (NCHW) or Channel (NHWC)
+   * @param[in] fm Tensor Format
+   * @param[in] d_type Tensor Data Type
    */
   Tensor(size_t d2, size_t d3, Tformat fm = Tformat::NCHW,
          Tdatatype d_type = Tdatatype::FP32) :
@@ -152,6 +105,8 @@ class Tensor {
   /**
    * @brief     Constructor of Tensor with just Width or Channel
    * @param[in] d3 Width (NCHW) or Channel (NHWC)
+   * @param[in] fm Tensor Format
+   * @param[in] d_type Tensor Data Type
    */
   explicit Tensor(size_t d3, Tformat fm = Tformat::NCHW,
                   Tdatatype d_type = Tdatatype::FP32) :
@@ -163,6 +118,7 @@ class Tensor {
    * @param[in] d1 Channel (NCHW) or Height (NHWC)
    * @param[in] d2 Height (NCHW) or Width (NHWC)
    * @param[in] d3 Width (NCHW) or Channel (NHWC)
+   * @param[in] t_type Tensor Type
    */
   Tensor(size_t d0, size_t d1, size_t d2, size_t d3,
          ml::train::TensorDim::TensorType t_type) :
@@ -173,6 +129,7 @@ class Tensor {
    * @param[in] d1 Channel
    * @param[in] d2 Height
    * @param[in] d3 Width
+   * @param[in] t_type Tensor Type
    */
   Tensor(size_t d1, size_t d2, size_t d3,
          ml::train::TensorDim::TensorType t_type) :
@@ -182,6 +139,7 @@ class Tensor {
    * @brief     Constructor of Tensor with batch size one and d1 size one
    * @param[in] d2 Height (NCHW) or Width (NHWC)
    * @param[in] d3 Width (NCHW) or Channel (NHWC)
+   * @param[in] t_type Tensor Type
    */
   Tensor(size_t d2, size_t d3, ml::train::TensorDim::TensorType t_type) :
     Tensor(1, (t_type.format == Tformat::NCHW) ? 1 : d3,
@@ -190,6 +148,7 @@ class Tensor {
   /**
    * @brief     Constructor of Tensor with just Width or Channel
    * @param[in] d3 Width (NCHW) or Channel (NHWC)
+   * @param[in] t_type Tensor Type
    */
   explicit Tensor(size_t d3, ml::train::TensorDim::TensorType t_type) :
     Tensor(1, (t_type.format == Tformat::NCHW) ? 1 : d3, 1,
@@ -198,62 +157,19 @@ class Tensor {
   /**
    * @brief     Constructor of Tensor
    * @param[in] d data for the Tensor. It needs to set format properly.
+   * @param[in] t_type Tensor Type
    */
-
   Tensor(std::vector<std::vector<std::vector<std::vector<float>>>> const &d,
          ml::train::TensorDim::TensorType t_type) {
-    if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
-      throw std::out_of_range(
-        "[Tensor] trying to initialize Tensor from empty vector");
-    }
-    // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
-    // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
-    // dim[1] == height, dim[2] == width, dim[3] == channel
-    dim.setTensorDim(0, d.size());
-    if (t_type.format == Tformat::NCHW) {
-      dim.setTensorDim(1, d[0].size());
-      dim.setTensorDim(2, d[0][0].size());
-      dim.setTensorDim(3, d[0][0][0].size());
-    } else {
-      dim.setTensorDim(2, d[0].size());
-      dim.setTensorDim(3, d[0][0].size());
-      dim.setTensorDim(1, d[0][0][0].size());
-    }
-
-    setTensorType(t_type);
-
-    strides = dim.computeStrides();
-
-    MemoryData *mem_data =
-      new MemoryData((void *)(new float[dim.getDataLen()]()));
-    data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
-      delete[] mem_data->getAddr<float>();
-    });
-    offset = 0;
-    contiguous = true;
-    initializer = Initializer::NONE;
-    // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
-    // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
-    // dim[1] == height, dim[2] == width, dim[3] == channel
-    if (t_type.format == Tformat::NCHW) {
-      for (unsigned int i = 0; i < batch(); ++i)
-        for (unsigned int j = 0; j < channel(); ++j)
-          for (unsigned int k = 0; k < height(); ++k)
-            for (unsigned int l = 0; l < width(); ++l)
-              this->setValue(i, j, k, l, d[i][j][k][l]);
-    } else {
-      for (unsigned int i = 0; i < batch(); ++i)
-        for (unsigned int j = 0; j < height(); ++j)
-          for (unsigned int k = 0; k < width(); ++k)
-            for (unsigned int l = 0; l < channel(); ++l)
-              this->setValue(i, l, j, k, d[i][j][k][l]);
-    }
-  };
+    itensor = std::shared_ptr<FloatTensor>(new FloatTensor(d, t_type.format),
+                                           std::default_delete<FloatTensor>());
+  }
 
   /**
    * @brief     Constructor of Tensor
    * @note      This constructor copies vector again. needs refactoring
    * @param[in] d data for the Tensor. It needs to set format properly.
+   * @param[in] t_type Tensor Type
    */
   Tensor(std::vector<std::vector<std::vector<float>>> const &d,
          ml::train::TensorDim::TensorType t_type) :
@@ -263,6 +179,7 @@ class Tensor {
    * @brief     Constructor of Tensor
    * @note      This constructor copies vector again. needs refactoring
    * @param[in] d data for the Tensor with batch size one
+   * @param[in] t_type Tensor Type
    */
   Tensor(std::vector<std::vector<float>> const &d,
          ml::train::TensorDim::TensorType t_type) :
@@ -273,63 +190,19 @@ class Tensor {
    * @brief     Constructor of Tensor
    * @note      This constructor copies vector again. needs refactoring
    * @param[in] d data for the Tensor with batch size one
+   * @param[in] t_type Tensor Type
    */
   Tensor(std::vector<std::vector<std::vector<std::vector<_FP16>>>> const &d,
          ml::train::TensorDim::TensorType t_type) {
-
-    if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
-      throw std::out_of_range(
-        "[Tensor] trying to initialize Tensor from empty vector");
-    }
-
-    dim.setTensorDim(0, d.size());
-    if (t_type.format == Tformat::NCHW) {
-      dim.setTensorDim(1, d[0].size());
-      dim.setTensorDim(2, d[0][0].size());
-      dim.setTensorDim(3, d[0][0][0].size());
-    } else {
-      dim.setTensorDim(2, d[0].size());
-      dim.setTensorDim(3, d[0][0].size());
-      dim.setTensorDim(1, d[0][0][0].size());
-    }
-
-    setTensorType(t_type);
-
-    strides = dim.computeStrides();
-
-    MemoryData *mem_data =
-      new MemoryData((void *)(new _FP16[dim.getDataLen()]()));
-    data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
-      delete[] mem_data->getAddr<_FP16>();
-    });
-    offset = 0;
-    contiguous = true;
-    initializer = Initializer::NONE;
-
-    setDataType(Tdatatype::FP16);
-
-    // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
-    // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
-    // dim[1] == height, dim[2] == width, dim[3] == channel
-    if (t_type.format == Tformat::NCHW) {
-      for (unsigned int i = 0; i < batch(); ++i)
-        for (unsigned int j = 0; j < channel(); ++j)
-          for (unsigned int k = 0; k < height(); ++k)
-            for (unsigned int l = 0; l < width(); ++l)
-              this->setValue(i, j, k, l, d[i][j][k][l]);
-    } else {
-      for (unsigned int i = 0; i < batch(); ++i)
-        for (unsigned int j = 0; j < height(); ++j)
-          for (unsigned int k = 0; k < width(); ++k)
-            for (unsigned int l = 0; l < channel(); ++l)
-              this->setValue(i, l, j, k, d[i][j][k][l]);
-    }
-  };
+    itensor = std::shared_ptr<HalfTensor>(new HalfTensor(d, t_type.format),
+                                          std::default_delete<HalfTensor>());
+  }
 
   /**
    * @brief     Constructor of Tensor
    * @note      This constructor copies vector again. needs refactoring
    * @param[in] d data for the Tensor. It needs to set format properly.
+   * @param[in] t_type Tensor Type
    */
   Tensor(std::vector<std::vector<std::vector<_FP16>>> const &d,
          ml::train::TensorDim::TensorType t_type) :
@@ -339,101 +212,23 @@ class Tensor {
    * @brief     Constructor of Tensor
    * @note      This constructor copies vector again. needs refactoring
    * @param[in] d data for the Tensor with batch size one
+   * @param[in] t_type Tensor Type
    */
   Tensor(std::vector<std::vector<_FP16>> const &d,
          ml::train::TensorDim::TensorType t_type) :
     Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
-
 #endif
 
   /**
-   * @brief     Constructor of Tensor
-   * @param[in] d data for the Tensor. It needs to set format properly.
-   * @param[in] t_type Tensor type.
-   */
-  Tensor(std::vector<std::vector<std::vector<std::vector<uint8_t>>>> const &d,
-         ml::train::TensorDim::TensorType t_type) {
-    if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
-      throw std::out_of_range(
-        "[Tensor] trying to initialize Tensor from empty vector");
-    }
-
-    if (t_type.data_type != Tdatatype::QINT8 &&
-        t_type.data_type != Tdatatype::QINT4) {
-      throw std::out_of_range(
-        "[Tensor] TensorType do not match with input data type");
-    }
-
-    // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
-    // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
-    // dim[1] == height, dim[2] == width, dim[3] == channel
-    dim.setTensorDim(0, d.size());
-    if (t_type.format == Tformat::NCHW) {
-      dim.setTensorDim(1, d[0].size());
-      dim.setTensorDim(2, d[0][0].size());
-      dim.setTensorDim(3, d[0][0][0].size());
-    } else {
-      dim.setTensorDim(2, d[0].size());
-      dim.setTensorDim(3, d[0][0].size());
-      dim.setTensorDim(1, d[0][0][0].size());
-    }
-
-    setTensorType(t_type);
-
-    strides = dim.computeStrides();
-
-    MemoryData *mem_data =
-      (t_type.data_type == Tdatatype::QINT8)
-        ? new MemoryData((void *)(new uint8_t[dim.getDataLen()]()))
-        : new MemoryData((void *)(new uint8_t[(dim.getDataLen() + 1) / 2]()));
-    data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
-      delete[] mem_data->getAddr<uint8_t>();
-    });
-    offset = 0;
-    contiguous = true;
-    initializer = Initializer::NONE;
-
-    // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
-    // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
-    // dim[1] == height, dim[2] == width, dim[3] == channel
-    if (t_type.format == Tformat::NCHW) {
-      for (unsigned int i = 0; i < batch(); ++i)
-        for (unsigned int j = 0; j < channel(); ++j)
-          for (unsigned int k = 0; k < height(); ++k)
-            for (unsigned int l = 0; l < width(); ++l)
-              this->setValue(i, j, k, l, d[i][j][k][l]);
-    } else {
-      for (unsigned int i = 0; i < batch(); ++i)
-        for (unsigned int j = 0; j < height(); ++j)
-          for (unsigned int k = 0; k < width(); ++k)
-            for (unsigned int l = 0; l < channel(); ++l)
-              this->setValue(i, l, j, k, d[i][j][k][l]);
-    }
-  };
-
-  /**
-   * @brief     Constructor of Tensor
-   * @note      This constructor copies vector again. needs refactoring
-   * @param[in] d data for the Tensor. It needs to set format properly.
-   */
-  Tensor(std::vector<std::vector<std::vector<uint8_t>>> const &d,
-         ml::train::TensorDim::TensorType t_type) :
-    Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
-
-  /**
-   * @brief     Constructor of Tensor
-   * @note      This constructor copies vector again. needs refactoring
-   * @param[in] d data for the Tensor with batch size one
+   * @brief Basic Destructor
    */
-  Tensor(std::vector<std::vector<uint8_t>> const &d,
-         ml::train::TensorDim::TensorType t_type) :
-    Tensor(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
+  ~Tensor() = default;
 
   /**
    *  @brief  Copy constructor of Tensor.
    *  @param[in] Tensor &
    */
-  Tensor(const Tensor &rhs) = default;
+  Tensor(const Tensor &rhs);
 
   /**
    *  @brief  Move constructor of Tensor.
@@ -445,7 +240,7 @@ class Tensor {
    * @brief  Copy assignment operator.
    * @param[in] rhs Tensor to be copied.
    */
-  Tensor &operator=(const Tensor &rhs) = default;
+  Tensor &operator=(const Tensor &rhs);
 
   /**
    * @brief  Move assignment operator.
@@ -453,16 +248,28 @@ class Tensor {
    */
   Tensor &operator=(Tensor &&rhs) noexcept = default;
 
+  /**
+   * @brief     Comparison operator overload
+   * @param[in] rhs Tensor to be compared with
+   */
+  bool operator==(const Tensor &rhs) const;
+
+  /**
+   * @brief     Comparison operator overload
+   * @param[in] rhs Tensor to be compared with
+   */
+  bool operator!=(const Tensor &rhs) const { return !(*this == rhs); }
+
   /**
    * @brief Construct a new Tensor object from a buffer
    * This will not copy buffer to a new tensor but directly uses it
    *
-   * @param buf buffer
-   * @param bytes buffer size in bytes
-   * @param d tensor dim
-   * @param offset offset to be used from current
-   * @return Tensor object
-   * @throws std::invalid_argument if buf is null
+   * @param[in] buf buffer
+   * @param[in] bytes buffer size in bytes
+   * @param[in] d tensor dim
+   * @param[in] offset offset to be used from current
+   * @return    Tensor object
+   * @throws    std::invalid_argument if buf is null
    */
   template <typename T = float>
   static Tensor Map(T *buf, unsigned int bytes, const TensorDim &d,
@@ -477,117 +284,92 @@ class Tensor {
         "Creating shared tensor of size bigger than tensor memory.");
     }
 
-    Tensor tmp;
-    tmp.dim = d;
-    tmp.strides = d.computeStrides();
-    /// Tensor does not own the memory
-    tmp.data = std::shared_ptr<MemoryData>(new MemoryData((void *)buf),
-                                           std::default_delete<MemoryData>());
-    tmp.offset = offset;
-
-    return tmp;
+    Tensor output("", d.getFormat(), d.getDataType());
+    output.setTensorVar(d, buf, offset);
+    return output;
   };
 
-  friend void swap(Tensor &lhs, Tensor &rhs) noexcept {
-    std::swap(lhs.dim, rhs.dim);
-    std::swap(lhs.strides, rhs.strides);
-    std::swap(lhs.contiguous, rhs.contiguous);
-    std::swap(lhs.initializer, rhs.initializer);
-    std::swap(lhs.data, rhs.data);
-    std::swap(lhs.name, rhs.name);
-  }
-
   /**
-   * @brief     Comparison operator overload
-   * @param[in] rhs Tensor to be compared with
+   * @brief    Allocate memory for this tensor
    */
-  bool operator==(const Tensor &rhs) const;
+  void allocate();
 
   /**
-   * @brief     Comparison operator overload
-   * @param[in] rhs Tensor to be compared with
+   * @brief    Deallocate memory for this tensor
+   * @note     This will not necessary free the memory as tensors share memory
    */
-  bool operator!=(const Tensor &rhs) const { return !(*this == rhs); }
+  void deallocate();
 
   /**
-   * @brief    Allocate memory for this tensor
+   * @brief    Check if the tensor has memory allocated/assigned/associated
    */
-  void allocate();
+  bool isAllocated();
 
   /**
-   * @brief    Deallocate memory for this tensor
-   * @note     This will not necessary free the memory as tensors share memory
+   * @brief     return Data pointer of Tensor
+   * @retval    template T pointer
    */
-  void deallocate() {
-    data = nullptr;
-    offset = 0;
+  template <typename T = float> T *getData() const {
+    return (T *)itensor->getData();
   }
 
   /**
-   * @brief    Check if the tensor has memory allocated/assigned/associated
+   * @brief     return Data pointer of Tensor
+   * @retval    template T pointer
    */
-  bool isAllocated() const { return data != nullptr; }
+  template <typename T = float> T *getData(size_t idx) const {
+    return (T *)itensor->getData(idx);
+  }
 
   /**
-   * @brief     return value at specific location
-   * @param[in] batch batch location
-   * @param[in] c channel location
-   * @param[in] h height location
-   * @param[in] w width location
+   * @brief     i data index
+   * @retval    template T pointer (address of ith data)
    */
-  template <typename T = float>
-  const T &getValue(unsigned int batch, unsigned int c, unsigned int h,
-                    unsigned int w) const noexcept {
-    return getValue<T>(getIndex(batch, c, h, w));
+  template <typename T = float> T *getAddress(unsigned int i) {
+    return (T *)itensor->getAddress(i);
   }
 
-  template <typename T = float>
-  T &getValue(unsigned int batch, unsigned int c, unsigned int h,
-              unsigned int w) noexcept {
-    return getValue<T>(getIndex(batch, c, h, w));
+  /**
+   * @brief     i data index
+   * @retval    template T pointer (address of ith data)
+   */
+  template <typename T = float> const T *getAddress(unsigned int i) const {
+    return (T *)itensor->getAddress(i);
   }
 
   /**
-   * @brief     return value at specific location
-   * @param[in] idx location
+   * @brief    get address of n-d data
    */
   template <typename T = float>
-  const T &getValue(unsigned int idx) const noexcept {
-    if (getDataType() == Tdatatype::QINT4) {
-      return getData<T>()[idx / 2];
-    }
-    return getData<T>()[idx];
+  T *getAddress(unsigned int b, unsigned int c, unsigned int h,
+                unsigned int w) {
+    return getAddress<T>(getIndex(b, c, h, w));
   }
 
   /**
-   * @brief     return value at specific location
-   * @param[in] idx location
+   * @brief    get address of n-d data
    */
-  template <typename T = float> T &getValue(unsigned int idx) noexcept {
-    if (getDataType() == Tdatatype::QINT4) {
-      return getData<T>()[idx / 2];
-    }
-    return getData<T>()[idx];
+  template <typename T = float>
+  const T *getAddress(unsigned int b, unsigned int c, unsigned int h,
+                      unsigned int w) const {
+    return getAddress<T>(getIndex(b, c, h, w));
   }
 
   /**
    * @brief     return value at specific location
    * @param[in] idx location
-   * @retval    qint4 value in location
    */
-  uint8_t getValueQint4(unsigned int idx) const noexcept {
-    uint8_t value = getData<uint8_t>()[idx / 2];
-    return decode_qint(value, (idx % 2 == 0));
+  template <typename T = float>
+  const T &getValue(unsigned int idx) const noexcept {
+    return getData<T>()[idx];
   }
 
   /**
    * @brief     return value at specific location
    * @param[in] idx location
-   * @retval    qint4 value in location
    */
-  uint8_t getValueQint4(unsigned int idx) noexcept {
-    uint8_t value = getData<uint8_t>()[idx / 2];
-    return decode_qint(value, (idx % 2 == 0));
+  template <typename T = float> T &getValue(unsigned int idx) noexcept {
+    return getData<T>()[idx];
   }
 
   /**
@@ -596,13 +378,11 @@ class Tensor {
    * @param[in] c channel location
    * @param[in] h height location
    * @param[in] w width location
-   * @retval    qint4 value in location
    */
-  uint8_t getValueQint4(unsigned int b, unsigned int c, unsigned int h,
-                        unsigned int w) const noexcept {
-    size_t idx = getIndex(b, c, h, w);
-    uint8_t value = getData<uint8_t>()[idx / 2];
-    return decode_qint(value, (idx % 2 == 0));
+  template <typename T = float>
+  const T &getValue(unsigned int b, unsigned int c, unsigned int h,
+                    unsigned int w) const noexcept {
+    return getValue<T>(getIndex(b, c, h, w));
   }
 
   /**
@@ -611,184 +391,242 @@ class Tensor {
    * @param[in] c channel location
    * @param[in] h height location
    * @param[in] w width location
-   * @retval    qint4 value in location
    */
-  uint8_t getValueQint4(unsigned int b, unsigned int c, unsigned int h,
-                        unsigned int w) noexcept {
-    size_t idx = getIndex(b, c, h, w);
-    uint8_t value = getData<uint8_t>()[idx / 2];
-    return decode_qint(value, (idx % 2 == 0));
+  template <typename T = float>
+  T &getValue(unsigned int b, unsigned int c, unsigned int h,
+              unsigned int w) noexcept {
+    return getValue<T>(getIndex(b, c, h, w));
   }
 
   /**
-   * @brief Get the Value thinking that it is padded
-   * for example, for the tensor (virtually padded) below,
-   * getValue(0, 0, 2, 2, 1, 1, .0f) will return 5
-   * padding available for height and width axis for now
-   * 0 0 0 0 0
-   * 0 1 2 3 0
-   * 0 4 5 6 0
-   * 0 7 8 9 0
-   * 0 0 0 0 0
-   * @param b batch index
-   * @param c channel index
-   * @param h height index
-   * @param w width index
-   * @param ph padding height
-   * @param pw padding width
-   * @return float value
+   * @brief     Fill the Tensor elements with value
+   * @param[in] value value to be stored
    */
-  template <typename T = float>
-  const T getValuePaddedVirtual(unsigned int b, unsigned int c, unsigned int h,
-                                unsigned int w, unsigned int ph,
-                                unsigned int pw,
-                                T pad_value = 0) const EXCEPT_WHEN_DEBUG {
-#if DEBUG
-    unsigned int padded_h = 2 * ph + h;
-    unsigned int padded_w = 2 * pw + w;
-    if (h > padded_h && w > padded_w) {
-      throw std::out_of_range(
-        "[Tensor::getValuePadded] trying to access out of range");
-    }
-#endif
-
-    if (ph <= h && h < ph + height() && pw <= w && w < pw + width()) {
-      return getValue<T>(b, c, h - ph, w - pw);
-    }
-
-    return pad_value;
-  }
+  void setValue(float value);
 
   /**
-   * @brief     Multiply value element by element immediately
-   * @param[in] value multiplier
-   * @retval    #ML_ERROR_INVALID_PARAMETER Tensor dimension is not right
-   * @retval    #ML_ERROR_NONE Successful
+   * @brief     Set the element value
+   * @param[in] b batch location
+   * @param[in] c channel location
+   * @param[in] h height location
+   * @param[in] w width location
+   * @param[in] value value to be stored
    */
-  int multiply_i(float const &value);
+  void setValue(unsigned int b, unsigned int c, unsigned int h, unsigned int w,
+                float value);
 
   /**
-   * @brief     Multiply value element by element
-   * @param[in] value multiplier
-   * @retval    Calculated Tensor
+   * @brief     Set the element value
+   * @param[in] offset offset from start location
+   * @param[in] value value to be stored
+   *
+   * @todo      This is a temporary workout. Remove this
    */
-  Tensor multiply(float const &value) const;
+  void setValueInt(unsigned int offset, int value) noexcept {
+    int *data_int = (int *)getData();
+    data_int[offset] = value;
+  }
 
   /**
-   * @brief     multiply value element by element
-   * @param[in] value multiplier
-   * @param[out] out out tensor to store the result
-   * @retval    Calculated Tensor
+   * @brief     add the element value to the location
+   * @param[in] b batch location
+   * @param[in] c channel location
+   * @param[in] h height location
+   * @param[in] w width location
+   * @param[in] value value to be stored
+   * @param[in] beta scalar to multiply output with and add
    */
-  Tensor &multiply(float const &value, Tensor &out) const;
+  void addValue(unsigned int b, unsigned int c, unsigned int h, unsigned int w,
+                float value, float beta) noexcept;
 
   /**
-   * @brief     Multiply Tensor Elementwise
-   * @param[in] m Tensor to be multiplied
-   * @param[in] beta scalar to multiply output with and add
-   * @retval    #ML_ERROR_NONE successful
+   * @brief     Fill the Tensor elements with zero
    */
-  int multiply_i(Tensor const &m, const float beta = 0.0);
+  void setZero();
 
   /**
-   * @brief     Multiply Tensor Element by Element ( Not the MxM )
-   * @param[in] m Tensor to be multiplied
-   * @param[in] beta scalar to multiply output with and add
-   * @retval    Calculated Tensor
+   * @brief     Set the tensor with random normal distribution
+   * @param[in] mean mean of the distribution
+   * @param[in] std standard deviation of the distribution
    */
-  Tensor multiply(Tensor const &m, const float beta = 0.0) const;
+  void setRandNormal(float mean = 0.0f, float stddev = 0.05f);
 
   /**
-   * @brief     Multiply Tensor Element by Element ( Not the MxM )
-   * @param[in] m Tensor to be multiplied
-   * @param[out] output Tensor to store the result
-   * @param[in] beta scalar to multiply output with and add
-   * @retval    Calculated Tensor
+   * @brief     Set the tensor with random uniform distribution
+   * @param[in] min minimum value for the distribution
+   * @param[in] max maximum value for the distribution
    */
-  Tensor &multiply(Tensor const &m, Tensor &output,
-                   const float beta = 0.0) const;
+  void setRandUniform(float min = -0.05f, float max = 0.05f);
 
   /**
-   * @brief     Multiply Tensor Elementwise
-   * @param[in] m Tensor to be multiplied
-   * @param[in] beta scalar to multiply output with and add
-   * @retval    #ML_ERROR_NONE successful
-   *
-   * @note support different strided inputs and output
-   * @note does not support broadcasting
-   *
-   * @todo merge this to multiply_i
+   * @brief     Set the tensor with random bernoulli distribution
+   * @param[in] probability probability value for the distribution
    */
-  int multiply_i_strided(Tensor const &m, const float beta = 0.0);
+  void setRandBernoulli(float probability = 0.5f);
 
   /**
-   * @brief     Multiply Tensor Element by Element ( Not the MxM )
-   * @param[in] m Tensor to be multiplied
-   * @param[in] beta scalar to multiply output with and add
-   * @retval    Calculated Tensor
-   *
-   * @note support different strided inputs and output
-   * @note does not support broadcasting
-   *
-   * @todo merge this to multiply
+   * @brief     Initialize the memory of the given tensor
    */
-  Tensor multiply_strided(Tensor const &m, const float beta = 0.0) const;
+  void initialize();
 
   /**
-   * @brief     Multiply Tensor Element by Element ( Not the MxM )
-   * @param[in] m Tensor to be multiplied
-   * @param[out] output Tensor to store the result
-   * @param[in] beta scalar to multiply output with and add
-   * @retval    Calculated Tensor
-   *
-   * @note support different strided inputs and output
-   * @note does not support broadcasting
-   *
-   * @todo merge this to multiply
+   * @brief     Initialize the memory of the given tensor
+   * @param     init Initiailizer to use for the initialization
    */
-  Tensor &multiply_strided(Tensor const &m, Tensor &output,
-                           const float beta = 0.0) const;
+  void initialize(Initializer init);
 
   /**
-   * @brief     Add Tensor Elementwise
-   * @param[in] m Tensor to be added
-   * @param[in] beta scalar to add output with and add
-   * @retval    #ML_ERROR_NONE successful
-   *
-   * @note support different strided inputs and output
+   * @brief Apply instantly to the element
+   * @param[in] *function function pointer applied
+   * @return int ML_ERROR_NONE if successful
+   */
+  template <typename T = float> int apply_i(std::function<T(T)> f) {
+    Tensor result = *this;
+    apply<T>(f, result);
+
+    return ML_ERROR_NONE;
+  };
+
+  /**
+   * @brief     Apply function element by element
+   * @param[in] *function function pointer applied
+   * @retval    Tensor
+   */
+  template <typename T = float> Tensor apply(std::function<T(T)> f) const {
+    Tensor result;
+    apply<T>(f, result);
+
+    return result;
+  };
+
+  /**
+   * @brief     Apply function element by element
+   * @param[in] *function function pointer applied
+   * @param[out] output output tensor
+   * @retval    Tensor
+   */
+  template <typename T = float>
+  Tensor &apply(std::function<T(T)> f, Tensor &output) const {
+    CREATE_IF_EMPTY_DIMS(output, {itensor->getFormat(), itensor->getDataType()},
+                         nullptr);
+
+    if (itensor->getFormat() != output.itensor->getFormat() ||
+        itensor->getDataType() != itensor->getDataType()) {
+      /// @todo add unittest
+      throw std::invalid_argument(
+        "[Tensor::apply] output dimension does not match");
+    }
+
+    itensor->apply(f, output);
+
+    return output;
+  }
+
+  /**
+   * @brief     Apply function to Tensor
+   * @param[in] *function function pointer applied
+   * @retval    Tensor
+   */
+  Tensor apply(std::function<Tensor(Tensor)> f) const;
+
+  /**
+   * @brief     Apply function to Tensor
+   * @param[in] *function function pointer applied
+   * @param[out] output output tensor
+   * @retval    Tensor
+   */
+  Tensor &apply(std::function<Tensor &(Tensor, Tensor &)> f,
+                Tensor &output) const;
+
+  /**
+   * @brief     Multiply Tensor Elementwise
+   * @param[in] m Tensor to be multiplied
+   * @param[in] beta scalar to multiply output with and add
+   * @retval    #ML_ERROR_NONE successful
+   *
+   * @note support different strided inputs and output
    * @note does not support broadcasting
    *
-   * @todo merge this to add_i
+   * @todo merge this to multiply_i
    */
-  int add_i_strided(Tensor const &m, const float beta = 0.0);
+  int multiply_i_strided(Tensor const &m, const float beta = 0.0);
 
   /**
-   * @brief     Add Tensor Element by Element
-   * @param[in] m Tensor to be added
-   * @param[in] beta Value to be scale the added tensor
+   * @brief     Multiply Tensor Element by Element ( Not the MxM )
+   * @param[in] m Tensor to be multiplied
+   * @param[in] beta scalar to multiply output with and add
    * @retval    Calculated Tensor
    *
    * @note support different strided inputs and output
    * @note does not support broadcasting
    *
-   * @todo merge this to add
+   * @todo merge this to multiply
    */
-  Tensor add_strided(Tensor const &m, const float beta = 0.0) const;
+  Tensor multiply_strided(Tensor const &m, const float beta = 0.0) const;
 
   /**
-   * @brief     Add Tensor Element by Element
-   * @param[in] m Tensor to be added
+   * @brief     Multiply Tensor Element by Element ( Not the MxM )
+   * @param[in] m Tensor to be multiplied
    * @param[out] output Tensor to store the result
-   * @param[in] beta Value to be scale the added tensor
+   * @param[in] beta scalar to multiply output with and add
    * @retval    Calculated Tensor
    *
    * @note support different strided inputs and output
    * @note does not support broadcasting
    *
-   * @todo merge this to add
+   * @todo merge this to multiply
    */
-  Tensor &add_strided(Tensor const &m, Tensor &output,
-                      const float beta = 0.0) const;
+  Tensor &multiply_strided(Tensor const &m, Tensor &output,
+                           const float beta = 0.0) const;
+
+  /**
+   * @brief     Multiply value element by element immediately
+   * @param[in] value multiplier
+   * @retval    #ML_ERROR_INVALID_PARAMETER Tensor dimension is not right
+   * @retval    #ML_ERROR_NONE Successful
+   */
+  int multiply_i(float const &value);
+
+  /**
+   * @brief     Multiply value element by element
+   * @param[in] value multiplier
+   * @retval    Calculated Tensor
+   */
+  Tensor multiply(float const &value) const;
+
+  /**
+   * @brief      multiply value element by element
+   * @param[in]  value multiplier
+   * @param[out] out out tensor to store the result
+   * @retval     Calculated Tensor
+   */
+  Tensor &multiply(float const &value, Tensor &out) const;
+
+  /**
+   * @brief     Multiply Tensor Elementwise
+   * @param[in] m Tensor to be multiplied
+   * @param[in] beta scalar to multiply output with and add
+   * @retval    #ML_ERROR_NONE successful
+   */
+  int multiply_i(Tensor const &m, const float beta = 0.0);
+
+  /**
+   * @brief     Multiply Tensor Element by Element ( Not the MxM )
+   * @param[in] m Tensor to be multiplied
+   * @param[in] beta scalar to multiply output with and add
+   * @retval    Calculated Tensor
+   */
+  Tensor multiply(Tensor const &m, const float beta = 0.0) const;
+
+  /**
+   * @brief      Multiply Tensor Element by Element ( Not the MxM )
+   * @param[in]  m Tensor to be multiplied
+   * @param[out] output Tensor to store the result
+   * @param[in]  beta scalar to multiply output with and add
+   * @retval     Calculated Tensor
+   */
+  Tensor &multiply(Tensor const &m, Tensor &output,
+                   const float beta = 0.0) const;
 
   /**
    * @brief     Divide value element by element immediately
@@ -808,10 +646,10 @@ class Tensor {
   /**
    * @brief     Divide value element by element
    * @param[in] value Divisor
-   * @param[out] out out parameter to store the result
+   * @param[out] output Tensor to store the result
    * @retval    Calculated Tensor
    */
-  Tensor &divide(float const &value, Tensor &out) const;
+  Tensor &divide(float const &value, Tensor &output) const;
 
   /**
    * @brief     divide Tensor Elementwise
@@ -836,10 +674,51 @@ class Tensor {
   Tensor &divide(Tensor const &m, Tensor &output) const;
 
   /**
-   * @brief Add Tensor Element immediately to target tensor without mem copy
+   * @brief     Add Tensor Elementwise
+   * @param[in] input Tensor to be added
+   * @param[in] beta scalar to add output with and add
+   * @retval    #ML_ERROR_NONE successful
+   *
+   * @note support different strided inputs and output
+   * @note does not support broadcasting
+   *
+   * @todo merge this to add_i
+   */
+  int add_i_strided(Tensor const &input, const float beta = 0.0);
+
+  /**
+   * @brief     Add Tensor Element by Element
+   * @param[in] input Tensor to be added
+   * @param[in] beta Value to be scale the input tensor
+   * @retval    Calculated Tensor
+   *
+   * @note support different strided inputs and output
+   * @note does not support broadcasting
+   *
+   * @todo merge this to add
+   */
+  Tensor add_strided(Tensor const &input, const float beta = 0.0) const;
+
+  /**
+   * @brief      Add Tensor Element by Element
+   * @param[in]  input Tensor to be added
+   * @param[out] output Tensor to store the result
+   * @param[in]  beta Value to be scale the input tensor
+   * @retval     Calculated Tensor
+   *
+   * @note support different strided inputs and output
+   * @note does not support broadcasting
+   *
+   * @todo merge this to add
+   */
+  Tensor &add_strided(Tensor const &input, Tensor &output,
+                      const float beta = 0.0) const;
+
+  /**
+   * @brief     Add Tensor Element immediately to target tensor without mem copy
    * @param[in] value value to be added
-   * @retval #ML_ERROR_NONE  Successful
-   * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
+   * @retval    #ML_ERROR_NONE  Successful
+   * @retval    #ML_ERROR_INVALID_PARAMETER Invalid Parameter
    */
   int add_i(float const &value);
 
@@ -851,35 +730,35 @@ class Tensor {
   Tensor add(float const &value) const;
 
   /**
-   * @brief     Add Tensor Element by Element
-   * @param[in] value value to be added
-   * @param[out] out Tensor to save output without allocating new memory
-   * @retval    Calculated Tensor
+   * @brief      Add Tensor Element by Element
+   * @param[in]  value value to be added
+   * @param[out] output Tensor to save output without allocating new memory
+   * @retval     Calculated Tensor
    */
-  Tensor &add(float const &value, Tensor &out) const;
+  Tensor &add(float const &value, Tensor &output) const;
 
   /**
-   * @brief Add Tensor Element by Element without mem copy
+   * @brief     Add Tensor Element by Element without mem copy
    * @param[in] m Tensor to be added
-   * @param[out] alpha Values to be scaled
-   * @retval #ML_ERROR_NONE  Successful
-   * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
+   * @param[in] alpha Values to be scaled
+   * @retval    #ML_ERROR_NONE  Successful
+   * @retval    #ML_ERROR_INVALID_PARAMETER Invalid Parameter
    */
   int add_i(Tensor const &m, float const alpha = 1.F);
 
-/**
- * @brief Do add_i for specific section
- * 
- * @param len Length of the specific section
- * @param addr_idx Starting index of the psecific section
- * @param m Input Tensor to be added
- * @param incX Incremental index of X
- * @param incY Incremental index of Y
- * @param alphas Vector of multiple alpha values
- * @param alpha_idx Index of alpha in alpha vector
- * @retval #ML_ERROR_NONE  Successful
- * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
- */
+  /**
+   * @brief Do add_i for specific section
+   *
+   * @param len Length of the specific section
+   * @param addr_idx Starting index of the psecific section
+   * @param m Input Tensor to be added
+   * @param incX Incremental index of X
+   * @param incY Incremental index of Y
+   * @param alphas Vector of multiple alpha values
+   * @param alpha_idx Index of alpha in alpha vector
+   * @retval #ML_ERROR_NONE  Successful
+   * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
+   */
   int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
                     unsigned int incX, unsigned int incY, const Tensor alphas,
                     unsigned int alpha_idx);
@@ -887,23 +766,24 @@ class Tensor {
   /**
    * @brief     Add Tensor Element by Element
    * @param[in] m Tensor to be added
+   * @param[in] alpha Values to be scaled
    * @retval    Calculated Tensor
    */
   Tensor add(Tensor const &m, float const alpha = 1) const;
 
   /**
-   * @brief     Add Tensor Element by Element
-   * @param[in] m Tensor to be added
-   * @param[out] m Tensor to be out
-   * @retval    Calculated Tensor
+   * @brief      Add Tensor Element by Element
+   * @param[in]  m Tensor to be added
+   * @param[out] output Tensor to be out
+   * @param[in]  alpha Values to be scaled
+   * @retval     Calculated Tensor
    */
-  Tensor &add(Tensor const &m, Tensor &out, float const alpha = 1) const;
+  Tensor &add(Tensor const &m, Tensor &output, float const alpha = 1) const;
 
   /**
    * @brief     memcpyless version of subtract
-   * @param[in] value value to subtract
-   * @retval #ML_ERROR_NONE  Successful
-   * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
+   * @retval    #ML_ERROR_NONE  Successful
+   * @retval    #ML_ERROR_INVALID_PARAMETER Invalid Parameter
    */
   int subtract_i(float const &value);
 
@@ -915,18 +795,18 @@ class Tensor {
   Tensor subtract(float const &value) const;
 
   /**
-   * @brief     Subtract Tensor Element by Element
-   * @param[in] value value to be added
-   * @param[out] out Tensor to save output without allocating new memory
-   * @retval    Calculated Tensor
+   * @brief      Subtract Tensor Element by Element
+   * @param[in]  value value to be added
+   * @param[out] output Tensor to save output without allocating new memory
+   * @retval     Calculated Tensor
    */
-  Tensor &subtract(float const &value, Tensor &out) const;
+  Tensor &subtract(float const &value, Tensor &output) const;
 
   /**
    * @brief     memcpyless version of subtract
    * @param[in] m Tensor to be subtracted
-   * @retval #ML_ERROR_NONE  Successful
-   * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
+   * @retval    #ML_ERROR_NONE  Successful
+   * @retval    #ML_ERROR_INVALID_PARAMETER Invalid Parameter
    */
   int subtract_i(Tensor const &m);
 
@@ -938,308 +818,165 @@ class Tensor {
   Tensor subtract(Tensor const &m) const;
 
   /**
-   * @brief     Subtract Tensor Element by Element
-   * @param[in] m Tensor to be added
-   * @param[out] m Tensor to be out
-   * @retval    Calculated Tensor
+   * @brief      Subtract Tensor Element by Element
+   * @param[in]  m Tensor to be added
+   * @param[out] output Tensor to be out
+   * @retval     Calculated Tensor
    */
-  Tensor &subtract(Tensor const &m, Tensor &out) const;
+  Tensor &subtract(Tensor const &m, Tensor &output) const;
 
   /**
-   * @brief Tensor power elementwise
-   *
-   * @param exponent exponent
-   * @return int ML_ERROR_NONE if successful
+   * @brief     sum all the Tensor elements according to the batch
+   * @retval    Calculated Tensor(batch, 1, 1, 1)
    */
-  int pow_i(float exponent);
+  Tensor sum_by_batch() const;
 
   /**
-   * @brief    Tensor power Element by Element
-   * @param[in] exponent exponent
-   * @retval Calculated Tensor
+   * @brief     sum all the Tensor elements according to the axis
+   *            0 : batch direction
+   *            1 : channel direction
+   *            2 : height direction
+   *            3 : width direction
+   * @param[in] axis Axis to calculate sum along
+   * @param[in] alpha Scale the sum by this value
+   * @retval    Calculated Tensor
    */
-  Tensor pow(float exponent) const;
+  Tensor sum(unsigned int axis, float alpha = 1.0) const;
 
   /**
-   * @brief    Tensor power Element by Element
-   * @param[in] exponent exponent
-   * @param[out] out out to store the result
-   * @retval Calculated Tensor
+   * @brief     sum all the Tensor elements according to the axis
+   *            0 : batch direction
+   *            1 : channel direction
+   *            2 : height direction
+   *            3 : width direction
+   * @param[in] axis Axis to calculate sum along
+   * @param[out] output output tensor
+   * @param[in] alpha Scale the sum by this value
+   * @retval    Calculated Tensor
    */
-  Tensor &pow(float exponent, Tensor &out) const;
+  Tensor &sum(unsigned int axis, Tensor &output, float alpha = 1.0,
+              float beta = 0.0) const;
 
   /**
-   * @brief  gaussian error function
-   * @return int ML_ERROR_NONE if successful
+   * @brief sum all the Tensor by multiple axes
+   *
+   * @param axes axes to sum along
+   * @param alpha Scale the sum by this value
+   * @return Tensor
    */
-  int erf_i();
+  Tensor sum(const std::vector<unsigned int> &axes, float alpha = 1.0) const;
 
   /**
-   * @brief    gaussian error function
-   * @retval Calculated Tensor
+   * @brief sum all the Tensor by multiple axes
+   *
+   * @param axes axes to sum along
+   * @param[out] output output tensor
+   * @param alpha Scale the sum by this value
+   * @return Tensor
    */
-  Tensor erf() const;
+  Tensor &sum(const std::vector<unsigned int> &axes, Tensor &output,
+              float alpha = 1.0) const;
 
   /**
-   * @brief    gaussian error function
-   * @param[out] out out to store the result
-   * @retval Calculated Tensor
+   * @brief     Averaging the Tensor elements according to the axis
+   *            0 : batch direction
+   *            1 : channel direction
+   *            2 : height direction
+   *            3 : width direction
+   * @retval    Calculated Tensor
    */
-  Tensor &erf(Tensor &out) const;
+  Tensor average(unsigned int axis) const;
 
   /**
-   * @brief    sin transform function
-   * @param[out] out out to store the result
+   * @brief     Averaging the Tensor elements according to the axis
+   * @retval    Calculated Tensor
    */
-  void sin(Tensor &out, float alpha = 1.0);
+  Tensor &average(unsigned int axis, Tensor &output) const;
 
   /**
-   * @brief    cos transform function
-   * @param[out] out out to store the result
+   * @brief     Average all the Tensor by multiple axes
+   * @param[in] axes axes to sum along
+   * @retval    Calculated Tensor
    */
-  void cos(Tensor &out, float alpha = 1.0);
+  Tensor average(const std::vector<unsigned int> &axes) const;
 
   /**
-   * @brief inverse squared root function
-   *
+   * @brief      Average all the Tensor by multiple axes
+   * @param[in]  axes axes to sum along
+   * @param[out] output output tensor
+   * @retval     Calculated Tensor
    */
-  void inv_sqrt_i();
+  Tensor &average(const std::vector<unsigned int> &axes, Tensor &output) const;
 
   /**
-   * @brief  getter of size of data
-   * @retval size of data
+   * @brief     Average the Tensor elements by all axis
+   * @retval    Calculated Tensor
    */
-  unsigned int sizeofData() { return dim.getDataTypeSize(); }
-
-  /**
-   * @brief     Dot Product of Tensor ( equal MxM )
-   * @details   This applies dot of the last dimension of this and second-last
-   * dimension of passed tensor m.
-   * @param[in] m Tensor
-   * @param[in] trans Transpose
-   * @param[in] trans_m Transpose m
-   * @retval    Calculated Tensor
-   */
-  Tensor dot(Tensor const &m, bool trans = false, bool trans_m = false) const;
+  Tensor average() const;
 
   /**
-   * @brief     Dot Product of Tensor ( equal MxM )
-   * @details   This applies dot of the last dimension of this and second-last
-   * dimension of passed tensor m.
-   * @param[in] m Tensor
-   * @param[in] output output Tensor
-   * @param[in] trans Transpose
-   * @param[in] trans_m Transpose m
-   * @param[in] beta beta
+   * @brief     Averaging the Tensor elements by all axis
    * @retval    Calculated Tensor
    */
-  Tensor &dot(Tensor const &m, Tensor &output, bool trans = false,
-              bool trans_m = false, float beta = 0.0f) const;
-
-  /**
-   * @brief compute the derivative of this in the current tensor
-   * @param m same as given to the dot()
-   * @param output_deriv the derivative of the output
-   * @param[in] trans same as given to the dot()
-   * @param[in] trans_m same as given to the dot()
-   * @param[in] beta same as given to the dot()
-   * @note This will compute the derivative in-place and will overwrite existing
-   * data in the tensor
-   */
-  Tensor &dot_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
-                          bool trans = false, bool trans_m = false,
-                          float beta = 0.0f);
-
-  /**
-   * @brief compute the derivative wrt m in the m tensor
-   * @param m_deriv tensor where derivative wrt m will be stored
-   * @param output_deriv the derivative of the output
-   * @param[in] trans same as given to the dot()
-   * @param[in] trans_m same as given to the dot()
-   * @param[in] beta same as given to the dot()
-   * @note The caller tensor must be the same tensor as the one which called the
-   * dot() product.
-   */
-  Tensor &dot_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
-                          bool trans = false, bool trans_m = false,
-                          float beta = 0.0f) const;
-
-  /**
-   * @copydoc Tensor::dot(Tensor const &m, Tensor &output, bool trans,
-              bool trans_m, float beta) const
-   * @details performs dot operation over a batch of inputs
-   */
-  Tensor &dotBatched(Tensor const &m, Tensor &result, bool trans = false,
-                     bool trans_m = false, float beta = 0.0f) const;
-
-  /**
-   * @copydoc Tensor::dot_deriv_wrt_1(Tensor const &m, Tensor const
-   &output_deriv, bool trans, bool trans_m, float beta)
-   */
-  Tensor &dot_batched_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
-                                  bool trans = false, bool trans_m = false,
-                                  float beta = 0.0f);
-
-  /**
-   * @brief Tensor::dot_deriv_wrt_2(Tensor const &m_deriv, Tensor const
-   &output_deriv, bool trans, bool trans_m, float beta) const
-   */
-  Tensor &dot_batched_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
-                                  bool trans = false, bool trans_m = false,
-                                  float beta = 0.0f) const;
-
-  /**
-   * @brief Transpose Tensor
-   *
-   * @param direction to transpose ex) 0:2:1
-   * @return Tensor
-   */
-  Tensor transpose(const std::string &direction) const;
-
-  /**
-   * @brief Transpose Tensor
-   * @param direction to transpose ex) 0:2:1
-   * @param[out] Tensor to save to, dimension is always reshaped.
-   * @retval Tensor& reference to the out
-   */
-  Tensor &transpose(const std::string &direction, Tensor &out) const;
-
-  /**
-   * @brief Calculate Drop Out Mask : x * 1.0/(1.0-rate)
-   * @param dropout drop out rate
-   * @retval Tensor& reference of drop out mask
-   */
-  Tensor dropout_mask(float dropout) const;
-
-  /**
-   * @brief Calculate Drop Out Mask : x * 1.0/(1.0-rate) inplace
-   * @param dropout drop out rate
-   */
-  void dropout_mask(float dropout);
-
-  /**
-   * @brief Calculate filter mask
-   * @param mask_len length of each mask along the last axis
-   * @param invert invert the mask
-   */
-  void filter_mask(const Tensor &mask_len, bool reverse = false);
-
-  /**
-   * @brief Calculate 2 Zone Out Mask
-   * @details Calculate zone out mask according to the bernoulli distribution.
-   * Zone out mask with rate @a zoneout for inplace and the other zone out mask
-   * with rate @a (1-zoneout).
-   * @param zoneout zone out rate
-   * @retval Tensor zone out mask for opposite tensor
-   */
-  Tensor zoneout_mask(float zoneout);
-
-  /**
-   * @brief Calculate 2 Zone Out Mask
-   * @details Calculate zone out mask according to the bernoulli distribution.
-   * Zone out mask with rate @a zoneout for inplace and the other zone out mask
-   * with rate @a (1-zoneout).
-   * @param opposite opposite zone out mask
-   * @param zoneout zone out rate
-   */
-  void zoneout_mask(Tensor &opposite, float zoneout);
-
-  /**
-   * @brief     sum all the Tensor elements according to the batch
-   * @retval    Calculated Tensor(batch, 1, 1, 1)
-   */
-  Tensor sum_by_batch() const;
+  Tensor &average(Tensor &output) const;
 
   /**
-   * @brief     sum all the Tensor elements according to the axis
-   *            0 : batch direction
-   *            1 : channel direction
-   *            2 : height direction
-   *            3 : width direction
-   * @param[in] axis Axis to calculate sum along
-   * @param[in] alpha Scale the sum by this value
-   * @retval    Calculated Tensor
+   * @brief     Tensor power element without mem copy
+   * @param[in] exponent exponent
+   * @retval    #ML_ERROR_NONE  Successful
    */
-  Tensor sum(unsigned int axis, float alpha = 1.0) const;
+  int pow_i(float exponent);
 
   /**
-   * @brief     sum all the Tensor elements according to the axis
-   *            0 : batch direction
-   *            1 : channel direction
-   *            2 : height direction
-   *            3 : width direction
-   * @param[in] axis Axis to calculate sum along
-   * @param[out] output output tensor
-   * @param[in] alpha Scale the sum by this value
+   * @brief     Tensor power element by element
+   * @param[in] exponent exponent
    * @retval    Calculated Tensor
    */
-  Tensor &sum(unsigned int axis, Tensor &output, float alpha = 1.0,
-              float beta = 0.0) const;
+  Tensor pow(float exponent) const;
 
   /**
-   * @brief sum all the Tensor by multiple axes
-   *
-   * @param axes axes to sum along
-   * @param alpha Scale the sum by this value
-   * @return Tensor
+   * @brief      Tensor power element by element
+   * @param[in]  exponent exponent
+   * @param[out] output out to store the result
+   * @retval     Calculated Tensor
    */
-  Tensor sum(const std::vector<unsigned int> &axes, float alpha = 1.0) const;
+  Tensor &pow(float exponent, Tensor &output) const;
 
   /**
-   * @brief sum all the Tensor by multiple axes
-   *
-   * @param axes axes to sum along
-   * @param[out] output output tensor
-   * @param alpha Scale the sum by this value
-   * @return Tensor
+   * @brief     Gauss error function
+   * @retval    #ML_ERROR_NONE  Successful
    */
-  Tensor &sum(const std::vector<unsigned int> &axes, Tensor &output,
-              float alpha = 1.0) const;
+  int erf_i();
 
   /**
-   * @brief     Averaging the Tensor elements according to the axis
-   *            0 : batch direction
-   *            1 : channel direction
-   *            2 : height direction
-   *            3 : width direction
+   * @brief     Gauss error function
    * @retval    Calculated Tensor
    */
-  Tensor average(unsigned int axis) const;
-  /**
-   * @brief     Averaging the Tensor elements according to the axis
-   *
-   * @retval    Calculated Tensor
-   */
-  Tensor &average(unsigned int axis, Tensor &output) const;
+  Tensor erf() const;
 
   /**
-   * @brief average all the Tensor by multiple axes
-   *
-   * @param axes axes to sum along
-   * @return Tensor
+   * @brief      Gauss error function
+   * @param[out] output out to store the result
+   * @retval     Calculated Tensor
    */
-  Tensor average(const std::vector<unsigned int> &axes) const;
+  Tensor &erf(Tensor &output) const;
 
   /**
-   * @brief average all the Tensor by multiple axes
-   *
-   * @param axes axes to sum along
-   * @param output output tensor
-   * @return Tensor
+   * @brief    sin transform function
+   * @param[out] out out to store the result
    */
-  Tensor &average(const std::vector<unsigned int> &axes, Tensor &output) const;
+  void sin(Tensor &out, float alpha = 1.0);
 
   /**
-   * @brief     Averaging the Tensor elements by all axis
-   * @retval    Calculated Tensor
+   * @brief    cos transform function
+   * @param[out] out out to store the result
    */
-  Tensor average() const;
+  void cos(Tensor &out, float alpha = 1.0);
 
   /**
-   * @brief     Averaging the Tensor elements by all axis
-   * @retval    Calculated Tensor
+   * @brief inverse squared root function
    */
-  Tensor &average(Tensor &output) const;
+  void inv_sqrt_i();
 
   /**
    * @brief     Anchor a starting point to defer following evaluation
@@ -1247,12 +984,6 @@ class Tensor {
    */
   LazyTensor chain() const;
 
-  /**
-   * @brief     Softmax the Tensor elements
-   * @retval    Calculated Tensor
-   */
-  Tensor softmax() const;
-
   /**
    * @brief     l2norm the Tensor elements
    * @retval    Calculated l2norm
@@ -1284,371 +1015,126 @@ class Tensor {
   void standardization_i();
 
   /**
-   * @brief     i data index
-   * @retval    address of ith data
-   */
-  template <typename T = float> T *getAddress(unsigned int i) {
-    size_t index = getIndex(batch(), channel(), height(), width());
-    if (i > index) {
-      return nullptr;
-    }
-    if (getDataType() == Tdatatype::QINT4)
-      return &getData<T>()[i / 2];
-    return &getData<T>()[i];
-  }
-
-  /**
-   * @brief     i data index
-   * @retval    address of ith data
-   */
-  template <typename T = float> const T *getAddress(unsigned int i) const {
-    size_t index = getIndex(batch(), channel(), height(), width());
-    if (i > index) {
-      return nullptr;
-    }
-
-    if (getDataType() == Tdatatype::QINT4)
-      return &getData<T>()[i / 2];
-    return &getData<T>()[i];
-  }
-
-  /**
-   * @brief    get address of n-d data
-   */
-  template <typename T = float>
-  T *getAddress(unsigned int b, unsigned int c, unsigned int h,
-                unsigned int w) {
-    return getAddress<T>(getIndex(b, c, h, w));
-  }
-
-  /**
-   * @brief    get address of n-d data
-   */
-  template <typename T = float>
-  const T *getAddress(unsigned int b, unsigned int c, unsigned int h,
-                      unsigned int w) const {
-    return getAddress<T>(getIndex(b, c, h, w));
-  }
-
-  /**
-   * @brief Apply instantly to the element
-   *
-   * @param f function to apply
-   * @return int ML_ERROR_NONE if successful
-   */
-  template <typename T = float> int apply_i(std::function<T(T)> f) {
-    Tensor result = *this;
-    apply<T>(f, result);
-
-    return ML_ERROR_NONE;
-  };
-
-  /**
-   * @brief     Apply function element by element
-   * @param[in] *function function pointer applied
-   * @param[out] output output tensor
-   * @retval    Tensor
-   */
-  template <typename T = float>
-  Tensor &apply(std::function<T(T)> f, Tensor &output) const {
-    CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
-
-    if (dim != output.dim) {
-      /// @todo add unittest
-      throw std::invalid_argument(
-        "[Tensor::apply] output dimension does not match");
-    }
-
-    if (contiguous && output.contiguous) {
-      const T *data = (getData<T>());
-      T *rdata = (output.getData<T>());
-
-      std::transform(data, data + size(), rdata, f);
-    } else if (strides[3] == 1 && output.strides[3] == 1) {
-      /** @todo optimize this with combining these loops where stride is 1 */
-      for (unsigned int b = 0; b < batch(); ++b) {
-        for (unsigned int c = 0; c < channel(); ++c) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            T *out_data = output.getAddress<T>(b, c, h, 0);
-            const T *in_data = getAddress<T>(b, c, h, 0);
-            std::transform(in_data, in_data + width(), out_data, f);
-          }
-        }
-      }
-    } else {
-      for (unsigned int b = 0; b < batch(); ++b) {
-        for (unsigned int c = 0; c < channel(); ++c) {
-          for (unsigned int h = 0; h < height(); ++h) {
-            for (unsigned int w = 0; w < width(); ++w) {
-              output.setValue(b, c, h, w, f(getValue<T>(b, c, h, w)));
-            }
-          }
-        }
-      }
-    }
-
-    return output;
-  };
-
-  /**
-   * @brief     Apply function element by element
-   * @param[in] *function function pointer applied
-   * @retval    Tensor
-   */
-  template <typename T = float> Tensor apply(std::function<T(T)> f) const {
-    Tensor result;
-    apply<T>(f, result);
-
-    return result;
-  };
-
-  /**
-   * @brief     Apply function to Tensor
-   * @param[in] *function function pointer applied
-   * @retval    Tensor
-   */
-  Tensor apply(std::function<Tensor(Tensor)> f) const;
-
-  /**
-   * @brief     Apply function to Tensor
-   * @param[in] *function function pointer applied
-   * @param[out] output output tensor
-   * @retval    Tensor
-   */
-  Tensor &apply(std::function<Tensor &(Tensor, Tensor &)> f,
-                Tensor &output) const;
-
-  /**
-   * @brief     Print element
-   * @param[in] out out stream
-   * @retval    Tensor
-   */
-  void print(std::ostream &out) const;
-
-  /**
-   * @brief     Print element
-   * @param[in] out out stream
-   * @param[in] opt print formatting option. opt=0 would pretty print the data,
-   * else it would print the raw data.
-   * @retval    Tensor
-   */
-  void print_(std::ostream &out, uint opt = 0) const;
-
-  /**
-   * @brief     Get size of current tensor
-   * @retval    unsigned int size of the current tensor
-   */
-  size_t size() const { return dim.getDataLen(); }
-
-  /**
-   * @brief     Get if the tensor is empty
-   * @retval    true if the tensor is empty
-   */
-  bool empty() const { return size() == 0; }
-
-  /**
-   * @brief     Get size of the data in bytes
-   * @retval    size_t Size in bytes
-   */
-  size_t bytes() const {
-    if (getDataType() == Tdatatype::QINT4) {
-      return (size() * dim.getDataTypeSize() + 1) / 2;
-    }
-    return size() * dim.getDataTypeSize();
-  }
-
-  /**
-   * @brief     Set the element value
-   * @param[in] batch batch location
-   * @param[in] c channel location
-   * @param[in] h height location
-   * @param[in] w width location
-   * @param[in] value value to be stored
-   */
-  void setValue(unsigned int batch, unsigned int c, unsigned int h,
-                unsigned int w, float value) noexcept {
-    if (getDataType() == Tdatatype::FP32) {
-      getData<float>()[getIndex(batch, c, h, w)] = value;
-    } else if (getDataType() == Tdatatype::FP16) {
-#ifdef ENABLE_FP16
-      getData<_FP16>()[getIndex(batch, c, h, w)] = static_cast<_FP16>(value);
-#else
-      ml_loge("%s", "Error: enable-fp16 is not enabled");
-#endif
-    } else if (getDataType() == Tdatatype::QINT8) {
-      getData<uint8_t>()[getIndex(batch, c, h, w)] = value;
-    } else if (getDataType() == Tdatatype::QINT4) {
-      int idx = getIndex(batch, c, h, w);
-
-      if (idx % 2 == 0) {
-        getData<uint8_t>()[idx / 2] =
-          encode_qint(value, getData<uint8_t>()[idx / 2]);
-      } else {
-        getData<uint8_t>()[idx / 2] =
-          encode_qint(getData<uint8_t>()[idx / 2] >> 4, value);
-      }
-    }
-  }
-
-  /**
-   * @brief     add the element value to the location
-   * @param[in] batch batch location
-   * @param[in] c channel location
-   * @param[in] h height location
-   * @param[in] w width location
-   * @param[in] value value to be stored
-   * @param[in] beta scalar to multiply output with and add
-   */
-  void addValue(unsigned int batch, unsigned int c, unsigned int h,
-                unsigned int w, float value, float beta) noexcept {
-    auto const &idx = getIndex(batch, c, h, w);
-    if (dim.getDataType() == Tdatatype::FP32) {
-      getData<float>()[idx] *= beta;
-      getData<float>()[idx] += value;
-    } else if (dim.getDataType() == Tdatatype::FP16) {
-#ifdef ENABLE_FP16
-      getData<_FP16>()[idx] *= static_cast<_FP16>(beta);
-      getData<_FP16>()[idx] += static_cast<_FP16>(value);
-#else
-      ml_loge("%s", "Error: enable-fp16 is not enabled");
-#endif
-    } else if (getDataType() == Tdatatype::QINT8) {
-      getData<uint8_t>()[idx] *= beta;
-      getData<uint8_t>()[idx] += value;
-    }
-  }
-
-  /**
-   * @brief     Set the element value
-   * @param[in] offset offset from start location
-   * @param[in] value value to be stored
-   *
-   * @todo      This is a temporary workout. Remove this once multiple datatypes
-   * are supported.
-   */
-  void setValueInt(unsigned int offset, int value) noexcept {
-    int *data_int = (int *)getData();
-    data_int[offset] = value;
-  }
-
-  /**
-   * @brief     Fill the Tensor elements with value
-   * @param[in] value value to be stored
-   */
-  void setValue(float value);
-
-  /**
-   * @brief     Fill the Tensor elements with zero
+   * @brief     Dot Product of Tensor ( equal MxM )
+   * @details   This applies dot of the last dimension of this and second-last
+   * dimension of passed input tensor.
+   * @param[in] input Tensor
+   * @param[in] trans Transpose
+   * @param[in] trans_in Transpose input
+   * @retval    Calculated Tensor
    */
-  void setZero();
+  Tensor dot(Tensor const &input, bool trans = false,
+             bool trans_in = false) const;
 
   /**
-   * @brief Set the Dist object
-   *
-   * @tparam T distrubution engine
-   * @param dist distribution engine
+   * @brief     Dot Product of Tensor ( equal MxM )
+   * @details   This applies dot of the last dimension of this and
+   * second-last dimension of passed input tensor.
+   * @param[in] input Tensor
+   * @param[in] output output Tensor
+   * @param[in] trans Transpose
+   * @param[in] trans_in Transpose input
+   * @param[in] beta beta
+   * @retval    Calculated Tensor
    */
-  template <typename T, typename Engine> void setDist(Engine dist) {
-    NNTR_THROW_IF(!contiguous, std::invalid_argument)
-      << getName() << " Tensor is not contiguous, cannot set distribution";
-
-    T *data_ = getData<T>();
-    unsigned int len = size();
-    for (unsigned int i = 0; i < len; ++i) {
-      data_[i] = (T)dist(rng);
-    }
-  };
+  Tensor &dot(Tensor const &input, Tensor &output, bool trans = false,
+              bool trans_in = false, float beta = 0.0f) const;
 
   /**
-   * @brief     Set the tensor with random normal distribution
-   * @param[in] mean mean of the distribution
-   * @param[in] std standard deviation of the distribution
+   * @brief compute the derivative of this in the current tensor
+   * @param input same as given to the dot()
+   * @param output_deriv the derivative of the output
+   * @param[in] trans same as given to the dot()
+   * @param[in] trans_in same as given to the dot()
+   * @param[in] beta same as given to the dot()
+   * @note This will compute the derivative in-place and will overwrite
+   existing
+   * data in the tensor
    */
-  void setRandNormal(float mean = 0.0f, float std = 0.05f);
+  Tensor &dot_deriv_wrt_1(Tensor const &input, Tensor const &output_deriv,
+                          bool trans = false, bool trans_in = false,
+                          float beta = 0.0f);
 
   /**
-   * @brief     Set the tensor with random uniform distribution
-   * @param[in] min minimum value for the distribution
-   * @param[in] max maximum value for the distribution
+   * @brief compute the derivative wrt m in the input tensor
+   * @param input_deriv tensor where derivative wrt m will be stored
+   * @param output_deriv the derivative of the output
+   * @param[in] trans same as given to the dot()
+   * @param[in] trans_in same as given to the dot()
+   * @param[in] beta same as given to the dot()
+   * @note The caller tensor must be the same tensor as the one which called
+   the dot() product.
    */
-  void setRandUniform(float min = -0.05f, float max = 0.05f);
+  Tensor &dot_deriv_wrt_2(Tensor &input_deriv, Tensor const &output_deriv,
+                          bool trans = false, bool trans_in = false,
+                          float beta = 0.0f) const;
 
   /**
-   * @brief     Set the tensor with random bernoulli distribution
-   * @param[in] probability probability value for the distribution
+   * @copydoc Tensor::dot(Tensor const &input, Tensor &output, bool trans,
+              bool trans_in, float beta) const
+   * @details performs dot operation over a batch of inputs
    */
-  void setRandBernoulli(float probability = 0.5f);
+  Tensor &dotBatched(Tensor const &input, Tensor &result, bool trans = false,
+                     bool trans_in = false, float beta = 0.0f) const;
 
   /**
-   * @brief     Initialize the memory of the given tensor
+   * @copydoc Tensor::dot_deriv_wrt_1(Tensor const &input, Tensor const
+   &output_deriv, bool trans, bool trans_in, float beta)
    */
-  void initialize();
+  Tensor &dot_batched_deriv_wrt_1(Tensor const &input,
+                                  Tensor const &output_deriv,
+                                  bool trans = false, bool trans_in = false,
+                                  float beta = 0.0f);
 
   /**
-   * @brief     Initialize the memory of the given tensor
-   * @param     init Initiailizer to use for the initialization
+   * @brief Tensor::dot_deriv_wrt_2(Tensor const &input_deriv, Tensor const
+   &output_deriv, bool trans, bool trans_in, float beta) const
    */
-  void initialize(Initializer init) {
-    initializer = init;
-    initialize();
-  }
+  Tensor &dot_batched_deriv_wrt_2(Tensor &input_deriv,
+                                  Tensor const &output_deriv,
+                                  bool trans = false, bool trans_in = false,
+                                  float beta = 0.0f) const;
 
   /**
-   * @brief     set the memory format
-   * @param     fm format of Tensor
+   * @brief Calculate Drop Out Mask : x * 1.0/(1.0-rate)
+   * @param dropout drop out rate
+   * @retval Tensor& reference of drop out mask
    */
-  void convertFormat(TensorDim::Format fm) {
-    if (getFormat() != fm) {
-      transpose("2:1:0");
-    }
-
-    dim.setFormat(fm);
-  }
+  Tensor dropout_mask(float dropout) const;
 
   /**
-   * @brief     Copy the Tensor
-   * @param[in] from Tensor to be copied
-   *
-   * @note copy can reshape the tensor to match the shape
+   * @brief Calculate Drop Out Mask : x * 1.0/(1.0-rate) inplace
+   * @param dropout drop out rate
    */
-  void copy(const Tensor &from);
+  void dropout_mask(float dropout);
 
   /**
-   * @brief     Copy the Tensor
-   * @param[in] from Tensor to be copied
+   * @brief Calculate filter mask
+   * @param mask_len length of each mask along the last axis
+   * @param invert invert the mask
    */
-  void copyData(const Tensor &from);
+  void filter_mask(const Tensor &mask_len, bool reverse = false);
 
   /**
-   * @brief     Copy the Tensor
-   * @param[in] from Tensor to be copied
+   * @brief Calculate 2 Zone Out Mask
+   * @details Calculate zone out mask according to the bernoulli distribution.
+   * Zone out mask with rate @a zoneout for inplace and the other zone out mask
+   * with rate @a (1-zoneout).
+   * @param zoneout zone out rate
+   * @retval Tensor zone out mask for opposite tensor
    */
-  void copy_with_stride(const Tensor &from);
+  Tensor zoneout_mask(float zoneout);
 
   /**
-   * @brief Get slice of the tensor, sliced by batch
-   * @param[in] offset offset in batch to start the slice
-   * @param[in] size size of the slice
-   * @retval slice of this tensor
-   * @note This function provides a slice of this tensor, and does not create a
-   * copy
+   * @brief Calculate 2 Zone Out Mask
+   * @details Calculate zone out mask according to the bernoulli distribution.
+   * Zone out mask with rate @a zoneout for inplace and the other zone out mask
+   * with rate @a (1-zoneout).
+   * @param opposite opposite zone out mask
+   * @param zoneout zone out rate
    */
-  Tensor getBatchSlice(size_t offset, unsigned int size) const;
+  void zoneout_mask(Tensor &opposite, float zoneout);
 
-  /**
-   * @brief Get new tensor which shares memory with current tensor but different
-   * shape
-   *
-   * @param dim new dimension to be set for this tensor
-   * @param offset offset to be used from the start of the data in elements
-   * @note The new tensor will share the same data as the current tensor but
-   * can have different size.
-   * @note New size added with offset must be less than the size of the original
-   * tensor.
-   */
-  Tensor getSharedDataTensor(const TensorDim dim, size_t offset,
-                             bool reset_stride = true,
-                             const std::string &name_ = "") const;
   /**
    * @brief split tensor along axis.
    *
@@ -1664,221 +1150,151 @@ class Tensor {
    * @param sizes sizes
    * @param axis axis
    * @return Tensor splitted tensor
-   * @note if the given array sizes is just a 1 unsigned int value, assumes that
-   * it divide tensor by given size evenly
-   */
-  std::vector<Tensor> split(std::vector<size_t> sizes, int axis = 0);
-
-  /**
-   * @brief concatenate tensors along axis
-   *
-   * @param tensors tensors to be concatenated to the first tensor
-   * @param axis axis
-   * @return Tensor concatenated tensor
-   */
-  static Tensor cat(const std::vector<Tensor> &tensors, int axis = 0);
-
-  /**
-   * @brief make this tensor share memory with given tensor
-   *
-   * @param src Source tensor whose memory is to be shared
-   * @param offset offset to be used from the start of the data in bytes
-   * @note This tensor will share the same data as the current tensor but
-   * can have different size.
-   * @note This tensor's size added with offset must be less than the size of
-   * the source tensor.
-   * @note The stride of the source tensor and this tensor must be same.
-   */
-  void makeSharedDataTensor(const Tensor &src, size_t offset = 0);
-
-  /**
-   * @brief     Convient wrapper for inplace copy of @a this.
-   * @retval    Copied version of this
-   */
-  Tensor clone() const;
-
-  /**
-   * @brief     Save the Tensor into file
-   * @param[in] file output file stream
-   */
-  void save(std::ostream &file);
-
-  /**
-   * @brief     Read the Tensor from file
-   * @param[in] file input file stream
-   * @param[in] s_type scale factor data type
+   * @note if the given array sizes is just a 1 unsigned int value, assumes that
+   * it divide tensor by given size evenly
    */
-  void read(std::ifstream &file, Tdatatype s_type = Tdatatype::FP32);
+  std::vector<Tensor> split(std::vector<size_t> sizes, int axis = 0);
 
   /**
-   * @brief     return argument index which value is max by batch
-   * @retval    unsigned int argument index
+   * @brief concatenate tensors along axis
+   *
+   * @param tensors tensors to be concatenated to the first tensor
+   * @param axis axis
+   * @return Tensor concatenated tensor
    */
-  std::vector<unsigned int> argmax() const;
+  Tensor concat(const std::vector<Tensor> &tensors, int axis = 0);
 
   /**
-   * @brief     return max of the absolute values of the tensor
-   * @retval    maximum absolute value
+   * @brief concatenate tensors along axis
+   *
+   * @param tensors tensors to be concatenated to the first tensor
+   * @param axis axis
+   * @return Tensor concatenated tensor
    */
-  float max_abs() const;
+  static Tensor cat(const std::vector<Tensor> &tensors, int axis = 0);
 
   /**
-   * @brief     return a copy of the Tensor Dim
-   * @retval    TensorDim
+   * @brief     Print element
+   * @param[in] out out stream
    */
-  TensorDim getDim() const { return TensorDim(dim); }
+  void print(std::ostream &out) const;
 
   /**
-   * @brief     return Tensor Dim for a given axis
-   * @retval    dimension
+   * @brief     put data of Tensor
+   * @note      It is only effective when memory_swap is used
    */
-  size_t getTensorDim(unsigned int axis);
+  void putData() const;
 
   /**
-   * @brief     return Tensor Type
+   * @brief Set the memory buffer for the tensor
+   *
+   * @param buf the memory buffer
+   * @param init intialize the buffer
    */
-  TensorDim::TensorType getTensorType() const { return dim.getTensorType(); };
+  void setData(const std::shared_ptr<MemoryData> buf, size_t off = 0,
+               bool init = false);
 
   /**
-   * @brief     return Tensor batch size
-   * @retval    batch size
+   * @brief     return Data pointer of Tensor
+   * @retval    template T pointer (float pointer as default)
    */
-  size_t batch() const { return dim.batch(); }
+  const std::shared_ptr<MemoryData> getMemoryData() const;
 
   /**
-   * @brief     return Tensor batch size
-   * @retval    batch size
+   * @brief     return offset
    */
-  size_t channel() const { return dim.channel(); }
+  size_t getOffset() const;
 
   /**
-   * @brief     return Tensor height size
-   * @retval    height size
+   * @brief     Copy the Tensor
+   * @param[in] from Tensor to be copied
+   *
+   * @note copy can reshape the tensor to match the shape
+   * @note support copying data from multiple data type
    */
-  size_t height() const { return dim.height(); }
+  void copy(const Tensor &from);
 
   /**
-   * @brief     return Tensor batch size
-   * @retval    width size
+   * @brief     Copy the Tensor
+   * @param[in] from Tensor to be copied
+   * @note      support copying data from multiple data type
    */
-  size_t width() const { return dim.width(); }
+  void copyData(const Tensor &from);
 
   /**
-   * @brief     return Tensor Data Type Size
-   * @retval    data type size
+   * @brief     Copy the Tensor
+   * @param[in] from Tensor to be copied
+   * @note      only support copying data from tensor with the same data type
    */
-  uint getDataTypeSize() const { return dim.getDataTypeSize(); }
+  void copy_with_stride(const Tensor &from);
 
   /**
-   * @brief     update batch size for this tensor
-   * @param     batch size
-   * @note      The batchsize of src_tensor need not be related with this
-   * tensor's batch size
-   *
-   * @note      The memory for this tensor will re-allocated/re-assigned if the
-   * updated batch size is different than the current batch size.
-   *
-   * @note      If this tensor is/was the src_tensor for some other, then
-   * reduction in batch size can make the dependent tensors allocate fail due to
-   * memory smaller. Caller must handle this in their own end.
-   *
-   * @note      If this tensor is re-allocated, then the memory might not be
-   * immediately freed as the tensor already depending on this tensor also
-   * share the same memory. So, the peak memory consumption in worst case can
-   * reach the total memory requirements of a model with old batchsize and the
-   * new batch size. It is recommended to first deallocate all the tensors,
-   * updateBatch and then allocate again to avoid such issues.
+   * @brief Get slice of the tensor, sliced by batch
+   * @param[in] offset offset in batch to start the slice
+   * @param[in] size size of the slice
+   * @retval slice of this tensor
+   * @note This function provides a slice of this tensor, and does not create a
+   * copy
    */
-  void updateBatch(unsigned int batch) {
-    if (dim.batch() == batch) {
-      return;
-    }
-
-    if (isAllocated())
-      throw std::invalid_argument(
-        "Cannot update batch for an allocated tensor");
-    dim.batch(batch);
-  }
+  Tensor getBatchSlice(size_t offset, unsigned int size) const;
 
   /**
-   * @brief     return Data pointer of Tensor
-   * @retval    template T pointer (float pointer as default)
+   * @brief     Convient wrapper for inplace copy of @a this.
+   * @retval    Copied version of this
    */
-  template <typename T = float> T *getData() {
-    if (!data)
-      return nullptr;
-
-    data->validate();
-    return data->getAddr<T>() + offset;
-  }
+  Tensor clone() const;
 
   /**
-   * @brief     return Data pointer of Tensor
-   * @retval    template T pointer (float pointer as default)
+   * @brief     Save the Tensor into file
+   * @param[in] file output file stream
    */
-  template <typename T = float> const T *getData() const {
-    if (!data)
-      return nullptr;
-
-    data->validate();
-    return data->getAddr<T>() + offset;
-  }
+  void save(std::ostream &file);
 
   /**
-   * @brief     return Data pointer of Tensor
-   * @retval    template T pointer (float pointer as default)
+   * @brief     Read the Tensor from file
+   * @param[in] file input file stream
    */
-  template <typename T = float> T *getData(size_t idx) const {
-    if (!data)
-      return nullptr;
-
-    size_t index = idx;
-
-    data->validate();
-    return data->getAddr<T>() + offset + index;
-  }
+  void read(std::ifstream &file);
 
   /**
-   * @brief     setter data type
-   * @param[in] Data Type
+   * @brief     return argument index which value is max by batch
+   * @retval    unsigned int argument indices
    */
-  void setDataType(Tdatatype d_type) { dim.setDataType(d_type); }
+  std::vector<unsigned int> argmax() const;
 
   /**
-   * @brief     setter tensor type
-   * @param[in] tensor Type
+   * @brief     return max of the absolute values of the tensor
+   * @retval    maximum absolute value
    */
-  void setTensorType(ml::train::TensorDim::TensorType t_type) {
-    dim.setTensorType(t_type);
-  }
+  float max_abs() const;
 
   /**
-   * @brief     put data of Tensor
-   *
-   * @note      It is only effective when memory_swap is used
+   * @brief  return maximum value
+   * @retval Maximum value of the tensor data
    */
-  void putData() const {
-    if (!data)
-      return;
-
-    data->invalidate();
-  }
+  float maxValue() const;
 
   /**
-   * @brief     return Data pointer of Tensor
-   * @retval    template T pointer (float pointer as default)
+   * @brief  return minimum value
+   * @retval Minimum value of the tensor data
    */
-  const std::shared_ptr<MemoryData> getMemoryData() const { return data; }
+  float minValue() const;
 
   /**
-   * @brief     return offset
+   * @brief  Transpose Tensor
+   * @param  direction to transpose ex) 0:2:1
+   * @return Tensor
    */
-  size_t getOffset() const { return offset; }
+  Tensor transpose(const std::string &direction) const;
 
   /**
-   * @brief     i data index
-   * @retval    address of ith data
+   * @brief      Transpose Tensor
+   * @param      direction to transpose ex) 0:2:1
+   * @param[out] Tensor to save to, dimension is always reshaped.
+   * @retval     Tensor& reference to the out
    */
+  Tensor &transpose(const std::string &direction, Tensor &out) const;
+
   /**
    * @brief     set Tensor Dim
    * @param[in] d TensorDim
@@ -1898,263 +1314,144 @@ class Tensor {
   void fill(const Tensor &from, bool allocate = false);
 
   /**
-   * @brief     return current stride of tensor.
-   * @retval    int[MAXDIM] strides
-   */
-  const std::array<size_t, TensorDim::MAXDIM> getStrides() const noexcept {
-    return strides;
-  }
-  /**
-   * @brief Get linear index given the n-d index
-   */
-  inline size_t getIndex(unsigned int b, unsigned int c, unsigned int h,
-                         unsigned int w) const noexcept {
-    if (getFormat() == Tformat::NCHW) {
-      if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
-        return (b * strides[0] + c * strides[1] + h * strides[2] +
-                w * strides[3]);
-      } else {
-        return b * dim[1] * dim[2] * dim[3] + c * dim[2] * dim[3] + h +
-               w * dim[2];
-      }
-
-    } else {
-      return (b * strides[0] + h * strides[1] + w * strides[2] +
-              c * strides[3]);
-    }
-  }
-
-  /**
-   * @brief Check if two given axes are contiguous
-   */
-  bool checkContinuous(unsigned int n, unsigned int np1) const {
-    std::vector<unsigned int> continuous_order_nhwc = {0, 3, 1, 2};
-    bool continuous = false;
-    if (getFormat() == Tformat::NHWC) {
-      if (continuous_order_nhwc[np1] == continuous_order_nhwc[n] + 1)
-        continuous = true;
-    } else {
-      if (n + 1 == np1)
-        continuous = true;
-    }
-    return continuous;
-  }
-
-  /**
-   * @brief   Get name of the tensor
-   *
-   * @return name of the tensor
-   */
-  void setName(const std::string &name_) { name = name_; }
-
-  /**
-   * @brief   Get name of the tensor
-   *
-   * @return name of the tensor
+   * @brief     return a copy of the Tensor Dim
+   * @retval    TensorDim
    */
-  const std::string &getName() const { return name; }
+  TensorDim getDim() const;
 
   /**
-   * @brief Set the memory buffer for the tensor
-   *
-   * @param buf the memory buffer
-   * @param init intialize the buffer
+   * @brief     return Tensor Type
    */
-  void setData(const std::shared_ptr<MemoryData> buf, size_t off = 0,
-               bool init = false) {
-    if (buf) {
-      data = buf;
-      offset = off;
-      if (init)
-        initialize();
-    } else {
-      data = nullptr;
-      offset = 0;
-    }
-  }
+  TensorDim::TensorType getTensorType() const;
 
   /**
    * @brief Get initializer for the tensor
    *
    * @return initializer of the tensor
    */
-  Tensor::Initializer getInitializer() const { return initializer; }
+  Initializer getInitializer() const;
 
   /**
    * @brief Get format for the tensor
-   *
    * @return format of the tensor
    */
-  TensorDim::Format getFormat() const { return dim.getFormat(); }
+  TensorDim::Format getFormat() const;
 
   /**
    * @brief Get data type for the tensor
    *
    * @return data type of the tensor
    */
-  Tdatatype getDataType() const { return dim.getDataType(); }
+  Tdatatype getDataType() const;
 
   /**
-   * @brief     Set fp32 scale factors of the tensor
-   * @param[in] scales fp32 scale factors
+   * @brief     update batch size for this tensor
+   * @param     batch size
+   * @note      The batchsize of src_tensor need not be related with this
+   * tensor's batch size
+   *
+   * @note      The memory for this tensor will re-allocated/re-assigned if the
+   * updated batch size is different than the current batch size.
+   *
+   * @note      If this tensor is/was the src_tensor for some other, then
+   * reduction in batch size can make the dependent tensors allocate fail due to
+   * memory smaller. Caller must handle this in their own end.
+   *
+   * @note      If this tensor is re-allocated, then the memory might not be
+   * immediately freed as the tensor already depending on this tensor also
+   * share the same memory. So, the peak memory consumption in worst case can
+   * reach the total memory requirements of a model with old batchsize and the
+   * new batch size. It is recommended to first deallocate all the tensors,
+   * updateBatch and then allocate again to avoid such issues.
    */
-  void setScaleFactors(std::vector<float> scales) {
-    if (scales.empty()) {
-      throw std::invalid_argument("Error: invalid parameter");
-    }
-
-    scale_factors_fp32 = scales;
-  }
+  void updateBatch(unsigned int batch);
 
   /**
-   * @brief Get scale factors of the tensor
-   *
-   * @return scale factors of the tensor
+   * @brief     return whether tensor is contiguous or not.
+   * @retval    bool contiguous
    */
-  std::vector<float> getScaleFactors() const;
+  const bool getContiguous() const noexcept;
 
   /**
-   * @brief     Set output axis of the tensor
-   * @param[in] zp zero points
+   * @brief     return current stride of tensor.
+   * @retval    int[MAXDIM] strides
    */
-  void setZeroPoints(std::vector<uint8_t> zp);
+  const std::array<size_t, TensorDim::MAXDIM> getStrides() const noexcept;
 
-#ifdef ENABLE_FP16
   /**
-   * @brief     Set fp16 scale factors of the tensor
-   * @param[in] scales fp16 scale factors
+   * @brief     Check if two given axes are contiguous
+   * @param[in] np1 first axis
+   * @param[in] np2 second axis to compare with first axis
+   * @retval    bool continuous
    */
-  void setScaleFactorsFP16(std::vector<_FP16> scales) {
-    if (scales.empty()) {
-      throw std::invalid_argument("Error: invalid parameter");
-    }
-
-    scale_factors_fp16 = scales;
-  }
-#endif
+  bool checkContinuous(unsigned int np1, unsigned int np2) const;
 
   /**
-   * @brief Get zero points of the tensor
-   *
-   * @return zero points of the tensor
+   * @brief     Set name of the tensor
+   * @param[in] name_ tensor name
    */
-  std::vector<uint8_t> getZeroPoints() const;
+  void setName(const std::string &name_);
 
   /**
-   * @brief      Dequantize Tensor to output tensor datatype
-   * @param[out] output Tensor to store the result
+   * @brief     Get name of the tensor
+   * @retval    string name
    */
-  void dequantize(Tensor &output, unsigned int axis) const;
-
-  static constexpr float epsilon = 1e-5;
-
-private:
-  /**< handle the data as a std::shared_ptr<float> type */
-  TensorDim dim;
-  std::array<size_t, TensorDim::MAXDIM> strides;
-  bool contiguous;
-  Tensor::Initializer initializer;
-  std::string name; /**< name of the tensor */
-  std::shared_ptr<MemoryData> data;
-  size_t offset;
-  std::vector<float> scale_factors_fp32;
-#ifdef ENABLE_FP16
-  std::vector<_FP16> scale_factors_fp16;
-#endif
-  std::vector<uint8_t> zero_points;
+  const std::string &getName() const;
 
-  /**<
-   * When using shared_data with tensor, this stores the ptr of the source
-   * tensor which handles the full memory. If tensor data is already allocated,
-   * this does not affect the tensor. If the tensor data is not allocated, and
-   * src_ptr is valid, this tensor will use the memory allocated by the src_ptr
+  /**
+   * @brief Get linear index given the n-d index
+   */
+  size_t getIndex(unsigned int b, unsigned int c, unsigned int h,
+                  unsigned int w) const noexcept;
+  /**
+   * @brief     Get size of current tensor
+   * @retval    unsigned int size of the current tensor
    */
-  std::shared_ptr<SrcSharedTensor> src_tensor;
+  size_t size() const;
 
-  struct BroadcastInfo;
+  /**
+   * @brief     Get if the tensor is empty
+   * @retval    true if the tensor is empty
+   */
+  bool empty() const;
 
   /**
-   * @brief Applies the given operator to the tensor with the passed argument
-   * @param[in] m Tensor
-   * @param[in] v_func vectorized function to apply
-   * @param e broadcast info.
-   * @param cur_axis current axis. pass default when calling outside.
-   * @param offset offset for this.  pass default when calling outside.
-   * @param m_offset offset for m.  pass default when calling outside.
-   * @retval #ML_ERROR_NONE Successful
-   * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
+   * @brief     Get size of the data in bytes
+   * @retval    size_t Size in bytes
    */
-  void
-  apply_broadcast_util(Tensor const &m,
-                       std::function<void(const BroadcastInfo &e, const float *,
-                                          const float *, float *)>
-                         v_func,
-                       Tensor &output, const BroadcastInfo &e,
-                       int cur_axis = -1, size_t offset = 0,
-                       size_t m_offset = 0) const;
+  size_t bytes() const;
 
   /**
-   * @brief Applies the given operator to the tensor with the passed argument
-   *
-   * @param[in] m Tensor
-   * @param[in] v_func vectorized function to apply
-   * @retval #ML_ERROR_NONE Successful
-   * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
+   * @brief     return Tensor batch size
+   * @retval    batch size
    */
-  void apply_broadcast(Tensor const &m,
-                       std::function<void(const BroadcastInfo &e, const float *,
-                                          const float *, float *)>
-                         v_func,
-                       Tensor &output) const;
-#ifdef ENABLE_FP16
+  size_t batch() const;
+
   /**
-   * @brief Applies the given operator to the tensor with the passed argument
-   * @param[in] m Tensor
-   * @param[in] v_func vectorized function to apply
-   * @param e broadcast info.
-   * @param cur_axis current axis. pass default when calling outside.
-   * @param offset offset for this.  pass default when calling outside.
-   * @param m_offset offset for m.  pass default when calling outside.
-   * @retval #ML_ERROR_NONE Successful
-   * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
+   * @brief     return Tensor channel size
+   * @retval    channel size
    */
-  void
-  apply_broadcast_util(Tensor const &m,
-                       std::function<void(const BroadcastInfo &e, const _FP16 *,
-                                          const _FP16 *, _FP16 *)>
-                         v_func,
-                       Tensor &output, const BroadcastInfo &e,
-                       int cur_axis = -1, size_t offset = 0,
-                       size_t m_offset = 0) const;
+  size_t channel() const;
+
   /**
-   * @brief Applies the given operator to the tensor with the passed argument
-   *
-   * @param[in] m Tensor
-   * @param[in] v_func vectorized function to apply
-   * @retval #ML_ERROR_NONE Successful
-   * @retval #ML_ERROR_INVALID_PARAMETER Invalid Parameter
+   * @brief     return Tensor height size
+   * @retval    height size
    */
-  void apply_broadcast(Tensor const &m,
-                       std::function<void(const BroadcastInfo &e, const _FP16 *,
-                                          const _FP16 *, _FP16 *)>
-                         v_func,
-                       Tensor &output) const;
-#endif
+  size_t height() const;
+
   /**
-   * @brief compute Loop info for broadcasting and vectorization
-   *
-   * @param m target tensor to be calculated against.
-   * @return BroadcastInfo Loopinfo needed to run external loop
+   * @brief     return Tensor width size
+   * @retval    width size
    */
-  BroadcastInfo computeBroadcastInfo(const Tensor &m) const;
+  size_t width() const;
 
   /**
-   * @brief copy a buffer to @a this, the caller has to ensure that @a this is
-   * initialized otherwise undefined behavior
+   * @brief Merge the given two axis for tensor at second axis inplace
    *
-   * @param buf buffer to copy from
+   * @param axis1 first axis to merge
+   * @param axis2 second axis to merge
    */
-  void copy(const void *buf);
+  void mergeAxis(unsigned int axis1, unsigned int axis2);
 
   /**
    * @brief Update destination tensor to share memory with source tensor
@@ -2167,57 +1464,52 @@ class Tensor {
    * @note New size added with offset must be less than the size of the original
    * tensor.
    */
-  static void createSharedDataTensor(const Tensor &src, Tensor &dest,
-                                     size_t offset);
-
-  /**
-   * @brief    Reallocate memory for this tensor
-   * @note     This will not necessary free the memory as tensors share memory
-   * @note     This can increase the peak memory consumption when callled on all
-   * the tensors of a model sequentially. It is advised to first deallocate all
-   * the tensors and then allocate, than reallocate tensors one by one.
-   */
-  void reallocate() {
-    deallocate();
-    allocate();
-  }
+  void createSharedDataTensor(const Tensor &src, Tensor &dest,
+                              size_t offset) const;
 
   /**
-   * @brief Merge the given two axis for tensor at second axis inplace
+   * @brief Get new tensor which shares memory with current tensor but different
+   * shape
    *
-   * @param axis1 first axis to merge
-   * @param axis2 second axis to merge
+   * @param dim new dimension to be set for this tensor
+   * @param offset offset to be used from the start of the data in elements
+   * @note The new tensor will share the same data as the current tensor but
+   * can have different size.
+   * @note New size added with offset must be less than the size of the original
+   * tensor.
    */
-  void mergeAxis(unsigned int axis1, unsigned int axis2);
+  Tensor getSharedDataTensor(const TensorDim dim_, size_t offset,
+                             bool reset_stride = true,
+                             const std::string &name_ = "") const;
 
   /**
-   * @brief     rotate 180 dgree
-   * @param[in] in input Tensor
-   * @retVal Tensor rotated tensor (180 degree)
+   * @brief    Swaps Tensor lhs and rhs
+   * @param[in] lhs Tensor to be swapped
+   * @param[in] rhs Tensor to be swapped
    */
-  Tensor rotate_180(Tensor in);
+  friend void swap(Tensor &lhs, Tensor &rhs) noexcept {
+    std::swap(lhs.itensor, rhs.itensor);
+  }
 
-  /**
-   * @brief      Encode two int4 values to one int8 value
-   * @param[in]  high value for first 4 bits
-   * @param[in]  low value for last 4 bits
-   * @retval     Encoded value
-   */
-  uint8_t encode_qint(uint8_t high, uint8_t low) const;
+  static constexpr float epsilon = 1e-5;
+
+private:
+  std::shared_ptr<TensorBase> itensor;
 
   /**
-   * @brief      Decode int8 value to a int4 value
-   * @param[in]  idx index to retrieve value
-   * @retval     Decoded value
+   * @brief Set tensor variables
+   *
+   * @param[in] d TensorDim
+   * @param[in] buf buffer
+   * @param[in] offset offset to be used
    */
-  uint8_t decode_qint(uint8_t val, bool isHigh) const;
-
-}; // namespace nntrainer
+  void setTensorVar(TensorDim d, void *buf, size_t offset);
+};
 
 /**
  * @brief   Overriding output stream
  */
-std::ostream &operator<<(std::ostream &out, Tensor const &m);
+std::ostream &operator<<(std::ostream &out, Tensor const &input);
 
 typedef std::shared_ptr<Tensor> sharedTensor;
 
@@ -2227,7 +1519,7 @@ typedef std::vector<sharedConstTensor> sharedConstTensors;
 
 typedef std::vector<sharedTensor> sharedTensors;
 
-} /* namespace nntrainer */
+} // namespace nntrainer
 
 #endif /* __cplusplus */
 #endif /* __TENSOR_H__ */
diff --git a/nntrainer/tensor/tensor_base.cpp b/nntrainer/tensor/tensor_base.cpp
index b2bcfd444e..ed34654d04 100644
--- a/nntrainer/tensor/tensor_base.cpp
+++ b/nntrainer/tensor/tensor_base.cpp
@@ -9,8 +9,8 @@
  * @bug		No known bugs except for NYI items
  */
 
+#include <tensor.h>
 #include <tensor_base.h>
-#include <tensor_v2.h>
 
 namespace nntrainer {
 
@@ -176,14 +176,14 @@ void TensorBase::getSharedDataTensor(const TensorDim dim_, size_t offset,
   createSharedDataTensor(this, ret, offset);
 }
 
-TensorBase::BroadcastInfoV2
-TensorBase::computeBroadcastInfo(const TensorV2 &m) const {
+TensorBase::BroadcastInfo
+TensorBase::computeBroadcastInfo(const Tensor &m) const {
   if (m.size() > this->size())
     throw exception::not_supported("broadcasting *this is not supported");
 
   const TensorDim m_dim = m.getDim();
 
-  BroadcastInfoV2 e;
+  BroadcastInfo e;
   e.tensor_type = getTensorType();
 
   uint continuity[4] = {0, 1, 2, 3};
@@ -255,7 +255,7 @@ TensorBase::computeBroadcastInfo(const TensorV2 &m) const {
 }
 
 void TensorBase::calculateFlattenDot(
-  TensorV2 const &input, TensorV2 &output, bool trans, bool trans_in,
+  Tensor const &input, Tensor &output, bool trans, bool trans_in,
   unsigned int &first_three_flat, unsigned int &last_axis,
   unsigned int &input_first_three_flat, unsigned int &input_last_axis,
   unsigned int &M, unsigned int &N, unsigned int &K, unsigned int &lda,
@@ -285,11 +285,11 @@ void TensorBase::calculateFlattenDot(
     N = input_last_axis;
     M = first_three_flat;
     if (getFormat() == Tformat::NHWC) {
-      CREATE_V2_IF_EMPTY_DIMS(output, batch(), N, height(), width(),
-                              getTensorType()); //  NHWC Result Tensor
+      CREATE_IF_EMPTY_DIMS(output, batch(), N, height(), width(),
+                           getTensorType()); //  NHWC Result Tensor
     } else {
-      CREATE_V2_IF_EMPTY_DIMS(output, batch(), channel(), height(), N,
-                              getTensorType());
+      CREATE_IF_EMPTY_DIMS(output, batch(), channel(), height(), N,
+                           getTensorType());
     }
 
     // We are not set zero the output because of performance reason.
@@ -305,11 +305,11 @@ void TensorBase::calculateFlattenDot(
     N = input_first_three_flat;
     M = first_three_flat;
     if (getFormat() == Tformat::NHWC) {
-      CREATE_V2_IF_EMPTY_DIMS(output, batch(), N, height(), width(),
-                              getTensorType());
+      CREATE_IF_EMPTY_DIMS(output, batch(), N, height(), width(),
+                           getTensorType());
     } else {
-      CREATE_V2_IF_EMPTY_DIMS(output, batch(), channel(), height(), N,
-                              getTensorType());
+      CREATE_IF_EMPTY_DIMS(output, batch(), channel(), height(), N,
+                           getTensorType());
     }
   } else if (trans && !trans_in) {
     if (first_three_flat != input_first_three_flat)
@@ -319,9 +319,9 @@ void TensorBase::calculateFlattenDot(
     N = input_last_axis;
     M = last_axis;
     if (getFormat() == Tformat::NHWC) {
-      CREATE_V2_IF_EMPTY_DIMS(output, 1, N, M, 1, getTensorType());
+      CREATE_IF_EMPTY_DIMS(output, 1, N, M, 1, getTensorType());
     } else {
-      CREATE_V2_IF_EMPTY_DIMS(output, 1, 1, M, N, getTensorType());
+      CREATE_IF_EMPTY_DIMS(output, 1, 1, M, N, getTensorType());
     }
   } else {
     if (first_three_flat != input_last_axis)
@@ -331,9 +331,9 @@ void TensorBase::calculateFlattenDot(
     N = input_first_three_flat;
     M = last_axis;
     if (getFormat() == Tformat::NHWC) {
-      CREATE_V2_IF_EMPTY_DIMS(output, 1, N, M, 1, getTensorType());
+      CREATE_IF_EMPTY_DIMS(output, 1, N, M, 1, getTensorType());
     } else {
-      CREATE_V2_IF_EMPTY_DIMS(output, 1, 1, M, N, getTensorType());
+      CREATE_IF_EMPTY_DIMS(output, 1, 1, M, N, getTensorType());
     }
   }
 
diff --git a/nntrainer/tensor/tensor_base.h b/nntrainer/tensor/tensor_base.h
index 5a18a7a1e7..c3b4bfb875 100644
--- a/nntrainer/tensor/tensor_base.h
+++ b/nntrainer/tensor/tensor_base.h
@@ -72,7 +72,7 @@ enum class Initializer {
   NONE            /** No initialization */
 };
 
-class TensorV2;
+class Tensor;
 class SrcSharedTensorBase;
 
 /**
@@ -114,6 +114,21 @@ class TensorBase {
   TensorBase(const TensorDim &d, const void *buf = nullptr) :
     TensorBase(d, true) {}
 
+  /**
+   *  @brief  Copy constructor of TensorBase.
+   *  @param[in] Tensor &
+   */
+  TensorBase(const TensorBase &rhs) {
+    dim = rhs.dim;
+    strides = rhs.strides;
+    contiguous = rhs.contiguous;
+    initializer = rhs.initializer;
+    name = rhs.name;
+    data = rhs.data;
+    offset = rhs.offset;
+    src_tensor = rhs.src_tensor;
+  }
+
   /**
    * @brief     Comparison operator overload
    * @param[in] rhs Tensor to be compared with
@@ -129,7 +144,7 @@ class TensorBase {
   bool operator!=(const TensorBase &rhs) const { return !(*this == rhs); }
 
   /**
-   * @copydoc TensorV2::setTensorVar(TensorDim d, void *buf, size_t offset)
+   * @copydoc Tensor::setTensorVar(TensorDim d, void *buf, size_t offset)
    */
   void setTensorVar(TensorDim d, void *buf, size_t offset);
 
@@ -139,27 +154,27 @@ class TensorBase {
   virtual ~TensorBase() {}
 
   /**
-   * @copydoc TensorV2::allocate()
+   * @copydoc Tensor::allocate()
    */
   virtual void allocate() = 0;
 
   /**
-   * @copydoc TensorV2::deallocate()
+   * @copydoc Tensor::deallocate()
    */
   virtual void deallocate() = 0;
 
   /**
-   * @copydoc TensorV2::isAllocated()
+   * @copydoc Tensor::isAllocated()
    */
   bool isAllocated() { return data != nullptr; }
 
   /**
-   * @copydoc TensorV2::getData()
+   * @copydoc Tensor::getData()
    */
   virtual void *getData() const = 0;
 
   /**
-   * @copydoc TensorV2::getData(size_t idx)
+   * @copydoc Tensor::getData(size_t idx)
    */
   virtual void *getData(size_t idx) const = 0;
 
@@ -176,143 +191,155 @@ class TensorBase {
   virtual const void *getAddress(unsigned int i) const = 0;
 
   /**
-   * @copydoc TensorV2::setValue(float value)
+   * @copydoc Tensor::setValue(float value)
    */
   virtual void setValue(float value) = 0;
 
   /**
-   * @copydoc TensorV2::setValue(b, c, h, w, value)
+   * @copydoc Tensor::setValue(b, c, h, w, value)
    */
   virtual void setValue(unsigned int b, unsigned int c, unsigned int h,
                         unsigned int w, float value) = 0;
 
   /**
-   * @copydoc TensorV2::addValue()
+   * @copydoc Tensor::addValue()
    */
   virtual void addValue(unsigned int b, unsigned int c, unsigned int h,
                         unsigned int w, float value, float beta) = 0;
 
   /**
-   * @copydoc TensorV2::setZero()
+   * @copydoc Tensor::setZero()
    */
   virtual void setZero() = 0;
 
   /**
-   * @copydoc TensorV2::setRandNormal()
+   * @copydoc Tensor::setRandNormal()
    */
   virtual void setRandNormal(float mean, float stddev) = 0;
 
   /**
-   * @copydoc TensorV2::setRandBernoulli()
+   * @copydoc Tensor::setRandBernoulli()
    */
   virtual void setRandUniform(float min, float max) = 0;
 
   /**
-   * @copydoc TensorV2::setRandBernoulli()
+   * @copydoc Tensor::setRandBernoulli()
    */
   virtual void setRandBernoulli(float probability) = 0;
 
   /**
-   * @copydoc TensorV2::initialize()
+   * @copydoc Tensor::initialize()
    */
   virtual void initialize() = 0;
 
   /**
-   * @copydoc TensorV2::initialize(Initializer init)
+   * @copydoc Tensor::initialize(Initializer init)
    */
   virtual void initialize(Initializer init) = 0;
 
   /**
-   * @copydoc TensorV2::multiply_strided(TensorV2 const &m, TensorV2 &output,
+   * @copydoc Tensor::multiply_strided(Tensor const &m, Tensor &output,
    * const float beta)
    */
-  virtual TensorV2 multiply_strided(TensorV2 const &m, TensorV2 &output,
-                                    const float beta) const = 0;
+  virtual Tensor multiply_strided(Tensor const &m, Tensor &output,
+                                  const float beta) const = 0;
 
   /**
-   * @copydoc TensorV2::multiply_i(float const &value)
+   * @copydoc Tensor::multiply_i(float const &value)
    */
   virtual int multiply_i(float const &value) = 0;
 
   /**
-   * @copydoc TensorV2::multiply(float const &value, TensorV2 &out)
+   * @copydoc Tensor::multiply(float const &value, Tensor &out)
    */
-  virtual TensorV2 &multiply(float const &value, TensorV2 &out) const = 0;
+  virtual Tensor &multiply(float const &value, Tensor &out) const = 0;
 
   /**
-   * @copydoc TensorV2::multiply(TensorV2 const &m, TensorV2 &output, const
+   * @copydoc Tensor::multiply(Tensor const &m, Tensor &output, const
    * float beta = 0.0)
    */
-  virtual TensorV2 &multiply(TensorV2 const &m, TensorV2 &output,
-                             const float beta = 0.0) const = 0;
+  virtual Tensor &multiply(Tensor const &m, Tensor &output,
+                           const float beta = 0.0) const = 0;
 
   /**
-   * @copydoc TensorV2::divide(float const &value, TensorV2 &output)
+   * @copydoc Tensor::divide(float const &value, Tensor &output)
    */
-  virtual TensorV2 &divide(float const &value, TensorV2 &output) const = 0;
+  virtual Tensor &divide(float const &value, Tensor &output) const = 0;
 
   /**
-   * @copydoc TensorV2::divide(TensorV2 const &m, TensorV2 &output)
+   * @copydoc Tensor::divide(Tensor const &m, Tensor &output)
    */
-  virtual TensorV2 &divide(TensorV2 const &m, TensorV2 &output) const = 0;
+  virtual Tensor &divide(Tensor const &m, Tensor &output) const = 0;
 
   /**
-   * @copydoc TensorV2::add_strided(TensorV2 const &input, TensorV2 &output,
+   * @copydoc Tensor::add_strided(Tensor const &input, Tensor &output,
    * const float beta)
    */
-  virtual TensorV2 &add_strided(TensorV2 const &input, TensorV2 &output,
-                                const float beta) const = 0;
+  virtual Tensor &add_strided(Tensor const &input, Tensor &output,
+                              const float beta) const = 0;
 
   /**
-   * @copydoc TensorV2::add(float const &value, TensorV2 &output)
+   * @copydoc Tensor::add_i(Tensor const &m, float const alpha)
    */
-  virtual TensorV2 &add(float const &value, TensorV2 &output) const = 0;
+  virtual int add_i(Tensor const &m, Tensor &output, float const alpha) = 0;
 
   /**
-   * @copydoc TensorV2::add(TensorV2 const &m, TensorV2 &output, float const
+   * @copydoc Tensor::add_i_partial()
+   */
+  virtual int add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
+                            unsigned int incX, unsigned int incY,
+                            const Tensor alphas, unsigned int alpha_idx) = 0;
+
+  /**
+   * @copydoc Tensor::add(float const &value, Tensor &output)
+   */
+  virtual Tensor &add(float const &value, Tensor &output) const = 0;
+
+  /**
+   * @copydoc Tensor::add(Tensor const &m, Tensor &output, float const
    * alpha)
    */
-  virtual TensorV2 &add(TensorV2 const &m, TensorV2 &output,
-                        float const alpha) const = 0;
+  virtual Tensor &add(Tensor const &m, Tensor &output,
+                      float const alpha) const = 0;
 
   /**
-   * @copydoc TensorV2::subtract(float const &value, TensorV2 &output)
+   * @copydoc Tensor::subtract(float const &value, Tensor &output)
    */
-  virtual TensorV2 &subtract(float const &value, TensorV2 &output) const = 0;
+  virtual Tensor &subtract(float const &value, Tensor &output) const = 0;
 
   /**
    * @brief      Sum all the Tensor elements according to the batch
    * @param[out] output Tensor(batch, 1, 1, 1)
    */
-  virtual void sum_by_batch(TensorV2 &output) const = 0;
+  virtual void sum_by_batch(Tensor &output) const = 0;
 
   /**
-   * @copydoc TensorV2::sum(unsigned int axis, TensorV2 &output, float alpha,
+   * @copydoc Tensor::sum(unsigned int axis, Tensor &output, float alpha,
    * float beta) const
    */
-  virtual TensorV2 &sum(unsigned int axis, TensorV2 &output, float alpha,
-                        float beta) const = 0;
+  virtual Tensor &sum(unsigned int axis, Tensor &output, float alpha,
+                      float beta) const = 0;
 
   /**
-   * @copydoc TensorV2::l2norm
+   * @copydoc Tensor::l2norm
    */
   virtual float l2norm() const = 0;
 
   /**
-   * @copydoc TensorV2::pow(float exponent, TensorV2 &output)
+   * @copydoc Tensor::pow(float exponent, Tensor &output)
    */
-  virtual TensorV2 &pow(float exponent, TensorV2 &output) const = 0;
+  virtual Tensor &pow(float exponent, Tensor &output) const = 0;
 
   /**
-   * @copydoc TensorV2::erf(TensorV2 &output)
+   * @copydoc Tensor::erf(Tensor &output)
    */
-  virtual TensorV2 &erf(TensorV2 &output) const = 0;
+  virtual Tensor &erf(Tensor &output) const = 0;
 
   /**
    * @brief    sin transform function
    * @param[out] out out to store the result
    */
-  virtual void sin(TensorV2 &out, float alpha = 1.0) {
+  virtual void sin(Tensor &out, float alpha = 1.0) {
     throw std::invalid_argument(
       "Tensor::sin not supported in current tensor data type.");
   }
@@ -321,11 +348,20 @@ class TensorBase {
    * @brief    cos transform function
    * @param[out] out out to store the result
    */
-  virtual void cos(TensorV2 &out, float alpha = 1.0) {
+  virtual void cos(Tensor &out, float alpha = 1.0) {
     throw std::invalid_argument(
       "Tensor::cos not supported in current tensor data type.");
   }
 
+  /**
+   * @brief      inverse squared root function
+   * @param[out] out out to store the result
+   */
+  virtual void inv_sqrt(Tensor &out) {
+    throw std::invalid_argument(
+      "Tensor::inv_sqrt not supported in current tensor data type.");
+  }
+
   /**
    * @brief     Dot Product of Tensor ( equal MxM )
    * @details   This applies dot of the last dimension of this and
@@ -337,48 +373,51 @@ class TensorBase {
    * @param[in] beta beta
    * @retval    Calculated Tensor
    */
-  virtual TensorV2 &dot(TensorV2 const &input, TensorV2 &output, bool trans,
-                        bool trans_in, float beta) const = 0;
+  virtual Tensor &dot(Tensor const &input, Tensor &output, bool trans,
+                      bool trans_in, float beta) const = 0;
 
   /**
-   * @copydoc TensorV2::dropout_mask(float dropout)
+   * @copydoc Tensor::dropout_mask(float dropout)
    */
   virtual void dropout_mask(float dropout) = 0;
 
   /**
-   * @copydoc TensorV2::filter_mask(const TensorV2 &mask_len, bool reverse)
+   * @copydoc Tensor::filter_mask(const Tensor &mask_len, bool reverse)
    */
-  virtual void filter_mask(const TensorV2 &mask_len, bool reverse) = 0;
+  virtual void filter_mask(const Tensor &mask_len, bool reverse) = 0;
 
   /**
-   * @copydoc TensorV2::zoneout_mask(TensorV2 &opposite, float zoneout)
+   * @copydoc Tensor::zoneout_mask(Tensor &opposite, float zoneout)
    */
-  virtual void zoneout_mask(TensorV2 &opposite, float zoneout) = 0;
+  virtual void zoneout_mask(Tensor &opposite, float zoneout) = 0;
 
   /**
-   * @copydoc TensorV2::split(std::vector<size_t> sizes, int axis)
+   * @copydoc Tensor::split(std::vector<size_t> sizes, int axis)
    */
-  virtual std::vector<TensorV2> split(std::vector<size_t> sizes, int axis) = 0;
+  virtual std::vector<Tensor> split(std::vector<size_t> sizes, int axis) = 0;
 
   /**
-   * @copydoc TensorV2::print(std::ostream &out)
+   * @copydoc Tensor::concat(const std::vector<Tensor> &tensors, int axis)
+   */
+  virtual Tensor concat(const std::vector<Tensor> &tensors, int axis) = 0;
+
+  /**
+   * @copydoc Tensor::print(std::ostream &out)
    */
   virtual void print(std::ostream &out) const = 0;
 
   /**
-   * @copydoc TensorV2::apply(std::function<T(T)> f, TensorV2 &output)
+   * @copydoc Tensor::apply(std::function<T(T)> f, Tensor &output)
    */
-  virtual TensorV2 &apply(std::function<float(float)> f,
-                          TensorV2 &output) const {
+  virtual Tensor &apply(std::function<float(float)> f, Tensor &output) const {
     return output;
   }
 
 #ifdef ENABLE_FP16
   /**
-   * @copydoc TensorV2::apply(std::function<T(T)> f, TensorV2 &output)
+   * @copydoc Tensor::apply(std::function<T(T)> f, Tensor &output)
    */
-  virtual TensorV2 &apply(std::function<_FP16(_FP16)> f,
-                          TensorV2 &output) const {
+  virtual Tensor &apply(std::function<_FP16(_FP16)> f, Tensor &output) const {
     return output;
   }
 #endif
@@ -389,39 +428,46 @@ class TensorBase {
    *
    * @note copy can reshape the tensor to match the shape
    */
-  virtual void copy(const TensorV2 &from) = 0;
+  virtual void copy(const Tensor &from) = 0;
 
   /**
    * @brief     Copy the Tensor
    * @param[in] from Tensor to be copied
    */
-  virtual void copyData(const TensorV2 &from) = 0;
+  virtual void copyData(const Tensor &from) = 0;
+
+  /**
+   * @brief      Copy the Tensor
+   * @param[in]  input Tensor to be copied
+   * @param[out] output output Tensor
+   */
+  virtual void copy_with_stride(const Tensor &input, Tensor &output) = 0;
 
   /**
-   * @copydoc TensorV2::argmax()
+   * @copydoc Tensor::argmax()
    */
   virtual std::vector<unsigned int> argmax() const = 0;
 
   /**
-   * @copydoc TensorV2::max_abs()
+   * @copydoc Tensor::max_abs()
    */
   virtual float max_abs() const = 0;
 
   /**
-   * @copydoc TensorV2::maxValue()
+   * @copydoc Tensor::maxValue()
    */
   virtual float maxValue() const = 0;
 
   /**
-   * @copydoc TensorV2::minValue()
+   * @copydoc Tensor::minValue()
    */
   virtual float minValue() const = 0;
 
   /**
-   * @copydoc TensorV2::transpose(const std::string &direction, TensorV2 &out)
+   * @copydoc Tensor::transpose(const std::string &direction, Tensor &out)
    */
-  virtual TensorV2 &transpose(const std::string &direction,
-                              TensorV2 &out) const = 0;
+  virtual Tensor &transpose(const std::string &direction,
+                            Tensor &out) const = 0;
 
   /**
    * @brief     put data of Tensor
@@ -634,12 +680,12 @@ class TensorBase {
    * @note This should better be implemented in iterator fashion before used
    * extensively.
    */
-  struct BroadcastInfoV2 {
+  struct BroadcastInfo {
 
     /**
      * @brief Construct a new External Loop Info object
      */
-    BroadcastInfoV2() :
+    BroadcastInfo() :
       buffer_size(0),
       buffer_axis(-1),
       strides{0, 0, 0, 0},
@@ -659,7 +705,7 @@ class TensorBase {
    * @param m target tensor to be calculated against.
    * @return BroadcastInfo Loopinfo needed to run external loop
    */
-  BroadcastInfoV2 computeBroadcastInfo(const TensorV2 &m) const;
+  BroadcastInfo computeBroadcastInfo(const Tensor &m) const;
 
   /**
    * @brief Calcuates variables needed to perform tensor flatten dot product
@@ -681,7 +727,7 @@ class TensorBase {
    *
    * @note op(X) is one of X or X**T
    */
-  void calculateFlattenDot(TensorV2 const &input, TensorV2 &output, bool trans,
+  void calculateFlattenDot(Tensor const &input, Tensor &output, bool trans,
                            bool trans_in, unsigned int &first_three_flat,
                            unsigned int &last_axis,
                            unsigned int &input_first_three_flat,
diff --git a/nntrainer/tensor/tensor_pool.cpp b/nntrainer/tensor/tensor_pool.cpp
index d41e293793..0a69f1dce9 100644
--- a/nntrainer/tensor/tensor_pool.cpp
+++ b/nntrainer/tensor/tensor_pool.cpp
@@ -32,8 +32,7 @@ namespace nntrainer {
  */
 Tensor *TensorPool::request(const std::string &name, const TensorDim &dim,
                             const std::vector<unsigned int> &exec_order,
-                            TensorLifespan lifespan,
-                            const Tensor::Initializer &init,
+                            TensorLifespan lifespan, const Initializer &init,
                             bool is_weight_grad) {
   return registerRequestSpec(
     {is_weight_grad, std::make_unique<Tensor>(dim, false, init, name),
@@ -101,8 +100,7 @@ Tensor *TensorPool::view(const std::string &name, const std::string &reference,
   /** @note default is_weight_grad for view is false. view is for the
    * activation. */
   return registerRequestSpec(
-    {false,
-     std::make_unique<Tensor>(dim, false, Tensor::Initializer::NONE, name),
+    {false, std::make_unique<Tensor>(dim, false, Initializer::NONE, name),
      TensorPool::DependentDetails{parent_idx, adjusted_offset}});
 }
 
@@ -365,7 +363,7 @@ Tensor *TensorPool::requestOrExtend(const std::string &name,
                                     const TensorDim &dim,
                                     const std::vector<unsigned int> &exec_order,
                                     TensorLifespan lifespan,
-                                    const Tensor::Initializer &init) {
+                                    const Initializer &init) {
   NNTR_THROW_IF(lifespan == TensorLifespan::UNMANAGED, std::invalid_argument)
     << "unmanaged life span is not supported";
 
diff --git a/nntrainer/tensor/tensor_pool.h b/nntrainer/tensor/tensor_pool.h
index 7ff49d790c..fd17db7cd2 100644
--- a/nntrainer/tensor/tensor_pool.h
+++ b/nntrainer/tensor/tensor_pool.h
@@ -43,8 +43,7 @@ class TensorPool {
    * @brief     Constructor of TensorPool
    */
   TensorPool() :
-    mem_pool(std::make_unique<MemoryPool>()),
-    cache_loader(nullptr) {}
+    mem_pool(std::make_unique<MemoryPool>()), cache_loader(nullptr) {}
 
   /**
    * @brief     Constructor of TensorPool
@@ -179,7 +178,7 @@ class TensorPool {
   Tensor *request(const std::string &name, const TensorDim &dim,
                   const std::vector<unsigned int> &exec_order,
                   TensorLifespan lifespan,
-                  const Tensor::Initializer &init = Tensor::Initializer::NONE,
+                  const Initializer &init = Initializer::NONE,
                   bool is_weight_grad = false);
 
   /**
@@ -237,11 +236,10 @@ class TensorPool {
    * @return Tensor* ptr to either to the existing tensor or newly created
    * tensor
    */
-  Tensor *
-  requestOrExtend(const std::string &name, const TensorDim &dim,
-                  const std::vector<unsigned int> &exec_order,
-                  TensorLifespan lifespan,
-                  const Tensor::Initializer &init = Tensor::Initializer::NONE);
+  Tensor *requestOrExtend(const std::string &name, const TensorDim &dim,
+                          const std::vector<unsigned int> &exec_order,
+                          TensorLifespan lifespan,
+                          const Initializer &init = Initializer::NONE);
 
   /**
    * @brief reidentify the source of already created tensor (or view).
diff --git a/nntrainer/tensor/tensor_v2.cpp b/nntrainer/tensor/tensor_v2.cpp
deleted file mode 100644
index 28cc2b1b67..0000000000
--- a/nntrainer/tensor/tensor_v2.cpp
+++ /dev/null
@@ -1,1082 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-/**
- * @file	tensor_v2.cpp
- * @date	01 December 2023
- * @brief	This is a TensorV2 class
- * @see		https://github.com/nnstreamer/nntrainer
- * @author	Jijoong Moon <jijoong.moon@samsung.com>
- * @author	Donghyeon Jeong <dhyeon.jeong@samsung.com>
- * @bug		No known bugs except for NYI items
- */
-
-#include <float_tensor.h>
-#include <tensor_v2.h>
-
-#ifdef ENABLE_FP16
-#include <half_tensor.h>
-#endif
-
-namespace nntrainer {
-
-TensorV2::TensorV2(std::string name_, Tformat fm, Tdatatype d_type) {
-  itensor = nullptr;
-
-  if (d_type == Tdatatype::FP32) {
-    itensor = std::shared_ptr<FloatTensor>(new FloatTensor(name_, fm),
-                                           std::default_delete<FloatTensor>());
-  } else if (d_type == Tdatatype::FP16) {
-#ifdef ENABLE_FP16
-    itensor = std::shared_ptr<HalfTensor>(new HalfTensor(name_, fm),
-                                          std::default_delete<HalfTensor>());
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  } else {
-    throw std::invalid_argument(
-      "Error: TensorV2 cannot be constructed because the given d_type is not "
-      "compatible with itensor. The supported d_types are: FP32, FP16 "
-      "(if built with ENABLE_FP16).");
-  }
-}
-
-TensorV2::TensorV2(const TensorDim &d, bool alloc_now, Initializer init,
-                   std::string name) {
-  itensor = nullptr;
-
-  if (d.getDataType() == Tdatatype::FP32) {
-    itensor =
-      std::shared_ptr<FloatTensor>(new FloatTensor(d, alloc_now, init, name),
-                                   std::default_delete<FloatTensor>());
-  } else if (d.getDataType() == Tdatatype::FP16) {
-#ifdef ENABLE_FP16
-    itensor =
-      std::shared_ptr<HalfTensor>(new HalfTensor(d, alloc_now, init, name),
-                                  std::default_delete<HalfTensor>());
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  } else {
-    throw std::invalid_argument(
-      "Error: TensorV2 cannot be constructed because the given d_type is not "
-      "compatible with itensor. The supported d_types are: FP32, FP16 "
-      "(if built with ENABLE_FP16).");
-  }
-}
-
-TensorV2::TensorV2(const TensorDim &d, const void *buf) {
-  itensor = nullptr;
-
-  if (d.getDataType() == Tdatatype::FP32) {
-    itensor = std::shared_ptr<FloatTensor>(new FloatTensor(d, buf),
-                                           std::default_delete<FloatTensor>());
-  } else if (d.getDataType() == Tdatatype::FP16) {
-#ifdef ENABLE_FP16
-    itensor = std::shared_ptr<HalfTensor>(new HalfTensor(d, buf),
-                                          std::default_delete<HalfTensor>());
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  } else {
-    throw std::invalid_argument(
-      "Error: TensorV2 cannot be constructed because the given d_type is not "
-      "compatible with itensor. The supported d_types are: FP32, FP16 "
-      "(if built with ENABLE_FP16).");
-  }
-}
-
-TensorV2::TensorV2(
-  std::vector<std::vector<std::vector<std::vector<float>>>> const &d,
-  ml::train::TensorDim::TensorType t_type) {
-  itensor = std::shared_ptr<FloatTensor>(new FloatTensor(d, t_type.format),
-                                         std::default_delete<FloatTensor>());
-}
-
-#ifdef ENABLE_FP16
-TensorV2::TensorV2(
-  std::vector<std::vector<std::vector<std::vector<_FP16>>>> const &d,
-  ml::train::TensorDim::TensorType t_type) {
-  itensor = std::shared_ptr<HalfTensor>(new HalfTensor(d, t_type.format),
-                                        std::default_delete<HalfTensor>());
-}
-#endif
-
-bool TensorV2::operator==(const TensorV2 &rhs) const {
-  /// compares tensor information
-  if (*itensor == *rhs.itensor) {
-    /// compares tensor data
-    if (getDataType() == Tdatatype::FP32) {
-      return *std::dynamic_pointer_cast<FloatTensor>(itensor) ==
-             *std::dynamic_pointer_cast<FloatTensor>(rhs.itensor);
-    } else if (getDataType() == Tdatatype::FP16) {
-#ifdef ENABLE_FP16
-      return *std::dynamic_pointer_cast<HalfTensor>(itensor) ==
-             *std::dynamic_pointer_cast<HalfTensor>(rhs.itensor);
-#else
-      throw std::invalid_argument(
-        "Error: HalfTensor cannot be created or used when FP16 is not enabled. "
-        "Please check if the tensor data type is set properly.");
-#endif
-    }
-  }
-  return false;
-}
-
-void TensorV2::allocate() { itensor->allocate(); }
-
-void TensorV2::deallocate() { itensor->deallocate(); }
-
-bool TensorV2::isAllocated() { return itensor->isAllocated(); }
-
-void TensorV2::setValue(float value) { itensor->setValue(value); }
-
-void TensorV2::setValue(unsigned int b, unsigned int c, unsigned int h,
-                        unsigned int w, float value) {
-  itensor->setValue(b, c, h, w, value);
-}
-
-void TensorV2::addValue(unsigned int b, unsigned int c, unsigned int h,
-                        unsigned int w, float value, float beta) noexcept {
-  itensor->addValue(b, c, h, w, value, beta);
-}
-
-void TensorV2::setZero() { itensor->setZero(); }
-
-void TensorV2::setRandNormal(float mean, float stddev) {
-  itensor->setRandNormal(mean, stddev);
-}
-
-void TensorV2::setRandUniform(float min, float max) {
-  itensor->setRandUniform(min, max);
-}
-
-void TensorV2::setRandBernoulli(float probability) {
-  itensor->setRandBernoulli(probability);
-}
-
-void TensorV2::initialize() { itensor->initialize(); }
-
-void TensorV2::initialize(Initializer init) { itensor->initialize(init); }
-
-TensorV2 TensorV2::apply(std::function<TensorV2(TensorV2)> f) const {
-  return f(*this);
-}
-
-TensorV2 &TensorV2::apply(std::function<TensorV2 &(TensorV2, TensorV2 &)> f,
-                          TensorV2 &output) const {
-  return f(*this, output);
-}
-
-int TensorV2::multiply_i_strided(TensorV2 const &m, const float beta) {
-  try {
-    this->multiply_strided(m, *this, beta);
-  } catch (std::exception &err) {
-    ml_loge("%s %s", typeid(err).name(), err.what());
-    return ML_ERROR_INVALID_PARAMETER;
-  }
-
-  return ML_ERROR_NONE;
-}
-
-TensorV2 TensorV2::multiply_strided(TensorV2 const &m, const float beta) const {
-  TensorV2 t;
-  return this->multiply_strided(m, t, beta);
-}
-
-TensorV2 &TensorV2::multiply_strided(TensorV2 const &m, TensorV2 &output,
-                                     const float beta) const {
-  itensor->multiply_strided(m, output, beta);
-  return output;
-}
-
-int TensorV2::multiply_i(float const &value) {
-  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
-    << getName() << " is not contiguous, cannot multiply";
-
-  return itensor->multiply_i(value);
-}
-
-TensorV2 TensorV2::multiply(float const &value) const {
-  TensorV2 t;
-  return multiply(value, t);
-}
-
-TensorV2 &TensorV2::multiply(float const &value, TensorV2 &out) const {
-  itensor->multiply(value, out);
-  return out;
-}
-
-int TensorV2::multiply_i(TensorV2 const &m, const float beta) {
-  try {
-    this->multiply(m, *this, beta);
-  } catch (std::exception &err) {
-    ml_loge("%s %s", typeid(err).name(), err.what());
-    return ML_ERROR_INVALID_PARAMETER;
-  }
-
-  return ML_ERROR_NONE;
-}
-
-TensorV2 TensorV2::multiply(TensorV2 const &m, const float beta) const {
-  TensorV2 t("", this->getFormat());
-  return multiply(m, t, beta);
-}
-
-TensorV2 &TensorV2::multiply(TensorV2 const &m, TensorV2 &output,
-                             const float beta) const {
-  itensor->multiply(m, output, beta);
-  return output;
-}
-
-int TensorV2::divide_i(float const &value) {
-  if (value == 0.0f) {
-    return ML_ERROR_INVALID_PARAMETER;
-  }
-  this->divide(value, *this);
-  return ML_ERROR_NONE;
-}
-
-TensorV2 TensorV2::divide(float const &value) const {
-  TensorV2 output("", getFormat(), getDataType());
-  return divide(value, output);
-}
-
-TensorV2 &TensorV2::divide(float const &value, TensorV2 &output) const {
-  /// @todo add unittest, ZeroDivisionError
-  if (value == 0.0f) {
-    std::stringstream ss;
-    ss << "[Tensor] divide by value failed, value: " << value;
-    throw std::invalid_argument(ss.str().c_str());
-  }
-  itensor->divide(value, output);
-  return output;
-}
-
-int TensorV2::divide_i(TensorV2 const &m) {
-  try {
-    this->divide(m, *this);
-  } catch (std::exception &err) {
-    ml_loge("%s %s", typeid(err).name(), err.what());
-    return ML_ERROR_INVALID_PARAMETER;
-  }
-
-  return ML_ERROR_NONE;
-}
-
-TensorV2 TensorV2::divide(TensorV2 const &m) const {
-  TensorV2 output("", getFormat(), getDataType());
-  return this->divide(m, output);
-}
-
-TensorV2 &TensorV2::divide(TensorV2 const &m, TensorV2 &output) const {
-  NNTR_THROW_IF(!getContiguous() || !m.getContiguous() ||
-                  !output.getContiguous(),
-                std::invalid_argument)
-    << getName() << " is not contiguous, cannot divide";
-  itensor->divide(m, output);
-  return output;
-}
-
-int TensorV2::add_i_strided(TensorV2 const &input, const float beta) {
-  try {
-    this->add_strided(input, *this, beta);
-  } catch (std::exception &err) {
-    ml_loge("%s %s", typeid(err).name(), err.what());
-    return ML_ERROR_INVALID_PARAMETER;
-  }
-
-  return ML_ERROR_NONE;
-}
-
-TensorV2 TensorV2::add_strided(TensorV2 const &input, const float beta) const {
-  TensorV2 output("", getFormat(), getDataType());
-  return this->add_strided(input, output, beta);
-}
-
-TensorV2 &TensorV2::add_strided(TensorV2 const &input, TensorV2 &output,
-                                const float beta) const {
-  CREATE_V2_IF_EMPTY_DIMS(output, getDim(), nullptr);
-
-  if (size() != input.size() || size() != output.size())
-    throw std::invalid_argument(
-      "Strided addition does not support broadcasting");
-
-  itensor->add_strided(input, output, beta);
-
-  return output;
-}
-
-int TensorV2::add_i(float const &value) {
-  this->add(value, *this);
-  return ML_ERROR_NONE;
-}
-
-TensorV2 TensorV2::add(float const &value) const {
-  TensorV2 t("", getFormat(), getDataType());
-  return add(value, t);
-}
-
-TensorV2 &TensorV2::add(float const &value, TensorV2 &output) const {
-  itensor->add(value, output);
-  return output;
-}
-
-int TensorV2::add_i(TensorV2 const &m, float const alpha) {
-  try {
-    this->add(m, *this, alpha);
-  } catch (std::exception &err) {
-    ml_loge("%s %s", typeid(err).name(), err.what());
-    return ML_ERROR_INVALID_PARAMETER;
-  }
-  return ML_ERROR_NONE;
-}
-
-TensorV2 TensorV2::add(TensorV2 const &m, float const alpha) const {
-  TensorV2 t("", getFormat(), getDataType());
-  return this->add(m, t, alpha);
-}
-
-TensorV2 &TensorV2::add(TensorV2 const &m, TensorV2 &output,
-                        float const alpha) const {
-  NNTR_THROW_IF(!itensor->getContiguous() || !m.getContiguous() ||
-                  !output.getContiguous(),
-                std::invalid_argument)
-    << getName() << " is not contiguous, cannot add";
-  itensor->add(m, output, alpha);
-  return output;
-}
-
-int TensorV2::subtract_i(float const &value) {
-  this->subtract(value, *this);
-  return ML_ERROR_NONE;
-}
-
-TensorV2 TensorV2::subtract(float const &value) const {
-  TensorV2 output("", getFormat(), getDataType());
-  return subtract(value, output);
-}
-
-TensorV2 &TensorV2::subtract(float const &value, TensorV2 &output) const {
-  itensor->subtract(value, output);
-  return output;
-}
-
-int TensorV2::subtract_i(TensorV2 const &m) { return add_i(m, -1); }
-
-TensorV2 TensorV2::subtract(TensorV2 const &m) const { return add(m, -1); }
-
-TensorV2 &TensorV2::subtract(TensorV2 const &m, TensorV2 &output) const {
-  return add(m, output, -1);
-}
-
-/**
- * This is to sum the Tensor data according to the dim.batch().
- * Therefore the result has M(dim.batch(), 1, 1, 1) dimension.
- */
-TensorV2 TensorV2::sum_by_batch() const {
-  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
-    << getName() << " is not contiguous, cannot sum";
-
-  TensorV2 output(batch(), 1, 1, 1, this->getFormat(), getDataType());
-  itensor->sum_by_batch(output);
-  return output;
-}
-
-TensorV2 TensorV2::sum(unsigned int axis, float alpha) const {
-  TensorV2 output("", this->getFormat(), this->getDataType());
-  return sum(axis, output, alpha, 0);
-}
-
-TensorV2 &TensorV2::sum(unsigned int axis, TensorV2 &output, float alpha,
-                        float beta) const {
-  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
-    << getName() << " is not contiguous, cannot sum";
-
-  itensor->sum(axis, output, alpha, beta);
-  return output;
-}
-
-TensorV2 TensorV2::sum(const std::vector<unsigned int> &axes,
-                       float alpha) const {
-  TensorV2 output("", this->getFormat());
-  return sum(axes, output, alpha);
-}
-
-TensorV2 &TensorV2::sum(const std::vector<unsigned int> &axes, TensorV2 &output,
-                        float alpha) const {
-  if (axes.empty())
-    throw std::invalid_argument("empty axes given");
-
-  if (axes.size() == 1) {
-    this->sum(axes[0], output, alpha);
-  } else {
-
-    /** club axes together */
-    TensorV2 new_reshaped = TensorV2(getDim());
-    new_reshaped.copy(*this);
-    std::vector<unsigned int> continuous_order = {0, 3, 1, 2};
-    std::vector<unsigned int> new_axes = {axes[0]};
-
-    for (unsigned int i = 1; i < axes.size(); ++i) {
-      if (checkContinuous(axes[i - 1], axes[i])) {
-        new_reshaped.mergeAxis(axes[i - 1], axes[i]);
-        new_axes.back() = axes[i];
-      } else {
-        new_axes.push_back(axes[i]);
-      }
-    }
-
-    TensorV2 ret = new_reshaped.sum(new_axes[0]);
-    for (unsigned int i = 1; i < new_axes.size() - 1; ++i)
-      ret = ret.sum(axes[i]);
-    ret.sum(new_axes.back(), output, alpha);
-  }
-  return output;
-}
-
-TensorV2 TensorV2::average(unsigned int axis) const {
-  TensorV2 output("", this->getFormat(), this->getDataType());
-  return average(axis, output);
-}
-
-TensorV2 &TensorV2::average(unsigned int axis, TensorV2 &output) const {
-  if (axis >= TensorDim::MAXDIM)
-    throw std::out_of_range(
-      "negative axis or axis more then MAXDIM is invalid");
-
-  unsigned int axis_size = getDim()[axis];
-  if (axis_size == 1)
-    output.copy(*this);
-  else
-    this->sum(axis, output, 1.0 / ((float)axis_size));
-
-  return output;
-}
-
-TensorV2 TensorV2::average(const std::vector<unsigned int> &axes) const {
-  TensorV2 output("", this->getFormat(), this->getDataType());
-  return average(axes, output);
-}
-
-TensorV2 &TensorV2::average(const std::vector<unsigned int> &axes,
-                            TensorV2 &output) const {
-  if (axes.empty())
-    return this->average(output);
-
-  TensorDim ret_shape(getTensorType());
-
-  for (const auto &idx : axes) {
-    if (idx >= TensorDim::MAXDIM) {
-      throw std::out_of_range("axis more then MAXDIM is invalid");
-    }
-    ret_shape.setTensorDim(idx, getDim().getTensorDim(idx));
-  }
-
-  return this->sum(axes, output, 1.0 / (float)ret_shape.getDataLen());
-}
-
-TensorV2 TensorV2::average() const {
-  TensorV2 output = *this;
-  unsigned int axis = 0;
-  if (this->getFormat() == Tformat::NHWC) {
-    output.reshape({1, getDim().getDataLen(), 1, 1, this->getTensorType()});
-    axis = 1;
-  } else {
-    output.reshape({1, 1, 1, getDim().getDataLen(), this->getTensorType()});
-    axis = 3;
-  }
-  return output.average(axis);
-}
-
-TensorV2 &TensorV2::average(TensorV2 &output) const {
-  TensorV2 result = *this;
-  result.reshape({1, 1, 1, getDim().getDataLen()});
-  return result.average(3, output);
-}
-
-int TensorV2::pow_i(float exponent) {
-  pow(exponent, *this);
-  return ML_ERROR_NONE;
-}
-
-TensorV2 TensorV2::pow(float exponent) const {
-  TensorV2 output("", getFormat(), getDataType());
-  return pow(exponent, output);
-}
-
-TensorV2 &TensorV2::pow(float exponent, TensorV2 &output) const {
-  itensor->pow(exponent, output);
-  return output;
-}
-
-int TensorV2::erf_i() {
-  erf(*this);
-  return ML_ERROR_NONE;
-}
-
-TensorV2 TensorV2::erf() const {
-  TensorV2 output("", getFormat(), getDataType());
-  return erf(output);
-}
-
-TensorV2 &TensorV2::erf(TensorV2 &output) const {
-  itensor->erf(output);
-  return output;
-}
-
-void TensorV2::sin(TensorV2 &out, float alpha) {
-  if (size() != out.size())
-    throw std::invalid_argument("Error: Size of out of Tensor::sin must match");
-
-  itensor->sin(out, alpha);
-}
-
-void TensorV2::cos(TensorV2 &out, float alpha) {
-  if (size() != out.size())
-    throw std::invalid_argument("Error: Size of out of Tensor::cos must match");
-
-  itensor->cos(out, alpha);
-}
-
-float TensorV2::l2norm() const { return itensor->l2norm(); }
-
-void TensorV2::normalization_i() {
-  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
-    << getName() << " is not contiguous, cannot do normalization.";
-
-  const float min = minValue();
-  const float max = maxValue();
-
-  if (max == min) {
-    TensorV2 tmp = *this;
-    this->subtract_i(tmp);
-  } else {
-    this->subtract_i(min);
-    this->divide_i(max - min);
-  }
-}
-
-void TensorV2::standardization_i() {
-  TensorV2 mean_by_batch = this->sum_by_batch();
-  mean_by_batch.divide_i(getDim().getFeatureLen());
-
-  this->subtract_i(mean_by_batch);
-  TensorV2 std_dev_by_batch(batch(), 1, 1, 1, getFormat(), getDataType());
-  std_dev_by_batch.setZero();
-
-  /// @todo remove conditional statement
-  if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-    float *std_dev = std_dev_by_batch.getData<float>();
-
-    for (unsigned int k = 0; k < batch(); ++k) {
-      TensorV2 sub_this = this->getBatchSlice(k, 1);
-      std_dev[k] = sub_this.l2norm();
-    }
-  } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    _FP16 *std_dev = std_dev_by_batch.getData<_FP16>();
-
-    for (unsigned int k = 0; k < batch(); ++k) {
-      TensorV2 sub_this = this->getBatchSlice(k, 1);
-      std_dev[k] = static_cast<_FP16>(sub_this.l2norm());
-    }
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-
-  std_dev_by_batch.divide_i(getDim().getFeatureLen());
-  this->divide_i(std_dev_by_batch);
-}
-
-TensorV2 TensorV2::dot(TensorV2 const &input, bool trans, bool trans_in) const {
-  TensorV2 output("", this->getFormat(), this->getDataType());
-  dot(input, output, trans, trans_in);
-
-  return output;
-}
-
-/**
- * @note: This dot product flattens the fist 3 axis for the purpose of
- * computation. So, while performing, these matrices are behaving as 2-D
- * matrices. The dimensions are restored while returning back the tensor
- * in case of trans is false.
- */
-TensorV2 &TensorV2::dot(TensorV2 const &input, TensorV2 &output, bool trans,
-                        bool trans_in, float beta) const {
-  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
-    << getName() << " is not contiguous. Cannot dot product.";
-
-  itensor->dot(input, output, trans, trans_in, beta);
-  return output;
-}
-
-TensorV2 &TensorV2::dot_deriv_wrt_1(TensorV2 const &m,
-                                    TensorV2 const &output_deriv, bool trans,
-                                    bool trans_m, float beta) {
-  bool deriv_trans_m = true;
-  bool deriv_trans = false;
-  /** @todo handle all cases of trans and trans_m */
-  if (!trans && trans_m) {
-    deriv_trans_m = false;
-  }
-
-  return output_deriv.dot(m, *this, deriv_trans, deriv_trans_m, beta);
-}
-
-/**
- * @brief compute the derivative wrt m in the m tensor
- * @note The caller tensor must be the same tensor as the one which called the
- * dot() product.
- */
-TensorV2 &TensorV2::dot_deriv_wrt_2(TensorV2 &m_deriv,
-                                    TensorV2 const &output_deriv, bool trans,
-                                    bool trans_m, float beta) const {
-  bool deriv_trans_m = false;
-  bool deriv_trans = true;
-  /** @todo handle all cases of trans and trans_m */
-
-  if (!trans && trans_m) {
-    output_deriv.dot(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
-    return m_deriv;
-  } else {
-    return dot(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
-  }
-}
-
-TensorV2 &TensorV2::dotBatched(TensorV2 const &m, TensorV2 &result, bool trans,
-                               bool trans_m, float beta) const {
-  if (!result.isAllocated())
-    throw std::invalid_argument(
-      "Output tensor must be preallocated for dotBatched operation");
-  for (unsigned int b = 0; b < batch(); b++) {
-    /** @todo try using transpose to speedup the operation */
-    const TensorV2 this_b = this->getBatchSlice(b, 1);
-    TensorV2 m_b = m.getBatchSlice(b, 1);
-    TensorV2 result_b = result.getBatchSlice(b, 1);
-
-    this_b.dot(m_b, result_b, trans, trans_m, beta);
-  }
-
-  return result;
-}
-
-TensorV2 &TensorV2::dot_batched_deriv_wrt_1(TensorV2 const &m,
-                                            TensorV2 const &output_deriv,
-                                            bool trans, bool trans_m,
-                                            float beta) {
-  bool deriv_trans_m = true;
-  bool deriv_trans = false;
-  /** @todo handle all cases of trans and trans_m */
-  if (!trans && trans_m) {
-    deriv_trans_m = false;
-  }
-
-  return output_deriv.dotBatched(m, *this, deriv_trans, deriv_trans_m, beta);
-}
-
-TensorV2 &TensorV2::dot_batched_deriv_wrt_2(TensorV2 &m_deriv,
-                                            TensorV2 const &output_deriv,
-                                            bool trans, bool trans_m,
-                                            float beta) const {
-  bool deriv_trans_m = false;
-  bool deriv_trans = true;
-  /** @todo handle all cases of trans and trans_m */
-
-  if (!trans && trans_m) {
-    output_deriv.dotBatched(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
-    return m_deriv;
-  } else {
-    return dotBatched(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
-  }
-}
-
-TensorV2 TensorV2::dropout_mask(float dropout) const {
-  TensorV2 output(getDim());
-  output.dropout_mask(dropout);
-  return output;
-}
-
-void TensorV2::dropout_mask(float dropout) {
-  /// @todo add unittest
-  NNTR_THROW_IF(dropout < 0 || dropout > 1, std::invalid_argument)
-    << "[Tensor::dropout_mask] Dropout rate should be between 0 and 1";
-
-  // if the rate is zero, no change is needed
-  if (std::fpclassify(dropout) == FP_ZERO)
-    return;
-
-  setRandUniform(0.0, 1.0);
-  itensor->dropout_mask(dropout);
-}
-
-void TensorV2::filter_mask(const TensorV2 &mask_len, bool reverse) {
-  /// @todo add unittest
-  itensor->filter_mask(mask_len, reverse);
-}
-
-TensorV2 TensorV2::zoneout_mask(float zoneout) {
-  TensorV2 output(getDim());
-  zoneout_mask(output, zoneout);
-  return output;
-}
-
-void TensorV2::zoneout_mask(TensorV2 &opposite, float zoneout) {
-  NNTR_THROW_IF(getDim() != opposite.getDim(), std::invalid_argument)
-    << "[Tensor::zoneout_mask] opposite dimension does not match";
-
-  NNTR_THROW_IF(zoneout < 0 || zoneout > 1, std::invalid_argument)
-    << "[Tensor::zoneout_mask] Zoneout rate should be between 0 and 1";
-
-  // if the rate is zero, no change is needed
-  if (std::fpclassify(zoneout) == FP_ZERO)
-    return;
-
-  itensor->zoneout_mask(opposite, zoneout);
-}
-
-std::vector<TensorV2> TensorV2::split(unsigned num_size, int axis) {
-  NNTR_THROW_IF(num_size == 0, std::invalid_argument)
-    << "num size cannot be zero";
-
-  if (axis == -1) {
-    axis = 3;
-  }
-
-  NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
-    << "cannot split axis of axis: " << axis;
-
-  NNTR_THROW_IF(getDim().getTensorDim(axis) % num_size != 0,
-                std::invalid_argument)
-    << "axis is not divisible by num_size, axis: " << axis
-    << " num size: " << num_size;
-
-  std::vector<size_t> sizes;
-  sizes.resize(num_size);
-
-  unsigned int sz = getDim().getTensorDim(axis) / num_size;
-  std::fill(sizes.begin(), sizes.end(), sz);
-
-  return split(sizes, axis);
-}
-
-std::vector<TensorV2> TensorV2::split(std::vector<size_t> sizes, int axis) {
-  NNTR_THROW_IF(sizes.size() == 0, std::invalid_argument)
-    << "num size cannot be zero";
-
-  NNTR_THROW_IF(!(-1 <= axis && axis < 4), std::invalid_argument)
-    << "cannot split axis of axis: " << axis;
-
-  NNTR_THROW_IF(
-    std::any_of(sizes.begin(), sizes.end(), [](size_t sz) { return !sz; }),
-    std::invalid_argument)
-    << "among given sizes at least one of size is 0";
-
-  return itensor->split(sizes, axis);
-}
-
-TensorV2 TensorV2::cat(const std::vector<TensorV2> &tensors, int axis) {
-  NNTR_THROW_IF(!(-1 <= axis && axis < 4), std::invalid_argument)
-    << "cannot split axis of axis: " << axis;
-
-  NNTR_THROW_IF(tensors.empty(), std::invalid_argument)
-    << "given tensor vector is empty";
-
-  TensorV2 output;
-  Tdatatype dtype = tensors.front().getDim().getDataType();
-
-  if (dtype == Tdatatype::FP32) {
-    output = FloatTensor::cat(tensors, axis);
-  } else if (dtype == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-    output = HalfTensor::cat(tensors, axis);
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-
-  return output;
-}
-
-void TensorV2::print(std::ostream &out) const { itensor->print(out); }
-
-void TensorV2::putData() const { itensor->putData(); }
-
-void TensorV2::setData(const std::shared_ptr<MemoryData> buf, size_t off,
-                       bool init) {
-  itensor->setMemoryData(buf, off);
-
-  if (buf && init) {
-    initialize();
-  }
-}
-
-const std::shared_ptr<MemoryData> TensorV2::getMemoryData() const {
-  return itensor->getMemoryData();
-}
-
-size_t TensorV2::getOffset() const { return itensor->getOffset(); }
-
-void TensorV2::copy(const TensorV2 &from) {
-  /// @todo enable copy to non-contiguous tensor
-  if (!itensor->getContiguous()) {
-    throw std::runtime_error("Cannot copy non-contiguous tensor");
-  }
-
-  if (from.size() != 0 && size() == from.size() &&
-      getDataType() == from.getDataType()) {
-    // if tensor size and data type match, copy data
-    itensor->copy(from);
-  } else {
-    // replace with a new tensor that are the same with the given tensor
-    if (from.getDataType() == ml::train::TensorDim::DataType::FP32) {
-      TensorV2 t = TensorV2(from.getDim(), from.getData<float>());
-      swap(t, *this);
-    } else if (from.getDataType() == ml::train::TensorDim::DataType::FP16) {
-#ifdef ENABLE_FP16
-      TensorV2 t = TensorV2(from.getDim(), from.getData<_FP16>());
-      swap(t, *this);
-#else
-      throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-    }
-  }
-}
-
-void TensorV2::copyData(const TensorV2 &from) { itensor->copyData(from); }
-
-void TensorV2::copy_with_stride(const TensorV2 &from) {
-  if (itensor->getDim() == from.getDim()) {
-    // if the tensor dim matches, copy the data
-    copy(from);
-  } else {
-    // replace with a new tensor that has the same data as the given tensor
-    TensorV2 t = TensorV2(from.getDim(), true);
-    for (unsigned int b = 0; b < t.batch(); ++b) {
-      for (unsigned int c = 0; c < t.channel(); ++c) {
-        for (unsigned int h = 0; h < t.height(); ++h) {
-          for (unsigned int w = 0; w < t.width(); ++w) {
-            if (getDataType() == ml::train::TensorDim::DataType::FP32) {
-              t.setValue(b, c, h, w, from.getValue<float>(b, c, h, w));
-            } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
-              /// @todo remove #ifdef ENABLE_FP16
-#ifdef ENABLE_FP16
-              t.setValue(b, c, h, w, from.getValue<_FP16>(b, c, h, w));
-#else
-              throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-            }
-          }
-        }
-      }
-    }
-    swap(t, *this);
-  }
-}
-
-TensorV2 TensorV2::getBatchSlice(size_t offset, unsigned int size) const {
-  TensorDim dim_ = getDim();
-  dim_.batch(size);
-
-  return getSharedDataTensor(dim_, offset * this->getDim().getFeatureLen(),
-                             true, "");
-}
-
-TensorV2 TensorV2::clone() const {
-  TensorV2 output(getName(), getFormat(), getDataType());
-  output.copy(*this);
-  return output;
-}
-
-void TensorV2::save(std::ostream &file) {
-  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
-    << getName() << " is not contiguous, cannot save.";
-
-  std::streamsize sz = static_cast<std::streamsize>(bytes());
-  NNTR_THROW_IF(sz < 0, std::invalid_argument)
-    << "save size: " << bytes()
-    << " is too big. It cannot be represented by std::streamsize";
-
-  checkedWrite(file, getData<char>(), sz, "[Tensor::save] operation failed");
-  putData();
-}
-
-void TensorV2::read(std::ifstream &file) {
-  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
-    << getName() << " is not contiguous, cannot read.";
-
-  std::streamsize sz = static_cast<std::streamsize>(bytes());
-
-  NNTR_THROW_IF(sz < 0, std::invalid_argument)
-    << "read size: " << bytes()
-    << " is too big. It cannot be represented by std::streamsize";
-
-  checkedRead(file, getData<char>(), sz, "[Tensor::read] operation failed");
-  putData();
-}
-
-std::vector<unsigned int> TensorV2::argmax() const {
-  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
-    << getName() << " is not contiguous, cannot get argmax.";
-  return itensor->argmax();
-}
-
-float TensorV2::max_abs() const {
-  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
-    << getName() << " is not contiguous, cannot get max_abs.";
-  return itensor->max_abs();
-}
-
-float TensorV2::maxValue() const { return itensor->maxValue(); }
-
-float TensorV2::minValue() const { return itensor->minValue(); }
-
-TensorV2 TensorV2::transpose(const std::string &direction) const {
-  TensorV2 output(getDim());
-  transpose(direction, output);
-  return output;
-}
-
-TensorV2 &TensorV2::transpose(const std::string &direction,
-                              TensorV2 &output) const {
-  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
-    << getName() << " is not contiguous. Cannot transpose.";
-
-  if (output.getData<char>() == getData<char>()) {
-    TensorV2 result = clone();
-    return result.transpose(direction, output);
-  }
-
-  itensor->transpose(direction, output);
-
-  return output;
-}
-
-void TensorV2::reshape(const TensorDim &d) { itensor->reshape(d); }
-
-void TensorV2::fill(const TensorV2 &from, bool allocate) {
-  if (allocate && this->empty()) {
-    this->copy(from);
-    return;
-  }
-
-  if (!from.getContiguous() || !getContiguous()) {
-    /// @todo enable this if needed
-    throw nntrainer::exception::not_supported(
-      "[Tensor::fill] non-contiguous tensors are not supported");
-  }
-
-  if (getDim() != from.getDim()) {
-    throw std::invalid_argument("[Tensor::fill] dimension must be the same");
-  }
-
-  if (getStrides() != from.getStrides()) {
-    /// @todo length does not represent buffer size, there should be way to
-    /// get the buffer size
-    throw std::invalid_argument("[Tensor::fill] buffer size must be the same");
-  }
-
-  copyData(from);
-}
-
-TensorDim TensorV2::getDim() const { return itensor->getDim(); }
-
-TensorDim::TensorType TensorV2::getTensorType() const {
-  return itensor->getTensorType();
-};
-
-Initializer TensorV2::getInitializer() const {
-  return itensor->getInitializer();
-}
-
-TensorDim::Format TensorV2::getFormat() const { return itensor->getFormat(); }
-
-Tdatatype TensorV2::getDataType() const { return itensor->getDataType(); }
-
-void TensorV2::updateBatch(unsigned int batch) { itensor->updateBatch(batch); }
-
-const bool TensorV2::getContiguous() const noexcept {
-  return itensor->getContiguous();
-}
-
-const std::array<size_t, TensorDim::MAXDIM>
-TensorV2::getStrides() const noexcept {
-  return itensor->getStrides();
-}
-
-bool TensorV2::checkContinuous(unsigned int np1, unsigned int np2) const {
-  if (np1 > 3 || np2 > 3) {
-    throw std::invalid_argument(
-      "Error: Input value must be within the range of 0 to 3.");
-  }
-
-  if (getFormat() == Tformat::NCHW) {
-    if (np1 + 1 == np2)
-      return true;
-  } else {
-    std::vector<unsigned int> continuous_order_nhwc = {0, 3, 1, 2};
-    if (continuous_order_nhwc[np2] == continuous_order_nhwc[np1] + 1)
-      return true;
-  }
-
-  return false;
-}
-
-void TensorV2::setName(const std::string &name_) { itensor->setName(name_); }
-
-const std::string &TensorV2::getName() const { return itensor->getName(); }
-
-size_t TensorV2::getIndex(unsigned int b, unsigned int c, unsigned int h,
-                          unsigned int w) const noexcept {
-  return itensor->getIndex(b, c, h, w);
-}
-
-size_t TensorV2::size() const { return itensor->size(); }
-
-bool TensorV2::empty() const { return itensor->empty(); }
-
-size_t TensorV2::bytes() const { return itensor->bytes(); }
-
-size_t TensorV2::batch() const { return itensor->batch(); }
-
-size_t TensorV2::channel() const { return itensor->channel(); }
-
-size_t TensorV2::height() const { return itensor->height(); }
-
-size_t TensorV2::width() const { return itensor->width(); }
-
-void TensorV2::mergeAxis(unsigned int axis1, unsigned int axis2) {
-  NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
-    << getName() << " is not contiguous, cannot merge axis";
-
-  if (axis2 != axis1 + 1)
-    if (!checkContinuous(axis1, axis2))
-      throw std::invalid_argument("axis2 must be axis1 + 1 for merging.");
-
-  itensor->mergeAxis(axis1, axis2);
-}
-
-void TensorV2::createSharedDataTensor(const TensorV2 &src, TensorV2 &dest,
-                                      size_t offset) const {
-  itensor->createSharedDataTensor(src.itensor.get(), dest.itensor.get(),
-                                  offset);
-}
-
-TensorV2 TensorV2::getSharedDataTensor(const TensorDim dim_, size_t offset,
-                                       bool reset_stride,
-                                       const std::string &name_) const {
-  TensorV2 ret = *this;
-  itensor->getSharedDataTensor(dim_, offset, reset_stride, name_,
-                               ret.itensor.get());
-  return ret;
-}
-
-void TensorV2::setTensorVar(TensorDim d, void *buf, size_t offset) {
-  itensor->setTensorVar(d, buf, offset);
-}
-
-std::ostream &operator<<(std::ostream &out, TensorV2 const &input) {
-  input.print(out);
-  return out;
-}
-
-} // namespace nntrainer
diff --git a/nntrainer/tensor/tensor_v2.h b/nntrainer/tensor/tensor_v2.h
deleted file mode 100644
index 21893475ad..0000000000
--- a/nntrainer/tensor/tensor_v2.h
+++ /dev/null
@@ -1,1467 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-/**
- * @file	tensor_v2.h
- * @date	01 December 2023
- * @brief	This is a TensorV2 class
- * @see		https://github.com/nnstreamer/nntrainer
- * @author	Jijoong Moon <jijoong.moon@samsung.com>
- * @author	Donghyeon Jeong <dhyeon.jeong@samsung.com>
- * @bug		No known bugs except for NYI items
- */
-
-#ifndef __TENSOR_V2_H__
-#define __TENSOR_V2_H__
-#ifdef __cplusplus
-
-#define CREATE_V2_IF_EMPTY_DIMS(tensor, ...) \
-  do {                                       \
-    if (tensor.empty())                      \
-      tensor = TensorV2(__VA_ARGS__);        \
-  } while (0);
-
-#include <cstddef>
-
-#include <nntrainer_log.h>
-#include <tensor_base.h>
-
-namespace nntrainer {
-
-/**
- * @class   TensorV2 Class
- * @brief   TensorV2 Class
- */
-class TensorV2 {
-public:
-  /**
-   * @brief     Basic Constructor of Tensor
-   */
-  TensorV2(std::string name_ = "", Tformat fm = Tformat::NCHW,
-           Tdatatype d_type = Tdatatype::FP32);
-
-  /**
-   * @brief     Constructor of Tensor with dimension, possibly lazily
-   * @param d Tensor dim for this tensor
-   * @param alloc_now If the memory of the tensor must be allocated
-   * @param init Initializer for the tensor
-   * @param name Name of the tensor
-   */
-  TensorV2(const TensorDim &d, bool alloc_now,
-           Initializer init = Initializer::NONE, std::string name = "");
-
-  /**
-   * @brief     Constructor of Tensor with dimension/buf
-   * @param d Tensor dim for this tensor
-   * @param buf buffer
-   * @note Memory for this tensor is instantaneously allocated
-   */
-  TensorV2(const TensorDim &d, const void *buf = nullptr);
-
-  /**
-   * @brief     Constructor of Tensor
-   * @param[in] d0 Batch of Tensor
-   * @param[in] d1 Channel
-   * @param[in] d2 Height
-   * @param[in] d3 Width
-   * @param[in] fm Tensor Format
-   * @param[in] d_type Tensor Data Type
-   */
-  TensorV2(size_t d0, size_t d1, size_t d2, size_t d3,
-           Tformat fm = Tformat::NCHW, Tdatatype d_type = Tdatatype::FP32) :
-    TensorV2(TensorDim(d0, d1, d2, d3, fm, d_type), nullptr){};
-
-  /**
-   * @brief     Constructor of Tensor
-   * @param[in] d1 Channel
-   * @param[in] d2 Height
-   * @param[in] d3 Width
-   * @param[in] fm Tensor Format
-   * @param[in] d_type Tensor Data Type
-   */
-  TensorV2(size_t d1, size_t d2, size_t d3, Tformat fm = Tformat::NCHW,
-           Tdatatype d_type = Tdatatype::FP32) :
-    TensorV2(1, d1, d2, d3, fm, d_type){};
-
-  /**
-   * @brief     Constructor of Tensor with batch size one and d1 size one
-   * @param[in] d2 Height (NCHW) or Width (NHWC)
-   * @param[in] d3 Width (NCHW) or Channel (NHWC)
-   * @param[in] fm Tensor Format
-   * @param[in] d_type Tensor Data Type
-   */
-  TensorV2(size_t d2, size_t d3, Tformat fm = Tformat::NCHW,
-           Tdatatype d_type = Tdatatype::FP32) :
-    TensorV2(1, 1, d2, d3, fm, d_type){};
-
-  /**
-   * @brief     Constructor of Tensor with just Width or Channel
-   * @param[in] d3 Width (NCHW) or Channel (NHWC)
-   * @param[in] fm Tensor Format
-   * @param[in] d_type Tensor Data Type
-   */
-  explicit TensorV2(size_t d3, Tformat fm = Tformat::NCHW,
-                    Tdatatype d_type = Tdatatype::FP32) :
-    TensorV2(1, 1, 1, d3, fm, d_type){};
-
-  /**
-   * @brief     Constructor of Tensor
-   * @param[in] d0 Batch of Tensor
-   * @param[in] d1 Channel (NCHW) or Height (NHWC)
-   * @param[in] d2 Height (NCHW) or Width (NHWC)
-   * @param[in] d3 Width (NCHW) or Channel (NHWC)
-   * @param[in] t_type Tensor Type
-   */
-  TensorV2(size_t d0, size_t d1, size_t d2, size_t d3,
-           ml::train::TensorDim::TensorType t_type) :
-    TensorV2(TensorDim(d0, d1, d2, d3, t_type), nullptr){};
-
-  /**
-   * @brief     Constructor of Tensor
-   * @param[in] d1 Channel
-   * @param[in] d2 Height
-   * @param[in] d3 Width
-   * @param[in] t_type Tensor Type
-   */
-  TensorV2(size_t d1, size_t d2, size_t d3,
-           ml::train::TensorDim::TensorType t_type) :
-    TensorV2(1, d1, d2, d3, t_type){};
-
-  /**
-   * @brief     Constructor of Tensor with batch size one and d1 size one
-   * @param[in] d2 Height (NCHW) or Width (NHWC)
-   * @param[in] d3 Width (NCHW) or Channel (NHWC)
-   * @param[in] t_type Tensor Type
-   */
-  TensorV2(size_t d2, size_t d3, ml::train::TensorDim::TensorType t_type) :
-    TensorV2(1, (t_type.format == Tformat::NCHW) ? 1 : d3,
-             (t_type.format == Tformat::NCHW) ? d2 : 1,
-             (t_type.format == Tformat::NCHW) ? d3 : d2, t_type){};
-  /**
-   * @brief     Constructor of Tensor with just Width or Channel
-   * @param[in] d3 Width (NCHW) or Channel (NHWC)
-   * @param[in] t_type Tensor Type
-   */
-  explicit TensorV2(size_t d3, ml::train::TensorDim::TensorType t_type) :
-    TensorV2(1, (t_type.format == Tformat::NCHW) ? 1 : d3, 1,
-             (t_type.format == Tformat::NCHW) ? d3 : 1, t_type){};
-
-  /**
-   * @brief     Constructor of Tensor
-   * @param[in] d data for the Tensor. It needs to set format properly.
-   * @param[in] t_type Tensor Type
-   */
-  TensorV2(std::vector<std::vector<std::vector<std::vector<float>>>> const &d,
-           ml::train::TensorDim::TensorType t_type);
-
-  /**
-   * @brief     Constructor of Tensor
-   * @note      This constructor copies vector again. needs refactoring
-   * @param[in] d data for the Tensor. It needs to set format properly.
-   * @param[in] t_type Tensor Type
-   */
-  TensorV2(std::vector<std::vector<std::vector<float>>> const &d,
-           ml::train::TensorDim::TensorType t_type) :
-    TensorV2(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
-
-  /**
-   * @brief     Constructor of Tensor
-   * @note      This constructor copies vector again. needs refactoring
-   * @param[in] d data for the Tensor with batch size one
-   * @param[in] t_type Tensor Type
-   */
-  TensorV2(std::vector<std::vector<float>> const &d,
-           ml::train::TensorDim::TensorType t_type) :
-    TensorV2(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
-
-#ifdef ENABLE_FP16
-  /**
-   * @brief     Constructor of Tensor
-   * @note      This constructor copies vector again. needs refactoring
-   * @param[in] d data for the Tensor with batch size one
-   * @param[in] t_type Tensor Type
-   */
-  TensorV2(std::vector<std::vector<std::vector<std::vector<_FP16>>>> const &d,
-           ml::train::TensorDim::TensorType t_type);
-
-  /**
-   * @brief     Constructor of Tensor
-   * @note      This constructor copies vector again. needs refactoring
-   * @param[in] d data for the Tensor. It needs to set format properly.
-   * @param[in] t_type Tensor Type
-   */
-  TensorV2(std::vector<std::vector<std::vector<_FP16>>> const &d,
-           ml::train::TensorDim::TensorType t_type) :
-    TensorV2(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
-
-  /**
-   * @brief     Constructor of Tensor
-   * @note      This constructor copies vector again. needs refactoring
-   * @param[in] d data for the Tensor with batch size one
-   * @param[in] t_type Tensor Type
-   */
-  TensorV2(std::vector<std::vector<_FP16>> const &d,
-           ml::train::TensorDim::TensorType t_type) :
-    TensorV2(std::vector<std::decay<decltype(d)>::type>{d}, t_type){};
-
-#endif
-
-  /**
-   * @brief Basic Destructor
-   */
-  ~TensorV2() = default;
-
-  /**
-   *  @brief  Copy constructor of Tensor.
-   *  @param[in] Tensor &
-   */
-  TensorV2(const TensorV2 &rhs) = default;
-
-  /**
-   *  @brief  Move constructor of Tensor.
-   *  @param[in] Tensor &&
-   */
-  TensorV2(TensorV2 &&rhs) noexcept = default;
-
-  /**
-   * @brief  Copy assignment operator.
-   * @param[in] rhs Tensor to be copied.
-   */
-  TensorV2 &operator=(const TensorV2 &rhs) = default;
-
-  /**
-   * @brief  Move assignment operator.
-   * @parma[in] rhs Tensor to be moved.
-   */
-  TensorV2 &operator=(TensorV2 &&rhs) noexcept = default;
-
-  /**
-   * @brief     Comparison operator overload
-   * @param[in] rhs Tensor to be compared with
-   */
-  bool operator==(const TensorV2 &rhs) const;
-
-  /**
-   * @brief     Comparison operator overload
-   * @param[in] rhs Tensor to be compared with
-   */
-  bool operator!=(const TensorV2 &rhs) const { return !(*this == rhs); }
-
-  /**
-   * @brief Construct a new Tensor object from a buffer
-   * This will not copy buffer to a new tensor but directly uses it
-   *
-   * @param[in] buf buffer
-   * @param[in] bytes buffer size in bytes
-   * @param[in] d tensor dim
-   * @param[in] offset offset to be used from current
-   * @return    Tensor object
-   * @throws    std::invalid_argument if buf is null
-   * @note      Note that the buffer is not owned by the mapped tensor
-   */
-  template <typename T = float>
-  static TensorV2 Map(T *buf, unsigned int bytes, const TensorDim &d,
-                      size_t offset = 0) {
-    if (d.getDataLen() == 0 || buf == nullptr) {
-      throw std::invalid_argument(
-        "[Tensor::Map] empty tensor dim is not allowed");
-    }
-
-    if (d.getDataLen() * sizeof(T) + offset > bytes) {
-      throw std::invalid_argument(
-        "Creating shared tensor of size bigger than tensor memory.");
-    }
-
-    TensorV2 output;
-    output.setTensorVar(d, buf, offset);
-    return output;
-  };
-
-  /**
-   * @brief    Allocate memory for this tensor
-   */
-  void allocate();
-
-  /**
-   * @brief    Deallocate memory for this tensor
-   * @note     This will not necessary free the memory as tensors share memory
-   */
-  void deallocate();
-
-  /**
-   * @brief    Check if the tensor has memory allocated/assigned/associated
-   */
-  bool isAllocated();
-
-  /**
-   * @brief     return Data pointer of TensorV2
-   * @retval    template T pointer
-   */
-  template <typename T = float> T *getData() const {
-    return (T *)itensor->getData();
-  }
-
-  /**
-   * @brief     return Data pointer of TensorV2
-   * @retval    template T pointer
-   */
-  template <typename T = float> T *getData(size_t idx) const {
-    return (T *)itensor->getData(idx);
-  }
-
-  /**
-   * @brief     i data index
-   * @retval    template T pointer (address of ith data)
-   */
-  template <typename T = float> T *getAddress(unsigned int i) {
-    return (T *)itensor->getAddress(i);
-  }
-
-  /**
-   * @brief     i data index
-   * @retval    template T pointer (address of ith data)
-   */
-  template <typename T = float> const T *getAddress(unsigned int i) const {
-    return (T *)itensor->getAddress(i);
-  }
-
-  /**
-   * @brief    get address of n-d data
-   */
-  template <typename T = float>
-  T *getAddress(unsigned int b, unsigned int c, unsigned int h,
-                unsigned int w) {
-    return getAddress<T>(getIndex(b, c, h, w));
-  }
-
-  /**
-   * @brief    get address of n-d data
-   */
-  template <typename T = float>
-  const T *getAddress(unsigned int b, unsigned int c, unsigned int h,
-                      unsigned int w) const {
-    return getAddress<T>(getIndex(b, c, h, w));
-  }
-
-  /**
-   * @brief     return value at specific location
-   * @param[in] idx location
-   */
-  template <typename T = float>
-  const T &getValue(unsigned int idx) const noexcept {
-    return getData<T>()[idx];
-  }
-
-  /**
-   * @brief     return value at specific location
-   * @param[in] idx location
-   */
-  template <typename T = float> T &getValue(unsigned int idx) noexcept {
-    return getData<T>()[idx];
-  }
-
-  /**
-   * @brief     return value at specific location
-   * @param[in] b batch location
-   * @param[in] c channel location
-   * @param[in] h height location
-   * @param[in] w width location
-   */
-  template <typename T = float>
-  const T &getValue(unsigned int b, unsigned int c, unsigned int h,
-                    unsigned int w) const noexcept {
-    return getValue<T>(getIndex(b, c, h, w));
-  }
-
-  /**
-   * @brief     return value at specific location
-   * @param[in] b batch location
-   * @param[in] c channel location
-   * @param[in] h height location
-   * @param[in] w width location
-   */
-  template <typename T = float>
-  T &getValue(unsigned int b, unsigned int c, unsigned int h,
-              unsigned int w) noexcept {
-    return getValue<T>(getIndex(b, c, h, w));
-  }
-
-  /**
-   * @brief     Fill the Tensor elements with value
-   * @param[in] value value to be stored
-   */
-  void setValue(float value);
-
-  /**
-   * @brief     Set the element value
-   * @param[in] b batch location
-   * @param[in] c channel location
-   * @param[in] h height location
-   * @param[in] w width location
-   * @param[in] value value to be stored
-   */
-  void setValue(unsigned int b, unsigned int c, unsigned int h, unsigned int w,
-                float value);
-
-  /**
-   * @brief     Set the element value
-   * @param[in] offset offset from start location
-   * @param[in] value value to be stored
-   *
-   * @todo      This is a temporary workout. Remove this
-   */
-  void setValueInt(unsigned int offset, int value) noexcept {
-    int *data_int = (int *)getData();
-    data_int[offset] = value;
-  }
-
-  /**
-   * @brief     add the element value to the location
-   * @param[in] b batch location
-   * @param[in] c channel location
-   * @param[in] h height location
-   * @param[in] w width location
-   * @param[in] value value to be stored
-   * @param[in] beta scalar to multiply output with and add
-   */
-  void addValue(unsigned int b, unsigned int c, unsigned int h, unsigned int w,
-                float value, float beta) noexcept;
-
-  /**
-   * @brief     Fill the Tensor elements with zero
-   */
-  void setZero();
-
-  /**
-   * @brief     Set the tensor with random normal distribution
-   * @param[in] mean mean of the distribution
-   * @param[in] std standard deviation of the distribution
-   */
-  void setRandNormal(float mean = 0.0f, float stddev = 0.05f);
-
-  /**
-   * @brief     Set the tensor with random uniform distribution
-   * @param[in] min minimum value for the distribution
-   * @param[in] max maximum value for the distribution
-   */
-  void setRandUniform(float min = -0.05f, float max = 0.05f);
-
-  /**
-   * @brief     Set the tensor with random bernoulli distribution
-   * @param[in] probability probability value for the distribution
-   */
-  void setRandBernoulli(float probability = 0.5f);
-
-  /**
-   * @brief     Initialize the memory of the given tensor
-   */
-  void initialize();
-
-  /**
-   * @brief     Initialize the memory of the given tensor
-   * @param     init Initiailizer to use for the initialization
-   */
-  void initialize(Initializer init);
-
-  /**
-   * @brief Apply instantly to the element
-   * @param[in] *function function pointer applied
-   * @return int ML_ERROR_NONE if successful
-   */
-  template <typename T = float> int apply_i(std::function<T(T)> f) {
-    TensorV2 result = *this;
-    apply<T>(f, result);
-
-    return ML_ERROR_NONE;
-  };
-
-  /**
-   * @brief     Apply function element by element
-   * @param[in] *function function pointer applied
-   * @retval    Tensor
-   */
-  template <typename T = float> TensorV2 apply(std::function<T(T)> f) const {
-    TensorV2 result;
-    apply<T>(f, result);
-
-    return result;
-  };
-
-  /**
-   * @brief     Apply function element by element
-   * @param[in] *function function pointer applied
-   * @param[out] output output tensor
-   * @retval    Tensor
-   */
-  template <typename T = float>
-  TensorV2 &apply(std::function<T(T)> f, TensorV2 &output) const {
-    CREATE_V2_IF_EMPTY_DIMS(
-      output, {itensor->getFormat(), itensor->getDataType()}, nullptr);
-
-    if (itensor->getFormat() != output.itensor->getFormat() ||
-        itensor->getDataType() != itensor->getDataType()) {
-      /// @todo add unittest
-      throw std::invalid_argument(
-        "[Tensor::apply] output dimension does not match");
-    }
-
-    itensor->apply(f, output);
-
-    return output;
-  }
-
-  /**
-   * @brief     Apply function to Tensor
-   * @param[in] *function function pointer applied
-   * @retval    Tensor
-   */
-  TensorV2 apply(std::function<TensorV2(TensorV2)> f) const;
-
-  /**
-   * @brief     Apply function to Tensor
-   * @param[in] *function function pointer applied
-   * @param[out] output output tensor
-   * @retval    Tensor
-   */
-  TensorV2 &apply(std::function<TensorV2 &(TensorV2, TensorV2 &)> f,
-                  TensorV2 &output) const;
-
-  /**
-   * @brief     Multiply Tensor Elementwise
-   * @param[in] m Tensor to be multiplied
-   * @param[in] beta scalar to multiply output with and add
-   * @retval    #ML_ERROR_NONE successful
-   *
-   * @note support different strided inputs and output
-   * @note does not support broadcasting
-   *
-   * @todo merge this to multiply_i
-   */
-  int multiply_i_strided(TensorV2 const &m, const float beta = 0.0);
-
-  /**
-   * @brief     Multiply Tensor Element by Element ( Not the MxM )
-   * @param[in] m Tensor to be multiplied
-   * @param[in] beta scalar to multiply output with and add
-   * @retval    Calculated Tensor
-   *
-   * @note support different strided inputs and output
-   * @note does not support broadcasting
-   *
-   * @todo merge this to multiply
-   */
-  TensorV2 multiply_strided(TensorV2 const &m, const float beta = 0.0) const;
-
-  /**
-   * @brief     Multiply Tensor Element by Element ( Not the MxM )
-   * @param[in] m Tensor to be multiplied
-   * @param[out] output Tensor to store the result
-   * @param[in] beta scalar to multiply output with and add
-   * @retval    Calculated Tensor
-   *
-   * @note support different strided inputs and output
-   * @note does not support broadcasting
-   *
-   * @todo merge this to multiply
-   */
-  TensorV2 &multiply_strided(TensorV2 const &m, TensorV2 &output,
-                             const float beta = 0.0) const;
-
-  /**
-   * @brief     Multiply value element by element immediately
-   * @param[in] value multiplier
-   * @retval    #ML_ERROR_INVALID_PARAMETER Tensor dimension is not right
-   * @retval    #ML_ERROR_NONE Successful
-   */
-  int multiply_i(float const &value);
-
-  /**
-   * @brief     Multiply value element by element
-   * @param[in] value multiplier
-   * @retval    Calculated Tensor
-   */
-  TensorV2 multiply(float const &value) const;
-
-  /**
-   * @brief      multiply value element by element
-   * @param[in]  value multiplier
-   * @param[out] out out tensor to store the result
-   * @retval     Calculated Tensor
-   */
-  TensorV2 &multiply(float const &value, TensorV2 &out) const;
-
-  /**
-   * @brief     Multiply Tensor Elementwise
-   * @param[in] m Tensor to be multiplied
-   * @param[in] beta scalar to multiply output with and add
-   * @retval    #ML_ERROR_NONE successful
-   */
-  int multiply_i(TensorV2 const &m, const float beta = 0.0);
-
-  /**
-   * @brief     Multiply Tensor Element by Element ( Not the MxM )
-   * @param[in] m Tensor to be multiplied
-   * @param[in] beta scalar to multiply output with and add
-   * @retval    Calculated Tensor
-   */
-  TensorV2 multiply(TensorV2 const &m, const float beta = 0.0) const;
-
-  /**
-   * @brief      Multiply Tensor Element by Element ( Not the MxM )
-   * @param[in]  m Tensor to be multiplied
-   * @param[out] output Tensor to store the result
-   * @param[in]  beta scalar to multiply output with and add
-   * @retval     Calculated Tensor
-   */
-  TensorV2 &multiply(TensorV2 const &m, TensorV2 &output,
-                     const float beta = 0.0) const;
-
-  /**
-   * @brief     Divide value element by element immediately
-   * @param[in] value divisor
-   * @retval    #ML_ERROR_INVALID_PARAMETER Tensor dimension is not right
-   * @retval    #ML_ERROR_NONE Successful
-   */
-  int divide_i(float const &value);
-
-  /**
-   * @brief     Divide value element by element
-   * @param[in] value Divisor
-   * @retval    Calculated Tensor
-   */
-  TensorV2 divide(float const &value) const;
-
-  /**
-   * @brief     Divide value element by element
-   * @param[in] value Divisor
-   * @param[out] output Tensor to store the result
-   * @retval    Calculated Tensor
-   */
-  TensorV2 &divide(float const &value, TensorV2 &output) const;
-
-  /**
-   * @brief     divide Tensor Elementwise
-   * @param[in] m Tensor to be multiplied
-   * @retval    #ML_ERROR_NONE successful
-   */
-  int divide_i(TensorV2 const &m);
-
-  /**
-   * @brief     Divide Tensor Element by Element
-   * @param[in] m Divisor Tensor
-   * @retval    Calculated Tensor
-   */
-  TensorV2 divide(TensorV2 const &m) const;
-
-  /**
-   * @brief     divide Tensor Elementwise
-   * @param[in] m Tensor to be multiplied
-   * @param[out] output Tensor to store the result
-   * @retval    Calculated Tensor
-   */
-  TensorV2 &divide(TensorV2 const &m, TensorV2 &output) const;
-
-  /**
-   * @brief     Add Tensor Elementwise
-   * @param[in] input Tensor to be added
-   * @param[in] beta scalar to add output with and add
-   * @retval    #ML_ERROR_NONE successful
-   *
-   * @note support different strided inputs and output
-   * @note does not support broadcasting
-   *
-   * @todo merge this to add_i
-   */
-  int add_i_strided(TensorV2 const &input, const float beta = 0.0);
-
-  /**
-   * @brief     Add Tensor Element by Element
-   * @param[in] input Tensor to be added
-   * @param[in] beta Value to be scale the input tensor
-   * @retval    Calculated Tensor
-   *
-   * @note support different strided inputs and output
-   * @note does not support broadcasting
-   *
-   * @todo merge this to add
-   */
-  TensorV2 add_strided(TensorV2 const &input, const float beta = 0.0) const;
-
-  /**
-   * @brief      Add Tensor Element by Element
-   * @param[in]  input Tensor to be added
-   * @param[out] output Tensor to store the result
-   * @param[in]  beta Value to be scale the input tensor
-   * @retval     Calculated Tensor
-   *
-   * @note support different strided inputs and output
-   * @note does not support broadcasting
-   *
-   * @todo merge this to add
-   */
-  TensorV2 &add_strided(TensorV2 const &input, TensorV2 &output,
-                        const float beta = 0.0) const;
-
-  /**
-   * @brief     Add Tensor Element immediately to target tensor without mem copy
-   * @param[in] value value to be added
-   * @retval    #ML_ERROR_NONE  Successful
-   * @retval    #ML_ERROR_INVALID_PARAMETER Invalid Parameter
-   */
-  int add_i(float const &value);
-
-  /**
-   * @brief     Add value Element by Element
-   * @param[in] value value to be added
-   * @retval    Calculated Tensor
-   */
-  TensorV2 add(float const &value) const;
-
-  /**
-   * @brief      Add Tensor Element by Element
-   * @param[in]  value value to be added
-   * @param[out] output Tensor to save output without allocating new memory
-   * @retval     Calculated Tensor
-   */
-  TensorV2 &add(float const &value, TensorV2 &output) const;
-
-  /**
-   * @brief     Add Tensor Element by Element without mem copy
-   * @param[in] m Tensor to be added
-   * @param[in] alpha Values to be scaled
-   * @retval    #ML_ERROR_NONE  Successful
-   * @retval    #ML_ERROR_INVALID_PARAMETER Invalid Parameter
-   */
-  int add_i(TensorV2 const &m, float const alpha = 1);
-
-  /**
-   * @brief     Add Tensor Element by Element
-   * @param[in] m Tensor to be added
-   * @param[in] alpha Values to be scaled
-   * @retval    Calculated Tensor
-   */
-  TensorV2 add(TensorV2 const &m, float const alpha = 1) const;
-
-  /**
-   * @brief      Add Tensor Element by Element
-   * @param[in]  m Tensor to be added
-   * @param[out] output Tensor to be out
-   * @param[in]  alpha Values to be scaled
-   * @retval     Calculated Tensor
-   */
-  TensorV2 &add(TensorV2 const &m, TensorV2 &output,
-                float const alpha = 1) const;
-
-  /**
-   * @brief     memcpyless version of subtract
-   * @retval    #ML_ERROR_NONE  Successful
-   * @retval    #ML_ERROR_INVALID_PARAMETER Invalid Parameter
-   */
-  int subtract_i(float const &value);
-
-  /**
-   * @brief     subtract value Element by Element
-   * @param[in] value value to be subtracted
-   * @retval    Calculated Tensor
-   */
-  TensorV2 subtract(float const &value) const;
-
-  /**
-   * @brief      Subtract Tensor Element by Element
-   * @param[in]  value value to be added
-   * @param[out] output Tensor to save output without allocating new memory
-   * @retval     Calculated Tensor
-   */
-  TensorV2 &subtract(float const &value, TensorV2 &output) const;
-
-  /**
-   * @brief     memcpyless version of subtract
-   * @param[in] m Tensor to be subtracted
-   * @retval    #ML_ERROR_NONE  Successful
-   * @retval    #ML_ERROR_INVALID_PARAMETER Invalid Parameter
-   */
-  int subtract_i(TensorV2 const &m);
-
-  /**
-   * @brief     Substract Tensor Element by Element
-   * @param[in] m Tensor to be subtracted
-   * @retval    Calculated Tensor
-   */
-  TensorV2 subtract(TensorV2 const &m) const;
-
-  /**
-   * @brief      Subtract Tensor Element by Element
-   * @param[in]  m Tensor to be added
-   * @param[out] output Tensor to be out
-   * @retval     Calculated Tensor
-   */
-  TensorV2 &subtract(TensorV2 const &m, TensorV2 &output) const;
-
-  /**
-   * @brief     sum all the Tensor elements according to the batch
-   * @retval    Calculated Tensor(batch, 1, 1, 1)
-   */
-  TensorV2 sum_by_batch() const;
-
-  /**
-   * @brief     sum all the Tensor elements according to the axis
-   *            0 : batch direction
-   *            1 : channel direction
-   *            2 : height direction
-   *            3 : width direction
-   * @param[in] axis Axis to calculate sum along
-   * @param[in] alpha Scale the sum by this value
-   * @retval    Calculated Tensor
-   */
-  TensorV2 sum(unsigned int axis, float alpha = 1.0) const;
-
-  /**
-   * @brief     sum all the Tensor elements according to the axis
-   *            0 : batch direction
-   *            1 : channel direction
-   *            2 : height direction
-   *            3 : width direction
-   * @param[in] axis Axis to calculate sum along
-   * @param[out] output output tensor
-   * @param[in] alpha Scale the sum by this value
-   * @retval    Calculated Tensor
-   */
-  TensorV2 &sum(unsigned int axis, TensorV2 &output, float alpha = 1.0,
-                float beta = 0.0) const;
-
-  /**
-   * @brief sum all the Tensor by multiple axes
-   *
-   * @param axes axes to sum along
-   * @param alpha Scale the sum by this value
-   * @return Tensor
-   */
-  TensorV2 sum(const std::vector<unsigned int> &axes, float alpha = 1.0) const;
-
-  /**
-   * @brief sum all the Tensor by multiple axes
-   *
-   * @param axes axes to sum along
-   * @param[out] output output tensor
-   * @param alpha Scale the sum by this value
-   * @return Tensor
-   */
-  TensorV2 &sum(const std::vector<unsigned int> &axes, TensorV2 &output,
-                float alpha = 1.0) const;
-
-  /**
-   * @brief     Averaging the Tensor elements according to the axis
-   *            0 : batch direction
-   *            1 : channel direction
-   *            2 : height direction
-   *            3 : width direction
-   * @retval    Calculated Tensor
-   */
-  TensorV2 average(unsigned int axis) const;
-
-  /**
-   * @brief     Averaging the Tensor elements according to the axis
-   * @retval    Calculated Tensor
-   */
-  TensorV2 &average(unsigned int axis, TensorV2 &output) const;
-
-  /**
-   * @brief     Average all the Tensor by multiple axes
-   * @param[in] axes axes to sum along
-   * @retval    Calculated Tensor
-   */
-  TensorV2 average(const std::vector<unsigned int> &axes) const;
-
-  /**
-   * @brief      Average all the Tensor by multiple axes
-   * @param[in]  axes axes to sum along
-   * @param[out] output output tensor
-   * @retval     Calculated Tensor
-   */
-  TensorV2 &average(const std::vector<unsigned int> &axes,
-                    TensorV2 &output) const;
-
-  /**
-   * @brief     Average the Tensor elements by all axis
-   * @retval    Calculated Tensor
-   */
-  TensorV2 average() const;
-
-  /**
-   * @brief     Averaging the Tensor elements by all axis
-   * @retval    Calculated Tensor
-   */
-  TensorV2 &average(TensorV2 &output) const;
-
-  /**
-   * @brief     Tensor power element without mem copy
-   * @param[in] exponent exponent
-   * @retval    #ML_ERROR_NONE  Successful
-   */
-  int pow_i(float exponent);
-
-  /**
-   * @brief     Tensor power element by element
-   * @param[in] exponent exponent
-   * @retval    Calculated Tensor
-   */
-  TensorV2 pow(float exponent) const;
-
-  /**
-   * @brief      Tensor power element by element
-   * @param[in]  exponent exponent
-   * @param[out] output out to store the result
-   * @retval     Calculated Tensor
-   */
-  TensorV2 &pow(float exponent, TensorV2 &output) const;
-
-  /**
-   * @brief     Gauss error function
-   * @retval    #ML_ERROR_NONE  Successful
-   */
-  int erf_i();
-
-  /**
-   * @brief     Gauss error function
-   * @retval    Calculated Tensor
-   */
-  TensorV2 erf() const;
-
-  /**
-   * @brief      Gauss error function
-   * @param[out] output out to store the result
-   * @retval     Calculated Tensor
-   */
-  TensorV2 &erf(TensorV2 &output) const;
-
-  /**
-   * @brief    sin transform function
-   * @param[out] out out to store the result
-   */
-  void sin(TensorV2 &out, float alpha = 1.0);
-
-  /**
-   * @brief    cos transform function
-   * @param[out] out out to store the result
-   */
-  void cos(TensorV2 &out, float alpha = 1.0);
-
-  /**
-   * @brief     l2norm the Tensor elements
-   * @retval    Calculated l2norm
-   */
-  float l2norm() const;
-
-  /**
-   * @brief     Normalize the Tensor elements
-   * @retval    Calculated Tensor
-   */
-  TensorV2 &normalization(TensorV2 &output) const;
-
-  /**
-   * @brief     Standardize the Tensor elements
-   * @retval    Calculated Tensor
-   */
-  TensorV2 &standardization(TensorV2 &output) const;
-
-  /**
-   * @brief     Normalize the Tensor elements in-place
-   * @retval    Calculated Tensor
-   */
-  void normalization_i();
-
-  /**
-   * @brief     Standardize the Tensor elements in-place
-   * @retval    Calculated Tensor
-   */
-  void standardization_i();
-
-  /**
-   * @brief     Dot Product of Tensor ( equal MxM )
-   * @details   This applies dot of the last dimension of this and second-last
-   * dimension of passed input tensor.
-   * @param[in] input Tensor
-   * @param[in] trans Transpose
-   * @param[in] trans_in Transpose input
-   * @retval    Calculated Tensor
-   */
-  TensorV2 dot(TensorV2 const &input, bool trans = false,
-               bool trans_in = false) const;
-
-  /**
-   * @brief     Dot Product of Tensor ( equal MxM )
-   * @details   This applies dot of the last dimension of this and
-   * second-last dimension of passed input tensor.
-   * @param[in] input Tensor
-   * @param[in] output output Tensor
-   * @param[in] trans Transpose
-   * @param[in] trans_in Transpose input
-   * @param[in] beta beta
-   * @retval    Calculated Tensor
-   */
-  TensorV2 &dot(TensorV2 const &input, TensorV2 &output, bool trans = false,
-                bool trans_in = false, float beta = 0.0f) const;
-
-  /**
-   * @brief compute the derivative of this in the current tensor
-   * @param input same as given to the dot()
-   * @param output_deriv the derivative of the output
-   * @param[in] trans same as given to the dot()
-   * @param[in] trans_in same as given to the dot()
-   * @param[in] beta same as given to the dot()
-   * @note This will compute the derivative in-place and will overwrite
-   existing
-   * data in the tensor
-   */
-  TensorV2 &dot_deriv_wrt_1(TensorV2 const &input, TensorV2 const &output_deriv,
-                            bool trans = false, bool trans_in = false,
-                            float beta = 0.0f);
-
-  /**
-   * @brief compute the derivative wrt m in the input tensor
-   * @param input_deriv tensor where derivative wrt m will be stored
-   * @param output_deriv the derivative of the output
-   * @param[in] trans same as given to the dot()
-   * @param[in] trans_in same as given to the dot()
-   * @param[in] beta same as given to the dot()
-   * @note The caller tensor must be the same tensor as the one which called
-   the dot() product.
-   */
-  TensorV2 &dot_deriv_wrt_2(TensorV2 &input_deriv, TensorV2 const &output_deriv,
-                            bool trans = false, bool trans_in = false,
-                            float beta = 0.0f) const;
-
-  /**
-   * @copydoc Tensor::dot(Tensor const &input, Tensor &output, bool trans,
-              bool trans_in, float beta) const
-   * @details performs dot operation over a batch of inputs
-   */
-  TensorV2 &dotBatched(TensorV2 const &input, TensorV2 &result,
-                       bool trans = false, bool trans_in = false,
-                       float beta = 0.0f) const;
-
-  /**
-   * @copydoc Tensor::dot_deriv_wrt_1(Tensor const &input, Tensor const
-   &output_deriv, bool trans, bool trans_in, float beta)
-   */
-  TensorV2 &dot_batched_deriv_wrt_1(TensorV2 const &input,
-                                    TensorV2 const &output_deriv,
-                                    bool trans = false, bool trans_in = false,
-                                    float beta = 0.0f);
-
-  /**
-   * @brief Tensor::dot_deriv_wrt_2(Tensor const &input_deriv, Tensor const
-   &output_deriv, bool trans, bool trans_in, float beta) const
-   */
-  TensorV2 &dot_batched_deriv_wrt_2(TensorV2 &input_deriv,
-                                    TensorV2 const &output_deriv,
-                                    bool trans = false, bool trans_in = false,
-                                    float beta = 0.0f) const;
-
-  /**
-   * @brief Calculate Drop Out Mask : x * 1.0/(1.0-rate)
-   * @param dropout drop out rate
-   * @retval Tensor& reference of drop out mask
-   */
-  TensorV2 dropout_mask(float dropout) const;
-
-  /**
-   * @brief Calculate Drop Out Mask : x * 1.0/(1.0-rate) inplace
-   * @param dropout drop out rate
-   */
-  void dropout_mask(float dropout);
-
-  /**
-   * @brief Calculate filter mask
-   * @param mask_len length of each mask along the last axis
-   * @param invert invert the mask
-   */
-  void filter_mask(const TensorV2 &mask_len, bool reverse = false);
-
-  /**
-   * @brief Calculate 2 Zone Out Mask
-   * @details Calculate zone out mask according to the bernoulli distribution.
-   * Zone out mask with rate @a zoneout for inplace and the other zone out mask
-   * with rate @a (1-zoneout).
-   * @param zoneout zone out rate
-   * @retval Tensor zone out mask for opposite tensor
-   */
-  TensorV2 zoneout_mask(float zoneout);
-
-  /**
-   * @brief Calculate 2 Zone Out Mask
-   * @details Calculate zone out mask according to the bernoulli distribution.
-   * Zone out mask with rate @a zoneout for inplace and the other zone out mask
-   * with rate @a (1-zoneout).
-   * @param opposite opposite zone out mask
-   * @param zoneout zone out rate
-   */
-  void zoneout_mask(TensorV2 &opposite, float zoneout);
-
-  /**
-   * @brief split tensor along axis.
-   *
-   * @param num_size num_size
-   * @param axis axis
-   * @return Tensor splitted tensor
-   */
-  std::vector<TensorV2> split(unsigned num_size, int axis = 0);
-
-  /**
-   * @brief split tensor along axis.
-   *
-   * @param sizes sizes
-   * @param axis axis
-   * @return Tensor splitted tensor
-   * @note if the given array sizes is just a 1 unsigned int value, assumes that
-   * it divide tensor by given size evenly
-   */
-  std::vector<TensorV2> split(std::vector<size_t> sizes, int axis = 0);
-
-  /**
-   * @brief concatenate tensors along axis
-   *
-   * @param tensors tensors to be concatenated to the first tensor
-   * @param axis axis
-   * @return Tensor concatenated tensor
-   */
-  static TensorV2 cat(const std::vector<TensorV2> &tensors, int axis = 0);
-
-  /**
-   * @brief     Print element
-   * @param[in] out out stream
-   */
-  void print(std::ostream &out) const;
-
-  /**
-   * @brief     put data of Tensor
-   * @note      It is only effective when memory_swap is used
-   */
-  void putData() const;
-
-  /**
-   * @brief Set the memory buffer for the tensor
-   *
-   * @param buf the memory buffer
-   * @param init intialize the buffer
-   */
-  void setData(const std::shared_ptr<MemoryData> buf, size_t off = 0,
-               bool init = false);
-
-  /**
-   * @brief     return Data pointer of Tensor
-   * @retval    template T pointer (float pointer as default)
-   */
-  const std::shared_ptr<MemoryData> getMemoryData() const;
-
-  /**
-   * @brief     return offset
-   */
-  size_t getOffset() const;
-
-  /**
-   * @brief     Copy the Tensor
-   * @param[in] from Tensor to be copied
-   *
-   * @note copy can reshape the tensor to match the shape
-   * @note support copying data from multiple data type
-   */
-  void copy(const TensorV2 &from);
-
-  /**
-   * @brief     Copy the Tensor
-   * @param[in] from Tensor to be copied
-   * @note      support copying data from multiple data type
-   */
-  void copyData(const TensorV2 &from);
-
-  /**
-   * @brief     Copy the Tensor
-   * @param[in] from Tensor to be copied
-   * @note      only support copying data from tensor with the same data type
-   */
-  void copy_with_stride(const TensorV2 &from);
-
-  /**
-   * @brief Get slice of the tensor, sliced by batch
-   * @param[in] offset offset in batch to start the slice
-   * @param[in] size size of the slice
-   * @retval slice of this tensor
-   * @note This function provides a slice of this tensor, and does not create a
-   * copy
-   */
-  TensorV2 getBatchSlice(size_t offset, unsigned int size) const;
-
-  /**
-   * @brief     Convient wrapper for inplace copy of @a this.
-   * @retval    Copied version of this
-   */
-  TensorV2 clone() const;
-
-  /**
-   * @brief     Save the Tensor into file
-   * @param[in] file output file stream
-   */
-  void save(std::ostream &file);
-
-  /**
-   * @brief     Read the Tensor from file
-   * @param[in] file input file stream
-   */
-  void read(std::ifstream &file);
-
-  /**
-   * @brief     return argument index which value is max by batch
-   * @retval    unsigned int argument indices
-   */
-  std::vector<unsigned int> argmax() const;
-
-  /**
-   * @brief     return max of the absolute values of the tensor
-   * @retval    maximum absolute value
-   */
-  float max_abs() const;
-
-  /**
-   * @brief  return maximum value
-   * @retval Maximum value of the tensor data
-   */
-  float maxValue() const;
-
-  /**
-   * @brief  return minimum value
-   * @retval Minimum value of the tensor data
-   */
-  float minValue() const;
-
-  /**
-   * @brief  Transpose Tensor
-   * @param  direction to transpose ex) 0:2:1
-   * @return Tensor
-   */
-  TensorV2 transpose(const std::string &direction) const;
-
-  /**
-   * @brief      Transpose Tensor
-   * @param      direction to transpose ex) 0:2:1
-   * @param[out] Tensor to save to, dimension is always reshaped.
-   * @retval     Tensor& reference to the out
-   */
-  TensorV2 &transpose(const std::string &direction, TensorV2 &out) const;
-
-  /**
-   * @brief     set Tensor Dim
-   * @param[in] d TensorDim
-   * @note      Throws std::invalid_argument if size mismatch
-   */
-  void reshape(const TensorDim &d);
-
-  /**
-   * @brief fill tensor data with current value,
-   * if dimension is not exactly same, it is a hard error in this function
-   * so, only stride is overriden to @a this
-   *
-   * @param from Tensor to fill the data from
-   * @param allocate if unallocated, allocate with from.getDim()
-   * @throws std::invalid_argument if dimension and stride does not match
-   */
-  void fill(const TensorV2 &from, bool allocate = false);
-
-  /**
-   * @brief     return a copy of the Tensor Dim
-   * @retval    TensorDim
-   */
-  TensorDim getDim() const;
-
-  /**
-   * @brief     return Tensor Type
-   */
-  TensorDim::TensorType getTensorType() const;
-
-  /**
-   * @brief Get initializer for the tensor
-   *
-   * @return initializer of the tensor
-   */
-  Initializer getInitializer() const;
-
-  /**
-   * @brief Get format for the tensor
-   * @return format of the tensor
-   */
-  TensorDim::Format getFormat() const;
-
-  /**
-   * @brief Get data type for the tensor
-   *
-   * @return data type of the tensor
-   */
-  Tdatatype getDataType() const;
-
-  /**
-   * @brief     update batch size for this tensor
-   * @param     batch size
-   * @note      The batchsize of src_tensor need not be related with this
-   * tensor's batch size
-   *
-   * @note      The memory for this tensor will re-allocated/re-assigned if the
-   * updated batch size is different than the current batch size.
-   *
-   * @note      If this tensor is/was the src_tensor for some other, then
-   * reduction in batch size can make the dependent tensors allocate fail due to
-   * memory smaller. Caller must handle this in their own end.
-   *
-   * @note      If this tensor is re-allocated, then the memory might not be
-   * immediately freed as the tensor already depending on this tensor also
-   * share the same memory. So, the peak memory consumption in worst case can
-   * reach the total memory requirements of a model with old batchsize and the
-   * new batch size. It is recommended to first deallocate all the tensors,
-   * updateBatch and then allocate again to avoid such issues.
-   */
-  void updateBatch(unsigned int batch);
-
-  /**
-   * @brief     return whether tensor is contiguous or not.
-   * @retval    bool contiguous
-   */
-  const bool getContiguous() const noexcept;
-
-  /**
-   * @brief     return current stride of tensor.
-   * @retval    int[MAXDIM] strides
-   */
-  const std::array<size_t, TensorDim::MAXDIM> getStrides() const noexcept;
-
-  /**
-   * @brief     Check if two given axes are contiguous
-   * @param[in] np1 first axis
-   * @param[in] np2 second axis to compare with first axis
-   * @retval    bool continuous
-   */
-  bool checkContinuous(unsigned int np1, unsigned int np2) const;
-
-  /**
-   * @brief     Set name of the tensor
-   * @param[in] name_ tensor name
-   */
-  void setName(const std::string &name_);
-
-  /**
-   * @brief     Get name of the tensor
-   * @retval    string name
-   */
-  const std::string &getName() const;
-
-  /**
-   * @brief Get linear index given the n-d index
-   */
-  size_t getIndex(unsigned int b, unsigned int c, unsigned int h,
-                  unsigned int w) const noexcept;
-  /**
-   * @brief     Get size of current tensor
-   * @retval    unsigned int size of the current tensor
-   */
-  size_t size() const;
-
-  /**
-   * @brief     Get if the tensor is empty
-   * @retval    true if the tensor is empty
-   */
-  bool empty() const;
-
-  /**
-   * @brief     Get size of the data in bytes
-   * @retval    size_t Size in bytes
-   */
-  size_t bytes() const;
-
-  /**
-   * @brief     return Tensor batch size
-   * @retval    batch size
-   */
-  size_t batch() const;
-
-  /**
-   * @brief     return Tensor channel size
-   * @retval    channel size
-   */
-  size_t channel() const;
-
-  /**
-   * @brief     return Tensor height size
-   * @retval    height size
-   */
-  size_t height() const;
-
-  /**
-   * @brief     return Tensor width size
-   * @retval    width size
-   */
-  size_t width() const;
-
-  /**
-   * @brief Merge the given two axis for tensor at second axis inplace
-   *
-   * @param axis1 first axis to merge
-   * @param axis2 second axis to merge
-   */
-  void mergeAxis(unsigned int axis1, unsigned int axis2);
-
-  /**
-   * @brief Update destination tensor to share memory with source tensor
-   *
-   * @param src src tensor containing the memory
-   * @param dest destination tensor which will share the memory
-   * @param offset offset to be used from the start of the data in bytes
-   * @note The new tensor will share the same data as the current tensor but
-   * can have different size.
-   * @note New size added with offset must be less than the size of the original
-   * tensor.
-   */
-  void createSharedDataTensor(const TensorV2 &src, TensorV2 &dest,
-                              size_t offset) const;
-
-  /**
-   * @brief Get new tensor which shares memory with current tensor but different
-   * shape
-   *
-   * @param dim new dimension to be set for this tensor
-   * @param offset offset to be used from the start of the data in elements
-   * @note The new tensor will share the same data as the current tensor but
-   * can have different size.
-   * @note New size added with offset must be less than the size of the original
-   * tensor.
-   */
-  TensorV2 getSharedDataTensor(const TensorDim dim_, size_t offset,
-                               bool reset_stride = true,
-                               const std::string &name_ = "") const;
-
-  /**
-   * @brief    Swaps Tensor lhs and rhs
-   * @param[in] lhs Tensor to be swapped
-   * @param[in] rhs Tensor to be swapped
-   */
-  friend void swap(TensorV2 &lhs, TensorV2 &rhs) noexcept {
-    std::swap(lhs.itensor, rhs.itensor);
-  }
-
-private:
-  std::shared_ptr<TensorBase> itensor;
-
-  /**
-   * @brief Set tensor variables
-   *
-   * @param[in] d TensorDim
-   * @param[in] buf buffer
-   * @param[in] offset offset to be used
-   */
-  void setTensorVar(TensorDim d, void *buf, size_t offset);
-};
-
-/**
- * @brief   Overriding output stream
- */
-std::ostream &operator<<(std::ostream &out, TensorV2 const &input);
-
-} // namespace nntrainer
-
-#endif /* __cplusplus */
-#endif /* __TENSOR_V2_H__ */
diff --git a/nntrainer/tensor/tensor_wrap_specs.h b/nntrainer/tensor/tensor_wrap_specs.h
index 6a5195fef5..3f5f9b192d 100644
--- a/nntrainer/tensor/tensor_wrap_specs.h
+++ b/nntrainer/tensor/tensor_wrap_specs.h
@@ -75,9 +75,8 @@ enum class TensorLifespan {
  * regularizer_constant, decay, clip gradient constant, need_gradient property,
  * name, output axis of the tensor object and loss Scale Factor.
  */
-typedef std::tuple<TensorDim, TensorDim, Tensor::Initializer, WeightRegularizer,
-                   float, float, float, bool, const std::string, unsigned int,
-                   float>
+typedef std::tuple<TensorDim, TensorDim, Initializer, WeightRegularizer, float,
+                   float, float, bool, const std::string, unsigned int, float>
   WeightSpec;
 
 /**
@@ -86,7 +85,7 @@ typedef std::tuple<TensorDim, TensorDim, Tensor::Initializer, WeightRegularizer,
  * @details The tuple values are dimension, initializer, need_gradient property,
  * the name, and lifespan of the Var_Grad object.
  */
-typedef std::tuple<TensorDim, Tensor::Initializer, bool, const std::string,
+typedef std::tuple<TensorDim, Initializer, bool, const std::string,
                    TensorLifespan>
   VarGradSpec;
 
@@ -131,8 +130,7 @@ struct TensorSpecV2 {
   std::string name;                               /**< Identifier */
   TensorDim dim;                                  /**< dimension */
   TensorLifespan ls;                              /**< lifespan */
-  Tensor::Initializer initializer =
-    Tensor::Initializer::NONE; /**< initializer */
+  Initializer initializer = Initializer::NONE;    /**< initializer */
 
   /** ONLY USED FOR READ_ONLY_VIEW, MAYBE_MODIFYING_VIEW */
   unsigned int offset = 0u;   /**< tensor offset */
diff --git a/nntrainer/tensor/var_grad.cpp b/nntrainer/tensor/var_grad.cpp
index 09dbf6267e..e91c918a9f 100644
--- a/nntrainer/tensor/var_grad.cpp
+++ b/nntrainer/tensor/var_grad.cpp
@@ -18,7 +18,7 @@
 
 namespace nntrainer {
 
-Var_Grad::Var_Grad(const TensorDim &dim, const Tensor::Initializer init,
+Var_Grad::Var_Grad(const TensorDim &dim, const Initializer init,
                    bool need_gradient, bool alloc_now,
                    const std::string &name) :
   is_dependent(false),
@@ -32,15 +32,15 @@ Var_Grad::Var_Grad(const TensorDim &dim, const Tensor::Initializer init,
      * @todo gradient initializer should be none, and then they should be set
      * zero right before using by the user itself.
      */
-    grad = std::make_shared<Tensor>(dim, alloc_now, Tensor::Initializer::ZEROS,
-                                    grad_name);
+    grad =
+      std::make_shared<Tensor>(dim, alloc_now, Initializer::ZEROS, grad_name);
   else
     grad = std::make_shared<Tensor>(grad_name);
 }
 
 Var_Grad::Var_Grad(const TensorDim &dim_v, const TensorDim &dim_g,
-                   const Tensor::Initializer init, bool need_gradient,
-                   bool alloc_now, const std::string &name) :
+                   const Initializer init, bool need_gradient, bool alloc_now,
+                   const std::string &name) :
   is_dependent(false),
   is_first_access_gradient(false),
   is_last_access_gradient(false) {
@@ -53,8 +53,8 @@ Var_Grad::Var_Grad(const TensorDim &dim_v, const TensorDim &dim_g,
      * zero right before using by the user itself.
      */
 
-    grad = std::make_shared<Tensor>(dim_g, alloc_now,
-                                    Tensor::Initializer::ZEROS, grad_name);
+    grad =
+      std::make_shared<Tensor>(dim_g, alloc_now, Initializer::ZEROS, grad_name);
   else
     grad = std::make_shared<Tensor>(grad_name);
 }
diff --git a/nntrainer/tensor/var_grad.h b/nntrainer/tensor/var_grad.h
index 52cabbc055..48a4fcf261 100644
--- a/nntrainer/tensor/var_grad.h
+++ b/nntrainer/tensor/var_grad.h
@@ -55,9 +55,8 @@ class Var_Grad {
    * @param name Name for this Var_Grad
    */
   explicit Var_Grad(const TensorDim &dim,
-                    const Tensor::Initializer init = Tensor::Initializer::NONE,
-                    bool ng = true, bool alloc_now = false,
-                    const std::string &name = "");
+                    const Initializer init = Initializer::NONE, bool ng = true,
+                    bool alloc_now = false, const std::string &name = "");
 
   /**
    * @brief Construct a new Var_Grad object
@@ -69,9 +68,8 @@ class Var_Grad {
    * @param name Name for this Var_Grad
    */
   explicit Var_Grad(const TensorDim &dim_v, const TensorDim &dim_g,
-                    const Tensor::Initializer init = Tensor::Initializer::NONE,
-                    bool ng = true, bool alloc_now = false,
-                    const std::string &name = "");
+                    const Initializer init = Initializer::NONE, bool ng = true,
+                    bool alloc_now = false, const std::string &name = "");
 
   /**
    * @brief Construct a new Var_Grad object
diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp
index f98c8c8356..b1b3990388 100644
--- a/nntrainer/tensor/weight.cpp
+++ b/nntrainer/tensor/weight.cpp
@@ -18,7 +18,7 @@
 
 namespace nntrainer {
 
-Weight::Weight(const TensorDim &dim, const Tensor::Initializer init,
+Weight::Weight(const TensorDim &dim, const Initializer init,
                const WeightRegularizer reg, const float reg_const,
                const float decay_const, const float max_norm, bool train,
                bool alloc_now_, std::string name, unsigned int axis,
@@ -30,14 +30,14 @@ Weight::Weight(const TensorDim &dim, const Tensor::Initializer init,
   clip_by_global_norm(max_norm),
   output_axis(axis),
   loss_scale(loss_scale_) {
-  if (init == Tensor::Initializer::NONE)
+  if (init == Initializer::NONE)
     throw std::invalid_argument("Weight initializer cannot be none");
   if (regularizer == WeightRegularizer::UNKNOWN)
     throw std::invalid_argument("Weight regularizer unknown");
 }
 
 Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g,
-               const Tensor::Initializer init, const WeightRegularizer reg,
+               const Initializer init, const WeightRegularizer reg,
                const float reg_const, const float decay_const,
                const float max_norm, bool train, bool alloc_now_,
                std::string name, unsigned int axis, float loss_scale_) :
@@ -48,7 +48,7 @@ Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g,
   clip_by_global_norm(max_norm),
   output_axis(axis),
   loss_scale(loss_scale_) {
-  if (init == Tensor::Initializer::NONE)
+  if (init == Initializer::NONE)
     throw std::invalid_argument("Weight initializer cannot be none");
   if (regularizer == WeightRegularizer::UNKNOWN)
     throw std::invalid_argument("Weight regularizer unknown");
diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h
index 552f6d5739..36005eaee2 100644
--- a/nntrainer/tensor/weight.h
+++ b/nntrainer/tensor/weight.h
@@ -59,14 +59,13 @@ class Weight : public Var_Grad {
    * @param alloc_now The memory for the weight tensors be allocated upon init
    * @param name Name for this weight
    */
-  explicit Weight(
-    const TensorDim &dim,
-    const Tensor::Initializer init = Tensor::Initializer::XAVIER_UNIFORM,
-    const WeightRegularizer reg = WeightRegularizer::NONE,
-    const float reg_const = 1.0f, const float decay = 0.0f,
-    const float clip_by_global_norm = 0.0f, bool ng = true,
-    bool alloc_now = false, std::string name = "", unsigned int axis = 3,
-    float loss_scale_ = 0.0);
+  explicit Weight(const TensorDim &dim,
+                  const Initializer init = Initializer::XAVIER_UNIFORM,
+                  const WeightRegularizer reg = WeightRegularizer::NONE,
+                  const float reg_const = 1.0f, const float decay = 0.0f,
+                  const float clip_by_global_norm = 0.0f, bool ng = true,
+                  bool alloc_now = false, std::string name = "",
+                  unsigned int axis = 3, float loss_scale_ = 0.0);
 
   /**
    * @brief Construct a new Weight object
@@ -80,14 +79,13 @@ class Weight : public Var_Grad {
    * @param alloc_now The memory for the weight tensors be allocated upon init
    * @param name Name for this weight
    */
-  explicit Weight(
-    const TensorDim &dim_v, const TensorDim &dim_g,
-    const Tensor::Initializer init = Tensor::Initializer::XAVIER_UNIFORM,
-    const WeightRegularizer reg = WeightRegularizer::NONE,
-    const float reg_const = 1.0f, const float decay = 0.0f,
-    const float clip_by_global_norm = 0.0f, bool ng = true,
-    bool alloc_now = false, std::string name = "", unsigned int axis = 3,
-    float loss_scale_ = 0.0);
+  explicit Weight(const TensorDim &dim_v, const TensorDim &dim_g,
+                  const Initializer init = Initializer::XAVIER_UNIFORM,
+                  const WeightRegularizer reg = WeightRegularizer::NONE,
+                  const float reg_const = 1.0f, const float decay = 0.0f,
+                  const float clip_by_global_norm = 0.0f, bool ng = true,
+                  bool alloc_now = false, std::string name = "",
+                  unsigned int axis = 3, float loss_scale_ = 0.0);
 
   /**
    * @brief Construct a new Weight object
@@ -97,7 +95,7 @@ class Weight : public Var_Grad {
   explicit Weight(const Spec &spec, bool alloc_now = false) :
     Weight(std::get<0>(spec), // TensorDim for Variable
            std::get<1>(spec), // TensorDim for Gradient
-           std::get<2>(spec), // Tensor::Initializer
+           std::get<2>(spec), // Initializer
            std::get<3>(spec), // WeightRegularizer
            std::get<4>(spec), // WeightRegularizerConstant
            std::get<5>(spec), // weight decay constant
diff --git a/packaging/nntrainer.spec b/packaging/nntrainer.spec
index 36ba371d22..5788e708b5 100644
--- a/packaging/nntrainer.spec
+++ b/packaging/nntrainer.spec
@@ -527,7 +527,6 @@ cp -r result %{buildroot}%{_datadir}/nntrainer/unittest/
 # tensor headers
 %{_includedir}/nntrainer/memory_data.h
 %{_includedir}/nntrainer/tensor.h
-%{_includedir}/nntrainer/tensor_v2.h
 %{_includedir}/nntrainer/tensor_base.h
 %{_includedir}/nntrainer/float_tensor.h
 %if 0%{?enable_fp16}
diff --git a/test/include/nntrainer_test_util.h b/test/include/nntrainer_test_util.h
index 74eef4abaa..94601938f9 100644
--- a/test/include/nntrainer_test_util.h
+++ b/test/include/nntrainer_test_util.h
@@ -38,7 +38,6 @@
 #include <nntrainer_log.h>
 #include <realizer.h>
 #include <tensor.h>
-#include <tensor_v2.h>
 
 /** tolerance is reduced for packaging, but CI runs at full tolerance */
 #ifdef REDUCE_TOLERANCE
@@ -170,31 +169,6 @@ randUniform(unsigned int batch, unsigned channel, unsigned height,
             nntrainer::Tformat fm = nntrainer::Tformat::NCHW,
             nntrainer::Tdatatype d_type = nntrainer::Tdatatype::FP32);
 
-/**
- * @brief return a tensor filled with contant value with dimension
- */
-nntrainer::TensorV2
-constantV2(float value, unsigned int d0, unsigned d1, unsigned d2, unsigned d3,
-           nntrainer::Tformat fm = nntrainer::Tformat::NCHW,
-           nntrainer::Tdatatype d_type = nntrainer::Tdatatype::FP32);
-
-/**
- * @brief return a tensor filled with ranged value with given dimension
- */
-nntrainer::TensorV2
-rangedV2(unsigned int batch, unsigned channel, unsigned height, unsigned width,
-         nntrainer::Tformat fm = nntrainer::Tformat::NCHW,
-         nntrainer::Tdatatype d_type = nntrainer::Tdatatype::FP32);
-
-/**
- * @brief return a tensor filled with random value with given dimension
- */
-nntrainer::TensorV2
-randUniformV2(unsigned int batch, unsigned channel, unsigned height,
-              unsigned width, float min = -1, float max = 1,
-              nntrainer::Tformat fm = nntrainer::Tformat::NCHW,
-              nntrainer::Tdatatype d_type = nntrainer::Tdatatype::FP32);
-
 /**
  * @brief replace string and save in file
  * @param[in] from string to be replaced
diff --git a/test/nntrainer_test_util.cpp b/test/nntrainer_test_util.cpp
index bcc33e40c8..260727f212 100644
--- a/test/nntrainer_test_util.cpp
+++ b/test/nntrainer_test_util.cpp
@@ -213,45 +213,6 @@ nntrainer::Tensor randUniform(unsigned int batch, unsigned int channel,
   return t;
 }
 
-nntrainer::TensorV2 constantV2(float value, unsigned int d0, unsigned int d1,
-                               unsigned int d2, unsigned int d3,
-                               nntrainer::Tformat fm,
-                               nntrainer::Tdatatype d_type) {
-  nntrainer::TensorV2 t(d0, d1, d2, d3, {fm, d_type});
-  t.setValue(value);
-
-  return t;
-}
-
-nntrainer::TensorV2 rangedV2(unsigned int batch, unsigned int channel,
-                             unsigned int height, unsigned int width,
-                             nntrainer::Tformat fm,
-                             nntrainer::Tdatatype d_type) {
-  nntrainer::TensorV2 t(batch, channel, height, width, {fm, d_type});
-  if (d_type == nntrainer::Tdatatype::FP32) {
-    float i = 0;
-    t = t.apply((std::function<float(float)>)[&](float in) { return i++; });
-  } else if (d_type == nntrainer::Tdatatype::FP16) {
-#ifdef ENABLE_FP16
-    _FP16 i = 0;
-    t = t.apply((std::function<_FP16(_FP16)>)[&](_FP16 in) { return i++; });
-#else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
-#endif
-  }
-
-  return t;
-}
-
-nntrainer::TensorV2 randUniformV2(unsigned int batch, unsigned int channel,
-                                  unsigned int height, unsigned int width,
-                                  float min, float max, nntrainer::Tformat fm,
-                                  nntrainer::Tdatatype d_type) {
-  nntrainer::TensorV2 t(batch, channel, height, width, {fm, d_type});
-  t.setRandUniform(min, max);
-  return t;
-}
-
 const std::string
 getResPath(const std::string &filename,
            const std::initializer_list<const char *> fallback_base) {
diff --git a/test/unittest/layers/layers_golden_tests.cpp b/test/unittest/layers/layers_golden_tests.cpp
index 56d591019b..152f9e0934 100644
--- a/test/unittest/layers/layers_golden_tests.cpp
+++ b/test/unittest/layers/layers_golden_tests.cpp
@@ -90,7 +90,7 @@ static TensorPacks prepareTensors(const InitLayerContext &context,
     vg.reserve(dims.size());
 
     for (auto &dim : dims) {
-      vg.emplace_back(dim, Tensor::Initializer::NONE, true, true, "golden");
+      vg.emplace_back(dim, Initializer::NONE, true, true, "golden");
       sizeCheckedReadTensor(vg.back().getVariableRef(), file,
                             vg.back().getName());
     }
@@ -113,8 +113,8 @@ static TensorPacks prepareTensors(const InitLayerContext &context,
 
     for (auto &spec : specs) {
       /// @todo initializer should be depending is as well
-      vg.emplace_back(spec.variable_spec.dim, Tensor::Initializer::NONE, true,
-                      true, "golden");
+      vg.emplace_back(spec.variable_spec.dim, Initializer::NONE, true, true,
+                      "golden");
     }
     return vg;
   };
diff --git a/test/unittest/layers/unittest_layer_node.cpp b/test/unittest/layers/unittest_layer_node.cpp
index 3b41f02f30..9faf44e8af 100644
--- a/test/unittest/layers/unittest_layer_node.cpp
+++ b/test/unittest/layers/unittest_layer_node.cpp
@@ -123,9 +123,9 @@ TEST(nntrainer_LayerNode, finalize_04_p) {
  */
 TEST(nntrainer_LayerNode, finalize_05_n) {
   std::unique_ptr<nntrainer::LayerNode> lnode;
-  nntrainer::Var_Grad input = nntrainer::Var_Grad(
-    nntrainer::TensorDim({1, 1, 1, 1}), nntrainer::Tensor::Initializer::NONE,
-    true, false, "dummy");
+  nntrainer::Var_Grad input =
+    nntrainer::Var_Grad(nntrainer::TensorDim({1, 1, 1, 1}),
+                        nntrainer::Initializer::NONE, true, false, "dummy");
 
   EXPECT_NO_THROW(lnode =
                     nntrainer::createLayerNode(nntrainer::IdentityLayer::type));
@@ -284,16 +284,15 @@ TEST(nntrainer_LayerNode, setWeights_01_n) {
  */
 TEST(nntrainer_LayerNode, setWeights_02_n) {
   std::unique_ptr<nntrainer::LayerNode> lnode;
-  nntrainer::Weight weight =
-    nntrainer::Weight(nntrainer::TensorDim({1, 1, 1, 1}),
-                      nntrainer::Tensor::Initializer::XAVIER_UNIFORM,
-                      nntrainer::WeightRegularizer::NONE, 1.0f, 0.0f, 0.0f,
-                      true, false, "weight");
+  nntrainer::Weight weight = nntrainer::Weight(
+    nntrainer::TensorDim({1, 1, 1, 1}), nntrainer::Initializer::XAVIER_UNIFORM,
+    nntrainer::WeightRegularizer::NONE, 1.0f, 0.0f, 0.0f, true, false,
+    "weight");
   float *float_ptr[2] = {nullptr, nullptr};
   const std::vector<float *> new_weights({float_ptr[0], float_ptr[1]});
-  nntrainer::Var_Grad input = nntrainer::Var_Grad(
-    nntrainer::TensorDim({1, 1, 1, 1}), nntrainer::Tensor::Initializer::NONE,
-    true, false, "dummy");
+  nntrainer::Var_Grad input =
+    nntrainer::Var_Grad(nntrainer::TensorDim({1, 1, 1, 1}),
+                        nntrainer::Initializer::NONE, true, false, "dummy");
 
   EXPECT_NO_THROW(lnode =
                     nntrainer::createLayerNode(nntrainer::IdentityLayer::type));
diff --git a/test/unittest/meson.build b/test/unittest/meson.build
index b1977ea8d1..931570739a 100644
--- a/test/unittest/meson.build
+++ b/test/unittest/meson.build
@@ -39,7 +39,6 @@ test_target = [
   ['unittest_nntrainer_internal', []],
   ['unittest_nntrainer_lazy_tensor', []],
   ['unittest_nntrainer_tensor', []],
-  ['unittest_nntrainer_tensor_v2', []],
   ['unittest_nntrainer_tensor_nhwc', []],
   ['unittest_util_func', []],
   ['unittest_nntrainer_modelfile', []],
@@ -58,7 +57,6 @@ test_target = [
 if get_option('enable-fp16')
   test_target += [['unittest_nntrainer_tensor_fp16', []]]
   test_target += [['unittest_nntrainer_tensor_pool_fp16', []]]
-  test_target += [['unittest_nntrainer_tensor_v2_fp16', []]]
 endif
 
 if get_option('enable-profile')
diff --git a/test/unittest/unittest_nntrainer_tensor.cpp b/test/unittest/unittest_nntrainer_tensor.cpp
index 12c8873055..0f0fda6534 100644
--- a/test/unittest/unittest_nntrainer_tensor.cpp
+++ b/test/unittest/unittest_nntrainer_tensor.cpp
@@ -199,76 +199,76 @@ TEST(nntrainer_Tensor, Tensor_03_p) {
   EXPECT_EQ(status, ML_ERROR_NONE);
 }
 
-TEST(nntrainer_Tensor, Tensor_04_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  std::vector<std::vector<std::vector<uint8_t>>> in;
-
-  for (int k = 0; k < batch; ++k) {
-    std::vector<std::vector<uint8_t>> ttv;
-    for (int i = 0; i < height; ++i) {
-      std::vector<uint8_t> tv;
-      for (int j = 0; j < width; ++j) {
-        tv.push_back(k * height * width + i * width + j);
-      }
-      ttv.push_back(tv);
-    }
-    in.push_back(ttv);
-  }
-
-  nntrainer::Tensor tensor = nntrainer::Tensor(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
-  ASSERT_NE(nullptr, tensor.getData<uint8_t>());
-
-  if (tensor.getValue<uint8_t>(0, 0, 0, 1) != 1)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
+// TEST(nntrainer_Tensor, Tensor_04_p) {
+//   int status = ML_ERROR_NONE;
+//   int batch = 3;
+//   int height = 3;
+//   int width = 10;
+//   std::vector<std::vector<std::vector<uint8_t>>> in;
+
+//   for (int k = 0; k < batch; ++k) {
+//     std::vector<std::vector<uint8_t>> ttv;
+//     for (int i = 0; i < height; ++i) {
+//       std::vector<uint8_t> tv;
+//       for (int j = 0; j < width; ++j) {
+//         tv.push_back(k * height * width + i * width + j);
+//       }
+//       ttv.push_back(tv);
+//     }
+//     in.push_back(ttv);
+//   }
 
-TEST(nntrainer_Tensor, Tensor_05_p) {
-  int status = ML_ERROR_NONE;
-  std::vector<std::vector<std::vector<uint8_t>>> in = {{{0, 1}, {2, 3}},
-                                                       {{4, 5}, {6, 7}},
-                                                       {{8, 9}, {10, 11}},
-                                                       {{12, 13}, {14, 15}}};
+//   nntrainer::Tensor tensor = nntrainer::Tensor(
+//     in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+//   ASSERT_NE(nullptr, tensor.getData<uint8_t>());
 
-  nntrainer::Tensor tensor = nntrainer::Tensor(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4});
-  ASSERT_NE(nullptr, tensor.getData<uint8_t>());
-
-  for (size_t b = 0; b < tensor.batch(); ++b) {
-    for (size_t c = 0; c < tensor.channel(); ++c) {
-      for (size_t h = 0; h < tensor.height(); ++h) {
-        for (size_t w = 0; w < tensor.width(); ++w) {
-          size_t idx = tensor.getIndex(b, c, h, w);
-          ASSERT_EQ(idx, tensor.getValueQint4(idx));
-        }
-      }
-    }
-  }
-}
-
-TEST(nntrainer_Tensor, Tensor_06_p) {
-  int status = ML_ERROR_NONE;
-  nntrainer::Tensor tensor = nntrainer::Tensor(
-    1, 4, 2, 2, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4});
-  ASSERT_NE(nullptr, tensor.getData<uint8_t>());
+//   if (tensor.getValue<uint8_t>(0, 0, 0, 1) != 1)
+//     status = ML_ERROR_INVALID_PARAMETER;
+//   EXPECT_EQ(status, ML_ERROR_NONE);
+// }
 
-  tensor.setValue(2);
+// TEST(nntrainer_Tensor, Tensor_05_p) {
+//   int status = ML_ERROR_NONE;
+//   std::vector<std::vector<std::vector<uint8_t>>> in = {{{0, 1}, {2, 3}},
+//                                                        {{4, 5}, {6, 7}},
+//                                                        {{8, 9}, {10, 11}},
+//                                                        {{12, 13}, {14, 15}}};
+
+//   nntrainer::Tensor tensor = nntrainer::Tensor(
+//     in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4});
+//   ASSERT_NE(nullptr, tensor.getData<uint8_t>());
+
+//   for (size_t b = 0; b < tensor.batch(); ++b) {
+//     for (size_t c = 0; c < tensor.channel(); ++c) {
+//       for (size_t h = 0; h < tensor.height(); ++h) {
+//         for (size_t w = 0; w < tensor.width(); ++w) {
+//           size_t idx = tensor.getIndex(b, c, h, w);
+//           ASSERT_EQ(idx, tensor.getValueQint4(idx));
+//         }
+//       }
+//     }
+//   }
+// }
 
-  for (size_t b = 0; b < tensor.batch(); ++b) {
-    for (size_t c = 0; c < tensor.channel(); ++c) {
-      for (size_t h = 0; h < tensor.height(); ++h) {
-        for (size_t w = 0; w < tensor.width(); ++w) {
-          size_t idx = tensor.getIndex(b, c, h, w);
-          ASSERT_EQ(2, tensor.getValueQint4(idx));
-        }
-      }
-    }
-  }
-}
+// TEST(nntrainer_Tensor, Tensor_06_p) {
+//   int status = ML_ERROR_NONE;
+//   nntrainer::Tensor tensor = nntrainer::Tensor(
+//     1, 4, 2, 2, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4});
+//   ASSERT_NE(nullptr, tensor.getData<uint8_t>());
+
+//   tensor.setValue(2);
+
+//   for (size_t b = 0; b < tensor.batch(); ++b) {
+//     for (size_t c = 0; c < tensor.channel(); ++c) {
+//       for (size_t h = 0; h < tensor.height(); ++h) {
+//         for (size_t w = 0; w < tensor.width(); ++w) {
+//           size_t idx = tensor.getIndex(b, c, h, w);
+//           ASSERT_EQ(2, tensor.getValueQint4(idx));
+//         }
+//       }
+//     }
+//   }
+// }
 
 TEST(nntrainer_Tensor, multiply_i_01_p) {
   int status = ML_ERROR_NONE;
@@ -3217,19 +3217,19 @@ TEST(nntrainer_Tensor, print_small_size) {
   EXPECT_EQ(ss.str(), expected.str());
 }
 
-// TEST(nntrainer_Tensor, print_large_size) {
-//   nntrainer::Tensor target = constant(1.2, 3, 10, 10, 10);
+TEST(nntrainer_Tensor, print_large_size) {
+  nntrainer::Tensor target = constant(1.2, 3, 10, 10, 10);
 
-//   std::stringstream ss, expected;
+  std::stringstream ss, expected;
 
-//   expected << '<' << typeid(target).name() << " at " << &target << ">\n"
-//            << "data addr: " << target.getData() << '\n'
-//            << "Shape: 3:10:10:10\n"
-//            << "[1.2 1.2 1.2 ... 1.2 1.2 1.2]\n";
-//   ss << target;
+  expected << '<' << typeid(target).name() << " at " << &target << ">\n"
+           << "data addr: " << target.getData() << '\n'
+           << "Shape: 3:10:10:10 [ FP32 : NCHW ]\n"
+           << "[1.2 1.2 1.2 ... 1.2 1.2 1.2]\n";
+  ss << target;
 
-//   EXPECT_EQ(ss.str(), expected.str());
-// }
+  EXPECT_EQ(ss.str(), expected.str());
+}
 
 TEST(nntrainer_Tensor, DISABLED_equation_test_01_p) {
   nntrainer::Tensor a, b, c;
@@ -3342,28 +3342,28 @@ TEST(nntrainer_Tensor, allocate_03_p) {
   EXPECT_TRUE(t.isAllocated());
 }
 
-TEST(nntrainer_Tensor, allocate_04_p) {
-  nntrainer::Tensor t(
-    {1, 2, 3, 4, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8}},
-    true);
-  EXPECT_TRUE(t.isAllocated());
+// TEST(nntrainer_Tensor, allocate_04_p) {
+//   nntrainer::Tensor t(
+//     {1, 2, 3, 4, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8}},
+//     true);
+//   EXPECT_TRUE(t.isAllocated());
 
-  t.allocate();
-  EXPECT_TRUE(t.isAllocated());
-}
+//   t.allocate();
+//   EXPECT_TRUE(t.isAllocated());
+// }
 
-TEST(nntrainer_Tensor, allocate_05_p) {
-  nntrainer::Tensor t(
-    {1, 2, 3, 4, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}},
-    true);
-  EXPECT_TRUE(t.isAllocated());
+// TEST(nntrainer_Tensor, allocate_05_p) {
+//   nntrainer::Tensor t(
+//     {1, 2, 3, 4, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}},
+//     true);
+//   EXPECT_TRUE(t.isAllocated());
 
-  t.allocate();
-  EXPECT_TRUE(t.isAllocated());
-}
+//   t.allocate();
+//   EXPECT_TRUE(t.isAllocated());
+// }
 
 TEST(nntrainer_Tensor, initialize_01_p) {
-  nntrainer::Tensor t({1, 2, 3, 4}, true, nntrainer::Tensor::Initializer::ONES);
+  nntrainer::Tensor t({1, 2, 3, 4}, true, nntrainer::Initializer::ONES);
 
   nntrainer::Tensor golden(1, 2, 3, 4);
   golden.setValue(1);
@@ -3379,13 +3379,12 @@ TEST(nntrainer_Tensor, initialize_02_p) {
 
   EXPECT_NE(golden, t);
 
-  t.initialize(nntrainer::Tensor::Initializer::ONES);
+  t.initialize(nntrainer::Initializer::ONES);
   EXPECT_EQ(golden, t);
 }
 
 TEST(nntrainer_Tensor, initialize_03_p) {
-  nntrainer::Tensor t({1, 2, 3, 4}, false,
-                      nntrainer::Tensor::Initializer::ONES);
+  nntrainer::Tensor t({1, 2, 3, 4}, false, nntrainer::Initializer::ONES);
   t.allocate();
 
   nntrainer::Tensor golden(1, 2, 3, 4);
@@ -3396,7 +3395,7 @@ TEST(nntrainer_Tensor, initialize_03_p) {
 
 TEST(nntrainer_Tensor, initialize_04_p) {
   nntrainer::Tensor t({1, 2, 3, 4}, false);
-  t.initialize(nntrainer::Tensor::Initializer::ONES);
+  t.initialize(nntrainer::Initializer::ONES);
   t.allocate();
 
   nntrainer::Tensor golden(1, 2, 3, 4);
@@ -3417,23 +3416,22 @@ TEST(nntrainer_Tensor, initialize_05_p) {
    * EXPECT_NE(golden, t);
    */
 
-  t.initialize(nntrainer::Tensor::Initializer::ONES);
+  t.initialize(nntrainer::Initializer::ONES);
   EXPECT_EQ(golden, t);
 }
 
 TEST(nntrainer_Tensor, initialize_06_n) {
-  nntrainer::Tensor t({1, 2, 3, 4}, true, nntrainer::Tensor::Initializer::ONES);
-  nntrainer::Tensor golden({1, 2, 3, 4}, true,
-                           nntrainer::Tensor::Initializer::ZEROS);
+  nntrainer::Tensor t({1, 2, 3, 4}, true, nntrainer::Initializer::ONES);
+  nntrainer::Tensor golden({1, 2, 3, 4}, true, nntrainer::Initializer::ZEROS);
 
   EXPECT_NE(golden, t);
 
-  golden.initialize(nntrainer::Tensor::Initializer::ONES);
+  golden.initialize(nntrainer::Initializer::ONES);
   EXPECT_EQ(golden, t);
 }
 
 TEST(nntrainer_Tensor, initialize_07_p) {
-  nntrainer::Tensor t({1, 2, 3, 4}, true, nntrainer::Tensor::Initializer::ONES);
+  nntrainer::Tensor t({1, 2, 3, 4}, true, nntrainer::Initializer::ONES);
 
   nntrainer::Tensor golden(1, 2, 3, 4);
   golden.setValue(1);
@@ -3449,39 +3447,37 @@ TEST(nntrainer_Tensor, initialize_07_p) {
 }
 
 TEST(nntrainer_Tensor, initialize_08_p) {
-  nntrainer::Tensor t({1, 2, 3, 4}, true, nntrainer::Tensor::Initializer::ONES);
+  nntrainer::Tensor t({1, 2, 3, 4}, true, nntrainer::Initializer::ONES);
 
   nntrainer::Tensor golden(1, 2, 3, 4);
   golden.setValue(1);
 
   EXPECT_EQ(golden, t);
 
-  t.initialize(nntrainer::Tensor::Initializer::HE_NORMAL);
+  t.initialize(nntrainer::Initializer::HE_NORMAL);
   EXPECT_NE(golden, t);
 
   t.initialize();
   EXPECT_NE(golden, t);
 
-  t.initialize(nntrainer::Tensor::Initializer::ONES);
+  t.initialize(nntrainer::Initializer::ONES);
   EXPECT_EQ(golden, t);
 
   t.initialize();
   EXPECT_EQ(golden, t);
 }
 
-TEST(nntrainer_Tensor, initialize_09_p) {
-  nntrainer::Tensor t(
-    {1, 2, 3, 4, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}}, true,
-    nntrainer::Tensor::Initializer::ONES);
-  nntrainer::Tensor golden(
-    {1, 2, 3, 4, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}}, true,
-    nntrainer::Tensor::Initializer::ZEROS);
-
-  EXPECT_NE(golden, t);
-
-  golden.initialize(nntrainer::Tensor::Initializer::ONES);
-  EXPECT_EQ(golden, t);
-}
+// TEST(nntrainer_Tensor, initialize_09_p) {
+//   nntrainer::Tensor t(
+//     {1, 2, 3, 4, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}},
+//     true, nntrainer::Initializer::ONES);
+//   nntrainer::Tensor golden(
+//     {1, 2, 3, 4, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}},
+//     true, nntrainer::Initializer::ZEROS);
+//   EXPECT_NE(golden, t);
+//   golden.initialize(nntrainer::Initializer::ONES);
+//   EXPECT_EQ(golden, t);
+// }
 
 TEST(nntrainer_Tensor, split_01_p) {
   {
@@ -4070,22 +4066,6 @@ TEST(nntrainer_Tensor, TensorWrap_02_n) {
   EXPECT_THROW(nntrainer::Tensor::Map(dat, 3, {4}), std::invalid_argument);
 }
 
-TEST(nntrainer_Tensor, TensorPaddedValue_p) {
-  nntrainer::Tensor a = ranged(1, 1, 3, 3);
-  float default_padded = -1;
-
-  for (int i = 0; i < 5; ++i) {
-    for (int j = 0; j < 5; ++j) {
-      float expected = default_padded;
-      if (1 <= i && i <= 3 && 1 <= j && j <= 3) {
-        expected = (i - 1) * 3 + (j - 1);
-      }
-      float actual = a.getValuePaddedVirtual(0, 0, i, j, 1, 1, default_padded);
-      EXPECT_FLOAT_EQ(actual, expected);
-    }
-  }
-}
-
 TEST(nntrainer_Tensor, add_strided_01_p) {
   int status = ML_ERROR_NONE;
   int batch = 3;
@@ -4354,111 +4334,111 @@ TEST(nntrainer_Tensor, multiply_strided_06_p) {
   EXPECT_EQ(status, ML_ERROR_NONE);
 }
 
-/**
- * @brief dequantize FP32 tensor
- */
-TEST(nntrainer_Tensor, dequantize_01_n) {
-  int batch = 1;
-  int channel = 3;
-  int height = 4;
-  int width = 5;
-
-  nntrainer::Tensor input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-  input.setScaleFactors({1.5, 1.0, 0.5});
-  input.setZeroPoints({1, 4, 7});
+// /**
+//  * @brief dequantize FP32 tensor
+//  */
+// TEST(nntrainer_Tensor, dequantize_01_n) {
+//   int batch = 1;
+//   int channel = 3;
+//   int height = 4;
+//   int width = 5;
 
-  nntrainer::Tensor output(batch, channel, height, width);
+//   nntrainer::Tensor input(batch, channel, height, width);
+//   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+//   input.setScaleFactors({1.5, 1.0, 0.5});
+//   input.setZeroPoints({1, 4, 7});
 
-  EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
-}
+//   nntrainer::Tensor output(batch, channel, height, width);
 
-/**
- * @brief dequantize tensor with different dimension
- */
-TEST(nntrainer_Tensor, dequantize_02_n) {
-  int batch = 1;
-  int channel = 3;
-  int height = 4;
-  int width = 5;
+//   EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
+// }
 
-  nntrainer::Tensor input(
-    batch + 1, channel, height + 1, width + 1,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-  input.setScaleFactors({1.5, 1.0, 0.5});
-  input.setZeroPoints({1, 4, 7});
+// /**
+//  * @brief dequantize tensor with different dimension
+//  */
+// TEST(nntrainer_Tensor, dequantize_02_n) {
+//   int batch = 1;
+//   int channel = 3;
+//   int height = 4;
+//   int width = 5;
 
-  nntrainer::Tensor output(batch, channel, height, width);
+//   nntrainer::Tensor input(
+//     batch + 1, channel, height + 1, width + 1,
+//     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+//   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+//   input.setScaleFactors({1.5, 1.0, 0.5});
+//   input.setZeroPoints({1, 4, 7});
 
-  EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
-}
+//   nntrainer::Tensor output(batch, channel, height, width);
 
-/**
- * @brief dequantize tensor with no scale factors
- */
-TEST(nntrainer_Tensor, dequantize_03_n) {
-  int batch = 1;
-  int channel = 3;
-  int height = 4;
-  int width = 5;
-
-  nntrainer::Tensor input(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::Tensor output(batch, channel, height, width);
+//   EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
+// }
 
-  EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
-}
+// /**
+//  * @brief dequantize tensor with no scale factors
+//  */
+// TEST(nntrainer_Tensor, dequantize_03_n) {
+//   int batch = 1;
+//   int channel = 3;
+//   int height = 4;
+//   int width = 5;
 
-/**
- * @brief dequantize tensor with incorrect number of scale factors
- */
-TEST(nntrainer_Tensor, dequantize_04_n) {
-  int batch = 1;
-  int channel = 3;
-  int height = 4;
-  int width = 5;
+//   nntrainer::Tensor input(
+//     batch, channel, height, width,
+//     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+//   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
 
-  nntrainer::Tensor input(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+//   nntrainer::Tensor output(batch, channel, height, width);
 
-  nntrainer::Tensor output(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32});
+//   EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
+// }
 
-  input.setScaleFactors({2.0, 1.5, 1.0, 0.5});
-  input.setZeroPoints({2, 3, 4, 5});
-  EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
-  EXPECT_NO_THROW({ input.dequantize(output, 2); });
-}
+// /**
+//  * @brief dequantize tensor with incorrect number of scale factors
+//  */
+// TEST(nntrainer_Tensor, dequantize_04_n) {
+//   int batch = 1;
+//   int channel = 3;
+//   int height = 4;
+//   int width = 5;
+
+//   nntrainer::Tensor input(
+//     batch, channel, height, width,
+//     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+//   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+
+//   nntrainer::Tensor output(
+//     batch, channel, height, width,
+//     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32});
+
+//   input.setScaleFactors({2.0, 1.5, 1.0, 0.5});
+//   input.setZeroPoints({2, 3, 4, 5});
+//   EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
+//   EXPECT_NO_THROW({ input.dequantize(output, 2); });
+// }
 
-/**
- * @brief dequantize tensor to QINT8
- */
-TEST(nntrainer_Tensor, dequantize_05_n) {
-  int batch = 1;
-  int channel = 3;
-  int height = 4;
-  int width = 5;
+// /**
+//  * @brief dequantize tensor to QINT8
+//  */
+// TEST(nntrainer_Tensor, dequantize_05_n) {
+//   int batch = 1;
+//   int channel = 3;
+//   int height = 4;
+//   int width = 5;
 
-  nntrainer::Tensor input(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-  input.setScaleFactors({1.5, 1.0, 0.5});
-  input.setZeroPoints({1, 4, 7});
+//   nntrainer::Tensor input(
+//     batch, channel, height, width,
+//     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+//   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+//   input.setScaleFactors({1.5, 1.0, 0.5});
+//   input.setZeroPoints({1, 4, 7});
 
-  nntrainer::Tensor output(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+//   nntrainer::Tensor output(
+//     batch, channel, height, width,
+//     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
 
-  EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
-}
+//   EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
+// }
 
 TEST(nntrainer_Tensor, sin_contiguous_p) {
   int batch = 1;
@@ -4581,16 +4561,7 @@ TEST(nntrainer_Tensor, cos_uncontiguous_p) {
 
   shared_input.cos(shared_output);
 
-  for (int b = 0; b < batch; b++) {
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        for (int w = 0; w < width; w++) {
-          EXPECT_NEAR(shared_output.getValue(b, c, h, w),
-                      ground_truth.getValue(b, c, h, w), eps);
-        }
-      }
-    }
-  }
+  EXPECT_EQ(shared_output, ground_truth);
 }
 
 TEST(nntrainer_Tensor, sin_uncontiguous_p) {
@@ -4612,6 +4583,7 @@ TEST(nntrainer_Tensor, sin_uncontiguous_p) {
                           MOD);
 
   nntrainer::Tensor shared_input = input.getSharedDataTensor(dim, 0, false);
+
   ground_truth.copy_with_stride(shared_input);
 
   for (int b = 0; b < batch; b++) {
@@ -4627,16 +4599,7 @@ TEST(nntrainer_Tensor, sin_uncontiguous_p) {
 
   shared_input.sin(shared_output);
 
-  for (int b = 0; b < batch; b++) {
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        for (int w = 0; w < width; w++) {
-          EXPECT_NEAR(shared_output.getValue(b, c, h, w),
-                      ground_truth.getValue(b, c, h, w), eps);
-        }
-      }
-    }
-  }
+  EXPECT_EQ(shared_output, ground_truth);
 }
 
 TEST(nntrainer_Tensor, sin_unmatched_dim_n) {
diff --git a/test/unittest/unittest_nntrainer_tensor_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_fp16.cpp
index c0b060108d..619aa77f3a 100644
--- a/test/unittest/unittest_nntrainer_tensor_fp16.cpp
+++ b/test/unittest/unittest_nntrainer_tensor_fp16.cpp
@@ -4960,8 +4960,7 @@ TEST(nntrainer_Tensor, initialize_01_p) {
   t_type.format = nntrainer::Tformat::NCHW;
   t_type.data_type = nntrainer::Tdatatype::FP16;
 
-  nntrainer::Tensor t({1, 2, 3, 4, t_type}, true,
-                      nntrainer::Tensor::Initializer::ONES);
+  nntrainer::Tensor t({1, 2, 3, 4, t_type}, true, nntrainer::Initializer::ONES);
 
   nntrainer::Tensor golden(1, 2, 3, 4, t_type);
   golden.setValue(1);
@@ -4981,7 +4980,7 @@ TEST(nntrainer_Tensor, initialize_02_p) {
 
   EXPECT_NE(golden, t);
 
-  t.initialize(nntrainer::Tensor::Initializer::ONES);
+  t.initialize(nntrainer::Initializer::ONES);
   EXPECT_EQ(golden, t);
 }
 
@@ -4991,7 +4990,7 @@ TEST(nntrainer_Tensor, initialize_03_p) {
   t_type.data_type = nntrainer::Tdatatype::FP16;
 
   nntrainer::Tensor t({1, 2, 3, 4, t_type}, false,
-                      nntrainer::Tensor::Initializer::ONES);
+                      nntrainer::Initializer::ONES);
   t.allocate();
 
   nntrainer::Tensor golden(1, 2, 3, 4, t_type);
@@ -5006,7 +5005,7 @@ TEST(nntrainer_Tensor, initialize_04_p) {
   t_type.data_type = nntrainer::Tdatatype::FP16;
 
   nntrainer::Tensor t({1, 2, 3, 4, t_type}, false);
-  t.initialize(nntrainer::Tensor::Initializer::ONES);
+  t.initialize(nntrainer::Initializer::ONES);
   t.allocate();
 
   nntrainer::Tensor golden(1, 2, 3, 4, t_type);
@@ -5031,7 +5030,7 @@ TEST(nntrainer_Tensor, initialize_05_p) {
    * EXPECT_NE(golden, t);
    */
 
-  t.initialize(nntrainer::Tensor::Initializer::ONES);
+  t.initialize(nntrainer::Initializer::ONES);
   EXPECT_EQ(golden, t);
 }
 
@@ -5040,14 +5039,13 @@ TEST(nntrainer_Tensor, initialize_06_n) {
   t_type.format = nntrainer::Tformat::NCHW;
   t_type.data_type = nntrainer::Tdatatype::FP16;
 
-  nntrainer::Tensor t({1, 2, 3, 4, t_type}, true,
-                      nntrainer::Tensor::Initializer::ONES);
+  nntrainer::Tensor t({1, 2, 3, 4, t_type}, true, nntrainer::Initializer::ONES);
   nntrainer::Tensor golden({1, 2, 3, 4, t_type}, true,
-                           nntrainer::Tensor::Initializer::ZEROS);
+                           nntrainer::Initializer::ZEROS);
 
   EXPECT_NE(golden, t);
 
-  golden.initialize(nntrainer::Tensor::Initializer::ONES);
+  golden.initialize(nntrainer::Initializer::ONES);
   EXPECT_EQ(golden, t);
 }
 
@@ -5056,9 +5054,7 @@ TEST(nntrainer_Tensor, initialize_07_p) {
   t_type.format = nntrainer::Tformat::NCHW;
   t_type.data_type = nntrainer::Tdatatype::FP16;
 
-  nntrainer::Tensor t({1, 2, 3, 4, t_type}, true,
-                      nntrainer::Tensor::Initializer::ONES);
-
+  nntrainer::Tensor t({1, 2, 3, 4, t_type}, true, nntrainer::Initializer::ONES);
   nntrainer::Tensor golden(1, 2, 3, 4, t_type);
   golden.setValue(1);
 
@@ -5077,8 +5073,7 @@ TEST(nntrainer_Tensor, initialize_08_p) {
   t_type.format = nntrainer::Tformat::NCHW;
   t_type.data_type = nntrainer::Tdatatype::FP16;
 
-  nntrainer::Tensor t({1, 2, 3, 4, t_type}, true,
-                      nntrainer::Tensor::Initializer::ONES);
+  nntrainer::Tensor t({1, 2, 3, 4, t_type}, true, nntrainer::Initializer::ONES);
 
   nntrainer::Tensor golden(1, 2, 3, 4, t_type);
   golden.setValue(1.f);
@@ -5086,12 +5081,12 @@ TEST(nntrainer_Tensor, initialize_08_p) {
 
   /// @todo this test case is not valid anymore, since
   /// std::uniform_real_distribution does not support _FP16
-  // t.initialize(nntrainer::Tensor::Initializer::HE_NORMAL);
+  // t.initialize(nntrainer::Initializer::HE_NORMAL);
   // EXPECT_NE(golden, t);
   // t.initialize();
   // EXPECT_NE(golden, t);
 
-  t.initialize(nntrainer::Tensor::Initializer::ONES);
+  t.initialize(nntrainer::Initializer::ONES);
   EXPECT_EQ(golden, t);
 
   t.initialize();
@@ -5796,405 +5791,471 @@ TEST(nntrainer_Tensor, TensorWrap_02_n) {
   EXPECT_THROW(nntrainer::Tensor::Map(dat, 3, {4}), std::invalid_argument);
 }
 
-TEST(nntrainer_Tensor, TensorPaddedValue_p) {
-  nntrainer::Tensor a =
-    ranged(1, 1, 3, 3, nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16);
-  _FP16 default_padded = -1;
-
-  for (int i = 0; i < 5; ++i) {
-    for (int j = 0; j < 5; ++j) {
-      _FP16 expected = default_padded;
-      if (1 <= i && i <= 3 && 1 <= j && j <= 3) {
-        expected = (i - 1) * 3 + (j - 1);
-      }
-      _FP16 actual =
-        a.getValuePaddedVirtual<_FP16>(0, 0, i, j, 1, 1, default_padded);
-      EXPECT_FLOAT_EQ(actual, expected);
-    }
-  }
-}
+// TEST(nntrainer_Tensor, TensorPaddedValue_p) {
+//   nntrainer::Tensor a =
+//     ranged(1, 1, 3, 3, nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16);
+//   _FP16 default_padded = -1;
+
+//   for (int i = 0; i < 5; ++i) {
+//     for (int j = 0; j < 5; ++j) {
+//       _FP16 expected = default_padded;
+//       if (1 <= i && i <= 3 && 1 <= j && j <= 3) {
+//         expected = (i - 1) * 3 + (j - 1);
+//       }
+//       _FP16 actual =
+//         a.getValuePaddedVirtual<_FP16>(0, 0, i, j, 1, 1, default_padded);
+//       EXPECT_FLOAT_EQ(actual, expected);
+//     }
+//   }
+// }
 
-/**
- * @brief dequantize FP16 tensor
- */
-TEST(nntrainer_Tensor, dequantize_01_n) {
-  int batch = 1;
-  int channel = 3;
-  int height = 4;
-  int width = 5;
+// /**
+//  * @brief dequantize FP16 tensor
+//  */
+// TEST(nntrainer_Tensor, dequantize_01_n) {
+//   int batch = 1;
+//   int channel = 3;
+//   int height = 4;
+//   int width = 5;
 
-  nntrainer::Tensor input(batch, channel, height, width,
-                          nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+//   nntrainer::Tensor input(batch, channel, height, width,
+//                           nntrainer::Tformat::NCHW,
+//                           nntrainer::Tdatatype::FP16);
+//   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
 
-  input.setScaleFactorsFP16({static_cast<_FP16>(1.5), static_cast<_FP16>(1.0),
-                             static_cast<_FP16>(0.5)});
-  input.setZeroPoints({1, 4, 7});
+//   input.setScaleFactorsFP16({static_cast<_FP16>(1.5),
+//   static_cast<_FP16>(1.0),
+//                              static_cast<_FP16>(0.5)});
+//   input.setZeroPoints({1, 4, 7});
 
-  nntrainer::Tensor output(batch, channel, height, width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
+//   nntrainer::Tensor output(batch, channel, height, width,
+//                            nntrainer::Tformat::NCHW,
+//                            nntrainer::Tdatatype::FP16);
 
-  EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
-}
+//   EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
+// }
 
-/**
- * @brief dequantize tensor with different dimension
- */
-TEST(nntrainer_Tensor, dequantize_02_n) {
-  int batch = 1;
-  int channel = 3;
-  int height = 4;
-  int width = 5;
-
-  nntrainer::Tensor input(
-    batch + 1, channel, height + 1, width + 1,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+// /**
+//  * @brief dequantize tensor with different dimension
+//  */
+// TEST(nntrainer_Tensor, dequantize_02_n) {
+//   int batch = 1;
+//   int channel = 3;
+//   int height = 4;
+//   int width = 5;
 
-  input.setScaleFactorsFP16({static_cast<_FP16>(1.5), static_cast<_FP16>(1.0),
-                             static_cast<_FP16>(0.5)});
-  input.setZeroPoints({1, 4, 7});
+//   nntrainer::Tensor input(
+//     batch + 1, channel, height + 1, width + 1,
+//     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+//   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
 
-  nntrainer::Tensor output(batch, channel, height, width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
+//   input.setScaleFactorsFP16({static_cast<_FP16>(1.5),
+//   static_cast<_FP16>(1.0),
+//                              static_cast<_FP16>(0.5)});
+//   input.setZeroPoints({1, 4, 7});
 
-  EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
-}
+//   nntrainer::Tensor output(batch, channel, height, width,
+//                            nntrainer::Tformat::NCHW,
+//                            nntrainer::Tdatatype::FP16);
 
-/**
- * @brief dequantize tensor with no scale factors
- */
-TEST(nntrainer_Tensor, dequantize_03_n) {
-  int batch = 1;
-  int channel = 3;
-  int height = 4;
-  int width = 5;
-
-  nntrainer::Tensor input(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+//   EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
+// }
 
-  nntrainer::Tensor output(batch, channel, height, width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
+// /**
+//  * @brief dequantize tensor with no scale factors
+//  */
+// TEST(nntrainer_Tensor, dequantize_03_n) {
+//   int batch = 1;
+//   int channel = 3;
+//   int height = 4;
+//   int width = 5;
 
-  EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
-}
+//   nntrainer::Tensor input(
+//     batch, channel, height, width,
+//     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+//   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
 
-/**
- * @brief dequantize qint8 tensor to fp16
- */
-TEST(nntrainer_Tensor, dequantize_04_p) {
-  int batch = 1;
-  int channel = 3;
-  int height = 4;
-  int width = 5;
-
-  nntrainer::Tensor input(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
+//   nntrainer::Tensor output(batch, channel, height, width,
+//                            nntrainer::Tformat::NCHW,
+//                            nntrainer::Tdatatype::FP16);
 
-  input.setScaleFactorsFP16({static_cast<_FP16>(1.5), static_cast<_FP16>(1.0),
-                             static_cast<_FP16>(0.5)});
-  input.setZeroPoints({0, 0, 0});
-
-  nntrainer::Tensor output(
-    {1, 3, 4, 5, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}}, true);
-
-  EXPECT_NO_THROW({ input.dequantize(output, 1); });
-
-  _FP16 answer_data[] = {
-    static_cast<_FP16>(1.5), static_cast<_FP16>(1.5), static_cast<_FP16>(1.5),
-    static_cast<_FP16>(1.5), static_cast<_FP16>(1.5), static_cast<_FP16>(3),
-    static_cast<_FP16>(3),   static_cast<_FP16>(3),   static_cast<_FP16>(3),
-    static_cast<_FP16>(3),   static_cast<_FP16>(4.5), static_cast<_FP16>(4.5),
-    static_cast<_FP16>(4.5), static_cast<_FP16>(4.5), static_cast<_FP16>(4.5),
-    static_cast<_FP16>(6),   static_cast<_FP16>(6),   static_cast<_FP16>(6),
-    static_cast<_FP16>(6),   static_cast<_FP16>(6),   static_cast<_FP16>(6),
-    static_cast<_FP16>(6),   static_cast<_FP16>(6),   static_cast<_FP16>(6),
-    static_cast<_FP16>(6),   static_cast<_FP16>(7),   static_cast<_FP16>(7),
-    static_cast<_FP16>(7),   static_cast<_FP16>(7),   static_cast<_FP16>(7),
-    static_cast<_FP16>(8),   static_cast<_FP16>(8),   static_cast<_FP16>(8),
-    static_cast<_FP16>(8),   static_cast<_FP16>(8),   static_cast<_FP16>(9),
-    static_cast<_FP16>(9),   static_cast<_FP16>(9),   static_cast<_FP16>(9),
-    static_cast<_FP16>(9),   static_cast<_FP16>(5.5), static_cast<_FP16>(5.5),
-    static_cast<_FP16>(5.5), static_cast<_FP16>(5.5), static_cast<_FP16>(5.5),
-    static_cast<_FP16>(6),   static_cast<_FP16>(6),   static_cast<_FP16>(6),
-    static_cast<_FP16>(6),   static_cast<_FP16>(6),   static_cast<_FP16>(6.5),
-    static_cast<_FP16>(6.5), static_cast<_FP16>(6.5), static_cast<_FP16>(6.5),
-    static_cast<_FP16>(6.5), static_cast<_FP16>(7),   static_cast<_FP16>(7),
-    static_cast<_FP16>(7),   static_cast<_FP16>(7),   static_cast<_FP16>(7)};
-
-  nntrainer::Tensor answer(ml::train::TensorDim(batch, channel, height, width,
-                                                {nntrainer::Tformat::NCHW,
-                                                 nntrainer::Tdatatype::FP16}),
-                           answer_data);
-
-  EXPECT_EQ(output, answer);
-}
+//   EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
+// }
 
-/**
- * @brief dequantize qint8 tensor to fp16
- */
-TEST(nntrainer_Tensor, dequantize_05_p) {
-  size_t batch = 1;
-  size_t channel = 3;
-  size_t height = 4;
-  size_t width = 5;
-
-  nntrainer::Tensor input(
-    {batch,
-     channel,
-     height,
-     width,
-     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8}},
-    true, nntrainer::Tensor::Initializer::ZEROS);
-  nntrainer::Tensor output(batch, channel, height, width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
+// /**
+//  * @brief dequantize qint8 tensor to fp16
+//  */
+// TEST(nntrainer_Tensor, dequantize_04_p) {
+//   int batch = 1;
+//   int channel = 3;
+//   int height = 4;
+//   int width = 5;
+
+//   nntrainer::Tensor input(
+//     batch, channel, height, width,
+//     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+//   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
+
+//   input.setScaleFactorsFP16({static_cast<_FP16>(1.5),
+//   static_cast<_FP16>(1.0),
+//                              static_cast<_FP16>(0.5)});
+//   input.setZeroPoints({0, 0, 0});
+
+//   nntrainer::Tensor output(
+//     {1, 3, 4, 5, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
+//     true);
+
+//   EXPECT_NO_THROW({ input.dequantize(output, 1); });
+
+//   _FP16 answer_data[] = {
+//     static_cast<_FP16>(1.5), static_cast<_FP16>(1.5),
+//     static_cast<_FP16>(1.5), static_cast<_FP16>(1.5),
+//     static_cast<_FP16>(1.5), static_cast<_FP16>(3), static_cast<_FP16>(3),
+//     static_cast<_FP16>(3),   static_cast<_FP16>(3), static_cast<_FP16>(3),
+//     static_cast<_FP16>(4.5), static_cast<_FP16>(4.5),
+//     static_cast<_FP16>(4.5), static_cast<_FP16>(4.5),
+//     static_cast<_FP16>(4.5), static_cast<_FP16>(6),   static_cast<_FP16>(6),
+//     static_cast<_FP16>(6), static_cast<_FP16>(6),   static_cast<_FP16>(6),
+//     static_cast<_FP16>(6), static_cast<_FP16>(6),   static_cast<_FP16>(6),
+//     static_cast<_FP16>(6), static_cast<_FP16>(6),   static_cast<_FP16>(7),
+//     static_cast<_FP16>(7), static_cast<_FP16>(7),   static_cast<_FP16>(7),
+//     static_cast<_FP16>(7), static_cast<_FP16>(8),   static_cast<_FP16>(8),
+//     static_cast<_FP16>(8), static_cast<_FP16>(8),   static_cast<_FP16>(8),
+//     static_cast<_FP16>(9), static_cast<_FP16>(9),   static_cast<_FP16>(9),
+//     static_cast<_FP16>(9), static_cast<_FP16>(9),   static_cast<_FP16>(5.5),
+//     static_cast<_FP16>(5.5), static_cast<_FP16>(5.5),
+//     static_cast<_FP16>(5.5), static_cast<_FP16>(5.5), static_cast<_FP16>(6),
+//     static_cast<_FP16>(6),   static_cast<_FP16>(6), static_cast<_FP16>(6),
+//     static_cast<_FP16>(6),   static_cast<_FP16>(6.5),
+//     static_cast<_FP16>(6.5), static_cast<_FP16>(6.5),
+//     static_cast<_FP16>(6.5), static_cast<_FP16>(6.5), static_cast<_FP16>(7),
+//     static_cast<_FP16>(7), static_cast<_FP16>(7),   static_cast<_FP16>(7),
+//     static_cast<_FP16>(7)};
+
+//   nntrainer::Tensor answer(ml::train::TensorDim(batch, channel, height,
+//   width,
+//                                                 {nntrainer::Tformat::NCHW,
+//                                                  nntrainer::Tdatatype::FP16}),
+//                            answer_data);
+
+//   EXPECT_EQ(output, answer);
+// }
 
-  // Dequantize by channel
-  EXPECT_NO_THROW(input.setScaleFactorsFP16(
-    {static_cast<_FP16>(2), static_cast<_FP16>(-2), static_cast<_FP16>(-4)}));
-  EXPECT_NO_THROW(input.setZeroPoints({1, 1, 1}));
-  EXPECT_NO_THROW({ input.dequantize(output, 1); });
-
-  _FP16 answer_data_1[] = {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-                           -2, -2, -2, -2, -2, -2, -2, -2, 2,  2,  2,  2,
-                           2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-                           2,  2,  2,  2,  4,  4,  4,  4,  4,  4,  4,  4,
-                           4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4};
-
-  nntrainer::Tensor answer1(ml::train::TensorDim(batch, channel, height, width,
-                                                 {nntrainer::Tformat::NCHW,
-                                                  nntrainer::Tdatatype::FP16}),
-                            answer_data_1);
-
-  EXPECT_EQ(output, answer1);
-
-  // Dequantize by height
-
-  EXPECT_NO_THROW(input.setScaleFactorsFP16(
-    {static_cast<_FP16>(4.2), static_cast<_FP16>(2), static_cast<_FP16>(-2),
-     static_cast<_FP16>(-4.8)}));
-  EXPECT_NO_THROW(input.setZeroPoints({1, 1, 1, 1}));
-  EXPECT_NO_THROW({ input.dequantize(output, 2); });
-
-  _FP16 answer_data_2[] = {static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4.8),
-                           static_cast<_FP16>(4.8),  static_cast<_FP16>(4.8),
-                           static_cast<_FP16>(4.8),  static_cast<_FP16>(4.8),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4.8),
-                           static_cast<_FP16>(4.8),  static_cast<_FP16>(4.8),
-                           static_cast<_FP16>(4.8),  static_cast<_FP16>(4.8),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4.8),
-                           static_cast<_FP16>(4.8),  static_cast<_FP16>(4.8),
-                           static_cast<_FP16>(4.8),  static_cast<_FP16>(4.8)};
-  nntrainer::Tensor answer2(ml::train::TensorDim(batch, channel, height, width,
-                                                 {nntrainer::Tformat::NCHW,
-                                                  nntrainer::Tdatatype::FP16}),
-                            answer_data_2);
-
-  EXPECT_EQ(output, answer2);
-
-  // Dequantize by width
-  EXPECT_NO_THROW(input.setScaleFactorsFP16(
-    {static_cast<_FP16>(4.2), static_cast<_FP16>(2), static_cast<_FP16>(-2),
-     static_cast<_FP16>(-4), static_cast<_FP16>(8)}));
-  EXPECT_NO_THROW(input.setZeroPoints({1, 1, 1, 1, 1}));
-  EXPECT_NO_THROW({ input.dequantize(output, 3); });
-
-  _FP16 answer_data_3[] = {static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(-8)};
-
-  nntrainer::Tensor answer3(ml::train::TensorDim(batch, channel, height, width,
-                                                 {nntrainer::Tformat::NCHW,
-                                                  nntrainer::Tdatatype::FP16}),
-                            answer_data_3);
-
-  EXPECT_EQ(output, answer3);
-}
+// /**
+//  * @brief dequantize qint8 tensor to fp16
+//  */
+// TEST(nntrainer_Tensor, dequantize_05_p) {
+//   size_t batch = 1;
+//   size_t channel = 3;
+//   size_t height = 4;
+//   size_t width = 5;
+
+//   nntrainer::Tensor input(
+//     {batch,
+//      channel,
+//      height,
+//      width,
+//      {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8}},
+//     true, nntrainer::Initializer::ZEROS);
+//   nntrainer::Tensor output(batch, channel, height, width,
+//                            nntrainer::Tformat::NCHW,
+//                            nntrainer::Tdatatype::FP16);
 
-/**
- * @brief dequantize qint4 tensor
- */
-TEST(nntrainer_Tensor, dequantize_06_p) {
-  size_t batch = 1;
-  size_t channel = 3;
-  size_t height = 4;
-  size_t width = 5;
-
-  nntrainer::Tensor input(
-    {batch,
-     channel,
-     height,
-     width,
-     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}},
-    true, nntrainer::Tensor::Initializer::ZEROS);
-  nntrainer::Tensor output(batch, channel, height, width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
+//   // Dequantize by channel
+//   EXPECT_NO_THROW(input.setScaleFactorsFP16(
+//     {static_cast<_FP16>(2), static_cast<_FP16>(-2),
+//     static_cast<_FP16>(-4)}));
+//   EXPECT_NO_THROW(input.setZeroPoints({1, 1, 1}));
+//   EXPECT_NO_THROW({ input.dequantize(output, 1); });
+
+//   _FP16 answer_data_1[] = {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+//                            -2, -2, -2, -2, -2, -2, -2, -2, 2,  2,  2,  2,
+//                            2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+//                            2,  2,  2,  2,  4,  4,  4,  4,  4,  4,  4,  4,
+//                            4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4};
+
+//   nntrainer::Tensor answer1(ml::train::TensorDim(batch, channel, height,
+//   width,
+//                                                  {nntrainer::Tformat::NCHW,
+//                                                   nntrainer::Tdatatype::FP16}),
+//                             answer_data_1);
+
+//   EXPECT_EQ(output, answer1);
+
+//   // Dequantize by height
+
+//   EXPECT_NO_THROW(input.setScaleFactorsFP16(
+//     {static_cast<_FP16>(4.2), static_cast<_FP16>(2),
+//     static_cast<_FP16>(-2),
+//      static_cast<_FP16>(-4.8)}));
+//   EXPECT_NO_THROW(input.setZeroPoints({1, 1, 1, 1}));
+//   EXPECT_NO_THROW({ input.dequantize(output, 2); });
+
+//   _FP16 answer_data_2[] = {static_cast<_FP16>(-4.2),
+//   static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(4.8),
+//                            static_cast<_FP16>(4.8)};
+//   nntrainer::Tensor answer2(ml::train::TensorDim(batch, channel, height,
+//   width,
+//                                                  {nntrainer::Tformat::NCHW,
+//                                                   nntrainer::Tdatatype::FP16}),
+//                             answer_data_2);
+
+//   EXPECT_EQ(output, answer2);
+
+//   // Dequantize by width
+//   EXPECT_NO_THROW(input.setScaleFactorsFP16(
+//     {static_cast<_FP16>(4.2), static_cast<_FP16>(2),
+//     static_cast<_FP16>(-2),
+//      static_cast<_FP16>(-4), static_cast<_FP16>(8)}));
+//   EXPECT_NO_THROW(input.setZeroPoints({1, 1, 1, 1, 1}));
+//   EXPECT_NO_THROW({ input.dequantize(output, 3); });
+
+//   _FP16 answer_data_3[] = {static_cast<_FP16>(-4.2),
+//   static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(2),    static_cast<_FP16>(4),
+//                            static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2),   static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8)};
+
+//   nntrainer::Tensor answer3(ml::train::TensorDim(batch, channel, height,
+//   width,
+//                                                  {nntrainer::Tformat::NCHW,
+//                                                   nntrainer::Tdatatype::FP16}),
+//                             answer_data_3);
+
+//   EXPECT_EQ(output, answer3);
+// }
 
-  // Dequantize by channel
-  EXPECT_NO_THROW(input.setScaleFactorsFP16(
-    {static_cast<_FP16>(2), static_cast<_FP16>(-2), static_cast<_FP16>(-4)}));
-  EXPECT_NO_THROW(input.setZeroPoints({1, 1, 1}));
-  EXPECT_NO_THROW({ input.dequantize(output, 1); });
-
-  _FP16 answer_data_1[] = {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
-                           -2, -2, -2, -2, -2, -2, -2, -2, 2,  2,  2,  2,
-                           2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-                           2,  2,  2,  2,  4,  4,  4,  4,  4,  4,  4,  4,
-                           4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4};
-
-  nntrainer::Tensor answer1(ml::train::TensorDim(batch, channel, height, width,
-                                                 {nntrainer::Tformat::NCHW,
-                                                  nntrainer::Tdatatype::FP16}),
-                            answer_data_1);
-
-  EXPECT_EQ(output, answer1);
-
-  // Dequantize by height
-  EXPECT_NO_THROW(input.setScaleFactorsFP16(
-    {static_cast<_FP16>(4.2), static_cast<_FP16>(2), static_cast<_FP16>(-2),
-     static_cast<_FP16>(-4)}));
-  EXPECT_NO_THROW(input.setZeroPoints({1, 1, 1, 1}));
-  EXPECT_NO_THROW({ input.dequantize(output, 2); });
-
-  _FP16 answer_data_2[] = {static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(4)};
-  nntrainer::Tensor answer2(ml::train::TensorDim(batch, channel, height, width,
-                                                 {nntrainer::Tformat::NCHW,
-                                                  nntrainer::Tdatatype::FP16}),
-                            answer_data_2);
-
-  EXPECT_EQ(output, answer2);
-
-  // Dequantize by width
-  EXPECT_NO_THROW(input.setScaleFactorsFP16(
-    {static_cast<_FP16>(4.2), static_cast<_FP16>(2), static_cast<_FP16>(-2),
-     static_cast<_FP16>(-4), static_cast<_FP16>(8)}));
-  EXPECT_NO_THROW(input.setZeroPoints({1, 1, 1, 1, 1}));
-  EXPECT_NO_THROW({ input.dequantize(output, 3); });
-
-  _FP16 answer_data_3[] = {static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(-8),
-                           static_cast<_FP16>(-4.2), static_cast<_FP16>(-2),
-                           static_cast<_FP16>(2),    static_cast<_FP16>(4),
-                           static_cast<_FP16>(-8),   static_cast<_FP16>(-4.2),
-                           static_cast<_FP16>(-2),   static_cast<_FP16>(2),
-                           static_cast<_FP16>(4),    static_cast<_FP16>(-8)};
-
-  nntrainer::Tensor answer3(ml::train::TensorDim(batch, channel, height, width,
-                                                 {nntrainer::Tformat::NCHW,
-                                                  nntrainer::Tdatatype::FP16}),
-                            answer_data_3);
-
-  EXPECT_EQ(output, answer3);
-}
+// /**
+//  * @brief dequantize qint4 tensor
+//  */
+// TEST(nntrainer_Tensor, dequantize_06_p) {
+//   size_t batch = 1;
+//   size_t channel = 3;
+//   size_t height = 4;
+//   size_t width = 5;
+
+//   nntrainer::Tensor input(
+//     {batch,
+//      channel,
+//      height,
+//      width,
+//      {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT4}},
+//     true, nntrainer::Initializer::ZEROS);
+//   nntrainer::Tensor output(batch, channel, height, width,
+//                            nntrainer::Tformat::NCHW,
+//                            nntrainer::Tdatatype::FP16);
+//   // Dequantize by channel
+//   EXPECT_NO_THROW(input.setScaleFactorsFP16(
+//     {static_cast<_FP16>(2), static_cast<_FP16>(-2),
+//     static_cast<_FP16>(-4)}));
+//   EXPECT_NO_THROW(input.setZeroPoints({1, 1, 1}));
+//   EXPECT_NO_THROW({ input.dequantize(output, 1); });
+
+//   _FP16 answer_data_1[] = {-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
+//   -2,
+//                            -2, -2, -2, -2, -2, -2, -2, -2, 2,  2,  2,  2,
+//                            2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+//                            2,  2,  2,  2,  4,  4,  4,  4,  4,  4,  4,  4,
+//                            4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4, 4};
+
+//   nntrainer::Tensor answer1(ml::train::TensorDim(batch, channel, height,
+//   width,
+//                                                  {nntrainer::Tformat::NCHW,
+//                                                   nntrainer::Tdatatype::FP16}),
+//                             answer_data_1);
+
+//   EXPECT_EQ(output, answer1);
+
+//   // Dequantize by height
+//   EXPECT_NO_THROW(input.setScaleFactorsFP16(
+//     {static_cast<_FP16>(4.2), static_cast<_FP16>(2),
+//     static_cast<_FP16>(-2),
+//      static_cast<_FP16>(-4)}));
+//   EXPECT_NO_THROW(input.setZeroPoints({1, 1, 1, 1}));
+//   EXPECT_NO_THROW({ input.dequantize(output, 2); });
+
+//   _FP16 answer_data_2[] = {static_cast<_FP16>(-4.2),
+//   static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(4),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(4),
+//                            static_cast<_FP16>(4),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(4),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(4),
+//                            static_cast<_FP16>(4),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(4),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(4),
+//                            static_cast<_FP16>(4)};
+//   nntrainer::Tensor answer2(ml::train::TensorDim(batch, channel, height,
+//   width,
+//                                                  {nntrainer::Tformat::NCHW,
+//                                                   nntrainer::Tdatatype::FP16}),
+//                             answer_data_2);
+
+//   EXPECT_EQ(output, answer2);
+
+//   // Dequantize by width
+//   EXPECT_NO_THROW(input.setScaleFactorsFP16(
+//     {static_cast<_FP16>(4.2), static_cast<_FP16>(2),
+//     static_cast<_FP16>(-2),
+//      static_cast<_FP16>(-4), static_cast<_FP16>(8)}));
+//   EXPECT_NO_THROW(input.setZeroPoints({1, 1, 1, 1, 1}));
+//   EXPECT_NO_THROW({ input.dequantize(output, 3); });
+
+//   _FP16 answer_data_3[] = {static_cast<_FP16>(-4.2),
+//   static_cast<_FP16>(-2),
+//                            static_cast<_FP16>(2), static_cast<_FP16>(4),
+//                            static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4), static_cast<_FP16>(-8),
+//                            static_cast<_FP16>(-4.2),
+//                            static_cast<_FP16>(-2), static_cast<_FP16>(2),
+//                            static_cast<_FP16>(4),
+//                            static_cast<_FP16>(-8)};
+
+//   nntrainer::Tensor answer3(ml::train::TensorDim(batch, channel, height,
+//   width,
+//                                                  {nntrainer::Tformat::NCHW,
+//                                                   nntrainer::Tdatatype::FP16}),
+//                             answer_data_3);
+
+//   EXPECT_EQ(output, answer3);
+// }
 
 GTEST_API_ int main(int argc, char **argv) {
   int result = -1;
diff --git a/test/unittest/unittest_nntrainer_tensor_nhwc.cpp b/test/unittest/unittest_nntrainer_tensor_nhwc.cpp
index f65e1b4eda..167acec29c 100644
--- a/test/unittest/unittest_nntrainer_tensor_nhwc.cpp
+++ b/test/unittest/unittest_nntrainer_tensor_nhwc.cpp
@@ -3592,7 +3592,7 @@ TEST(nntrainer_Tensor, allocate_03_nhwc_p) {
 
 TEST(nntrainer_Tensor, initialize_01_nhwc_p) {
   nntrainer::Tensor t(nntrainer::TensorDim(1, 2, 3, 4, NHWC_, FP32_), true,
-                      nntrainer::Tensor::Initializer::ONES);
+                      nntrainer::Initializer::ONES);
 
   nntrainer::Tensor golden(1, 2, 3, 4, NHWC_, FP32_);
   golden.setValue(1);
@@ -3608,13 +3608,13 @@ TEST(nntrainer_Tensor, initialize_02_nhwc_p) {
 
   EXPECT_NE(golden, t);
 
-  t.initialize(nntrainer::Tensor::Initializer::ONES);
+  t.initialize(nntrainer::Initializer::ONES);
   EXPECT_EQ(golden, t);
 }
 
 TEST(nntrainer_Tensor, initialize_03_nhwc_p) {
   nntrainer::Tensor t(nntrainer::TensorDim(1, 2, 3, 4, NHWC_, FP32_), false,
-                      nntrainer::Tensor::Initializer::ONES);
+                      nntrainer::Initializer::ONES);
   t.allocate();
 
   nntrainer::Tensor golden(1, 2, 3, 4, NHWC_, FP32_);
@@ -3625,7 +3625,7 @@ TEST(nntrainer_Tensor, initialize_03_nhwc_p) {
 
 TEST(nntrainer_Tensor, initialize_04_nhwc_p) {
   nntrainer::Tensor t(nntrainer::TensorDim(1, 2, 3, 4, NHWC_, FP32_), false);
-  t.initialize(nntrainer::Tensor::Initializer::ONES);
+  t.initialize(nntrainer::Initializer::ONES);
   t.allocate();
 
   nntrainer::Tensor golden(1, 2, 3, 4, NHWC_, FP32_);
@@ -3646,25 +3646,25 @@ TEST(nntrainer_Tensor, initialize_05_nhwc_p) {
    * EXPECT_NE(golden, t);
    */
 
-  t.initialize(nntrainer::Tensor::Initializer::ONES);
+  t.initialize(nntrainer::Initializer::ONES);
   EXPECT_EQ(golden, t);
 }
 
 TEST(nntrainer_Tensor, initialize_06_nhwc_n) {
   nntrainer::Tensor t(nntrainer::TensorDim(1, 2, 3, 4, NHWC_, FP32_), true,
-                      nntrainer::Tensor::Initializer::ONES);
+                      nntrainer::Initializer::ONES);
   nntrainer::Tensor golden(nntrainer::TensorDim(1, 2, 3, 4, NHWC_, FP32_), true,
-                           nntrainer::Tensor::Initializer::ZEROS);
+                           nntrainer::Initializer::ZEROS);
 
   EXPECT_NE(golden, t);
 
-  golden.initialize(nntrainer::Tensor::Initializer::ONES);
+  golden.initialize(nntrainer::Initializer::ONES);
   EXPECT_EQ(golden, t);
 }
 
 TEST(nntrainer_Tensor, initialize_07_nhwc_p) {
   nntrainer::Tensor t(nntrainer::TensorDim(1, 2, 3, 4, NHWC_, FP32_), true,
-                      nntrainer::Tensor::Initializer::ONES);
+                      nntrainer::Initializer::ONES);
 
   nntrainer::Tensor golden(1, 2, 3, 4, NHWC_, FP32_);
   golden.setValue(1);
@@ -3681,20 +3681,20 @@ TEST(nntrainer_Tensor, initialize_07_nhwc_p) {
 
 TEST(nntrainer_Tensor, initialize_08_nhwc_p) {
   nntrainer::Tensor t(nntrainer::TensorDim(1, 2, 3, 4, NHWC_, FP32_), true,
-                      nntrainer::Tensor::Initializer::ONES);
+                      nntrainer::Initializer::ONES);
 
   nntrainer::Tensor golden(1, 2, 3, 4, NHWC_, FP32_);
   golden.setValue(1);
 
   EXPECT_EQ(golden, t);
 
-  t.initialize(nntrainer::Tensor::Initializer::HE_NORMAL);
+  t.initialize(nntrainer::Initializer::HE_NORMAL);
   EXPECT_NE(golden, t);
 
   t.initialize();
   EXPECT_NE(golden, t);
 
-  t.initialize(nntrainer::Tensor::Initializer::ONES);
+  t.initialize(nntrainer::Initializer::ONES);
   EXPECT_EQ(golden, t);
 
   t.initialize();
@@ -3973,21 +3973,21 @@ TEST(nntrainer_Tensor, TensorWrap_02_nhwc_n) {
   EXPECT_THROW(nntrainer::Tensor::Map(dat, 3, {4}), std::invalid_argument);
 }
 
-TEST(nntrainer_Tensor, TensorPaddedValue_nhwc_p) {
-  nntrainer::Tensor a = ranged(1, 1, 3, 3, NHWC_, FP32_);
-  float default_padded = -1;
+// TEST(nntrainer_Tensor, TensorPaddedValue_nhwc_p) {
+//   nntrainer::Tensor a = ranged(1, 1, 3, 3, NHWC_, FP32_);
+//   float default_padded = -1;
 
-  for (int i = 0; i < 5; ++i) {
-    for (int j = 0; j < 5; ++j) {
-      float expected = default_padded;
-      if (1 <= i && i <= 3 && 1 <= j && j <= 3) {
-        expected = (i - 1) * 3 + (j - 1);
-      }
-      float actual = a.getValuePaddedVirtual(0, 0, i, j, 1, 1, default_padded);
-      EXPECT_FLOAT_EQ(actual, expected);
-    }
-  }
-}
+//   for (int i = 0; i < 5; ++i) {
+//     for (int j = 0; j < 5; ++j) {
+//       float expected = default_padded;
+//       if (1 <= i && i <= 3 && 1 <= j && j <= 3) {
+//         expected = (i - 1) * 3 + (j - 1);
+//       }
+//       float actual = a.getValuePaddedVirtual(0, 0, i, j, 1, 1,
+//       default_padded); EXPECT_FLOAT_EQ(actual, expected);
+//     }
+//   }
+// }
 
 TEST(nntrainer_Tensor, zoneout_mask_01_nhwc_n) {
   const float zoneout_rate = 0.3f;
@@ -4702,51 +4702,51 @@ TEST(nntrainer_Tensor, tranpose_dimension_not_match_nhwc_n) {
   EXPECT_THROW(a.transpose("0:1:2", b), std::invalid_argument);
 }
 
-/**
- * @brief dequantize tensor with different format
- */
-TEST(nntrainer_Tensor, dequantize_01_n) {
-  int batch = 1;
-  int channel = 3;
-  int height = 4;
-  int width = 5;
-
-  nntrainer::Tensor input(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-  input.setScaleFactors({1.5, 1.0, 0.5});
-  input.setZeroPoints({1, 0, 3});
-
-  nntrainer::Tensor output(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NHWC, nntrainer::Tdatatype::FP32});
-
-  EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
-}
-
-/**
- * @brief dequantize tensor with different format
- */
-TEST(nntrainer_Tensor, dequantize_02_n) {
-  int batch = 1;
-  int channel = 3;
-  int height = 4;
-  int width = 5;
-
-  nntrainer::Tensor input(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NHWC, nntrainer::Tdatatype::QINT8});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-  input.setScaleFactors({1.5, 1.0, 0.5});
-  input.setZeroPoints({1, 0, 3});
-
-  nntrainer::Tensor output(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32});
-
-  EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
-}
+// /**
+//  * @brief dequantize tensor with different format
+//  */
+// TEST(nntrainer_Tensor, dequantize_01_n) {
+//   int batch = 1;
+//   int channel = 3;
+//   int height = 4;
+//   int width = 5;
+
+//   nntrainer::Tensor input(
+//     batch, channel, height, width,
+//     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::QINT8});
+//   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+//   input.setScaleFactors({1.5, 1.0, 0.5});
+//   input.setZeroPoints({1, 0, 3});
+
+//   nntrainer::Tensor output(
+//     batch, channel, height, width,
+//     {nntrainer::Tformat::NHWC, nntrainer::Tdatatype::FP32});
+
+//   EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
+// }
+
+// /**
+//  * @brief dequantize tensor with different format
+//  */
+// TEST(nntrainer_Tensor, dequantize_02_n) {
+//   int batch = 1;
+//   int channel = 3;
+//   int height = 4;
+//   int width = 5;
+
+//   nntrainer::Tensor input(
+//     batch, channel, height, width,
+//     {nntrainer::Tformat::NHWC, nntrainer::Tdatatype::QINT8});
+//   GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
+//   input.setScaleFactors({1.5, 1.0, 0.5});
+//   input.setZeroPoints({1, 0, 3});
+
+//   nntrainer::Tensor output(
+//     batch, channel, height, width,
+//     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32});
+
+//   EXPECT_THROW({ input.dequantize(output, 1); }, std::invalid_argument);
+// }
 
 int main(int argc, char **argv) {
   int result = -1;
diff --git a/test/unittest/unittest_nntrainer_tensor_pool.cpp b/test/unittest/unittest_nntrainer_tensor_pool.cpp
index fa57141c08..9035099a15 100644
--- a/test/unittest/unittest_nntrainer_tensor_pool.cpp
+++ b/test/unittest/unittest_nntrainer_tensor_pool.cpp
@@ -435,127 +435,137 @@ TEST(TensorPool, validate_memory) {
   EXPECT_NO_THROW(pool.deallocate());
 }
 
-/**
- * @brief qint8 tensors reuse fp32 tensor memory space
- */
-TEST(TensorPool, validate_memory_reuse_01_p) {
-  // |--------- t1 ---------|
-  // |-t2-||-t3-||-t4-||-t5-|
-  nntrainer::TensorPool pool;
-  nntrainer::Tensor *t1 = nullptr, *t2 = nullptr, *t3 = nullptr, *t4 = nullptr,
-                    *t5 = nullptr;
-
-  EXPECT_NO_THROW(
-    t1 = pool.request("t1", nntrainer::TensorDim({4}), {0},
-                      nntrainer::TensorLifespan::FORWARD_FUNC_LIFESPAN));
-  EXPECT_NE(t1, nullptr);
-  EXPECT_FALSE(t1->isAllocated());
-
-  EXPECT_NO_THROW(
-    t2 = pool.request("t2",
-                      nntrainer::TensorDim({4}, {nntrainer::Tformat::NCHW,
-                                                 nntrainer::Tdatatype::QINT8}),
-                      {1}, nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
-  EXPECT_NE(t2, nullptr);
-  EXPECT_FALSE(t2->isAllocated());
-
-  EXPECT_NO_THROW(
-    t3 = pool.request("t3",
-                      nntrainer::TensorDim({4}, {nntrainer::Tformat::NCHW,
-                                                 nntrainer::Tdatatype::QINT8}),
-                      {1}, nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
-  EXPECT_NE(t3, nullptr);
-  EXPECT_FALSE(t3->isAllocated());
-
-  EXPECT_NO_THROW(
-    t4 = pool.request("t4",
-                      nntrainer::TensorDim({4}, {nntrainer::Tformat::NCHW,
-                                                 nntrainer::Tdatatype::QINT8}),
-                      {1}, nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
-  EXPECT_NE(t4, nullptr);
-  EXPECT_FALSE(t4->isAllocated());
-
-  EXPECT_NO_THROW(
-    t5 = pool.request("t5",
-                      nntrainer::TensorDim({4}, {nntrainer::Tformat::NCHW,
-                                                 nntrainer::Tdatatype::QINT8}),
-                      {1}, nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
-  EXPECT_NE(t5, nullptr);
-  EXPECT_FALSE(t5->isAllocated());
-
-  EXPECT_NO_THROW(pool.finalize(nntrainer::OptimizedV1Planner(), 0, 2));
-  EXPECT_EQ(pool.minMemoryRequirement(), t1->bytes());
-
-  EXPECT_NO_THROW(pool.allocate());
-
-  EXPECT_EQ(t1->getAddress<float>(0), (float *)t2->getAddress<uint8_t>(0));
-  EXPECT_EQ(t1->getAddress<float>(1), (float *)t3->getAddress<uint8_t>(0));
-  EXPECT_EQ(t1->getAddress<float>(2), (float *)t4->getAddress<uint8_t>(0));
-  EXPECT_EQ(t1->getAddress<float>(3), (float *)t5->getAddress<uint8_t>(0));
-
-  EXPECT_NO_THROW(pool.deallocate());
-}
-
-/**
- * @brief qint4 tensors reuse fp32 tensor memory space
- */
-TEST(TensorPool, validate_memory_reuse_02_p) {
-  // |--------- t1 ---------|
-  // |-t2-||-t3-||-t4-||-t5-|
-  nntrainer::TensorPool pool;
-  nntrainer::Tensor *t1 = nullptr, *t2 = nullptr, *t3 = nullptr, *t4 = nullptr,
-                    *t5 = nullptr;
-
-  EXPECT_NO_THROW(
-    t1 = pool.request("t1", nntrainer::TensorDim({4}), {0},
-                      nntrainer::TensorLifespan::FORWARD_FUNC_LIFESPAN));
-  EXPECT_NE(t1, nullptr);
-  EXPECT_FALSE(t1->isAllocated());
-
-  EXPECT_NO_THROW(
-    t2 = pool.request("t2",
-                      nntrainer::TensorDim({8}, {nntrainer::Tformat::NCHW,
-                                                 nntrainer::Tdatatype::QINT4}),
-                      {1}, nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
-  EXPECT_NE(t2, nullptr);
-  EXPECT_FALSE(t2->isAllocated());
-
-  EXPECT_NO_THROW(
-    t3 = pool.request("t3",
-                      nntrainer::TensorDim({7}, {nntrainer::Tformat::NCHW,
-                                                 nntrainer::Tdatatype::QINT4}),
-                      {1}, nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
-  EXPECT_NE(t3, nullptr);
-  EXPECT_FALSE(t3->isAllocated());
-
-  EXPECT_NO_THROW(
-    t4 = pool.request("t4",
-                      nntrainer::TensorDim({8}, {nntrainer::Tformat::NCHW,
-                                                 nntrainer::Tdatatype::QINT4}),
-                      {1}, nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
-  EXPECT_NE(t4, nullptr);
-  EXPECT_FALSE(t4->isAllocated());
-
-  EXPECT_NO_THROW(
-    t5 = pool.request("t5",
-                      nntrainer::TensorDim({7}, {nntrainer::Tformat::NCHW,
-                                                 nntrainer::Tdatatype::QINT4}),
-                      {1}, nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
-  EXPECT_NE(t5, nullptr);
-  EXPECT_FALSE(t5->isAllocated());
-
-  EXPECT_NO_THROW(pool.finalize(nntrainer::OptimizedV1Planner(), 0, 2));
-  EXPECT_EQ(pool.minMemoryRequirement(), t1->bytes());
-
-  EXPECT_NO_THROW(pool.allocate());
-
-  EXPECT_EQ(t1->getAddress<float>(0), (float *)t2->getAddress<uint8_t>(0));
-  EXPECT_EQ(t1->getAddress<float>(1), (float *)t3->getAddress<uint8_t>(0));
-  EXPECT_EQ(t1->getAddress<float>(2), (float *)t4->getAddress<uint8_t>(0));
-  EXPECT_EQ(t1->getAddress<float>(3), (float *)t5->getAddress<uint8_t>(0));
-
-  EXPECT_NO_THROW(pool.deallocate());
-}
+// /**
+//  * @brief qint8 tensors reuse fp32 tensor memory space
+//  */
+// TEST(TensorPool, validate_memory_reuse_01_p) {
+//   // |--------- t1 ---------|
+//   // |-t2-||-t3-||-t4-||-t5-|
+//   nntrainer::TensorPool pool;
+//   nntrainer::Tensor *t1 = nullptr, *t2 = nullptr, *t3 = nullptr, *t4 =
+//   nullptr,
+//                     *t5 = nullptr;
+
+//   EXPECT_NO_THROW(
+//     t1 = pool.request("t1", nntrainer::TensorDim({4}), {0},
+//                       nntrainer::TensorLifespan::FORWARD_FUNC_LIFESPAN));
+//   EXPECT_NE(t1, nullptr);
+//   EXPECT_FALSE(t1->isAllocated());
+
+//   EXPECT_NO_THROW(
+//     t2 = pool.request("t2",
+//                       nntrainer::TensorDim({4}, {nntrainer::Tformat::NCHW,
+//                                                  nntrainer::Tdatatype::QINT8}),
+//                       {1},
+//                       nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
+//   EXPECT_NE(t2, nullptr);
+//   EXPECT_FALSE(t2->isAllocated());
+
+//   EXPECT_NO_THROW(
+//     t3 = pool.request("t3",
+//                       nntrainer::TensorDim({4}, {nntrainer::Tformat::NCHW,
+//                                                  nntrainer::Tdatatype::QINT8}),
+//                       {1},
+//                       nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
+//   EXPECT_NE(t3, nullptr);
+//   EXPECT_FALSE(t3->isAllocated());
+
+//   EXPECT_NO_THROW(
+//     t4 = pool.request("t4",
+//                       nntrainer::TensorDim({4}, {nntrainer::Tformat::NCHW,
+//                                                  nntrainer::Tdatatype::QINT8}),
+//                       {1},
+//                       nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
+//   EXPECT_NE(t4, nullptr);
+//   EXPECT_FALSE(t4->isAllocated());
+
+//   EXPECT_NO_THROW(
+//     t5 = pool.request("t5",
+//                       nntrainer::TensorDim({4}, {nntrainer::Tformat::NCHW,
+//                                                  nntrainer::Tdatatype::QINT8}),
+//                       {1},
+//                       nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
+//   EXPECT_NE(t5, nullptr);
+//   EXPECT_FALSE(t5->isAllocated());
+
+//   EXPECT_NO_THROW(pool.finalize(nntrainer::OptimizedV1Planner(), 0, 2));
+//   EXPECT_EQ(pool.minMemoryRequirement(), t1->bytes());
+
+//   EXPECT_NO_THROW(pool.allocate());
+
+//   EXPECT_EQ(t1->getAddress<float>(0), (float *)t2->getAddress<uint8_t>(0));
+//   EXPECT_EQ(t1->getAddress<float>(1), (float *)t3->getAddress<uint8_t>(0));
+//   EXPECT_EQ(t1->getAddress<float>(2), (float *)t4->getAddress<uint8_t>(0));
+//   EXPECT_EQ(t1->getAddress<float>(3), (float *)t5->getAddress<uint8_t>(0));
+
+//   EXPECT_NO_THROW(pool.deallocate());
+// }
+
+// /**
+//  * @brief qint4 tensors reuse fp32 tensor memory space
+//  */
+// TEST(TensorPool, validate_memory_reuse_02_p) {
+//   // |--------- t1 ---------|
+//   // |-t2-||-t3-||-t4-||-t5-|
+//   nntrainer::TensorPool pool;
+//   nntrainer::Tensor *t1 = nullptr, *t2 = nullptr, *t3 = nullptr, *t4 =
+//   nullptr,
+//                     *t5 = nullptr;
+
+//   EXPECT_NO_THROW(
+//     t1 = pool.request("t1", nntrainer::TensorDim({4}), {0},
+//                       nntrainer::TensorLifespan::FORWARD_FUNC_LIFESPAN));
+//   EXPECT_NE(t1, nullptr);
+//   EXPECT_FALSE(t1->isAllocated());
+
+//   EXPECT_NO_THROW(
+//     t2 = pool.request("t2",
+//                       nntrainer::TensorDim({8}, {nntrainer::Tformat::NCHW,
+//                                                  nntrainer::Tdatatype::QINT4}),
+//                       {1},
+//                       nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
+//   EXPECT_NE(t2, nullptr);
+//   EXPECT_FALSE(t2->isAllocated());
+
+//   EXPECT_NO_THROW(
+//     t3 = pool.request("t3",
+//                       nntrainer::TensorDim({7}, {nntrainer::Tformat::NCHW,
+//                                                  nntrainer::Tdatatype::QINT4}),
+//                       {1},
+//                       nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
+//   EXPECT_NE(t3, nullptr);
+//   EXPECT_FALSE(t3->isAllocated());
+
+//   EXPECT_NO_THROW(
+//     t4 = pool.request("t4",
+//                       nntrainer::TensorDim({8}, {nntrainer::Tformat::NCHW,
+//                                                  nntrainer::Tdatatype::QINT4}),
+//                       {1},
+//                       nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
+//   EXPECT_NE(t4, nullptr);
+//   EXPECT_FALSE(t4->isAllocated());
+
+//   EXPECT_NO_THROW(
+//     t5 = pool.request("t5",
+//                       nntrainer::TensorDim({7}, {nntrainer::Tformat::NCHW,
+//                                                  nntrainer::Tdatatype::QINT4}),
+//                       {1},
+//                       nntrainer::TensorLifespan::BACKWARD_FUNC_LIFESPAN));
+//   EXPECT_NE(t5, nullptr);
+//   EXPECT_FALSE(t5->isAllocated());
+
+//   EXPECT_NO_THROW(pool.finalize(nntrainer::OptimizedV1Planner(), 0, 2));
+//   EXPECT_EQ(pool.minMemoryRequirement(), t1->bytes());
+
+//   EXPECT_NO_THROW(pool.allocate());
+
+//   EXPECT_EQ(t1->getAddress<float>(0), (float *)t2->getAddress<uint8_t>(0));
+//   EXPECT_EQ(t1->getAddress<float>(1), (float *)t3->getAddress<uint8_t>(0));
+//   EXPECT_EQ(t1->getAddress<float>(2), (float *)t4->getAddress<uint8_t>(0));
+//   EXPECT_EQ(t1->getAddress<float>(3), (float *)t5->getAddress<uint8_t>(0));
+
+//   EXPECT_NO_THROW(pool.deallocate());
+// }
 
 /**
  * @brief check if data span of two tensor testOverlap
@@ -863,10 +873,9 @@ TEST(TensorPool, createOrExtend_different_dim_n) {
 
 TEST(TensorPool, createOrExtend_init_n) {
   nntrainer::TensorPool pool;
-  pool.requestOrExtend("t", {10}, {0}, max_ls,
-                       nntrainer::Tensor::Initializer::ONES);
+  pool.requestOrExtend("t", {10}, {0}, max_ls, nntrainer::Initializer::ONES);
   EXPECT_ANY_THROW(pool.requestOrExtend("t", {10}, {1}, max_ls,
-                                        nntrainer::Tensor::Initializer::ZEROS));
+                                        nntrainer::Initializer::ZEROS));
 }
 TEST(TensorPool, createOrExtend_unmanaged_n) {
   nntrainer::TensorPool pool;
diff --git a/test/unittest/unittest_nntrainer_tensor_pool_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_pool_fp16.cpp
index 19ab760aa5..fa7ef82c11 100644
--- a/test/unittest/unittest_nntrainer_tensor_pool_fp16.cpp
+++ b/test/unittest/unittest_nntrainer_tensor_pool_fp16.cpp
@@ -1337,16 +1337,29 @@ static void testNoOverlap(nntrainer::Tensor *t1, nntrainer::Tensor *t2) {
  * @param t2 t2 tensor 2
  */
 static void testSubset(nntrainer::Tensor *t1, nntrainer::Tensor *t2) {
-  _FP16 *t1_start = t1->getData<_FP16>();
-  _FP16 *t1_end = t1_start + t1->size();
-
-  _FP16 *t2_start = t2->getData<_FP16>();
-  _FP16 *t2_end = t2_start + t2->size();
-
-  EXPECT_NE(t1_start, nullptr);
-  EXPECT_NE(t2_start, nullptr);
-  EXPECT_TRUE(t1_start <= t2_start && t2_end <= t1_end)
-    << "t2 is not subset of t1";
+  if (t1->getDataType() == ml::train::TensorDim::DataType::FP32) {
+    float *t1_start = t1->getData<float>();
+    float *t1_end = t1_start + t1->size();
+
+    float *t2_start = t2->getData<float>();
+    float *t2_end = t2_start + t2->size();
+
+    EXPECT_NE(t1_start, nullptr);
+    EXPECT_NE(t2_start, nullptr);
+    EXPECT_TRUE(t1_start <= t2_start && t2_end <= t1_end)
+      << "t2 is not subset of t1";
+  } else {
+    _FP16 *t1_start = t1->getData<_FP16>();
+    _FP16 *t1_end = t1_start + t1->size();
+
+    _FP16 *t2_start = t2->getData<_FP16>();
+    _FP16 *t2_end = t2_start + t2->size();
+
+    EXPECT_NE(t1_start, nullptr);
+    EXPECT_NE(t2_start, nullptr);
+    EXPECT_TRUE(t1_start <= t2_start && t2_end <= t1_end)
+      << "t2 is not subset of t1";
+  }
 }
 
 TEST(TensorPool, create_allocate_has_data_01_p) {
@@ -2069,21 +2082,21 @@ TEST(TensorPool, createOrExtend_different_type_02_n) {
 TEST(TensorPool, createOrExtend_init_01_n) {
   nntrainer::TensorPool pool;
   pool.requestOrExtend("t", {{10}, FP16_}, {0}, max_ls,
-                       nntrainer::Tensor::Initializer::ONES);
+                       nntrainer::Initializer::ONES);
   EXPECT_ANY_THROW(pool.requestOrExtend("t", {{10}, FP16_}, {1}, max_ls,
-                                        nntrainer::Tensor::Initializer::ZEROS));
+                                        nntrainer::Initializer::ZEROS));
 }
 
 TEST(TensorPool, createOrExtend_init_02_n) {
   nntrainer::TensorPool pool;
   pool.requestOrExtend("t0", {{10}, FP16_}, {0}, max_ls,
-                       nntrainer::Tensor::Initializer::ONES);
+                       nntrainer::Initializer::ONES);
   EXPECT_ANY_THROW(pool.requestOrExtend("t0", {{10}, FP16_}, {1}, max_ls,
-                                        nntrainer::Tensor::Initializer::ZEROS));
+                                        nntrainer::Initializer::ZEROS));
   pool.requestOrExtend("t1", {{10}, FP32_}, {0}, max_ls,
-                       nntrainer::Tensor::Initializer::ONES);
+                       nntrainer::Initializer::ONES);
   EXPECT_ANY_THROW(pool.requestOrExtend("t1", {{10}, FP32_}, {1}, max_ls,
-                                        nntrainer::Tensor::Initializer::ZEROS));
+                                        nntrainer::Initializer::ZEROS));
 }
 
 TEST(TensorPool, createOrExtend_unmanaged_01_n) {
diff --git a/test/unittest/unittest_nntrainer_tensor_v2.cpp b/test/unittest/unittest_nntrainer_tensor_v2.cpp
deleted file mode 100644
index de7d2d7935..0000000000
--- a/test/unittest/unittest_nntrainer_tensor_v2.cpp
+++ /dev/null
@@ -1,1860 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-/**
- * Copyright (C) 2023 Donghyeon Jeong <dhyeon.jeong@samsung.com>
- *
- * @file        unittest_nntrainer_tensor_v2.cpp
- * @date        16 November 2023
- * @brief       Unit test utility for tensor v2.
- * @see         https://github.com/nnstreamer/nntrainer
- * @author      2023 Donghyeon Jeong <dhyeon.jeong@samsung.com>
- * @bug         No known bugs
- */
-#include <gtest/gtest.h>
-
-#include "nntrainer_test_util.h"
-#include "util_func.h"
-#include <fstream>
-#include <nntrainer_error.h>
-#include <tensor_dim.h>
-#include <tensor_v2.h>
-
-TEST(nntrainer_Tensor, Tensor_01_p) {
-  int status = ML_ERROR_NONE;
-  nntrainer::TensorV2 tensor = nntrainer::TensorV2(1, 2, 3);
-  tensor.setZero();
-  ASSERT_NE(nullptr, tensor.getData<float>());
-
-  if (tensor.getValue<float>(0, 0, 0, 0) != 0.0)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, Tensor_02_p) {
-  int status = ML_ERROR_NONE;
-  int height = 3;
-  int width = 10;
-  std::vector<std::vector<float>> in;
-  for (int i = 0; i < height; ++i) {
-    std::vector<float> tv;
-    for (int j = 0; j < width; ++j) {
-      tv.push_back(i * 2.0 + j);
-    }
-    in.push_back(tv);
-  }
-
-  nntrainer::TensorV2 tensor = nntrainer::TensorV2(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32});
-  ASSERT_NE(nullptr, tensor.getData<float>());
-
-  if (tensor.getValue<float>(0, 0, 0, 1) != 1.0)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, Tensor_02_nhwc_p) {
-  int status = ML_ERROR_NONE;
-  int width = 10;
-  int channel = 3;
-  std::vector<std::vector<float>> in;
-  for (int i = 0; i < width; ++i) {
-    std::vector<float> tv;
-    for (int j = 0; j < channel; ++j) {
-      tv.push_back(i * 2.0 + j);
-    }
-    in.push_back(tv);
-  }
-
-  nntrainer::TensorV2 tensor = nntrainer::TensorV2(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32});
-  ASSERT_NE(nullptr, tensor.getData<float>());
-
-  if (tensor.getValue<float>(0, 0, 0, 1) != 1.0)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, Tensor_03_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  std::vector<std::vector<std::vector<float>>> in;
-
-  for (int k = 0; k < batch; ++k) {
-    std::vector<std::vector<float>> ttv;
-    for (int i = 0; i < height; ++i) {
-      std::vector<float> tv;
-      for (int j = 0; j < width; ++j) {
-        tv.push_back(k * height * width + i * width + j);
-      }
-      ttv.push_back(tv);
-    }
-    in.push_back(ttv);
-  }
-
-  nntrainer::TensorV2 tensor = nntrainer::TensorV2(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32});
-  ASSERT_NE(nullptr, tensor.getData<float>());
-
-  if (tensor.getValue<float>(0, 0, 0, 1) != 1.0)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, Tensor_04_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  std::vector<std::vector<std::vector<float>>> in;
-
-  for (int k = 0; k < batch; ++k) {
-    std::vector<std::vector<float>> ttv;
-    for (int i = 0; i < height; ++i) {
-      std::vector<float> tv;
-      for (int j = 0; j < width; ++j) {
-        tv.push_back(k * height * width + i * width + j);
-      }
-      ttv.push_back(tv);
-    }
-    in.push_back(ttv);
-  }
-
-  nntrainer::TensorV2 t0 = nntrainer::TensorV2(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32});
-
-  // copy assignment operator
-  nntrainer::TensorV2 t1 = t0;
-
-  if (t1.getValue<float>(0, 0, 0, 1) != 1.0)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  // comparison operator
-  EXPECT_EQ(t0, t1);
-}
-
-TEST(nntrainer_Tensor, Tensor_05_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  std::vector<std::vector<std::vector<float>>> in;
-
-  for (int k = 0; k < batch; ++k) {
-    std::vector<std::vector<float>> ttv;
-    for (int i = 0; i < height; ++i) {
-      std::vector<float> tv;
-      for (int j = 0; j < width; ++j) {
-        tv.push_back(k * height * width + i * width + j);
-      }
-      ttv.push_back(tv);
-    }
-    in.push_back(ttv);
-  }
-
-  nntrainer::TensorV2 t0 = nntrainer::TensorV2(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32});
-
-  // copy assignment operator
-  nntrainer::TensorV2 t1 = nntrainer::TensorV2(batch, height, width);
-  t1.setRandNormal(2.3, 0.5);
-
-  float val_t0 = t0.getValue<float>(0, 0, 0, 1);
-  float val_t1 = t1.getValue<float>(0, 0, 0, 1);
-
-  swap(t0, t1);
-
-  if (t0.getValue<float>(0, 0, 0, 1) != val_t1)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  if (t1.getValue<float>(0, 0, 0, 1) != val_t0)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, empty_01) {
-  nntrainer::TensorV2 t;
-
-  EXPECT_TRUE(t.empty());
-}
-
-TEST(nntrainer_Tensor, empty_02) {
-  nntrainer::TensorV2 t({1, 2, 3, 4}, false);
-
-  EXPECT_FALSE(t.empty());
-}
-
-TEST(nntrainer_Tensor, empty_03) {
-  nntrainer::TensorV2 t({1, 2, 3, 4}, true);
-
-  EXPECT_FALSE(t.empty());
-}
-
-TEST(nntrainer_Tensor, allocate_01_n) {
-  nntrainer::TensorV2 t;
-  EXPECT_FALSE(t.isAllocated());
-
-  t.allocate();
-  EXPECT_FALSE(t.isAllocated());
-}
-
-TEST(nntrainer_Tensor, allocate_02_p) {
-  nntrainer::TensorV2 t({1, 2, 3, 4}, false);
-  EXPECT_FALSE(t.isAllocated());
-
-  t.allocate();
-  EXPECT_TRUE(t.isAllocated());
-}
-
-TEST(nntrainer_Tensor, allocate_03_p) {
-  nntrainer::TensorV2 t({1, 2, 3, 4}, true);
-  EXPECT_TRUE(t.isAllocated());
-
-  t.allocate();
-  EXPECT_TRUE(t.isAllocated());
-}
-
-TEST(nntrainer_Tensor, initialize_01_p) {
-  nntrainer::TensorV2 t({1, 2, 3, 4}, true, nntrainer::Initializer::ONES);
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4);
-  golden.setValue(1);
-
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_02_p) {
-  nntrainer::TensorV2 t({1, 2, 3, 4}, true);
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4);
-  golden.setValue(1);
-
-  EXPECT_NE(golden, t);
-
-  t.initialize(nntrainer::Initializer::ONES);
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_03_p) {
-  nntrainer::TensorV2 t({1, 2, 3, 4}, false, nntrainer::Initializer::ONES);
-  t.allocate();
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4);
-  golden.setValue(1);
-
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_04_p) {
-  nntrainer::TensorV2 t({1, 2, 3, 4}, false);
-  t.initialize(nntrainer::Initializer::ONES);
-  t.allocate();
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4);
-  golden.setValue(1);
-
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_05_p) {
-  nntrainer::TensorV2 t({1, 2, 3, 4}, false);
-  t.allocate();
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4);
-  golden.setValue(1.f);
-
-  /**
-   * Ideally, it should be NE, but it can be equal due to no initialization
-   * EXPECT_NE(golden, t);
-   */
-
-  t.initialize(nntrainer::Initializer::ONES);
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_06_n) {
-  nntrainer::TensorV2 t({1, 2, 3, 4}, true, nntrainer::Initializer::ONES);
-  nntrainer::TensorV2 golden({1, 2, 3, 4}, true, nntrainer::Initializer::ZEROS);
-
-  EXPECT_NE(golden, t);
-
-  golden.initialize(nntrainer::Initializer::ONES);
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_07_p) {
-  nntrainer::TensorV2 t({1, 2, 3, 4}, true, nntrainer::Initializer::ONES);
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4);
-  golden.setValue(1);
-
-  EXPECT_EQ(golden, t);
-
-  t.setValue(0, 0, 0, 0, 0);
-  t.setValue(0, 0, 0, t.size() - 1, 0);
-  EXPECT_NE(golden, t);
-
-  t.initialize();
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_08_p) {
-  nntrainer::TensorV2 t({1, 2, 3, 4}, true, nntrainer::Initializer::ONES);
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4);
-  golden.setValue(1);
-
-  EXPECT_EQ(golden, t);
-
-  t.initialize(nntrainer::Initializer::HE_NORMAL);
-  EXPECT_NE(golden, t);
-
-  t.initialize();
-  EXPECT_NE(golden, t);
-
-  t.initialize(nntrainer::Initializer::ONES);
-  EXPECT_EQ(golden, t);
-
-  t.initialize();
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, multiply_i_01_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::TensorV2 original;
-  original.copy(input);
-
-  status = input.multiply_i(2.0);
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  float *data = original.getData<float>();
-  ASSERT_NE(nullptr, data);
-  float *indata = input.getData<float>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * channel * width * height; ++i) {
-    EXPECT_FLOAT_EQ(data[i] + data[i], indata[i]);
-  }
-}
-
-TEST(nntrainer_Tensor, multiply_i_02_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::TensorV2 original;
-  original.copy(input);
-
-  status = input.multiply_i(input);
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  float *data = original.getData<float>();
-  ASSERT_NE(nullptr, data);
-  float *indata = input.getData<float>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * channel * width * height; ++i) {
-    EXPECT_FLOAT_EQ(data[i] * data[i], indata[i]);
-  }
-}
-
-TEST(nntrainer_Tensor, multiply_i_03_n) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::TensorV2 target2(batch, channel, height - 2, width - 1);
-  status = input.multiply_i(target2);
-
-  EXPECT_EQ(status, ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, multiply_i_broadcast_01_p) {
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 4, 5);
-    float answer_data[] = {
-      0,    1,    4,    9,    16,   25,   36,   49,   64,   81,   100,  121,
-      144,  169,  196,  225,  256,  289,  324,  361,  400,  441,  484,  529,
-      576,  625,  676,  729,  784,  841,  900,  961,  1024, 1089, 1156, 1225,
-      1296, 1369, 1444, 1521, 0,    41,   84,   129,  176,  225,  276,  329,
-      384,  441,  500,  561,  624,  689,  756,  825,  896,  969,  1044, 1121,
-      1200, 1281, 1364, 1449, 1536, 1625, 1716, 1809, 1904, 2001, 2100, 2201,
-      2304, 2409, 2516, 2625, 2736, 2849, 2964, 3081, 0,    81,   164,  249,
-      336,  425,  516,  609,  704,  801,  900,  1001, 1104, 1209, 1316, 1425,
-      1536, 1649, 1764, 1881, 2000, 2121, 2244, 2369, 2496, 2625, 2756, 2889,
-      3024, 3161, 3300, 3441, 3584, 3729, 3876, 4025, 4176, 4329, 4484, 4641};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 4, 5);
-    float answer_data[] = {
-      0,    1,    4,    9,    16,   25,   36,   49,   64,   81,   100,  121,
-      144,  169,  196,  225,  256,  289,  324,  361,  0,    21,   44,   69,
-      96,   125,  156,  189,  224,  261,  300,  341,  384,  429,  476,  525,
-      576,  629,  684,  741,  800,  861,  924,  989,  1056, 1125, 1196, 1269,
-      1344, 1421, 1500, 1581, 1664, 1749, 1836, 1925, 2016, 2109, 2204, 2301,
-      1200, 1281, 1364, 1449, 1536, 1625, 1716, 1809, 1904, 2001, 2100, 2201,
-      2304, 2409, 2516, 2625, 2736, 2849, 2964, 3081, 3200, 3321, 3444, 3569,
-      3696, 3825, 3956, 4089, 4224, 4361, 4500, 4641, 4784, 4929, 5076, 5225,
-      5376, 5529, 5684, 5841, 4000, 4141, 4284, 4429, 4576, 4725, 4876, 5029,
-      5184, 5341, 5500, 5661, 5824, 5989, 6156, 6325, 6496, 6669, 6844, 7021};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(3, 2, 4, 1);
-    float answer_data[] = {
-      0,    0,    0,    0,    0,    5,    6,    7,    8,    9,    20,   22,
-      24,   26,   28,   45,   48,   51,   54,   57,   80,   84,   88,   92,
-      96,   125,  130,  135,  140,  145,  180,  186,  192,  198,  204,  245,
-      252,  259,  266,  273,  320,  328,  336,  344,  352,  405,  414,  423,
-      432,  441,  500,  510,  520,  530,  540,  605,  616,  627,  638,  649,
-      720,  732,  744,  756,  768,  845,  858,  871,  884,  897,  980,  994,
-      1008, 1022, 1036, 1125, 1140, 1155, 1170, 1185, 1280, 1296, 1312, 1328,
-      1344, 1445, 1462, 1479, 1496, 1513, 1620, 1638, 1656, 1674, 1692, 1805,
-      1824, 1843, 1862, 1881, 2000, 2020, 2040, 2060, 2080, 2205, 2226, 2247,
-      2268, 2289, 2420, 2442, 2464, 2486, 2508, 2645, 2668, 2691, 2714, 2737};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 5);
-    float answer_data[] = {
-      0,    1,    4,    9,    16,   0,    6,    14,   24,   36,   0,    11,
-      24,   39,   56,   0,    16,   34,   54,   76,   0,    21,   44,   69,
-      96,   0,    26,   54,   84,   116,  0,    31,   64,   99,   136,  0,
-      36,   74,   114,  156,  200,  246,  294,  344,  396,  225,  276,  329,
-      384,  441,  250,  306,  364,  424,  486,  275,  336,  399,  464,  531,
-      300,  366,  434,  504,  576,  325,  396,  469,  544,  621,  350,  426,
-      504,  584,  666,  375,  456,  539,  624,  711,  800,  891,  984,  1079,
-      1176, 850,  946,  1044, 1144, 1246, 900,  1001, 1104, 1209, 1316, 950,
-      1056, 1164, 1274, 1386, 1000, 1111, 1224, 1339, 1456, 1050, 1166, 1284,
-      1404, 1526, 1100, 1221, 1344, 1469, 1596, 1150, 1276, 1404, 1534, 1666};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 1, 5);
-    float answer_data[] = {
-      0,   1,   4,    9,   16,  0,   6,   14,  24,  36,  0,   11,  24,  39,
-      56,  0,   16,   34,  54,  76,  100, 126, 154, 184, 216, 125, 156, 189,
-      224, 261, 150,  186, 224, 264, 306, 175, 216, 259, 304, 351, 0,   41,
-      84,  129, 176,  0,   46,  94,  144, 196, 0,   51,  104, 159, 216, 0,
-      56,  114, 174,  236, 300, 366, 434, 504, 576, 325, 396, 469, 544, 621,
-      350, 426, 504,  584, 666, 375, 456, 539, 624, 711, 0,   81,  164, 249,
-      336, 0,   86,   174, 264, 356, 0,   91,  184, 279, 376, 0,   96,  194,
-      294, 396, 500,  606, 714, 824, 936, 525, 636, 749, 864, 981, 550, 666,
-      784, 904, 1026, 575, 696, 819, 944, 1071};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 4, 1);
-    float answer_data[] = {
-      0,    0,    0,    0,    0,    5,    6,    7,    8,    9,    20,   22,
-      24,   26,   28,   45,   48,   51,   54,   57,   0,    0,    0,    0,
-      0,    25,   26,   27,   28,   29,   60,   62,   64,   66,   68,   105,
-      108,  111,  114,  117,  160,  164,  168,  172,  176,  225,  230,  235,
-      240,  245,  300,  306,  312,  318,  324,  385,  392,  399,  406,  413,
-      240,  244,  248,  252,  256,  325,  330,  335,  340,  345,  420,  426,
-      432,  438,  444,  525,  532,  539,  546,  553,  640,  648,  656,  664,
-      672,  765,  774,  783,  792,  801,  900,  910,  920,  930,  940,  1045,
-      1056, 1067, 1078, 1089, 800,  808,  816,  824,  832,  945,  954,  963,
-      972,  981,  1100, 1110, 1120, 1130, 1140, 1265, 1276, 1287, 1298, 1309};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(1, 1, 1, 5);
-    float answer_data[] = {
-      0, 1,   4,   9,   16,  0, 6,   14,  24,  36,  0, 11,  24,  39,  56,
-      0, 16,  34,  54,  76,  0, 21,  44,  69,  96,  0, 26,  54,  84,  116,
-      0, 31,  64,  99,  136, 0, 36,  74,  114, 156, 0, 41,  84,  129, 176,
-      0, 46,  94,  144, 196, 0, 51,  104, 159, 216, 0, 56,  114, 174, 236,
-      0, 61,  124, 189, 256, 0, 66,  134, 204, 276, 0, 71,  144, 219, 296,
-      0, 76,  154, 234, 316, 0, 81,  164, 249, 336, 0, 86,  174, 264, 356,
-      0, 91,  184, 279, 376, 0, 96,  194, 294, 396, 0, 101, 204, 309, 416,
-      0, 106, 214, 324, 436, 0, 111, 224, 339, 456, 0, 116, 234, 354, 476};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 1, 1);
-    float answer_data[] = {
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   20,  21,  22,  23,  24,  25,  26,  27,
-      28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
-      70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-      112, 113, 114, 115, 116, 117, 118, 119};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 1);
-    float answer_data[] = {
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   40,  41,
-      42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
-      56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
-      70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  160, 162, 164, 166,
-      168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194,
-      196, 198, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222,
-      224, 226, 228, 230, 232, 234, 236, 238};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 5, 1, 4);
-    nntrainer::TensorV2 t = rangedV2(3, 5, 1, 4);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 4);
-    float answer_data[] = {0,   1,   4,   9,   0,   5,   12,  21,  0,   9,
-                           20,  33,  0,   13,  28,  45,  0,   17,  36,  57,
-                           80,  105, 132, 161, 96,  125, 156, 189, 112, 145,
-                           180, 217, 128, 165, 204, 245, 144, 185, 228, 273,
-                           320, 369, 420, 473, 352, 405, 460, 517, 384, 441,
-                           500, 561, 416, 477, 540, 605, 448, 513, 580, 649};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-}
-
-TEST(nntrainer_Tensor, multiply_i_broadcast_not_supported_01_n) {
-  nntrainer::TensorV2 target(3, 1, 3, 1);
-  nntrainer::TensorV2 target2(3, 1, 3, 3);
-
-  EXPECT_EQ(target.multiply_i(target2), ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, multiply_i_broadcast_not_broadcastable_02_n) {
-  nntrainer::TensorV2 target(3, 2, 4, 5);
-  nntrainer::TensorV2 target2(3, 2, 3, 1);
-
-  EXPECT_EQ(target.multiply_i(target2), ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, multiply_01_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::TensorV2 result = input.multiply(0.0);
-  if (result.getValue(0, 0, 1, 1) != 0.0)
-    status = ML_ERROR_RESULT_OUT_OF_RANGE;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, multiply_02_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 result = input.multiply(input);
-
-  float *data = result.getData<float>();
-  ASSERT_NE(nullptr, data);
-  float *indata = input.getData<float>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    if (data[i] != indata[i] * indata[i]) {
-      status = ML_ERROR_RESULT_OUT_OF_RANGE;
-      break;
-    }
-  }
-
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, multiply_03_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 test(batch - 1, height - 1, width - 1);
-
-  EXPECT_THROW({ input.multiply(test); }, std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_04_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(batch, channel, height, 2 * width);
-  nntrainer::TensorV2 shared_input =
-    input.getSharedDataTensor(dim, 0, false, "");
-  nntrainer::TensorV2 test(dim);
-
-  EXPECT_THROW(shared_input.multiply(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_05_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  nntrainer::TensorV2 test(batch, channel, height, 2 * width);
-  nntrainer::TensorV2 shared_test = test.getSharedDataTensor(dim, 0, false, "");
-
-  EXPECT_THROW(input.multiply(shared_test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_06_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim, false);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 1);
-
-  EXPECT_THROW(input.multiply(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_07_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim, false);
-
-  EXPECT_THROW(input.multiply(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_08_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 2);
-  nntrainer::TensorV2 output(dim, false);
-
-  EXPECT_THROW(input.multiply(test, output), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_float_01_p) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 expected(batch, channel, height, width);
-  GEN_TEST_INPUT(expected, (i * (batch * height) + j * (width) + k + 1) * 2);
-
-  nntrainer::TensorV2 result = input.multiply(2.0);
-
-  EXPECT_EQ(result, expected);
-}
-
-TEST(nntrainer_Tensor, multiply_strided_01_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 result = input.multiply_strided(input);
-
-  float *data = result.getData<float>();
-  ASSERT_NE(nullptr, data);
-  float *indata = input.getData<float>();
-  ASSERT_NE(nullptr, indata);
-
-  float *outdata = new float[(input.size())];
-
-  std::transform(indata, indata + batch * channel * height * width, indata,
-                 outdata, std::multiplies<float>());
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    if (data[i] != outdata[i]) {
-      status = ML_ERROR_RESULT_OUT_OF_RANGE;
-      break;
-    }
-  }
-
-  delete[] outdata;
-
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, multiply_strided_02_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 test(batch - 1, height - 1, width - 1);
-
-  EXPECT_THROW({ input.multiply_strided(test); }, std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_strided_03_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-  // input is not allocated now : alloc_now == false
-  nntrainer::TensorV2 input(dim, false);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 1);
-
-  EXPECT_THROW(input.multiply_strided(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_strided_04_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  // test is not allocated.
-  nntrainer::TensorV2 test(dim, false);
-
-  EXPECT_THROW(input.multiply_strided(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_strided_05_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 1);
-  // output is not allocated
-  nntrainer::TensorV2 output(dim, false);
-
-  EXPECT_THROW(input.multiply_strided(test, output), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_strided_06_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 output(batch, channel, height, width);
-  GEN_TEST_INPUT(output, i * (batch * height) + j * (width) + k + 1);
-
-  float *indata = input.getData<float>();
-  ASSERT_NE(nullptr, indata);
-
-  float *data = output.getData<float>();
-  ASSERT_NE(nullptr, data);
-
-  float *outdata_beta = new float[(input.size())];
-  float *indata_mul = new float[(input.size())];
-  float *outdata = new float[(input.size())];
-
-  std::transform(
-    indata, indata + batch * channel * height * width, outdata_beta,
-    std::bind(std::multiplies<float>(), std::placeholders::_1, 10.0));
-
-  std::transform(indata, indata + batch * channel * height * width, indata,
-                 indata_mul, std::multiplies<float>());
-  std::transform(indata_mul, indata_mul + batch * channel * height * width,
-                 outdata_beta, outdata, std::plus<float>());
-
-  input.multiply_strided(input, output, 10.0);
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    if (data[i] != outdata[i]) {
-      status = ML_ERROR_RESULT_OUT_OF_RANGE;
-      break;
-    }
-  }
-
-  delete[] outdata_beta;
-  delete[] indata_mul;
-  delete[] outdata;
-
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, divide_i_01_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::TensorV2 original;
-  original.copy(input);
-
-  status = input.divide_i((float)2.0);
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  float *data = original.getData<float>();
-  ASSERT_NE(nullptr, data);
-  float *indata = input.getData<float>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * height * width * channel; ++i) {
-    EXPECT_FLOAT_EQ(data[i], indata[i] + indata[i]);
-  }
-}
-
-TEST(nntrainer_Tensor, divide_i_02_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  status = input.divide_i(input);
-  EXPECT_EQ(status, ML_ERROR_NONE);
-  float *indata = input.getData<float>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * height * width * channel; ++i) {
-    EXPECT_FLOAT_EQ(indata[i], float(1.0));
-  }
-}
-
-TEST(nntrainer_Tensor, divide_i_01_n) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  status = input.divide_i((float)0);
-  EXPECT_EQ(status, ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, divide_i_02_n) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::TensorV2 original(batch, channel, height - 2, width - 1);
-
-  status = input.divide_i(original);
-  EXPECT_EQ(status, ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, divide_01_p) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 result = input.divide(1.0);
-
-  float *previous = input.getData<float>();
-  ASSERT_NE(nullptr, previous);
-  float *data = result.getData<float>();
-  ASSERT_NE(nullptr, data);
-
-  for (int i = 0; i < batch * height * width * channel; ++i) {
-    EXPECT_FLOAT_EQ(data[i], previous[i]);
-  }
-}
-
-TEST(nntrainer_Tensor, divide_02_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  EXPECT_THROW({ input.divide(0.0); }, std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_04_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(batch, channel, height, 2 * width);
-  nntrainer::TensorV2 shared_input =
-    input.getSharedDataTensor(dim, 0, false, "");
-  nntrainer::TensorV2 test(dim);
-
-  EXPECT_THROW(shared_input.divide(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_05_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  nntrainer::TensorV2 test(batch, channel, height, 2 * width);
-  nntrainer::TensorV2 shared_test = test.getSharedDataTensor(dim, 0, false, "");
-
-  EXPECT_THROW(input.divide(shared_test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_06_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim, false);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 1);
-
-  EXPECT_THROW(input.divide(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_07_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim, false);
-
-  EXPECT_THROW(input.divide(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_08_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 2);
-  nntrainer::TensorV2 output(dim, false);
-
-  EXPECT_THROW(input.divide(test, output), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_i_broadcast_01_p) {
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 4, 5);
-    m.add_i(1);
-    float answer_data[] = {
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       41.0,      21.0,
-      14.333333, 11.0,      9.0,       7.6666665, 6.714286,  6.0,
-      5.4444447, 5.0,       4.6363635, 4.3333335, 4.076923,  3.857143,
-      3.6666667, 3.5,       3.3529413, 3.2222223, 3.1052632, 3.0,
-      2.9047618, 2.8181818, 2.7391305, 2.6666667, 2.6,       2.5384614,
-      2.4814816, 2.4285715, 2.3793104, 2.3333333, 2.2903225, 2.25,
-      2.2121212, 2.1764705, 2.142857,  2.1111112, 2.0810812, 2.0526316,
-      2.025641,  2.0,       81.0,      41.0,      27.666666, 21.0,
-      17.0,      14.333333, 12.428572, 11.0,      9.888889,  9.0,
-      8.272727,  7.6666665, 7.1538463, 6.714286,  6.3333335, 6.0,
-      5.7058825, 5.4444447, 5.2105265, 5.0,       4.8095236, 4.6363635,
-      4.478261,  4.3333335, 4.2,       4.076923,  3.9629629, 3.857143,
-      3.7586207, 3.6666667, 3.580645,  3.5,       3.4242425, 3.3529413,
-      3.2857144, 3.2222223, 3.162162,  3.1052632, 3.0512822, 3.0};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 4, 5);
-    m.add_i(1);
-    float answer_data[] = {
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       21.0,      11.0,      7.6666665, 6.0,
-      5.0,       4.3333335, 3.857143,  3.5,       3.2222223, 3.0,
-      2.8181818, 2.6666667, 2.5384614, 2.4285715, 2.3333333, 2.25,
-      2.1764705, 2.1111112, 2.0526316, 2.0,       1.9523809, 1.9090909,
-      1.8695652, 1.8333334, 1.8,       1.7692307, 1.7407408, 1.7142857,
-      1.6896552, 1.6666666, 1.6451613, 1.625,     1.6060606, 1.5882353,
-      1.5714285, 1.5555556, 1.5405406, 1.5263158, 1.5128205, 1.5,
-      2.9047618, 2.8181818, 2.7391305, 2.6666667, 2.6,       2.5384614,
-      2.4814816, 2.4285715, 2.3793104, 2.3333333, 2.2903225, 2.25,
-      2.2121212, 2.1764705, 2.142857,  2.1111112, 2.0810812, 2.0526316,
-      2.025641,  2.0,       1.9756098, 1.9523809, 1.9302325, 1.9090909,
-      1.8888888, 1.8695652, 1.8510638, 1.8333334, 1.8163265, 1.8,
-      1.7843137, 1.7692307, 1.754717,  1.7407408, 1.7272727, 1.7142857,
-      1.7017543, 1.6896552, 1.6779661, 1.6666666, 2.4634147, 2.4285715,
-      2.3953488, 2.3636363, 2.3333333, 2.3043478, 2.2765958, 2.25,
-      2.2244897, 2.2,       2.1764705, 2.1538463, 2.1320755, 2.1111112,
-      2.090909,  2.0714285, 2.0526316, 2.0344827, 2.0169492, 2.0};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(3, 2, 4, 1);
-    m.add_i(1);
-    float answer_data[] = {
-      1.0,       2.0,       3.0,       4.0,       5.0,       3.0,
-      3.5,       4.0,       4.5,       5.0,       3.6666667, 4.0,
-      4.3333335, 4.6666665, 5.0,       4.0,       4.25,      4.5,
-      4.75,      5.0,       4.2,       4.4,       4.6,       4.8,
-      5.0,       4.3333335, 4.5,       4.6666665, 4.8333335, 5.0,
-      4.428571,  4.571429,  4.714286,  4.857143,  5.0,       4.5,
-      4.625,     4.75,      4.875,     5.0,       4.5555553, 4.6666665,
-      4.7777777, 4.888889,  5.0,       4.6,       4.7,       4.8,
-      4.9,       5.0,       4.6363635, 4.7272725, 4.818182,  4.909091,
-      5.0,       4.6666665, 4.75,      4.8333335, 4.9166665, 5.0,
-      4.6923075, 4.769231,  4.8461537, 4.923077,  5.0,       4.714286,
-      4.785714,  4.857143,  4.928571,  5.0,       4.733333,  4.8,
-      4.866667,  4.9333334, 5.0,       4.75,      4.8125,    4.875,
-      4.9375,    5.0,       4.7647057, 4.8235292, 4.882353,  4.9411764,
-      5.0,       4.7777777, 4.8333335, 4.888889,  4.9444447, 5.0,
-      4.7894735, 4.8421054, 4.894737,  4.9473686, 5.0,       4.8,
-      4.85,      4.9,       4.95,      5.0,       4.8095236, 4.857143,
-      4.904762,  4.952381,  5.0,       4.818182,  4.8636365, 4.909091,
-      4.9545455, 5.0,       4.826087,  4.869565,  4.9130435, 4.9565215,
-      5.0,       4.8333335, 4.875,     4.9166665, 4.9583335, 5.0};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 5);
-    m.add_i(1);
-    float answer_data[] = {
-      1.0,       1.0,       1.0,       1.0,       1.0,       6.0,
-      3.5,       2.6666667, 2.25,      2.0,       11.0,      6.0,
-      4.3333335, 3.5,       3.0,       16.0,      8.5,       6.0,
-      4.75,      4.0,       21.0,      11.0,      7.6666665, 6.0,
-      5.0,       26.0,      13.5,      9.333333,  7.25,      6.0,
-      31.0,      16.0,      11.0,      8.5,       7.0,       36.0,
-      18.5,      12.666667, 9.75,      8.0,       6.8333335, 6.0,
-      5.375,     4.888889,  4.5,       7.6666665, 6.714286,  6.0,
-      5.4444447, 5.0,       8.5,       7.428571,  6.625,     6.0,
-      5.5,       9.333333,  8.142858,  7.25,      6.5555553, 6.0,
-      10.166667, 8.857142,  7.875,     7.111111,  6.5,       11.0,
-      9.571428,  8.5,       7.6666665, 7.0,       11.833333, 10.285714,
-      9.125,     8.222222,  7.5,       12.666667, 11.0,      9.75,
-      8.777778,  8.0,       7.3636365, 6.8333335, 6.3846154, 6.0,
-      5.6666665, 7.818182,  7.25,      6.769231,  6.357143,  6.0,
-      8.272727,  7.6666665, 7.1538463, 6.714286,  6.3333335, 8.727273,
-      8.083333,  7.5384617, 7.071429,  6.6666665, 9.181818,  8.5,
-      7.923077,  7.428571,  7.0,       9.636364,  8.916667,  8.307693,
-      7.785714,  7.3333335, 10.090909, 9.333333,  8.692307,  8.142858,
-      7.6666665, 10.545455, 9.75,      9.076923,  8.5,       8.0};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 1, 5);
-    m.add_i(1);
-    float answer_data[] = {
-      1.0,       1.0,       1.0,       1.0,       1.0,       6.0,
-      3.5,       2.6666667, 2.25,      2.0,       11.0,      6.0,
-      4.3333335, 3.5,       3.0,       16.0,      8.5,       6.0,
-      4.75,      4.0,       3.5,       3.142857,  2.875,     2.6666667,
-      2.5,       4.3333335, 3.857143,  3.5,       3.2222223, 3.0,
-      5.1666665, 4.571429,  4.125,     3.7777777, 3.5,       6.0,
-      5.285714,  4.75,      4.3333335, 4.0,       41.0,      21.0,
-      14.333333, 11.0,      9.0,       46.0,      23.5,      16.0,
-      12.25,     10.0,      51.0,      26.0,      17.666666, 13.5,
-      11.0,      56.0,      28.5,      19.333334, 14.75,     12.0,
-      10.166667, 8.857142,  7.875,     7.111111,  6.5,       11.0,
-      9.571428,  8.5,       7.6666665, 7.0,       11.833333, 10.285714,
-      9.125,     8.222222,  7.5,       12.666667, 11.0,      9.75,
-      8.777778,  8.0,       81.0,      41.0,      27.666666, 21.0,
-      17.0,      86.0,      43.5,      29.333334, 22.25,     18.0,
-      91.0,      46.0,      31.0,      23.5,      19.0,      96.0,
-      48.5,      32.666668, 24.75,     20.0,      16.833334, 14.571428,
-      12.875,    11.555555, 10.5,      17.666666, 15.285714, 13.5,
-      12.111111, 11.0,      18.5,      16.0,      14.125,    12.666667,
-      11.5,      19.333334, 16.714285, 14.75,     13.222222, 12.0};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 4, 1);
-    m.add_i(1);
-    float answer_data[] = {
-      1.0,       2.0,       3.0,       4.0,       5.0,       3.0,
-      3.5,       4.0,       4.5,       5.0,       3.6666667, 4.0,
-      4.3333335, 4.6666665, 5.0,       4.0,       4.25,      4.5,
-      4.75,      5.0,       21.0,      22.0,      23.0,      24.0,
-      25.0,      13.0,      13.5,      14.0,      14.5,      15.0,
-      10.333333, 10.666667, 11.0,      11.333333, 11.666667, 9.0,
-      9.25,      9.5,       9.75,      10.0,      8.2,       8.4,
-      8.6,       8.8,       9.0,       7.6666665, 7.8333335, 8.0,
-      8.166667,  8.333333,  7.285714,  7.428571,  7.571429,  7.714286,
-      7.857143,  7.0,       7.125,     7.25,      7.375,     7.5,
-      12.2,      12.4,      12.6,      12.8,      13.0,      11.0,
-      11.166667, 11.333333, 11.5,      11.666667, 10.142858, 10.285714,
-      10.428572, 10.571428, 10.714286, 9.5,       9.625,     9.75,
-      9.875,     10.0,      9.0,       9.111111,  9.222222,  9.333333,
-      9.444445,  8.6,       8.7,       8.8,       8.9,       9.0,
-      8.272727,  8.363636,  8.454545,  8.545455,  8.636364,  8.0,
-      8.083333,  8.166667,  8.25,      8.333333,  11.222222, 11.333333,
-      11.444445, 11.555555, 11.666667, 10.6,      10.7,      10.8,
-      10.9,      11.0,      10.090909, 10.181818, 10.272727, 10.363636,
-      10.454545, 9.666667,  9.75,      9.833333,  9.916667,  10.0};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(1, 1, 1, 5);
-    m.add_i(1);
-    float answer_data[] = {
-      1.0,   1.0,  1.0,       1.0,  1.0,  6.0,   3.5,  2.6666667, 2.25,  2.0,
-      11.0,  6.0,  4.3333335, 3.5,  3.0,  16.0,  8.5,  6.0,       4.75,  4.0,
-      21.0,  11.0, 7.6666665, 6.0,  5.0,  26.0,  13.5, 9.333333,  7.25,  6.0,
-      31.0,  16.0, 11.0,      8.5,  7.0,  36.0,  18.5, 12.666667, 9.75,  8.0,
-      41.0,  21.0, 14.333333, 11.0, 9.0,  46.0,  23.5, 16.0,      12.25, 10.0,
-      51.0,  26.0, 17.666666, 13.5, 11.0, 56.0,  28.5, 19.333334, 14.75, 12.0,
-      61.0,  31.0, 21.0,      16.0, 13.0, 66.0,  33.5, 22.666666, 17.25, 14.0,
-      71.0,  36.0, 24.333334, 18.5, 15.0, 76.0,  38.5, 26.0,      19.75, 16.0,
-      81.0,  41.0, 27.666666, 21.0, 17.0, 86.0,  43.5, 29.333334, 22.25, 18.0,
-      91.0,  46.0, 31.0,      23.5, 19.0, 96.0,  48.5, 32.666668, 24.75, 20.0,
-      101.0, 51.0, 34.333332, 26.0, 21.0, 106.0, 53.5, 36.0,      27.25, 22.0,
-      111.0, 56.0, 37.666668, 28.5, 23.0, 116.0, 58.5, 39.333332, 29.75, 24.0};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 1, 1);
-    m.add_i(1);
-    float answer_data[] = {
-      1.0,  2.0,  3.0,  4.0,   5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
-      13.0, 14.0, 15.0, 16.0,  17.0, 18.0, 19.0, 20.0, 10.5, 11.0, 11.5, 12.0,
-      12.5, 13.0, 13.5, 14.0,  14.5, 15.0, 15.5, 16.0, 16.5, 17.0, 17.5, 18.0,
-      18.5, 19.0, 19.5, 20.0,  41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0,
-      49.0, 50.0, 51.0, 52.0,  53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
-      30.5, 31.0, 31.5, 32.0,  32.5, 33.0, 33.5, 34.0, 34.5, 35.0, 35.5, 36.0,
-      36.5, 37.0, 37.5, 38.0,  38.5, 39.0, 39.5, 40.0, 81.0, 82.0, 83.0, 84.0,
-      85.0, 86.0, 87.0, 88.0,  89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0,
-      97.0, 98.0, 99.0, 100.0, 50.5, 51.0, 51.5, 52.0, 52.5, 53.0, 53.5, 54.0,
-      54.5, 55.0, 55.5, 56.0,  56.5, 57.0, 57.5, 58.0, 58.5, 59.0, 59.5, 60.0};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 1);
-    m.add_i(1);
-    float answer_data[] = {
-      1.0,       2.0,       3.0,  4.0,       5.0,       6.0,
-      7.0,       8.0,       9.0,  10.0,      11.0,      12.0,
-      13.0,      14.0,      15.0, 16.0,      17.0,      18.0,
-      19.0,      20.0,      21.0, 22.0,      23.0,      24.0,
-      25.0,      26.0,      27.0, 28.0,      29.0,      30.0,
-      31.0,      32.0,      33.0, 34.0,      35.0,      36.0,
-      37.0,      38.0,      39.0, 40.0,      20.5,      21.0,
-      21.5,      22.0,      22.5, 23.0,      23.5,      24.0,
-      24.5,      25.0,      25.5, 26.0,      26.5,      27.0,
-      27.5,      28.0,      28.5, 29.0,      29.5,      30.0,
-      30.5,      31.0,      31.5, 32.0,      32.5,      33.0,
-      33.5,      34.0,      34.5, 35.0,      35.5,      36.0,
-      36.5,      37.0,      37.5, 38.0,      38.5,      39.0,
-      39.5,      40.0,      27.0, 27.333334, 27.666666, 28.0,
-      28.333334, 28.666666, 29.0, 29.333334, 29.666666, 30.0,
-      30.333334, 30.666666, 31.0, 31.333334, 31.666666, 32.0,
-      32.333332, 32.666668, 33.0, 33.333332, 33.666668, 34.0,
-      34.333332, 34.666668, 35.0, 35.333332, 35.666668, 36.0,
-      36.333332, 36.666668, 37.0, 37.333332, 37.666668, 38.0,
-      38.333332, 38.666668, 39.0, 39.333332, 39.666668, 40.0};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 5, 1, 4);
-    nntrainer::TensorV2 t = rangedV2(3, 5, 1, 4);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 4);
-    m.add_i(1);
-    float answer_data[] = {
-      1.0,       1.0,       1.0,       1.0,       5.0,       3.0,
-      2.3333333, 2.0,       9.0,       5.0,       3.6666667, 3.0,
-      13.0,      7.0,       5.0,       4.0,       17.0,      9.0,
-      6.3333335, 5.0,       4.2,       3.6666667, 3.2857144, 3.0,
-      5.0,       4.3333335, 3.857143,  3.5,       5.8,       5.0,
-      4.428571,  4.0,       6.6,       5.6666665, 5.0,       4.5,
-      7.4,       6.3333335, 5.571429,  5.0,       4.5555553, 4.2,
-      3.909091,  3.6666667, 5.0,       4.6,       4.2727275, 4.0,
-      5.4444447, 5.0,       4.6363635, 4.3333335, 5.888889,  5.4,
-      5.0,       4.6666665, 6.3333335, 5.8,       5.3636365, 5.0};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-}
-
-TEST(nntrainer_Tensor, divide_i_broadcast_not_supported_01_n) {
-  nntrainer::TensorV2 target(3, 1, 3, 1);
-  nntrainer::TensorV2 target2(3, 1, 3, 3);
-
-  EXPECT_EQ(target.divide_i(target2), ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, divide_i_broadcast_not_broadcastable_02_n) {
-  nntrainer::TensorV2 target(3, 2, 4, 5);
-  nntrainer::TensorV2 target2(3, 2, 3, 1);
-
-  EXPECT_EQ(target.divide_i(target2), ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, add_i_01_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  int channel = 1;
-
-  nntrainer::TensorV2 target(batch, channel, height, width);
-  GEN_TEST_INPUT(target, i * (batch * height) + j * (width) + k + 1 + channel);
-
-  nntrainer::TensorV2 original(batch, channel, height, width);
-  original.copy(target);
-
-  status = target.add_i(2.1);
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  float *previous = original.getData<float>();
-  ASSERT_NE(nullptr, previous);
-  float *data = target.getData<float>();
-  ASSERT_NE(nullptr, data);
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    EXPECT_FLOAT_EQ(data[i], previous[i] + (float)2.1);
-  }
-}
-
-TEST(nntrainer_Tensor, add_i_02_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  int channel = 1;
-
-  nntrainer::TensorV2 target(batch, channel, height, width);
-  GEN_TEST_INPUT(target, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 original(batch, height, width);
-  original.copy(target);
-
-  status = target.add_i(target, 3.0);
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  float *previous = original.getData<float>();
-  ASSERT_NE(nullptr, previous);
-  float *data = target.getData<float>();
-  ASSERT_NE(nullptr, data);
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    EXPECT_FLOAT_EQ(data[i], previous[i] * 4.0);
-  }
-}
-
-/**
- * @brief operand dimension is not right
- */
-TEST(nntrainer_Tensor, add_i_01_n) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  int channel = 1;
-
-  nntrainer::TensorV2 target(batch, channel, height, width);
-  GEN_TEST_INPUT(target, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 target2(batch, height - 2, width - 3);
-
-  status = target.add_i(target2);
-  EXPECT_EQ(status, ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, add_i_broadcast_01_p) {
-  nntrainer::TensorDim ref_dim{3, 2, 4, 5};
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 4, 5);
-    float answer_data[] = {
-      0,   2,   4,   6,   8,   10,  12,  14,  16,  18,  20,  22,  24,  26,
-      28,  30,  32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,
-      56,  58,  60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  40,  42,
-      44,  46,  48,  50,  52,  54,  56,  58,  60,  62,  64,  66,  68,  70,
-      72,  74,  76,  78,  80,  82,  84,  86,  88,  90,  92,  94,  96,  98,
-      100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 80,  82,  84,  86,
-      88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110, 112, 114,
-      116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142,
-      144, 146, 148, 150, 152, 154, 156, 158};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 4, 5);
-    float answer_data[] = {
-      0,   2,   4,   6,   8,   10,  12,  14,  16,  18,  20,  22,  24,  26,
-      28,  30,  32,  34,  36,  38,  20,  22,  24,  26,  28,  30,  32,  34,
-      36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
-      64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,  90,
-      92,  94,  96,  98,  80,  82,  84,  86,  88,  90,  92,  94,  96,  98,
-      100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
-      128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154,
-      156, 158, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162,
-      164, 166, 168, 170, 172, 174, 176, 178};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(3, 2, 4, 1);
-    float answer_data[] = {
-      0,   1,   2,   3,   4,   6,   7,   8,   9,   10,  12,  13,  14,  15,
-      16,  18,  19,  20,  21,  22,  24,  25,  26,  27,  28,  30,  31,  32,
-      33,  34,  36,  37,  38,  39,  40,  42,  43,  44,  45,  46,  48,  49,
-      50,  51,  52,  54,  55,  56,  57,  58,  60,  61,  62,  63,  64,  66,
-      67,  68,  69,  70,  72,  73,  74,  75,  76,  78,  79,  80,  81,  82,
-      84,  85,  86,  87,  88,  90,  91,  92,  93,  94,  96,  97,  98,  99,
-      100, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 114, 115, 116,
-      117, 118, 120, 121, 122, 123, 124, 126, 127, 128, 129, 130, 132, 133,
-      134, 135, 136, 138, 139, 140, 141, 142};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 5);
-    float answer_data[] = {
-      0,   2,   4,   6,   8,   5,   7,   9,   11,  13,  10,  12,  14,  16,
-      18,  15,  17,  19,  21,  23,  20,  22,  24,  26,  28,  25,  27,  29,
-      31,  33,  30,  32,  34,  36,  38,  35,  37,  39,  41,  43,  45,  47,
-      49,  51,  53,  50,  52,  54,  56,  58,  55,  57,  59,  61,  63,  60,
-      62,  64,  66,  68,  65,  67,  69,  71,  73,  70,  72,  74,  76,  78,
-      75,  77,  79,  81,  83,  80,  82,  84,  86,  88,  90,  92,  94,  96,
-      98,  95,  97,  99,  101, 103, 100, 102, 104, 106, 108, 105, 107, 109,
-      111, 113, 110, 112, 114, 116, 118, 115, 117, 119, 121, 123, 120, 122,
-      124, 126, 128, 125, 127, 129, 131, 133};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 1, 5);
-    float answer_data[] = {
-      0,   2,   4,   6,   8,   5,   7,   9,   11,  13,  10,  12,  14,  16,
-      18,  15,  17,  19,  21,  23,  25,  27,  29,  31,  33,  30,  32,  34,
-      36,  38,  35,  37,  39,  41,  43,  40,  42,  44,  46,  48,  40,  42,
-      44,  46,  48,  45,  47,  49,  51,  53,  50,  52,  54,  56,  58,  55,
-      57,  59,  61,  63,  65,  67,  69,  71,  73,  70,  72,  74,  76,  78,
-      75,  77,  79,  81,  83,  80,  82,  84,  86,  88,  80,  82,  84,  86,
-      88,  85,  87,  89,  91,  93,  90,  92,  94,  96,  98,  95,  97,  99,
-      101, 103, 105, 107, 109, 111, 113, 110, 112, 114, 116, 118, 115, 117,
-      119, 121, 123, 120, 122, 124, 126, 128};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 4, 1);
-    float answer_data[] = {
-      0,   1,   2,   3,   4,   6,   7,   8,   9,   10,  12,  13,  14,  15,
-      16,  18,  19,  20,  21,  22,  20,  21,  22,  23,  24,  26,  27,  28,
-      29,  30,  32,  33,  34,  35,  36,  38,  39,  40,  41,  42,  44,  45,
-      46,  47,  48,  50,  51,  52,  53,  54,  56,  57,  58,  59,  60,  62,
-      63,  64,  65,  66,  64,  65,  66,  67,  68,  70,  71,  72,  73,  74,
-      76,  77,  78,  79,  80,  82,  83,  84,  85,  86,  88,  89,  90,  91,
-      92,  94,  95,  96,  97,  98,  100, 101, 102, 103, 104, 106, 107, 108,
-      109, 110, 108, 109, 110, 111, 112, 114, 115, 116, 117, 118, 120, 121,
-      122, 123, 124, 126, 127, 128, 129, 130};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(1, 1, 1, 5);
-    float answer_data[] = {
-      0,   2,   4,   6,   8,   5,   7,   9,   11,  13,  10,  12,  14,  16,
-      18,  15,  17,  19,  21,  23,  20,  22,  24,  26,  28,  25,  27,  29,
-      31,  33,  30,  32,  34,  36,  38,  35,  37,  39,  41,  43,  40,  42,
-      44,  46,  48,  45,  47,  49,  51,  53,  50,  52,  54,  56,  58,  55,
-      57,  59,  61,  63,  60,  62,  64,  66,  68,  65,  67,  69,  71,  73,
-      70,  72,  74,  76,  78,  75,  77,  79,  81,  83,  80,  82,  84,  86,
-      88,  85,  87,  89,  91,  93,  90,  92,  94,  96,  98,  95,  97,  99,
-      101, 103, 100, 102, 104, 106, 108, 105, 107, 109, 111, 113, 110, 112,
-      114, 116, 118, 115, 117, 119, 121, 123};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 1, 1);
-    float answer_data[] = {
-      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,
-      14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,  28,
-      29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  40,  41,
-      42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
-      56,  57,  58,  59,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
-      71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  80,  81,  82,  83,
-      84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
-      98,  99,  101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
-      113, 114, 115, 116, 117, 118, 119, 120};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 1);
-    float answer_data[] = {
-      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,
-      14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
-      28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  41,  42,
-      43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
-      57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
-      71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  82,  83,  84,  85,
-      86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
-      100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
-      114, 115, 116, 117, 118, 119, 120, 121};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5);
-    nntrainer::TensorV2 m = rangedV2(1, 1, 1, 1);
-    m.add_i(1.0);
-    float answer_data[] = {
-      1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
-      15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
-      29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
-      43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
-      57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
-      71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
-      85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,
-      99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
-      113, 114, 115, 116, 117, 118, 119, 120};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 5, 1, 4);
-    nntrainer::TensorV2 t = rangedV2(3, 5, 1, 4);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 4);
-    float answer_data[] = {0,  2,  4,  6,  4,  6,  8,  10, 8,  10, 12, 14,
-                           12, 14, 16, 18, 16, 18, 20, 22, 24, 26, 28, 30,
-                           28, 30, 32, 34, 32, 34, 36, 38, 36, 38, 40, 42,
-                           40, 42, 44, 46, 48, 50, 52, 54, 52, 54, 56, 58,
-                           56, 58, 60, 62, 60, 62, 64, 66, 64, 66, 68, 70};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(1, 1, 2, 1);
-    nntrainer::TensorV2 t = rangedV2(1, 1, 2, 1);
-    nntrainer::TensorV2 m = rangedV2(1, 1, 2, 1);
-    float answer_data[] = {0.0, 2.0};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(16, 1, 1, 1);
-    nntrainer::TensorV2 t = rangedV2(16, 1, 1, 1);
-    nntrainer::TensorV2 m = rangedV2(1, 1, 1, 1);
-    float answer_data[] = {0.0, 1.0, 2.0,  3.0,  4.0,  5.0,  6.0,  7.0,
-                           8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-}
-
-TEST(nntrainer_Tensor, add_i_broadcast_not_supported_01_n) {
-  nntrainer::TensorV2 target(3, 1, 3, 1);
-  nntrainer::TensorV2 target2(3, 1, 3, 3);
-
-  EXPECT_EQ(target.add_i(target2), ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, add_i_broadcast_not_broadcastable_02_n) {
-  nntrainer::TensorV2 target(3, 2, 4, 5);
-  nntrainer::TensorV2 target2(3, 2, 3, 1);
-
-  EXPECT_EQ(target.add_i(target2), ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, add_01_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 result = input.add(1.0);
-
-  float *data = result.getData<float>();
-  ASSERT_NE(nullptr, data);
-  float *indata = input.getData<float>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    if (data[i] != indata[i] + (float)1.0) {
-      status = ML_ERROR_RESULT_OUT_OF_RANGE;
-      break;
-    }
-  }
-
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, add_02_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 result = input.add(input);
-
-  float *data = result.getData<float>();
-  ASSERT_NE(nullptr, data);
-  float *indata = input.getData<float>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    if (data[i] != indata[i] + indata[i]) {
-      status = ML_ERROR_RESULT_OUT_OF_RANGE;
-      break;
-    }
-  }
-
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, add_03_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 test(batch - 1, channel, height - 1, width - 1);
-
-  EXPECT_THROW({ input.add(test); }, std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, add_04_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(batch, channel, height, 2 * width);
-  nntrainer::TensorV2 shared_input =
-    input.getSharedDataTensor(dim, 0, false, "");
-  nntrainer::TensorV2 test(dim);
-
-  EXPECT_THROW(shared_input.add(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, add_05_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  nntrainer::TensorV2 test(batch, channel, height, 2 * width);
-  nntrainer::TensorV2 shared_test = test.getSharedDataTensor(dim, 0, false, "");
-
-  EXPECT_THROW(input.add(shared_test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, add_06_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim, false);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 1);
-
-  EXPECT_THROW(input.add(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, add_07_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim, false);
-
-  EXPECT_THROW(input.add(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, add_08_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 2);
-  nntrainer::TensorV2 output(dim, false);
-
-  EXPECT_THROW(input.add(test, output), std::invalid_argument);
-}
-
-int main(int argc, char **argv) {
-  int result = -1;
-
-  try {
-    testing::InitGoogleTest(&argc, argv);
-  } catch (...) {
-    std::cerr << "Error during InitGoogleTest" << std::endl;
-    return 0;
-  }
-
-  try {
-    result = RUN_ALL_TESTS();
-  } catch (...) {
-    std::cerr << "Error during RUN_ALL_TESTS()" << std::endl;
-  }
-
-  return result;
-}
diff --git a/test/unittest/unittest_nntrainer_tensor_v2_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_v2_fp16.cpp
deleted file mode 100644
index d9b5743bd6..0000000000
--- a/test/unittest/unittest_nntrainer_tensor_v2_fp16.cpp
+++ /dev/null
@@ -1,2209 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-/**
- * Copyright (C) 2023 Donghyeon Jeong <dhyeon.jeong@samsung.com>
- *
- * @file        unittest_nntrainer_tensor_v2_fp16.cpp
- * @date        16 November 2023
- * @brief       Unit test utility for tensor v2.
- * @see         https://github.com/nnstreamer/nntrainer
- * @author      2023 Donghyeon Jeong <dhyeon.jeong@samsung.com>
- * @bug         No known bugs
- */
-#include <gtest/gtest.h>
-
-#include "nntrainer_test_util.h"
-#include "util_func.h"
-#include <fstream>
-#include <nntrainer_error.h>
-#include <tensor_dim.h>
-#include <tensor_v2.h>
-
-TEST(nntrainer_Tensor, Tensor_01_p) {
-  int status = ML_ERROR_NONE;
-  nntrainer::TensorV2 tensor = nntrainer::TensorV2(
-    1, 2, 3, nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16);
-  tensor.setZero();
-  ASSERT_NE(nullptr, tensor.getData<_FP16>());
-  if (tensor.getValue<_FP16>(0, 0, 0, 0) != 0.0)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, Tensor_02_p) {
-  int status = ML_ERROR_NONE;
-  int height = 3;
-  int width = 10;
-  std::vector<std::vector<_FP16>> in;
-  for (int i = 0; i < height; ++i) {
-    std::vector<_FP16> tv;
-    for (int j = 0; j < width; ++j) {
-      tv.push_back(static_cast<_FP16>(i * 2.0 + j));
-    }
-    in.push_back(tv);
-  }
-
-  nntrainer::TensorV2 tensor = nntrainer::TensorV2(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-  ASSERT_NE(nullptr, tensor.getData<_FP16>());
-
-  if (tensor.getValue<_FP16>(0, 0, 0, 1) != 1.0)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, Tensor_02_nhwc_p) {
-  int status = ML_ERROR_NONE;
-  int width = 10;
-  int channel = 3;
-  std::vector<std::vector<_FP16>> in;
-  for (int i = 0; i < width; ++i) {
-    std::vector<_FP16> tv;
-    for (int j = 0; j < channel; ++j) {
-      tv.push_back(static_cast<_FP16>(i * 2.0 + j));
-    }
-    in.push_back(tv);
-  }
-
-  nntrainer::TensorV2 tensor = nntrainer::TensorV2(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-  ASSERT_NE(nullptr, tensor.getData<_FP16>());
-
-  if (tensor.getValue<_FP16>(0, 0, 0, 1) != 1.0)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, Tensor_03_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  std::vector<std::vector<std::vector<_FP16>>> in;
-
-  for (int k = 0; k < batch; ++k) {
-    std::vector<std::vector<_FP16>> ttv;
-    for (int i = 0; i < height; ++i) {
-      std::vector<_FP16> tv;
-      for (int j = 0; j < width; ++j) {
-        tv.push_back(static_cast<_FP16>(k * height * width + i * width + j));
-      }
-      ttv.push_back(tv);
-    }
-    in.push_back(ttv);
-  }
-
-  nntrainer::TensorV2 tensor = nntrainer::TensorV2(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-  ASSERT_NE(nullptr, tensor.getData<_FP16>());
-
-  if (tensor.getValue<_FP16>(0, 0, 0, 1) != 1.0)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, Tensor_04_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  std::vector<std::vector<std::vector<_FP16>>> in;
-
-  for (int k = 0; k < batch; ++k) {
-    std::vector<std::vector<_FP16>> ttv;
-    for (int i = 0; i < height; ++i) {
-      std::vector<_FP16> tv;
-      for (int j = 0; j < width; ++j) {
-        tv.push_back(k * height * width + i * width + j);
-      }
-      ttv.push_back(tv);
-    }
-    in.push_back(ttv);
-  }
-
-  nntrainer::TensorV2 t0 = nntrainer::TensorV2(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-
-  // copy assignment operator
-  nntrainer::TensorV2 t1 = t0;
-
-  if (t1.getValue<_FP16>(0, 0, 0, 1) != 1.0)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  // comparison operator
-  EXPECT_EQ(t0, t1);
-}
-
-TEST(nntrainer_Tensor, Tensor_05_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  std::vector<std::vector<std::vector<_FP16>>> in;
-
-  for (int k = 0; k < batch; ++k) {
-    std::vector<std::vector<_FP16>> ttv;
-    for (int i = 0; i < height; ++i) {
-      std::vector<_FP16> tv;
-      for (int j = 0; j < width; ++j) {
-        tv.push_back(k * height * width + i * width + j);
-      }
-      ttv.push_back(tv);
-    }
-    in.push_back(ttv);
-  }
-
-  nntrainer::TensorV2 t0 = nntrainer::TensorV2(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-
-  // copy assignment operator
-  nntrainer::TensorV2 t1 = nntrainer::TensorV2(
-    batch, height, width, nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16);
-  t1.setRandNormal(2.3, 0.5);
-
-  _FP16 val_t0 = t0.getValue<_FP16>(0, 0, 0, 1);
-  _FP16 val_t1 = t1.getValue<_FP16>(0, 0, 0, 1);
-
-  swap(t0, t1);
-
-  if (t0.getValue<_FP16>(0, 0, 0, 1) != val_t1)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  if (t1.getValue<_FP16>(0, 0, 0, 1) != val_t0)
-    status = ML_ERROR_INVALID_PARAMETER;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, Tensor_06_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  std::vector<std::vector<std::vector<float>>> in;
-  std::vector<std::vector<std::vector<_FP16>>> in2;
-
-  for (int k = 0; k < batch; ++k) {
-    std::vector<std::vector<float>> ttv;
-    std::vector<std::vector<_FP16>> ttv2;
-    for (int i = 0; i < height; ++i) {
-      std::vector<float> tv;
-      std::vector<_FP16> tv2;
-      for (int j = 0; j < width; ++j) {
-        tv.push_back(k * height * width + i * width + j);
-        tv2.push_back(k * height * width + i * width + j);
-      }
-      ttv.push_back(tv);
-      ttv2.push_back(tv2);
-    }
-    in.push_back(ttv);
-    in2.push_back(ttv2);
-  }
-
-  nntrainer::TensorV2 t0 = nntrainer::TensorV2(
-    in, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32});
-  nntrainer::TensorV2 t1 = nntrainer::TensorV2(
-    in2, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-
-  EXPECT_NE(t0, t1);
-}
-
-TEST(nntrainer_Tensor, empty_01) {
-  nntrainer::TensorV2 t("", nntrainer::Tformat::NCHW,
-                        nntrainer::Tdatatype::FP16);
-
-  EXPECT_TRUE(t.empty());
-}
-
-TEST(nntrainer_Tensor, empty_02) {
-  nntrainer::TensorV2 t(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    false);
-
-  EXPECT_FALSE(t.empty());
-}
-
-TEST(nntrainer_Tensor, empty_03) {
-  nntrainer::TensorV2 t(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    true);
-
-  EXPECT_FALSE(t.empty());
-}
-
-TEST(nntrainer_Tensor, allocate_01_n) {
-  nntrainer::TensorV2 t;
-  EXPECT_FALSE(t.isAllocated());
-
-  t.allocate();
-  EXPECT_FALSE(t.isAllocated());
-}
-
-TEST(nntrainer_Tensor, allocate_02_p) {
-  nntrainer::TensorV2 t(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    false);
-  EXPECT_FALSE(t.isAllocated());
-
-  t.allocate();
-  EXPECT_TRUE(t.isAllocated());
-}
-
-TEST(nntrainer_Tensor, allocate_03_p) {
-  nntrainer::TensorV2 t(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    true);
-  EXPECT_TRUE(t.isAllocated());
-
-  t.allocate();
-  EXPECT_TRUE(t.isAllocated());
-}
-
-TEST(nntrainer_Tensor, initialize_01_p) {
-  nntrainer::TensorV2 t(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    true, nntrainer::Initializer::ONES);
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-  golden.setValue(1);
-
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_02_p) {
-  nntrainer::TensorV2 t(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    true);
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-  golden.setValue(1);
-
-  EXPECT_NE(golden, t);
-
-  t.initialize(nntrainer::Initializer::ONES);
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_03_p) {
-  nntrainer::TensorV2 t(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    false, nntrainer::Initializer::ONES);
-  t.allocate();
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-
-  golden.setValue(1);
-
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_04_p) {
-  nntrainer::TensorV2 t(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    false);
-  t.initialize(nntrainer::Initializer::ONES);
-  t.allocate();
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-  ;
-  golden.setValue(1);
-
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_05_p) {
-  nntrainer::TensorV2 t(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    false);
-  t.allocate();
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-
-  golden.setValue(1.f);
-
-  /**
-   * Ideally, it should be NE, but it can be equal due to no initialization
-   * EXPECT_NE(golden, t);
-   */
-
-  t.initialize(nntrainer::Initializer::ONES);
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_06_n) {
-  nntrainer::TensorV2 t(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    true, nntrainer::Initializer::ONES);
-  nntrainer::TensorV2 golden(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    true, nntrainer::Initializer::ZEROS);
-
-  EXPECT_NE(golden, t);
-
-  golden.initialize(nntrainer::Initializer::ONES);
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_07_p) {
-  nntrainer::TensorV2 t(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    true, nntrainer::Initializer::ONES);
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-
-  golden.setValue(1);
-
-  EXPECT_EQ(golden, t);
-
-  t.setValue(0, 0, 0, 0, 0);
-  t.setValue(0, 0, 0, t.size() - 1, 0);
-  EXPECT_NE(golden, t);
-
-  t.initialize();
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, initialize_08_p) {
-  nntrainer::TensorV2 t(
-    {{1, 2, 3, 4}, {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
-    true, nntrainer::Initializer::ONES);
-
-  nntrainer::TensorV2 golden(1, 2, 3, 4, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-
-  golden.setValue(1);
-
-  EXPECT_EQ(golden, t);
-
-  t.initialize(nntrainer::Initializer::HE_NORMAL);
-  EXPECT_NE(golden, t);
-
-  t.initialize();
-  EXPECT_NE(golden, t);
-
-  t.initialize(nntrainer::Initializer::ONES);
-  EXPECT_EQ(golden, t);
-
-  t.initialize();
-  EXPECT_EQ(golden, t);
-}
-
-TEST(nntrainer_Tensor, multiply_i_01_fp16_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::TensorV2 original;
-  original.copy(input);
-
-  status = input.multiply_i(2.0);
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  _FP16 *data = original.getData<_FP16>();
-  ASSERT_NE(nullptr, data);
-  _FP16 *indata = input.getData<_FP16>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * height * width * channel; ++i) {
-    EXPECT_FLOAT_EQ(data[i] + data[i], indata[i]);
-  }
-}
-
-TEST(nntrainer_Tensor, multiply_i_02_fp16_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::TensorV2 original;
-  original.copy(input);
-
-  status = input.multiply_i(input);
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  _FP16 *data = original.getData<_FP16>();
-  ASSERT_NE(nullptr, data);
-  _FP16 *indata = input.getData<_FP16>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * height * width * channel; ++i) {
-    EXPECT_FLOAT_EQ(data[i] * data[i], indata[i]);
-  }
-}
-
-TEST(nntrainer_Tensor, multiply_i_03_fp16_n) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::TensorV2 target2(batch, channel, height - 2, width - 1,
-                              nntrainer::Tformat::NCHW,
-                              nntrainer::Tdatatype::FP16);
-  status = input.multiply_i(target2);
-
-  EXPECT_EQ(status, ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, multiply_i_broadcast_01_fp16_p) {
-  unsigned int N = 120;
-  _FP16 *answer_data = new _FP16[N];
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-
-    float float_data[] = {
-      0,    1,    4,    9,    16,   25,   36,   49,   64,   81,   100,  121,
-      144,  169,  196,  225,  256,  289,  324,  361,  400,  441,  484,  529,
-      576,  625,  676,  729,  784,  841,  900,  961,  1024, 1089, 1156, 1225,
-      1296, 1369, 1444, 1521, 0,    41,   84,   129,  176,  225,  276,  329,
-      384,  441,  500,  561,  624,  689,  756,  825,  896,  969,  1044, 1121,
-      1200, 1281, 1364, 1449, 1536, 1625, 1716, 1809, 1904, 2001, 2100, 2201,
-      2304, 2409, 2516, 2625, 2736, 2849, 2964, 3081, 0,    81,   164,  249,
-      336,  425,  516,  609,  704,  801,  900,  1001, 1104, 1209, 1316, 1425,
-      1536, 1649, 1764, 1881, 2000, 2121, 2244, 2369, 2496, 2625, 2756, 2889,
-      3024, 3161, 3300, 3441, 3584, 3729, 3876, 4025, 4176, 4329, 4484, 4641};
-
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,    1,    4,    9,    16,   25,   36,   49,   64,   81,   100,  121,
-      144,  169,  196,  225,  256,  289,  324,  361,  0,    21,   44,   69,
-      96,   125,  156,  189,  224,  261,  300,  341,  384,  429,  476,  525,
-      576,  629,  684,  741,  800,  861,  924,  989,  1056, 1125, 1196, 1269,
-      1344, 1421, 1500, 1581, 1664, 1749, 1836, 1925, 2016, 2109, 2204, 2301,
-      1200, 1281, 1364, 1449, 1536, 1625, 1716, 1809, 1904, 2001, 2100, 2201,
-      2304, 2409, 2516, 2625, 2736, 2849, 2964, 3081, 3200, 3321, 3444, 3569,
-      3696, 3825, 3956, 4089, 4224, 4361, 4500, 4641, 4784, 4929, 5076, 5225,
-      5376, 5529, 5684, 5841, 4000, 4141, 4284, 4429, 4576, 4725, 4876, 5029,
-      5184, 5341, 5500, 5661, 5824, 5989, 6156, 6325, 6496, 6669, 6844, 7021};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(3, 2, 4, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,    0,    0,    0,    0,    5,    6,    7,    8,    9,    20,   22,
-      24,   26,   28,   45,   48,   51,   54,   57,   80,   84,   88,   92,
-      96,   125,  130,  135,  140,  145,  180,  186,  192,  198,  204,  245,
-      252,  259,  266,  273,  320,  328,  336,  344,  352,  405,  414,  423,
-      432,  441,  500,  510,  520,  530,  540,  605,  616,  627,  638,  649,
-      720,  732,  744,  756,  768,  845,  858,  871,  884,  897,  980,  994,
-      1008, 1022, 1036, 1125, 1140, 1155, 1170, 1185, 1280, 1296, 1312, 1328,
-      1344, 1445, 1462, 1479, 1496, 1513, 1620, 1638, 1656, 1674, 1692, 1805,
-      1824, 1843, 1862, 1881, 2000, 2020, 2040, 2060, 2080, 2205, 2226, 2247,
-      2268, 2289, 2420, 2442, 2464, 2486, 2508, 2645, 2668, 2691, 2714, 2737};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,    1,    4,    9,    16,   0,    6,    14,   24,   36,   0,    11,
-      24,   39,   56,   0,    16,   34,   54,   76,   0,    21,   44,   69,
-      96,   0,    26,   54,   84,   116,  0,    31,   64,   99,   136,  0,
-      36,   74,   114,  156,  200,  246,  294,  344,  396,  225,  276,  329,
-      384,  441,  250,  306,  364,  424,  486,  275,  336,  399,  464,  531,
-      300,  366,  434,  504,  576,  325,  396,  469,  544,  621,  350,  426,
-      504,  584,  666,  375,  456,  539,  624,  711,  800,  891,  984,  1079,
-      1176, 850,  946,  1044, 1144, 1246, 900,  1001, 1104, 1209, 1316, 950,
-      1056, 1164, 1274, 1386, 1000, 1111, 1224, 1339, 1456, 1050, 1166, 1284,
-      1404, 1526, 1100, 1221, 1344, 1469, 1596, 1150, 1276, 1404, 1534, 1666};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 1, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,   1,   4,    9,   16,  0,   6,   14,  24,  36,  0,   11,  24,  39,
-      56,  0,   16,   34,  54,  76,  100, 126, 154, 184, 216, 125, 156, 189,
-      224, 261, 150,  186, 224, 264, 306, 175, 216, 259, 304, 351, 0,   41,
-      84,  129, 176,  0,   46,  94,  144, 196, 0,   51,  104, 159, 216, 0,
-      56,  114, 174,  236, 300, 366, 434, 504, 576, 325, 396, 469, 544, 621,
-      350, 426, 504,  584, 666, 375, 456, 539, 624, 711, 0,   81,  164, 249,
-      336, 0,   86,   174, 264, 356, 0,   91,  184, 279, 376, 0,   96,  194,
-      294, 396, 500,  606, 714, 824, 936, 525, 636, 749, 864, 981, 550, 666,
-      784, 904, 1026, 575, 696, 819, 944, 1071};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 4, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,    0,    0,    0,    0,    5,    6,    7,    8,    9,    20,   22,
-      24,   26,   28,   45,   48,   51,   54,   57,   0,    0,    0,    0,
-      0,    25,   26,   27,   28,   29,   60,   62,   64,   66,   68,   105,
-      108,  111,  114,  117,  160,  164,  168,  172,  176,  225,  230,  235,
-      240,  245,  300,  306,  312,  318,  324,  385,  392,  399,  406,  413,
-      240,  244,  248,  252,  256,  325,  330,  335,  340,  345,  420,  426,
-      432,  438,  444,  525,  532,  539,  546,  553,  640,  648,  656,  664,
-      672,  765,  774,  783,  792,  801,  900,  910,  920,  930,  940,  1045,
-      1056, 1067, 1078, 1089, 800,  808,  816,  824,  832,  945,  954,  963,
-      972,  981,  1100, 1110, 1120, 1130, 1140, 1265, 1276, 1287, 1298, 1309};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(1, 1, 1, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0, 1,   4,   9,   16,  0, 6,   14,  24,  36,  0, 11,  24,  39,  56,
-      0, 16,  34,  54,  76,  0, 21,  44,  69,  96,  0, 26,  54,  84,  116,
-      0, 31,  64,  99,  136, 0, 36,  74,  114, 156, 0, 41,  84,  129, 176,
-      0, 46,  94,  144, 196, 0, 51,  104, 159, 216, 0, 56,  114, 174, 236,
-      0, 61,  124, 189, 256, 0, 66,  134, 204, 276, 0, 71,  144, 219, 296,
-      0, 76,  154, 234, 316, 0, 81,  164, 249, 336, 0, 86,  174, 264, 356,
-      0, 91,  184, 279, 376, 0, 96,  194, 294, 396, 0, 101, 204, 309, 416,
-      0, 106, 214, 324, 436, 0, 111, 224, 339, 456, 0, 116, 234, 354, 476};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 1, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   20,  21,  22,  23,  24,  25,  26,  27,
-      28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
-      70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-      112, 113, 114, 115, 116, 117, 118, 119};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   40,  41,
-      42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
-      56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
-      70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  160, 162, 164, 166,
-      168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194,
-      196, 198, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222,
-      224, 226, 228, 230, 232, 234, 236, 238};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 5, 1, 4, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(3, 5, 1, 4, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 4, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {0,   1,   4,   9,   0,   5,   12,  21,  0,   9,
-                          20,  33,  0,   13,  28,  45,  0,   17,  36,  57,
-                          80,  105, 132, 161, 96,  125, 156, 189, 112, 145,
-                          180, 217, 128, 165, 204, 245, 144, 185, 228, 273,
-                          320, 369, 420, 473, 352, 405, 460, 517, 384, 441,
-                          500, 561, 416, 477, 540, 605, 448, 513, 580, 649};
-    std::transform(float_data, float_data + 60, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.multiply_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  delete[] answer_data;
-}
-
-TEST(nntrainer_Tensor, multiply_i_broadcast_not_supported_01_n) {
-
-  nntrainer::TensorV2 target(3, 1, 3, 1, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-  nntrainer::TensorV2 target2(3, 1, 3, 3, nntrainer::Tformat::NCHW,
-                              nntrainer::Tdatatype::FP16);
-
-  EXPECT_EQ(target.multiply_i(target2), ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, multiply_i_broadcast_not_broadcastable_02_n) {
-  nntrainer::TensorV2 target(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-  nntrainer::TensorV2 target2(3, 2, 3, 1, nntrainer::Tformat::NCHW,
-                              nntrainer::Tdatatype::FP16);
-
-  EXPECT_EQ(target.multiply_i(target2), ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, multiply_01_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::TensorV2 result = input.multiply(0.0);
-  if (result.getValue<_FP16>(0, 0, 1, 1) != 0.0)
-    status = ML_ERROR_RESULT_OUT_OF_RANGE;
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, multiply_02_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 result = input.multiply(input);
-
-  _FP16 *data = result.getData<_FP16>();
-  ASSERT_NE(nullptr, data);
-  _FP16 *indata = input.getData<_FP16>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    if (data[i] != indata[i] * indata[i]) {
-      status = ML_ERROR_RESULT_OUT_OF_RANGE;
-      break;
-    }
-  }
-
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, multiply_03_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 test(batch - 1, height - 1, width - 1,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
-
-  EXPECT_THROW({ input.multiply(test); }, std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_04_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(batch, channel, height, 2 * width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  nntrainer::TensorV2 shared_input =
-    input.getSharedDataTensor(dim, 0, false, "");
-  nntrainer::TensorV2 test(dim);
-
-  EXPECT_THROW(shared_input.multiply(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_05_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  nntrainer::TensorV2 test(batch, channel, height, 2 * width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
-  nntrainer::TensorV2 shared_test = test.getSharedDataTensor(dim, 0, false, "");
-
-  EXPECT_THROW(input.multiply(shared_test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_06_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim, false);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 1);
-
-  EXPECT_THROW(input.multiply(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_07_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim, false);
-
-  EXPECT_THROW(input.multiply(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_08_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 2);
-  nntrainer::TensorV2 output(dim, false);
-
-  EXPECT_THROW(input.multiply(test, output), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_float_01_p) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 expected(batch, channel, height, width,
-                               nntrainer::Tformat::NCHW,
-                               nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(expected, (i * (batch * height) + j * (width) + k + 1) * 2);
-
-  nntrainer::TensorV2 result = input.multiply(2.0);
-
-  EXPECT_EQ(result, expected);
-}
-
-TEST(nntrainer_Tensor, multiply_strided_01_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 result = input.multiply_strided(input);
-
-  _FP16 *data = result.getData<_FP16>();
-  ASSERT_NE(nullptr, data);
-  _FP16 *indata = input.getData<_FP16>();
-  ASSERT_NE(nullptr, indata);
-
-  _FP16 *outdata = new _FP16[(input.size())];
-
-  std::transform(indata, indata + batch * height * width * channel, indata,
-                 outdata, std::multiplies<_FP16>());
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    if (data[i] != outdata[i]) {
-      status = ML_ERROR_RESULT_OUT_OF_RANGE;
-      break;
-    }
-  }
-
-  delete[] outdata;
-
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, multiply_strided_02_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 test(batch - 1, height - 1, width - 1);
-
-  EXPECT_THROW({ input.multiply_strided(test); }, std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_strided_03_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-  // input is not allocated now : alloc_now == false
-  nntrainer::TensorV2 input(dim, false);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 1);
-
-  EXPECT_THROW(input.multiply_strided(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_strided_04_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  // test is not allocated.
-  nntrainer::TensorV2 test(dim, false);
-
-  EXPECT_THROW(input.multiply_strided(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_strided_05_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 1);
-  // output is not allocated
-  nntrainer::TensorV2 output(dim, false);
-
-  EXPECT_THROW(input.multiply_strided(test, output), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, multiply_strided_06_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 output(
-    batch, channel, height, width,
-    {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16});
-  GEN_TEST_INPUT(output, i * (batch * height) + j * (width) + k + 1);
-
-  _FP16 *indata = input.getData<_FP16>();
-  ASSERT_NE(nullptr, indata);
-
-  _FP16 *outdata_beta = new _FP16[(input.size())];
-  _FP16 *indata_mul = new _FP16[(input.size())];
-  _FP16 *outdata = new _FP16[(input.size())];
-
-  std::transform(indata, indata + batch * height * width * channel,
-                 outdata_beta,
-                 std::bind(std::multiplies<_FP16>(), std::placeholders::_1,
-                           static_cast<_FP16>(10.0)));
-
-  std::transform(indata, indata + batch * height * width * channel, indata,
-                 indata_mul, std::multiplies<_FP16>());
-  std::transform(indata_mul, indata_mul + batch * height * width * channel,
-                 outdata_beta, outdata, std::plus<_FP16>());
-
-  input.multiply_strided(input, output, 10.0);
-
-  _FP16 *data = output.getData<_FP16>();
-  ASSERT_NE(nullptr, data);
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    if (data[i] != outdata[i]) {
-      status = ML_ERROR_RESULT_OUT_OF_RANGE;
-      break;
-    }
-  }
-
-  delete[] outdata_beta;
-  delete[] indata_mul;
-  delete[] outdata;
-
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, divide_i_01_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::TensorV2 original;
-  original.copy(input);
-
-  status = input.divide_i(2.0f);
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  _FP16 *data = original.getData<_FP16>();
-  ASSERT_NE(nullptr, data);
-  _FP16 *indata = input.getData<_FP16>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * height * width * channel; ++i) {
-    EXPECT_FLOAT_EQ(data[i], indata[i] + indata[i]);
-  }
-}
-
-TEST(nntrainer_Tensor, divide_i_02_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  status = input.divide_i(input);
-  EXPECT_EQ(status, ML_ERROR_NONE);
-  _FP16 *indata = input.getData<_FP16>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * height * width * channel; ++i) {
-    EXPECT_FLOAT_EQ(indata[i], _FP16(1.0));
-  }
-}
-
-TEST(nntrainer_Tensor, divide_i_01_n) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  status = input.divide_i((_FP16)0);
-  EXPECT_EQ(status, ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, divide_i_02_n) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k);
-
-  nntrainer::TensorV2 original(batch, channel, height - 2, width - 1,
-                               nntrainer::Tformat::NCHW,
-                               nntrainer::Tdatatype::FP16);
-
-  status = input.divide_i(original);
-  EXPECT_EQ(status, ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, divide_01_p) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 result = input.divide(1.0);
-
-  _FP16 *previous = input.getData<_FP16>();
-  ASSERT_NE(nullptr, previous);
-  _FP16 *data = result.getData<_FP16>();
-  ASSERT_NE(nullptr, data);
-
-  for (int i = 0; i < batch * height * width * channel; ++i) {
-    EXPECT_FLOAT_EQ(data[i], previous[i]);
-  }
-}
-
-TEST(nntrainer_Tensor, divide_02_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  EXPECT_THROW({ input.divide(0.0); }, std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_03_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 test(batch - 1, channel, height - 1, width - 1,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
-
-  EXPECT_THROW({ input.divide(test); }, std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_04_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(batch, channel, height, 2 * width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  nntrainer::TensorV2 shared_input =
-    input.getSharedDataTensor(dim, 0, false, "");
-  nntrainer::TensorV2 test(dim);
-
-  EXPECT_THROW(shared_input.divide(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_05_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  nntrainer::TensorV2 test(batch, channel, height, 2 * width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
-  nntrainer::TensorV2 shared_test = test.getSharedDataTensor(dim, 0, false, "");
-
-  EXPECT_THROW(input.divide(shared_test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_06_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim, false);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 1);
-
-  EXPECT_THROW(input.divide(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_07_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim, false);
-
-  EXPECT_THROW(input.divide(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_08_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 2);
-  nntrainer::TensorV2 output(dim, false);
-
-  EXPECT_THROW(input.divide(test, output), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, divide_i_broadcast_01_p) {
-  unsigned int N = 120;
-  _FP16 *answer_data = new _FP16[N];
-  nntrainer::TensorDim ref_dim(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                               nntrainer::Tdatatype::FP16);
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    m.add_i(1);
-    float float_data[] = {
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       41.0,      21.0,
-      14.333333, 11.0,      9.0,       7.6666665, 6.714286,  6.0,
-      5.4444447, 5.0,       4.6363635, 4.3333335, 4.076923,  3.857143,
-      3.6666667, 3.5,       3.3529413, 3.2222223, 3.1052632, 3.0,
-      2.9047618, 2.8181818, 2.7391305, 2.6666667, 2.6,       2.5384614,
-      2.4814816, 2.4285715, 2.3793104, 2.3333333, 2.2903225, 2.25,
-      2.2121212, 2.1764705, 2.142857,  2.1111112, 2.0810812, 2.0526316,
-      2.025641,  2.0,       81.0,      41.0,      27.666666, 21.0,
-      17.0,      14.333333, 12.428572, 11.0,      9.888889,  9.0,
-      8.272727,  7.6666665, 7.1538463, 6.714286,  6.3333335, 6.0,
-      5.7058825, 5.4444447, 5.2105265, 5.0,       4.8095236, 4.6363635,
-      4.478261,  4.3333335, 4.2,       4.076923,  3.9629629, 3.857143,
-      3.7586207, 3.6666667, 3.580645,  3.5,       3.4242425, 3.3529413,
-      3.2857144, 3.2222223, 3.162162,  3.1052632, 3.0512822, 3.0};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    m.add_i(1);
-    float float_data[] = {
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       1.0,       1.0,       1.0,       1.0,
-      1.0,       1.0,       21.0,      11.0,      7.6666665, 6.0,
-      5.0,       4.3333335, 3.857143,  3.5,       3.2222223, 3.0,
-      2.8181818, 2.6666667, 2.5384614, 2.4285715, 2.3333333, 2.25,
-      2.1764705, 2.1111112, 2.0526316, 2.0,       1.9523809, 1.9090909,
-      1.8695652, 1.8333334, 1.8,       1.7692307, 1.7407408, 1.7142857,
-      1.6896552, 1.6666666, 1.6451613, 1.625,     1.6060606, 1.5882353,
-      1.5714285, 1.5555556, 1.5405406, 1.5263158, 1.5128205, 1.5,
-      2.9047618, 2.8181818, 2.7391305, 2.6666667, 2.6,       2.5384614,
-      2.4814816, 2.4285715, 2.3793104, 2.3333333, 2.2903225, 2.25,
-      2.2121212, 2.1764705, 2.142857,  2.1111112, 2.0810812, 2.0526316,
-      2.025641,  2.0,       1.9756098, 1.9523809, 1.9302325, 1.9090909,
-      1.8888888, 1.8695652, 1.8510638, 1.8333334, 1.8163265, 1.8,
-      1.7843137, 1.7692307, 1.754717,  1.7407408, 1.7272727, 1.7142857,
-      1.7017543, 1.6896552, 1.6779661, 1.6666666, 2.4634147, 2.4285715,
-      2.3953488, 2.3636363, 2.3333333, 2.3043478, 2.2765958, 2.25,
-      2.2244897, 2.2,       2.1764705, 2.1538463, 2.1320755, 2.1111112,
-      2.090909,  2.0714285, 2.0526316, 2.0344827, 2.0169492, 2.0};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(3, 2, 4, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    m.add_i(1);
-    float float_data[] = {
-      1.0,       2.0,       3.0,       4.0,       5.0,       3.0,
-      3.5,       4.0,       4.5,       5.0,       3.6666667, 4.0,
-      4.3333335, 4.6666665, 5.0,       4.0,       4.25,      4.5,
-      4.75,      5.0,       4.2,       4.4,       4.6,       4.8,
-      5.0,       4.3333335, 4.5,       4.6666665, 4.8333335, 5.0,
-      4.428571,  4.571429,  4.714286,  4.857143,  5.0,       4.5,
-      4.625,     4.75,      4.875,     5.0,       4.5555553, 4.6666665,
-      4.7777777, 4.888889,  5.0,       4.6,       4.7,       4.8,
-      4.9,       5.0,       4.6363635, 4.7272725, 4.818182,  4.909091,
-      5.0,       4.6666665, 4.75,      4.8333335, 4.9166665, 5.0,
-      4.6923075, 4.769231,  4.8461537, 4.923077,  5.0,       4.714286,
-      4.785714,  4.857143,  4.928571,  5.0,       4.733333,  4.8,
-      4.866667,  4.9333334, 5.0,       4.75,      4.8125,    4.875,
-      4.9375,    5.0,       4.7647057, 4.8235292, 4.882353,  4.9411764,
-      5.0,       4.7777777, 4.8333335, 4.888889,  4.9444447, 5.0,
-      4.7894735, 4.8421054, 4.894737,  4.9473686, 5.0,       4.8,
-      4.85,      4.9,       4.95,      5.0,       4.8095236, 4.857143,
-      4.904762,  4.952381,  5.0,       4.818182,  4.8636365, 4.909091,
-      4.9545455, 5.0,       4.826087,  4.869565,  4.9130435, 4.9565215,
-      5.0,       4.8333335, 4.875,     4.9166665, 4.9583335, 5.0};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    m.add_i(1);
-    float float_data[] = {
-      1.0,       1.0,       1.0,       1.0,       1.0,       6.0,
-      3.5,       2.6666667, 2.25,      2.0,       11.0,      6.0,
-      4.3333335, 3.5,       3.0,       16.0,      8.5,       6.0,
-      4.75,      4.0,       21.0,      11.0,      7.6666665, 6.0,
-      5.0,       26.0,      13.5,      9.333333,  7.25,      6.0,
-      31.0,      16.0,      11.0,      8.5,       7.0,       36.0,
-      18.5,      12.666667, 9.75,      8.0,       6.8333335, 6.0,
-      5.375,     4.888889,  4.5,       7.6666665, 6.714286,  6.0,
-      5.4444447, 5.0,       8.5,       7.428571,  6.625,     6.0,
-      5.5,       9.333333,  8.142858,  7.25,      6.5555553, 6.0,
-      10.166667, 8.857142,  7.875,     7.111111,  6.5,       11.0,
-      9.571428,  8.5,       7.6666665, 7.0,       11.833333, 10.285714,
-      9.125,     8.222222,  7.5,       12.666667, 11.0,      9.75,
-      8.777778,  8.0,       7.3636365, 6.8333335, 6.3846154, 6.0,
-      5.6666665, 7.818182,  7.25,      6.769231,  6.357143,  6.0,
-      8.272727,  7.6666665, 7.1538463, 6.714286,  6.3333335, 8.727273,
-      8.083333,  7.5384617, 7.071429,  6.6666665, 9.181818,  8.5,
-      7.923077,  7.428571,  7.0,       9.636364,  8.916667,  8.307693,
-      7.785714,  7.3333335, 10.090909, 9.333333,  8.692307,  8.142858,
-      7.6666665, 10.545455, 9.75,      9.076923,  8.5,       8.0};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 1, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    m.add_i(1);
-    float float_data[] = {
-      1.0,       1.0,       1.0,       1.0,       1.0,       6.0,
-      3.5,       2.6666667, 2.25,      2.0,       11.0,      6.0,
-      4.3333335, 3.5,       3.0,       16.0,      8.5,       6.0,
-      4.75,      4.0,       3.5,       3.142857,  2.875,     2.6666667,
-      2.5,       4.3333335, 3.857143,  3.5,       3.2222223, 3.0,
-      5.1666665, 4.571429,  4.125,     3.7777777, 3.5,       6.0,
-      5.285714,  4.75,      4.3333335, 4.0,       41.0,      21.0,
-      14.333333, 11.0,      9.0,       46.0,      23.5,      16.0,
-      12.25,     10.0,      51.0,      26.0,      17.666666, 13.5,
-      11.0,      56.0,      28.5,      19.333334, 14.75,     12.0,
-      10.166667, 8.857142,  7.875,     7.111111,  6.5,       11.0,
-      9.571428,  8.5,       7.6666665, 7.0,       11.833333, 10.285714,
-      9.125,     8.222222,  7.5,       12.666667, 11.0,      9.75,
-      8.777778,  8.0,       81.0,      41.0,      27.666666, 21.0,
-      17.0,      86.0,      43.5,      29.333334, 22.25,     18.0,
-      91.0,      46.0,      31.0,      23.5,      19.0,      96.0,
-      48.5,      32.666668, 24.75,     20.0,      16.833334, 14.571428,
-      12.875,    11.555555, 10.5,      17.666666, 15.285714, 13.5,
-      12.111111, 11.0,      18.5,      16.0,      14.125,    12.666667,
-      11.5,      19.333334, 16.714285, 14.75,     13.222222, 12.0};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 4, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    m.add_i(1);
-    float float_data[] = {
-      1.0,       2.0,       3.0,       4.0,       5.0,       3.0,
-      3.5,       4.0,       4.5,       5.0,       3.6666667, 4.0,
-      4.3333335, 4.6666665, 5.0,       4.0,       4.25,      4.5,
-      4.75,      5.0,       21.0,      22.0,      23.0,      24.0,
-      25.0,      13.0,      13.5,      14.0,      14.5,      15.0,
-      10.333333, 10.666667, 11.0,      11.333333, 11.666667, 9.0,
-      9.25,      9.5,       9.75,      10.0,      8.2,       8.4,
-      8.6,       8.8,       9.0,       7.6666665, 7.8333335, 8.0,
-      8.166667,  8.333333,  7.285714,  7.428571,  7.571429,  7.714286,
-      7.857143,  7.0,       7.125,     7.25,      7.375,     7.5,
-      12.2,      12.4,      12.6,      12.8,      13.0,      11.0,
-      11.166667, 11.333333, 11.5,      11.666667, 10.142858, 10.285714,
-      10.428572, 10.571428, 10.714286, 9.5,       9.625,     9.75,
-      9.875,     10.0,      9.0,       9.111111,  9.222222,  9.333333,
-      9.444445,  8.6,       8.7,       8.8,       8.9,       9.0,
-      8.272727,  8.363636,  8.454545,  8.545455,  8.636364,  8.0,
-      8.083333,  8.166667,  8.25,      8.333333,  11.222222, 11.333333,
-      11.444445, 11.555555, 11.666667, 10.6,      10.7,      10.8,
-      10.9,      11.0,      10.090909, 10.181818, 10.272727, 10.363636,
-      10.454545, 9.666667,  9.75,      9.833333,  9.916667,  10.0};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(1, 1, 1, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    m.add_i(1);
-    float float_data[] = {
-      1.0,   1.0,  1.0,       1.0,  1.0,  6.0,   3.5,  2.6666667, 2.25,  2.0,
-      11.0,  6.0,  4.3333335, 3.5,  3.0,  16.0,  8.5,  6.0,       4.75,  4.0,
-      21.0,  11.0, 7.6666665, 6.0,  5.0,  26.0,  13.5, 9.333333,  7.25,  6.0,
-      31.0,  16.0, 11.0,      8.5,  7.0,  36.0,  18.5, 12.666667, 9.75,  8.0,
-      41.0,  21.0, 14.333333, 11.0, 9.0,  46.0,  23.5, 16.0,      12.25, 10.0,
-      51.0,  26.0, 17.666666, 13.5, 11.0, 56.0,  28.5, 19.333334, 14.75, 12.0,
-      61.0,  31.0, 21.0,      16.0, 13.0, 66.0,  33.5, 22.666666, 17.25, 14.0,
-      71.0,  36.0, 24.333334, 18.5, 15.0, 76.0,  38.5, 26.0,      19.75, 16.0,
-      81.0,  41.0, 27.666666, 21.0, 17.0, 86.0,  43.5, 29.333334, 22.25, 18.0,
-      91.0,  46.0, 31.0,      23.5, 19.0, 96.0,  48.5, 32.666668, 24.75, 20.0,
-      101.0, 51.0, 34.333332, 26.0, 21.0, 106.0, 53.5, 36.0,      27.25, 22.0,
-      111.0, 56.0, 37.666668, 28.5, 23.0, 116.0, 58.5, 39.333332, 29.75, 24.0};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 1, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    m.add_i(1);
-    float float_data[] = {
-      1.0,  2.0,  3.0,  4.0,   5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
-      13.0, 14.0, 15.0, 16.0,  17.0, 18.0, 19.0, 20.0, 10.5, 11.0, 11.5, 12.0,
-      12.5, 13.0, 13.5, 14.0,  14.5, 15.0, 15.5, 16.0, 16.5, 17.0, 17.5, 18.0,
-      18.5, 19.0, 19.5, 20.0,  41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0,
-      49.0, 50.0, 51.0, 52.0,  53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
-      30.5, 31.0, 31.5, 32.0,  32.5, 33.0, 33.5, 34.0, 34.5, 35.0, 35.5, 36.0,
-      36.5, 37.0, 37.5, 38.0,  38.5, 39.0, 39.5, 40.0, 81.0, 82.0, 83.0, 84.0,
-      85.0, 86.0, 87.0, 88.0,  89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0,
-      97.0, 98.0, 99.0, 100.0, 50.5, 51.0, 51.5, 52.0, 52.5, 53.0, 53.5, 54.0,
-      54.5, 55.0, 55.5, 56.0,  56.5, 57.0, 57.5, 58.0, 58.5, 59.0, 59.5, 60.0};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    m.add_i(1);
-    float float_data[] = {
-      1.0,       2.0,       3.0,  4.0,       5.0,       6.0,
-      7.0,       8.0,       9.0,  10.0,      11.0,      12.0,
-      13.0,      14.0,      15.0, 16.0,      17.0,      18.0,
-      19.0,      20.0,      21.0, 22.0,      23.0,      24.0,
-      25.0,      26.0,      27.0, 28.0,      29.0,      30.0,
-      31.0,      32.0,      33.0, 34.0,      35.0,      36.0,
-      37.0,      38.0,      39.0, 40.0,      20.5,      21.0,
-      21.5,      22.0,      22.5, 23.0,      23.5,      24.0,
-      24.5,      25.0,      25.5, 26.0,      26.5,      27.0,
-      27.5,      28.0,      28.5, 29.0,      29.5,      30.0,
-      30.5,      31.0,      31.5, 32.0,      32.5,      33.0,
-      33.5,      34.0,      34.5, 35.0,      35.5,      36.0,
-      36.5,      37.0,      37.5, 38.0,      38.5,      39.0,
-      39.5,      40.0,      27.0, 27.333334, 27.666666, 28.0,
-      28.333334, 28.666666, 29.0, 29.333334, 29.666666, 30.0,
-      30.333334, 30.666666, 31.0, 31.333334, 31.666666, 32.0,
-      32.333332, 32.666668, 33.0, 33.333332, 33.666668, 34.0,
-      34.333332, 34.666668, 35.0, 35.333332, 35.666668, 36.0,
-      36.333332, 36.666668, 37.0, 37.333332, 37.666668, 38.0,
-      38.333332, 38.666668, 39.0, 39.333332, 39.666668, 40.0};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 5, 1, 4, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(3, 5, 1, 4, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    t.add_i(1);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 4, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    m.add_i(1);
-    float float_data[] = {
-      1.0,       1.0,       1.0,       1.0,       5.0,       3.0,
-      2.3333333, 2.0,       9.0,       5.0,       3.6666667, 3.0,
-      13.0,      7.0,       5.0,       4.0,       17.0,      9.0,
-      6.3333335, 5.0,       4.2,       3.6666667, 3.2857144, 3.0,
-      5.0,       4.3333335, 3.857143,  3.5,       5.8,       5.0,
-      4.428571,  4.0,       6.6,       5.6666665, 5.0,       4.5,
-      7.4,       6.3333335, 5.571429,  5.0,       4.5555553, 4.2,
-      3.909091,  3.6666667, 5.0,       4.6,       4.2727275, 4.0,
-      5.4444447, 5.0,       4.6363635, 4.3333335, 5.888889,  5.4,
-      5.0,       4.6666665, 6.3333335, 5.8,       5.3636365, 5.0};
-    std::transform(float_data, float_data + 60, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.divide_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  delete[] answer_data;
-}
-
-TEST(nntrainer_Tensor, divide_i_broadcast_not_supported_01_n) {
-  nntrainer::TensorV2 target(3, 1, 3, 1, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-  nntrainer::TensorV2 target2(3, 1, 3, 3, nntrainer::Tformat::NCHW,
-                              nntrainer::Tdatatype::FP16);
-
-  EXPECT_EQ(target.divide_i(target2), ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, divide_i_broadcast_not_broadcastable_02_n) {
-  nntrainer::TensorV2 target(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-  nntrainer::TensorV2 target2(3, 2, 3, 1, nntrainer::Tformat::NCHW,
-                              nntrainer::Tdatatype::FP16);
-
-  EXPECT_EQ(target.divide_i(target2), ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, add_i_01_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  int channel = 1;
-
-  nntrainer::TensorV2 target(batch, channel, height, width,
-                             nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(target, i * (batch * height) + j * (width) + k + 1 + channel);
-
-  nntrainer::TensorV2 original(batch, channel, height, width,
-                               nntrainer::Tformat::NCHW,
-                               nntrainer::Tdatatype::FP16);
-  original.copy(target);
-
-  status = target.add_i((_FP16)2.1);
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  _FP16 *previous = original.getData<_FP16>();
-  ASSERT_NE(nullptr, previous);
-  _FP16 *data = target.getData<_FP16>();
-  ASSERT_NE(nullptr, data);
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    EXPECT_FLOAT_EQ(data[i], (_FP16)(previous[i] + (_FP16)2.1));
-  }
-}
-
-TEST(nntrainer_Tensor, add_i_02_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  int channel = 1;
-
-  nntrainer::TensorV2 target(batch, channel, height, width,
-                             nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(target, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 original(batch, height, width, nntrainer::Tformat::NCHW,
-                               nntrainer::Tdatatype::FP16);
-  original.copy(target);
-
-  status = target.add_i(target, 3.0);
-  EXPECT_EQ(status, ML_ERROR_NONE);
-
-  _FP16 *previous = original.getData<_FP16>();
-  ASSERT_NE(nullptr, previous);
-  _FP16 *data = target.getData<_FP16>();
-  ASSERT_NE(nullptr, data);
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    EXPECT_FLOAT_EQ(data[i], previous[i] * 4.0);
-  }
-}
-
-// /**
-//  * @brief operand dimension is not right
-//  */
-TEST(nntrainer_Tensor, add_i_01_n) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int height = 3;
-  int width = 10;
-  int channel = 1;
-
-  nntrainer::TensorV2 target(batch, channel, height, width,
-                             nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(target, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 target2(batch, height - 2, width - 3,
-                              nntrainer::Tformat::NCHW,
-                              nntrainer::Tdatatype::FP16);
-
-  status = target.add_i(target2);
-  EXPECT_EQ(status, ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, add_i_broadcast_01_p) {
-  unsigned int N = 120;
-  _FP16 *answer_data = new _FP16[N];
-  nntrainer::TensorDim ref_dim(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                               nntrainer::Tdatatype::FP16);
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,   2,   4,   6,   8,   10,  12,  14,  16,  18,  20,  22,  24,  26,
-      28,  30,  32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,
-      56,  58,  60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  40,  42,
-      44,  46,  48,  50,  52,  54,  56,  58,  60,  62,  64,  66,  68,  70,
-      72,  74,  76,  78,  80,  82,  84,  86,  88,  90,  92,  94,  96,  98,
-      100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 80,  82,  84,  86,
-      88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110, 112, 114,
-      116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142,
-      144, 146, 148, 150, 152, 154, 156, 158};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,   2,   4,   6,   8,   10,  12,  14,  16,  18,  20,  22,  24,  26,
-      28,  30,  32,  34,  36,  38,  20,  22,  24,  26,  28,  30,  32,  34,
-      36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
-      64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,  90,
-      92,  94,  96,  98,  80,  82,  84,  86,  88,  90,  92,  94,  96,  98,
-      100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
-      128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154,
-      156, 158, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162,
-      164, 166, 168, 170, 172, 174, 176, 178};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(3, 2, 4, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,   1,   2,   3,   4,   6,   7,   8,   9,   10,  12,  13,  14,  15,
-      16,  18,  19,  20,  21,  22,  24,  25,  26,  27,  28,  30,  31,  32,
-      33,  34,  36,  37,  38,  39,  40,  42,  43,  44,  45,  46,  48,  49,
-      50,  51,  52,  54,  55,  56,  57,  58,  60,  61,  62,  63,  64,  66,
-      67,  68,  69,  70,  72,  73,  74,  75,  76,  78,  79,  80,  81,  82,
-      84,  85,  86,  87,  88,  90,  91,  92,  93,  94,  96,  97,  98,  99,
-      100, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 114, 115, 116,
-      117, 118, 120, 121, 122, 123, 124, 126, 127, 128, 129, 130, 132, 133,
-      134, 135, 136, 138, 139, 140, 141, 142};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,   2,   4,   6,   8,   5,   7,   9,   11,  13,  10,  12,  14,  16,
-      18,  15,  17,  19,  21,  23,  20,  22,  24,  26,  28,  25,  27,  29,
-      31,  33,  30,  32,  34,  36,  38,  35,  37,  39,  41,  43,  45,  47,
-      49,  51,  53,  50,  52,  54,  56,  58,  55,  57,  59,  61,  63,  60,
-      62,  64,  66,  68,  65,  67,  69,  71,  73,  70,  72,  74,  76,  78,
-      75,  77,  79,  81,  83,  80,  82,  84,  86,  88,  90,  92,  94,  96,
-      98,  95,  97,  99,  101, 103, 100, 102, 104, 106, 108, 105, 107, 109,
-      111, 113, 110, 112, 114, 116, 118, 115, 117, 119, 121, 123, 120, 122,
-      124, 126, 128, 125, 127, 129, 131, 133};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 1, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,   2,   4,   6,   8,   5,   7,   9,   11,  13,  10,  12,  14,  16,
-      18,  15,  17,  19,  21,  23,  25,  27,  29,  31,  33,  30,  32,  34,
-      36,  38,  35,  37,  39,  41,  43,  40,  42,  44,  46,  48,  40,  42,
-      44,  46,  48,  45,  47,  49,  51,  53,  50,  52,  54,  56,  58,  55,
-      57,  59,  61,  63,  65,  67,  69,  71,  73,  70,  72,  74,  76,  78,
-      75,  77,  79,  81,  83,  80,  82,  84,  86,  88,  80,  82,  84,  86,
-      88,  85,  87,  89,  91,  93,  90,  92,  94,  96,  98,  95,  97,  99,
-      101, 103, 105, 107, 109, 111, 113, 110, 112, 114, 116, 118, 115, 117,
-      119, 121, 123, 120, 122, 124, 126, 128};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 4, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,   1,   2,   3,   4,   6,   7,   8,   9,   10,  12,  13,  14,  15,
-      16,  18,  19,  20,  21,  22,  20,  21,  22,  23,  24,  26,  27,  28,
-      29,  30,  32,  33,  34,  35,  36,  38,  39,  40,  41,  42,  44,  45,
-      46,  47,  48,  50,  51,  52,  53,  54,  56,  57,  58,  59,  60,  62,
-      63,  64,  65,  66,  64,  65,  66,  67,  68,  70,  71,  72,  73,  74,
-      76,  77,  78,  79,  80,  82,  83,  84,  85,  86,  88,  89,  90,  91,
-      92,  94,  95,  96,  97,  98,  100, 101, 102, 103, 104, 106, 107, 108,
-      109, 110, 108, 109, 110, 111, 112, 114, 115, 116, 117, 118, 120, 121,
-      122, 123, 124, 126, 127, 128, 129, 130};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(1, 1, 1, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,   2,   4,   6,   8,   5,   7,   9,   11,  13,  10,  12,  14,  16,
-      18,  15,  17,  19,  21,  23,  20,  22,  24,  26,  28,  25,  27,  29,
-      31,  33,  30,  32,  34,  36,  38,  35,  37,  39,  41,  43,  40,  42,
-      44,  46,  48,  45,  47,  49,  51,  53,  50,  52,  54,  56,  58,  55,
-      57,  59,  61,  63,  60,  62,  64,  66,  68,  65,  67,  69,  71,  73,
-      70,  72,  74,  76,  78,  75,  77,  79,  81,  83,  80,  82,  84,  86,
-      88,  85,  87,  89,  91,  93,  90,  92,  94,  96,  98,  95,  97,  99,
-      101, 103, 100, 102, 104, 106, 108, 105, 107, 109, 111, 113, 110, 112,
-      114, 116, 118, 115, 117, 119, 121, 123};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(1, 2, 1, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,
-      14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,  28,
-      29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  40,  41,
-      42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
-      56,  57,  58,  59,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
-      71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  80,  81,  82,  83,
-      84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
-      98,  99,  101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
-      113, 114, 115, 116, 117, 118, 119, 120};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {
-      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,
-      14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
-      28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  41,  42,
-      43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
-      57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
-      71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  82,  83,  84,  85,
-      86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
-      100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
-      114, 115, 116, 117, 118, 119, 120, 121};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorV2 t = rangedV2(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(1, 1, 1, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    m.add_i(1.0);
-    float float_data[] = {
-      1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
-      15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
-      29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
-      43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
-      57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
-      71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
-      85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,
-      99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
-      113, 114, 115, 116, 117, 118, 119, 120};
-    std::transform(float_data, float_data + N, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(3, 5, 1, 4, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(3, 5, 1, 4, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(3, 1, 1, 4, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    float float_data[] = {0,  2,  4,  6,  4,  6,  8,  10, 8,  10, 12, 14,
-                          12, 14, 16, 18, 16, 18, 20, 22, 24, 26, 28, 30,
-                          28, 30, 32, 34, 32, 34, 36, 38, 36, 38, 40, 42,
-                          40, 42, 44, 46, 48, 50, 52, 54, 52, 54, 56, 58,
-                          56, 58, 60, 62, 60, 62, 64, 66, 64, 66, 68, 70};
-    std::transform(float_data, float_data + 60, answer_data,
-                   static_cast_func<_FP16>());
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(1, 1, 2, 1, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(1, 1, 2, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(1, 1, 2, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    _FP16 answer_data[] = {static_cast<_FP16>(0.0), static_cast<_FP16>(2.0)};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  {
-    nntrainer::TensorDim ref_dim(16, 1, 1, 1, nntrainer::Tformat::NCHW,
-                                 nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 t = rangedV2(16, 1, 1, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    nntrainer::TensorV2 m = rangedV2(1, 1, 1, 1, nntrainer::Tformat::NCHW,
-                                     nntrainer::Tdatatype::FP16);
-    _FP16 answer_data[] = {static_cast<_FP16>(0.0),  static_cast<_FP16>(1.0),
-                           static_cast<_FP16>(2.0),  static_cast<_FP16>(3.0),
-                           static_cast<_FP16>(4.0),  static_cast<_FP16>(5.0),
-                           static_cast<_FP16>(6.0),  static_cast<_FP16>(7.0),
-                           static_cast<_FP16>(8.0),  static_cast<_FP16>(9.0),
-                           static_cast<_FP16>(10.0), static_cast<_FP16>(11.0),
-                           static_cast<_FP16>(12.0), static_cast<_FP16>(13.0),
-                           static_cast<_FP16>(14.0), static_cast<_FP16>(15.0)};
-    nntrainer::TensorV2 answer(ref_dim, answer_data);
-    int status = t.add_i(m);
-    EXPECT_EQ(status, ML_ERROR_NONE);
-    EXPECT_EQ(t, answer);
-  }
-  delete[] answer_data;
-}
-
-TEST(nntrainer_Tensor, add_i_broadcast_not_supported_01_n) {
-  nntrainer::TensorV2 target(3, 1, 3, 1, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-  nntrainer::TensorV2 target2(3, 1, 3, 3, nntrainer::Tformat::NCHW,
-                              nntrainer::Tdatatype::FP16);
-
-  EXPECT_EQ(target.add_i(target2), ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, add_i_broadcast_not_broadcastable_02_n) {
-  nntrainer::TensorV2 target(3, 2, 4, 5, nntrainer::Tformat::NCHW,
-                             nntrainer::Tdatatype::FP16);
-  nntrainer::TensorV2 target2(3, 2, 3, 1, nntrainer::Tformat::NCHW,
-                              nntrainer::Tdatatype::FP16);
-
-  EXPECT_EQ(target.add_i(target2), ML_ERROR_INVALID_PARAMETER);
-}
-
-TEST(nntrainer_Tensor, add_01_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 result = input.add(1.0);
-
-  _FP16 *data = result.getData<_FP16>();
-  ASSERT_NE(nullptr, data);
-  _FP16 *indata = input.getData<_FP16>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    if (data[i] != (_FP16)(indata[i] + (_FP16)1.0)) {
-      status = ML_ERROR_RESULT_OUT_OF_RANGE;
-      break;
-    }
-  }
-
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, add_02_p) {
-  int status = ML_ERROR_NONE;
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 result = input.add(input);
-
-  _FP16 *data = result.getData<_FP16>();
-  ASSERT_NE(nullptr, data);
-  _FP16 *indata = input.getData<_FP16>();
-  ASSERT_NE(nullptr, indata);
-
-  for (int i = 0; i < batch * height * width; ++i) {
-    if (data[i] != indata[i] + indata[i]) {
-      status = ML_ERROR_RESULT_OUT_OF_RANGE;
-      break;
-    }
-  }
-
-  EXPECT_EQ(status, ML_ERROR_NONE);
-}
-
-TEST(nntrainer_Tensor, add_03_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorV2 input(batch, channel, height, width,
-                            nntrainer::Tformat::NCHW,
-                            nntrainer::Tdatatype::FP16);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-
-  nntrainer::TensorV2 test(batch - 1, channel, height - 1, width - 1,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
-
-  EXPECT_THROW({ input.add(test); }, std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, add_04_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
-
-  nntrainer::TensorV2 input(batch, channel, height, 2 * width);
-  nntrainer::TensorV2 shared_input =
-    input.getSharedDataTensor(dim, 0, false, "");
-  nntrainer::TensorV2 test(dim);
-
-  EXPECT_THROW(shared_input.add(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, add_05_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
-
-  nntrainer::TensorV2 input(dim);
-  nntrainer::TensorV2 test(batch, channel, height, 2 * width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
-  nntrainer::TensorV2 shared_test = test.getSharedDataTensor(dim, 0, false, "");
-
-  EXPECT_THROW(input.add(shared_test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, add_06_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
-
-  nntrainer::TensorV2 input(dim, false);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 1);
-
-  EXPECT_THROW(input.add(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, add_07_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim, false);
-
-  EXPECT_THROW(input.add(test), std::invalid_argument);
-}
-
-TEST(nntrainer_Tensor, add_08_n) {
-  int batch = 3;
-  int channel = 1;
-  int height = 3;
-  int width = 10;
-
-  nntrainer::TensorDim dim(batch, channel, height, width,
-                           nntrainer::Tformat::NCHW,
-                           nntrainer::Tdatatype::FP16);
-
-  nntrainer::TensorV2 input(dim);
-  GEN_TEST_INPUT(input, i * (batch * height) + j * (width) + k + 1);
-  nntrainer::TensorV2 test(dim);
-  GEN_TEST_INPUT(test, i * (batch * height) + j * (width) + k + 2);
-  nntrainer::TensorV2 output(dim, false);
-
-  EXPECT_THROW(input.add(test, output), std::invalid_argument);
-}
-
-int main(int argc, char **argv) {
-  int result = -1;
-
-  try {
-    testing::InitGoogleTest(&argc, argv);
-  } catch (...) {
-    std::cerr << "Error during InitGoogleTest" << std::endl;
-    return 0;
-  }
-
-  try {
-    result = RUN_ALL_TESTS();
-  } catch (...) {
-    std::cerr << "Error during RUN_ALL_TESTS()" << std::endl;
-  }
-
-  return result;
-}