Skip to content

Commit

Permalink
[GPU/OpenCL] Initial version of Transpose function about different ax…
Browse files Browse the repository at this point in the history
…es with OpenCL ops

Added naive version of OpenCL implementation for Transpose function using blas
Incorporated kernels for ops used.
Added unit tests for transpose about different axes.

Signed-off-by: Niket Agarwal <[email protected]>
  • Loading branch information
niket-agarwal committed Aug 27, 2024
1 parent 632c68a commit 4ea3ba8
Show file tree
Hide file tree
Showing 16 changed files with 1,227 additions and 37 deletions.
20 changes: 15 additions & 5 deletions api/ccapi/include/layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,9 @@ enum LayerType {
LAYER_LOSS_CONSTANT_DERIVATIVE, /**< Synthetic loss layer to feed constant
derivative */
LAYER_UPSAMPLE2D, /**< Upsample 2D Layer type */
LAYER_RMSNORM = ML_TRAIN_LAYER_TYPE_RMSNORM, /**<RMS NORM Layer */
LAYER_UNKNOWN = ML_TRAIN_LAYER_TYPE_UNKNOWN /**< Unknown */
LAYER_RMSNORM = ML_TRAIN_LAYER_TYPE_RMSNORM, /**<RMS NORM Layer */
LAYER_UNKNOWN = ML_TRAIN_LAYER_TYPE_UNKNOWN, /**< Unknown */
LAYER_TRANSPOSE = ML_TRAIN_LAYER_TYPE_TRANSPOSE, /**< Transpose Layer type */
};

/**
Expand Down Expand Up @@ -311,12 +312,21 @@ Swiglu(const std::vector<std::string> &properties = {},
/**
* @brief Helper function to create RMS normalization layer for GPU
*/
inline std::unique_ptr<Layer> RMSNormCl(
const std::vector<std::string> &properties = {},
const LayerComputeEngine &compute_engine = LayerComputeEngine::GPU) {
inline std::unique_ptr<Layer>
RMSNormCl(const std::vector<std::string> &properties = {},
const LayerComputeEngine &compute_engine = LayerComputeEngine::GPU) {
return createLayer(LayerType::LAYER_RMSNORM, properties, compute_engine);
}

/**
* @brief Helper function to create Transpose layer
*/
inline std::unique_ptr<Layer>
Transpose(const std::vector<std::string> &properties = {},
const LayerComputeEngine &compute_engine = LayerComputeEngine::CPU) {
return createLayer(LayerType::LAYER_TRANSPOSE, properties, compute_engine);
}

/**
* @brief Helper function to create batch normalization layer
*/
Expand Down
7 changes: 4 additions & 3 deletions api/nntrainer-api-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ typedef enum {
27, /**< Layer Normalization Layer type (Since 7.0) */
ML_TRAIN_LAYER_TYPE_POSITIONAL_ENCODING =
28, /**< Positional Encoding Layer type (Since 7.0) */
ML_TRAIN_LAYER_TYPE_IDENTITY = 29, /**< Identity Layer type (Since 8.0) */
ML_TRAIN_LAYER_TYPE_SWIGLU = 30, /**< Swiglu Layer type */
ML_TRAIN_LAYER_TYPE_IDENTITY = 29, /**< Identity Layer type (Since 8.0) */
ML_TRAIN_LAYER_TYPE_SWIGLU = 30, /**< Swiglu Layer type */
ML_TRAIN_LAYER_TYPE_TRANSPOSE = 31, /**< Transpose Layer type */
ML_TRAIN_LAYER_TYPE_PREPROCESS_FLIP =
300, /**< Preprocess flip Layer (Since 6.5) */
ML_TRAIN_LAYER_TYPE_PREPROCESS_TRANSLATE =
Expand All @@ -77,7 +78,7 @@ typedef enum {
ML_TRAIN_LAYER_TYPE_LOSS_CROSS_ENTROPY_SOFTMAX = 502, /**< Cross Entropy with
Softmax Loss Layer type (Since 6.5) */
ML_TRAIN_LAYER_TYPE_RMSNORM = 503, /**< Cross Entropy with */
ML_TRAIN_LAYER_TYPE_UNKNOWN = 999 /**< Unknown Layer */
ML_TRAIN_LAYER_TYPE_UNKNOWN = 999 /**< Unknown Layer */
} ml_train_layer_type_e;

/**
Expand Down
5 changes: 5 additions & 0 deletions nntrainer/cl_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <reshape_cl.h>
#include <rmsnorm_layer_cl.h>
#include <swiglu_cl.h>
#include <transpose_cl.h>

namespace nntrainer {

Expand Down Expand Up @@ -49,6 +50,10 @@ static void add_default_object(ClContext &cc) {

cc.registerFactory(nntrainer::createLayer<ConcatLayerCl>, ConcatLayerCl::type,
ml::train::LayerType::LAYER_CONCAT);

cc.registerFactory(nntrainer::createLayer<TransposeLayerCl>,
TransposeLayerCl::type,
ml::train::LayerType::LAYER_TRANSPOSE);
}

static void registerer(ClContext &cc) noexcept {
Expand Down
1 change: 1 addition & 0 deletions nntrainer/layers/cl_layers/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ cl_layer_sources = [
'reshape_cl.cpp',
'rmsnorm_layer_cl.cpp',
'concat_cl.cpp',
'transpose_cl.cpp',
]

foreach s : cl_layer_sources
Expand Down
91 changes: 91 additions & 0 deletions nntrainer/layers/cl_layers/transpose_cl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (C) 2024 Niket Agarwal <[email protected]>
*
* @file transpose_cl.cpp
* @date 31 July 2024
* @brief Implementation of transpose layer
* @see https://github.com/nnstreamer/nntrainer
* @author Niket Agarwal <[email protected]>
* @bug No known bugs except for NYI items
*
*/

#include "transpose_cl.h"
#include <blas_kernel_interface.h>
#include <iostream>
#include <layer_context.h>
#include <nntrainer_error.h>
#include <nntrainer_log.h>
#include <node_exporter.h>

namespace nntrainer {

static constexpr size_t SINGLE_INOUT_IDX = 0;

void TransposeLayerCl::finalize(InitLayerContext &context) {
std::vector<TensorDim> dim = context.getInputDimensions();

for (unsigned int i = 0; i < dim.size(); ++i) {
if (dim[i].getDataLen() == 0) {
throw std::invalid_argument("Input dimension is not set");
} else {
dim[i].channel(dim[i].channel());
dim[i].height(dim[i].height());
dim[i].width(dim[i].width());
}
}

context.setOutputDimensions(dim);
}

void TransposeLayerCl::forwarding(RunLayerContext &context, bool training) {
Tensor &in = context.getInput(SINGLE_INOUT_IDX);
Tensor &out = context.getOutput(SINGLE_INOUT_IDX);
Transpose_i_cl("1:0:2", in, out, context);
}

void TransposeLayerCl::incremental_forwarding(RunLayerContext &context,
unsigned int from,
unsigned int to, bool training) {
Tensor &in = context.getInput(SINGLE_INOUT_IDX);
Tensor &out = context.getOutput(SINGLE_INOUT_IDX);
if (from) {
NNTR_THROW_IF(to - from != 1, std::invalid_argument)
<< "incremental step size is not 1";
from = 0;
to = 1;
}
Transpose_i_cl("1:0:2", in, out, context);
}

void TransposeLayerCl::calcDerivative(RunLayerContext &context) {
std::throw_with_nested(std::runtime_error("Training is not supported yet."));
}

void TransposeLayerCl::setProperty(const std::vector<std::string> &values) {
auto remain_props = loadProperties(values, transpose_props);
if (!remain_props.empty()) {
std::string msg = "[TransposeLayerCl] Unknown Layer Properties count " +
std::to_string(values.size());
throw exception::not_supported(msg);
}
}

#ifdef PLUGGABLE

Layer *create_transpose_layer_cl() {
auto layer = new TransposeLayerCl();
return layer;
}

void destroy_transpose_layer_cl(Layer *layer) { delete layer; }

extern "C" {
LayerPluggable ml_train_layer_pluggable{create_transpose_layer_cl,
destroy_transpose_layer_cl};
}

#endif

} // namespace nntrainer
105 changes: 105 additions & 0 deletions nntrainer/layers/cl_layers/transpose_cl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (C) 2024 Niket Agarwal <[email protected]>
*
* @file transpose_cl.h
* @date 31 July 2024
* @brief Implementation of transpose layer
* @see https://github.com/nnstreamer/nntrainer
* @author Niket Agarwal <[email protected]>
* @bug No known bugs except for NYI items
*
*/

#ifndef __TRANSPOSE_LAYER_CL_H__
#define __TRANSPOSE_LAYER_CL_H__

#include <common_properties.h>
#include <layer_devel.h>
#include <opencl_buffer.h>
#include <opencl_kernel.h>

#define CREATE_IF_EMPTY_DIMS(tensor, ...) \
do { \
if (tensor.empty()) \
tensor = Tensor(__VA_ARGS__); \
} while (0);

namespace nntrainer {

/**
* @brief A tranpose layer.
*
*/
class TransposeLayerCl final : public Layer {
public:
/**
* @brief Construct a new transpose layer object
*
*/
TransposeLayerCl() : Layer(), transpose_props(props::Print()) {}

/**
* @brief Destroy the transpose layer object
*
*/
~TransposeLayerCl() {}

/**
* @copydoc Layer::finalize(InitLayerContext &context)
*/
void finalize(InitLayerContext &context) override;

/**
* @copydoc Layer::forwarding(RunLayerContext &context, bool training)
*/
void forwarding(RunLayerContext &context, bool training) override;

/**
* @copydoc Layer::incremental_forwarding(RunLayerContext &context, unsigned
* int from, unsigned int to, bool training)
*/
void incremental_forwarding(RunLayerContext &context, unsigned int from,
unsigned int to, bool training) override;

/**
* @copydoc Layer::calcDerivative(RunLayerContext &context)
*/
void calcDerivative(RunLayerContext &context) override;

/**
* @copydoc bool supportBackwarding() const
*/
bool supportBackwarding() const override { return true; };

/**
* @copydoc Layer::exportTo(Exporter &exporter, ExportMethods method)
*/
void exportTo(Exporter &exporter,
const ml::train::ExportMethods &method) const override {};

/**
* @copydoc Layer::getType()
*/
const std::string getType() const override { return TransposeLayerCl::type; };

/**
* @copydoc Layer::setProperty(const std::vector<std::string> &values)
*/
void setProperty(const std::vector<std::string> &values) override;

inline static const std::string type = "transpose";

static opencl::Kernel kernel_transpose_axis0;
static opencl::Kernel kernel_transpose_fp16_axis0;
static opencl::Kernel kernel_transpose_axis1;
static opencl::Kernel kernel_transpose_fp16_axis1;
static opencl::Kernel kernel_transpose_axis2;
static opencl::Kernel kernel_transpose_fp16_axis2;

std::tuple<props::Print> transpose_props; /**< transpose layer properties :
unit - number of output neurons */
};
} // namespace nntrainer

#endif /* __TRANSPOSE_LAYER_CL_H__ */
12 changes: 12 additions & 0 deletions nntrainer/layers/layer_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,18 @@ std::string RunLayerContext::getKernelName(LayerKernel layerKernel) {
return "concat_cl_axis1";
case LayerKernel::CONCAT_AXIS1_FP16:
return "concat_cl_axis1_fp16";
case LayerKernel::TRANSPOSE_AXIS0:
return "transpose_cl_axis0";
case LayerKernel::TRANSPOSE_FP16_AXIS0:
return "transpose_cl_fp16_axis0";
case LayerKernel::TRANSPOSE_AXIS1:
return "transpose_cl_axis1";
case LayerKernel::TRANSPOSE_FP16_AXIS1:
return "transpose_cl_fp16_axis1";
case LayerKernel::TRANSPOSE_AXIS2:
return "transpose_cl_axis2";
case LayerKernel::TRANSPOSE_FP16_AXIS2:
return "transpose_cl_fp16_axis2";
default:
return "";
}
Expand Down
64 changes: 35 additions & 29 deletions nntrainer/layers/layer_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -828,35 +828,41 @@ class RunLayerContext {
* to the power of 2 (e.g: 1, 2, 4, 8 ...). Should also be resolved in
* getKernelName function.
*/
enum LayerKernel {
SGEMV = 1 << 0, /**< placeholder for kernel name */
SGEMV_FP16 = 1 << 1, /**< placeholder for kernel name */
DOT = 1 << 2, /**< placeholder for kernel name */
DOT_FP16 = 1 << 3, /**< placeholder for kernel name */
SGEMM_NOTRANS = 1 << 4, /**< placeholder for kernel name */
SGEMM_NOTRANS_FP16 = 1 << 5, /**< placeholder for kernel name */
SGEMM_TRANSA = 1 << 6, /**< placeholder for kernel name */
SGEMM_TRANSA_FP16 = 1 << 7, /**< placeholder for kernel name */
SGEMM_TRANSB = 1 << 8, /**< placeholder for kernel name */
SGEMM_TRANSB_FP16 = 1 << 9, /**< placeholder for kernel name */
SGEMM_TRANSAB = 1 << 10, /**< placeholder for kernel name */
SGEMM_TRANSAB_FP16 = 1 << 11, /**< placeholder for kernel name */
ADD = 1 << 12, /**< placeholder for kernel name */
ADD_FP16 = 1 << 13, /**< placeholder for kernel name */
SWIGLU = 1 << 14, /**< placeholder for kernel name */
SWIGLU_FP16 = 1 << 15, /**< placeholder for kernel name */
SSCAL = 1 << 16, /**< placeholder for kernel name */
SSCAL_FP16 = 1 << 17, /**< placeholder for kernel name */
COPY = 1 << 18, /**< placeholder for kernel name */
COPY_FP16 = 1 << 19, /**< placeholder for kernel name */
RMSNORM = 1 << 20, /**< placeholder for kernel name */
RMSNORM_FP16 = 1 << 21, /**< placeholder for kernel name */
CONCAT_AXIS3 = 1 << 22, /**< placeholder for kernel name */
CONCAT_AXIS3_FP16 = 1 << 23, /**< placeholder for kernel name */
CONCAT_AXIS2 = 1 << 24, /**< placeholder for kernel name */
CONCAT_AXIS2_FP16 = 1 << 25, /**< placeholder for kernel name */
CONCAT_AXIS1 = 1 << 26, /**< placeholder for kernel name */
CONCAT_AXIS1_FP16 = 1 << 27, /**< placeholder for kernel name */
enum LayerKernel : unsigned long {
SGEMV = 1ull << 0, /**< placeholder for kernel name */
SGEMV_FP16 = 1ull << 1, /**< placeholder for kernel name */
DOT = 1ull << 2, /**< placeholder for kernel name */
DOT_FP16 = 1ull << 3, /**< placeholder for kernel name */
SGEMM_NOTRANS = 1ull << 4, /**< placeholder for kernel name */
SGEMM_NOTRANS_FP16 = 1ull << 5, /**< placeholder for kernel name */
SGEMM_TRANSA = 1ull << 6, /**< placeholder for kernel name */
SGEMM_TRANSA_FP16 = 1ull << 7, /**< placeholder for kernel name */
SGEMM_TRANSB = 1ull << 8, /**< placeholder for kernel name */
SGEMM_TRANSB_FP16 = 1ull << 9, /**< placeholder for kernel name */
SGEMM_TRANSAB = 1ull << 10, /**< placeholder for kernel name */
SGEMM_TRANSAB_FP16 = 1ull << 11, /**< placeholder for kernel name */
ADD = 1ull << 12, /**< placeholder for kernel name */
ADD_FP16 = 1ull << 13, /**< placeholder for kernel name */
SWIGLU = 1ull << 14, /**< placeholder for kernel name */
SWIGLU_FP16 = 1ull << 15, /**< placeholder for kernel name */
SSCAL = 1ull << 16, /**< placeholder for kernel name */
SSCAL_FP16 = 1ull << 17, /**< placeholder for kernel name */
COPY = 1ull << 18, /**< placeholder for kernel name */
COPY_FP16 = 1ull << 19, /**< placeholder for kernel name */
RMSNORM = 1ull << 20, /**< placeholder for kernel name */
RMSNORM_FP16 = 1ull << 21, /**< placeholder for kernel name */
CONCAT_AXIS3 = 1ull << 22, /**< placeholder for kernel name */
CONCAT_AXIS3_FP16 = 1ull << 23, /**< placeholder for kernel name */
CONCAT_AXIS2 = 1ull << 24, /**< placeholder for kernel name */
CONCAT_AXIS2_FP16 = 1ull << 25, /**< placeholder for kernel name */
CONCAT_AXIS1 = 1ull << 26, /**< placeholder for kernel name */
CONCAT_AXIS1_FP16 = 1ull << 27, /**< placeholder for kernel name */
TRANSPOSE_AXIS0 = 1ull << 28, /**< placeholder for kernel name */
TRANSPOSE_FP16_AXIS0 = 1ull << 29, /**< placeholder for kernel name */
TRANSPOSE_AXIS1 = 1ull << 30, /**< placeholder for kernel name */
TRANSPOSE_FP16_AXIS1 = 1ull << 31, /**< placeholder for kernel name */
TRANSPOSE_AXIS2 = 1ull << 32, /**< placeholder for kernel name */
TRANSPOSE_FP16_AXIS2 = 1ull << 33 /**< placeholder for kernel name */
};

/**
Expand Down
Loading

0 comments on commit 4ea3ba8

Please sign in to comment.