diff --git a/nntrainer/layers/acti_func.h b/nntrainer/layers/acti_func.h index 07fc13696a..9e43219ee5 100644 --- a/nntrainer/layers/acti_func.h +++ b/nntrainer/layers/acti_func.h @@ -78,9 +78,13 @@ class ActiFunc { in_place = false; this->setActivation(gelu, geluPrime); break; - case ActivationType::ACT_QUICK_GELU: + case ActivationType::ACT_TANH_GELU: in_place = false; - this->setActivation(quickGelu, quickGeluPrime); + this->setActivation(tanhGelu, tanhGeluPrime); + break; + case ActivationType::ACT_SIGMOID_GELU: + in_place = false; + this->setActivation(sigmoidGelu, sigmoidGeluPrime); break; case ActivationType::ACT_ELU: this->setActivation(elu, eluPrime); @@ -462,30 +466,70 @@ class ActiFunc { } /** - * @brief quick gelu activation function (gelu approximation) + * @brief tanh-based gelu approximate function * @param[in] t_in input tensor * @param[in] t_out output tensor */ template - static Tensor &quickGelu(Tensor const &t_in, Tensor &t_out) { + static Tensor &tanhGelu(Tensor const &t_in, Tensor &t_out) { t_in.apply( - [&](T x) { return static_cast(x * (sigmoid(static_cast(1.702 * x)))); }, t_out); + [&](T x) { + return static_cast( + 0.5 * x * + (1 + tanhFloat( + static_cast(sqrt(2 / M_PI) * (x + 0.044715 * pow(x, 3)))))); + }, + t_out); return t_out; } /** - * @brief derivative quick gelu function + * @brief derivative of tanh-based gelu approximate function * @param[in] t_in input tensor * @param[in] t_out output tensor * @param[in] outgoing_derivative outgoing derivative * @param[in] incoming_derivative incoming derivative */ template - static Tensor &quickGeluPrime(Tensor const &t_in, Tensor const &t_out, - Tensor &outgoing_derivative, - Tensor const &incoming_derivative = Tensor()) { + static Tensor &tanhGeluPrime(Tensor const &t_in, Tensor const &t_out, + Tensor &outgoing_derivative, + Tensor const &incoming_derivative = Tensor()) { + // NYI + ml_logw("tanhGeluPrime which is calculate derivate of tanhGelu function is " + "not yet implemented"); + return outgoing_derivative; + } + + /** + * @brief sigmoid-based gelu approximate function (quick gelu) + * @param[in] t_in input tensor + * @param[in] t_out output tensor + */ + template + static Tensor &sigmoidGelu(Tensor const &t_in, Tensor &t_out) { + t_in.apply( + [&](T x) { + return static_cast(x * (sigmoid(static_cast(1.702 * x)))); + }, + t_out); + return t_out; + } + + /** + * @brief derivative of sigmoid-based gelu approximate function + * @param[in] t_in input tensor + * @param[in] t_out output tensor + * @param[in] outgoing_derivative outgoing derivative + * @param[in] incoming_derivative incoming derivative + */ + template + static Tensor & + sigmoidGeluPrime(Tensor const &t_in, Tensor const &t_out, + Tensor &outgoing_derivative, + Tensor const &incoming_derivative = Tensor()) { // NYI - ml_logw("quickGeluPrime which is calculate derivate of quickGelu function is not yet implemented"); + ml_logw("sigmoidGeluPrime which is calculate derivate of sigmoidGelu " + "function is not yet implemented"); return outgoing_derivative; } diff --git a/nntrainer/layers/common_properties.h b/nntrainer/layers/common_properties.h index 12368a38b1..8844783add 100644 --- a/nntrainer/layers/common_properties.h +++ b/nntrainer/layers/common_properties.h @@ -30,20 +30,21 @@ namespace nntrainer { * accordingly */ enum class ActivationType { - ACT_TANH, /**< tanh */ - ACT_SIGMOID, /**< sigmoid */ - ACT_RELU, /**< ReLU */ - ACT_SWISH, /**< Swish */ - ACT_GELU, /**< GELU */ - ACT_QUICK_GELU, /**< Quick GELU */ - ACT_SOFTMAX, /**< softmax */ - ACT_SOFTPLUS, /**< softplus */ - ACT_LEAKY_RELU, /**< Leaky ReLU */ - ACT_ELU, /**< ELU */ - ACT_SELU, /**< SELU */ - ACT_MISH, /**< Mish */ - ACT_NONE, /**< no op */ - ACT_UNKNOWN /**< unknown */ + ACT_TANH, /**< tanh */ + ACT_SIGMOID, /**< sigmoid */ + ACT_RELU, /**< ReLU */ + ACT_SWISH, /**< Swish */ + ACT_GELU, /**< GELU */ + ACT_TANH_GELU, /**< tanh GELU */ + ACT_SIGMOID_GELU, /**< sigmoid GELU */ + ACT_SOFTMAX, /**< softmax */ + ACT_SOFTPLUS, /**< softplus */ + ACT_LEAKY_RELU, /**< Leaky ReLU */ + ACT_ELU, /**< ELU */ + ACT_SELU, /**< SELU */ + ACT_MISH, /**< Mish */ + ACT_NONE, /**< no op */ + ACT_UNKNOWN /**< unknown */ }; namespace props { @@ -910,12 +911,12 @@ struct ActivationTypeInfo { static constexpr std::initializer_list EnumList = { Enum::ACT_TANH, Enum::ACT_SIGMOID, Enum::ACT_RELU, Enum::ACT_SOFTMAX, Enum::ACT_LEAKY_RELU, Enum::ACT_SWISH, - Enum::ACT_GELU, Enum::ACT_QUICK_GELU, Enum::ACT_NONE, - Enum::ACT_UNKNOWN}; + Enum::ACT_GELU, Enum::ACT_TANH_GELU, Enum::ACT_SIGMOID_GELU, + Enum::ACT_NONE, Enum::ACT_UNKNOWN}; static constexpr const char *EnumStr[] = { - "tanh", "sigmoid", "relu", "softmax", "leaky_relu", - "swish", "gelu", "quick_gelu", "none", "unknown"}; + "tanh", "sigmoid", "relu", "softmax", "leaky_relu", "swish", + "gelu", "tanh_gelu", "sigmoid_gelu", "none", "unknown"}; }; /** @@ -1122,7 +1123,7 @@ struct UpsampleModeInfo { enum class Interpolation { nearest, bilinear }; using Enum = Interpolation; - + static constexpr std::initializer_list EnumList = { Interpolation::nearest, Interpolation::bilinear};