Pico GPT v2 code for #51. Includes changes for #2

sbrunk · Dec 18, 2023 · 75857c4 · 75857c4
1 parent 8a3f633
commit 75857c4
Show file tree

Hide file tree

Showing 29 changed files with 1,838 additions and 41 deletions.
diff --git a/core/src/main/scala/torch/Tensor.scala b/core/src/main/scala/torch/Tensor.scala
@@ -68,6 +68,8 @@ import org.bytedeco.pytorch.SymIntOptional
 import org.bytedeco.pytorch.ScalarTypeOptional
 import scala.annotation.implicitNotFound
 
+import torch.nn.functional as F
+
 case class TensorTuple[D <: DType](
     values: Tensor[D],
     indices: Tensor[Int64]
@@ -439,6 +441,22 @@ sealed abstract class Tensor[D <: DType]( /* private[torch]  */ val native: pyto
 
   def `@`[D2 <: DType](u: Tensor[D2]): Tensor[Promoted[D, D2]] = matmul(u)
 
+
+  /** Fills elements of self tensor with value where mask is `true`. The shape of mask must be 
+    * [broadcastable](https://pytorch.org/docs/stable/notes/broadcasting.html#broadcasting-semantics) with the shape 
+    * of the underlying tensor.
+    *
+    * @param mask
+    *   the boolean mask
+    * @param value
+    *   the value to fill in with
+    * @return
+    *   Tensor with masked elements set to `value`
+    */
+  def maskedFill[S <: ScalaType](mask: Tensor[Bool], value: S): Tensor[Promoted[D, ScalaToDType[S]]] =
+    fromNative(native.masked_fill(mask.native, toScalar(value)))
+
+
   /** Returns the maximum value of all elements of this tensor. */
   def max(): Tensor[D] = fromNative(native.max())
 
@@ -541,6 +559,11 @@ sealed abstract class Tensor[D <: DType]( /* private[torch]  */ val native: pyto
 
   def shape: Seq[Int] = size
 
+  def softmax[Out <: FloatNN | Derive](
+      dim: Long,
+      dtype: Out = derive
+  ): Tensor[DTypeOrDeriveFromTensor[D, Out]] = F.softmax(input = this, dim = dim, dtype = dtype)
+
   def square = fromNative(native.square())
 
   def squeeze: Tensor[D] = fromNative(native.squeeze())
@@ -554,6 +577,22 @@ sealed abstract class Tensor[D <: DType]( /* private[torch]  */ val native: pyto
 
   /** Returns the sum of all elements of this tensor. */
   def sum: Tensor[Sum[D]] = fromNative(native.sum())
+  def sum[D2 <: DType | Derive](
+        dim: Int | Seq[Int] = Seq.empty,
+        keepdim: Boolean = false,
+        dtype: D2 = derive
+    ): Tensor[DTypeOrDeriveFromTensor[D, D2]] =
+      val derivedDType = dtype match
+        case _: Derive => this.dtype
+        case d: DType  => d
+      fromNative(
+        torchNative.sum(
+          native,
+          dim.toArray,
+          keepdim,
+          new ScalarTypeOptional(derivedDType.toScalarType)
+        )
+      )
 
   /** Expects `input` to be \<= 2-D tensor and transposes dimensions 0 and 1.
     *
@@ -562,6 +601,46 @@ sealed abstract class Tensor[D <: DType]( /* private[torch]  */ val native: pyto
     */
   def t: Tensor[D] = fromNative(native.t())
 
+  /** Returns a tensor that is a transposed version of `input` (this Tensor). The given dimensions 
+    * `dim0` and `dim1` are swapped.
+    * 
+    * If `input` is a strided tensor then the resulting `out` tensor shares its underlying storage with 
+    * the `input` tensor, so changing the content of one would change the content of the other.
+    * 
+    * If `input` is a [[https://pytorch.org/docs/stable/sparse.html#sparse-docs sparse tensor]] then the 
+    * resulting `out` tensor does not share the underlying storage with the input tensor.
+    * 
+    * If input is a [[https://pytorch.org/docs/stable/sparse.html#sparse-docs sparse tensor]] with 
+    * compressed layout (SparseCSR, SparseBSR, SparseCSC or SparseBSC) the arguments `dim0` and `dim1` 
+    * must be both batch dimensions, or must both be sparse dimensions. The batch dimensions of a sparse 
+    * tensor are the dimensions preceding the sparse dimensions.
+    * 
+    * @note Transpositions which interchange the sparse dimensions of a *SparseCSR* or *SparseCSC* 
+    * layout tensor will result in the layout changing between the two options. Transposition of the 
+    * sparse dimensions of a `SparseBSR` or `SparseBSC` layout tensor will likewise generate a result 
+    * with the opposite layout.
+    * 
+    * @example:
+    * {{{
+    *  val x = torch.randn(2, 3)
+    *  println(x)
+    *  val y = torch.transpose(x, 0, 1)
+    *  println(y)
+    * }}}
+    * 
+    * @param input
+    *   the input tensor.
+    * @param dim0
+    *   the first dimension to be transposed
+    * @param dim1
+    *   the second dimension to be transposed
+    * @return Tensor[D]
+    * 
+    * @see [[Tensor.mT]]
+    * 
+    */
+  def transpose(dim0: Int, dim1: Int): Tensor[D] = fromNative(native.transpose(dim0, dim1))
+
   /** Calculates the variance of all elements of this tensor. */
   def variance = fromNative(native.`var`())
 

diff --git a/core/src/main/scala/torch/nn/functional/Activations.scala b/core/src/main/scala/torch/nn/functional/Activations.scala
@@ -18,6 +18,7 @@ package torch
 package nn
 package functional
 
+import Derive.derive
 import org.bytedeco.pytorch
 import org.bytedeco.pytorch.global.torch as torchNative
 import org.bytedeco.javacpp.LongPointer
@@ -36,11 +37,16 @@ private[torch] trait Activations {
     *
     * @group nn_activation
     */
-  def logSoftmax[In <: DType, Out <: DType](input: Tensor[In], dim: Long)(
-      dtype: Out = input.dtype
-  ): Tensor[Out] =
+  def logSoftmax[In <: DType, Out <: FloatNN | Derive](
+      input: Tensor[In],
+      dim: Long,
+      dtype: Out = derive
+  ): Tensor[DTypeOrDeriveFromTensor[In, Out]] =
+    val derivedDType = dtype match
+      case _: Derive => input.dtype
+      case d: DType  => d
     val nativeDType =
-      if dtype == input.dtype then ScalarTypeOptional() else ScalarTypeOptional(dtype.toScalarType)
+      if dtype == input.dtype then ScalarTypeOptional() else ScalarTypeOptional(derivedDType.toScalarType)
     fromNative(torchNative.log_softmax(input.native, dim, nativeDType))
 
     /** Applies the rectified linear unit function element-wise.
@@ -72,10 +78,15 @@ private[torch] trait Activations {
     *
     * @group nn_activation
     */
-  def softmax[In <: DType, Out <: DType](input: Tensor[In], dim: Long)(
-      dtype: Out = input.dtype
-  ): Tensor[Out] =
+  def softmax[In <: DType, Out <: FloatNN | Derive](
+      input: Tensor[In],
+      dim: Long,
+      dtype: Out = derive
+  ): Tensor[DTypeOrDeriveFromTensor[In, Out]] =
+    val derivedDType = dtype match
+      case _: Derive => input.dtype
+      case d: DType  => d
     val nativeDType =
-      if dtype == input.dtype then ScalarTypeOptional() else ScalarTypeOptional(dtype.toScalarType)
+      if dtype == input.dtype then ScalarTypeOptional() else ScalarTypeOptional(derivedDType.toScalarType)
     fromNative(torchNative.softmax(input.native, dim, nativeDType))
 }
diff --git a/core/src/main/scala/torch/nn/functional/Loss.scala b/core/src/main/scala/torch/nn/functional/Loss.scala
@@ -47,4 +47,92 @@ private[torch] trait Loss {
         BCEWithLogitsLossOptions()
       )
     )
+
+
+  // http://bytedeco.org/javacpp-presets/pytorch/apidocs/
+  /** This criterion computes the cross entropy loss between input logits and target.
+   * See [[torch.nn.loss.CrossEntropyLoss]] for details.
+   * 
+   * **Shape:**
+   *
+   *   * Input: Shape $(C)$, $(N,C)$ or $(N,C,d_1,d_2,...,d_K)$ with $K≥1$ in the case of K-dimensional
+   * loss.
+   *   * Target: If containing class indices, shape $()$, $(N)$ or $(N,d_1,d_2,...,d_K)$ with $K≥1$ 
+   * in the case of K-dimensional loss where each value should be between $[0,C)$. If containing class 
+   * probabilities, same shape as the input and each value should be between [0,1][0,1].
+   * 
+   * where:
+   *   * C = number of classes
+   *   * N = batch size
+   * 
+   * 
+   * @example 
+   * {{{
+   * // Example of target with class indices
+   * val input = torch.randn(3, 5, requires_grad=True)
+   * val target = torch.randint(5, (3,), dtype=torch.int64)
+   * val loss = F.cross_entropy(input, target)
+   * loss.backward()
+   * 
+   * // Example of target with class probabilities
+   * val input = torch.randn(3, 5, requires_grad=True)
+   * val target = torch.randn(3, 5).softmax(dim=1)
+   * val loss = F.crossEntropy(input, target)
+   * loss.backward()
+   * }}}
+   * 
+   * 
+   * @param input
+   *  Predicted unnormalized logits; see Shape section above for supported shapes.
+   * @param target
+   *  Ground truth class indices or class probabilities; see Shape section below for supported shapes.
+   * @param weight
+   *  a manual rescaling weight given to each class. If given, has to be a Tensor of size C
+   * @param size_average
+   * Deprecated (see reduction). By default, the losses are averaged over each loss element in the 
+   * batch. Note that for some losses, there multiple elements per sample. If the field `size_average`
+   * is set to `false`, the losses are instead summed for each mini-batch. Ignored when reduce is 
+   * `false`. Default: `true`
+   * @param ignore_index
+   * Specifies a target value that is ignored and does not contribute to the input gradient. When 
+   * `size_average` is `true`, the loss is averaged over non-ignored targets. Note that 
+   * `ignore_index` is only applicable when the target contains class indices. Default: `-100`
+   * @param reduce
+   * Deprecated (see reduction). By default, the losses are averaged or summed over observations for 
+   * each mini-batch depending on `size_average`. When reduce is `false`, returns a loss per batch 
+   * element instead and ignores size_average. Default: `true`
+   * @param reduction
+   * Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'. 'none': no reduction will 
+   * be applied, 'mean': the sum of the output will be divided by the number of elements in the output, 
+   * 'sum': the output will be summed. Note: `size_average` and `reduce` are in the process of being 
+   * deprecated, and in the meantime, specifying either of those two args will override reduction. 
+   * Default: 'mean'
+   * @param label_smoothing
+   * A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 
+   * means no smoothing. The targets become a mixture of the original ground truth and a uniform 
+   * distribution as described in [[https://arxiv.org/abs/1512.00567 Rethinking the Inception Architecture 
+   * for Computer Vision]]. Default: 0.0
+   *
+   * @return [[torch.Tensor]]
+   * 
+   * 
+   * @see See [[https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html torch.nn.functional.cross_entropy]]
+   * @see See [[https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html for equivalent torch.nn.CrossEntropyLoss class]]
+   * @see See [[https://pytorch.org/cppdocs/ PyTorch C++ documentation]]
+   * @see See [[http://bytedeco.org/javacpp-presets/pytorch/apidocs/ ByteDeco PyTorch preset]]
+   */
+  def crossEntropy[
+      I <: BFloat16 | Float32 | Float64,
+      O <: NumericRealNN
+  ](
+      input: Tensor[I],
+      target: Tensor[O]
+  ): Tensor[I] =
+    fromNative(
+      torchNative.cross_entropy(
+        input.native,
+        target.native
+      )
+    )
+
 }