Skip to content

Commit

Permalink
Pico GPT v2 code for #51. Includes changes for #2
Browse files Browse the repository at this point in the history
  • Loading branch information
hmf committed Dec 18, 2023
1 parent 8a3f633 commit 75857c4
Show file tree
Hide file tree
Showing 29 changed files with 1,838 additions and 41 deletions.
79 changes: 79 additions & 0 deletions core/src/main/scala/torch/Tensor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ import org.bytedeco.pytorch.SymIntOptional
import org.bytedeco.pytorch.ScalarTypeOptional
import scala.annotation.implicitNotFound

import torch.nn.functional as F

case class TensorTuple[D <: DType](
values: Tensor[D],
indices: Tensor[Int64]
Expand Down Expand Up @@ -439,6 +441,22 @@ sealed abstract class Tensor[D <: DType]( /* private[torch] */ val native: pyto

def `@`[D2 <: DType](u: Tensor[D2]): Tensor[Promoted[D, D2]] = matmul(u)


/** Fills elements of self tensor with value where mask is `true`. The shape of mask must be
* [broadcastable](https://pytorch.org/docs/stable/notes/broadcasting.html#broadcasting-semantics) with the shape
* of the underlying tensor.
*
* @param mask
* the boolean mask
* @param value
* the value to fill in with
* @return
* Tensor with masked elements set to `value`
*/
def maskedFill[S <: ScalaType](mask: Tensor[Bool], value: S): Tensor[Promoted[D, ScalaToDType[S]]] =
fromNative(native.masked_fill(mask.native, toScalar(value)))


/** Returns the maximum value of all elements of this tensor. */
def max(): Tensor[D] = fromNative(native.max())

Expand Down Expand Up @@ -541,6 +559,11 @@ sealed abstract class Tensor[D <: DType]( /* private[torch] */ val native: pyto

def shape: Seq[Int] = size

def softmax[Out <: FloatNN | Derive](
dim: Long,
dtype: Out = derive
): Tensor[DTypeOrDeriveFromTensor[D, Out]] = F.softmax(input = this, dim = dim, dtype = dtype)

def square = fromNative(native.square())

def squeeze: Tensor[D] = fromNative(native.squeeze())
Expand All @@ -554,6 +577,22 @@ sealed abstract class Tensor[D <: DType]( /* private[torch] */ val native: pyto

/** Returns the sum of all elements of this tensor. */
def sum: Tensor[Sum[D]] = fromNative(native.sum())
def sum[D2 <: DType | Derive](
dim: Int | Seq[Int] = Seq.empty,
keepdim: Boolean = false,
dtype: D2 = derive
): Tensor[DTypeOrDeriveFromTensor[D, D2]] =
val derivedDType = dtype match
case _: Derive => this.dtype
case d: DType => d
fromNative(
torchNative.sum(
native,
dim.toArray,
keepdim,
new ScalarTypeOptional(derivedDType.toScalarType)
)
)

/** Expects `input` to be \<= 2-D tensor and transposes dimensions 0 and 1.
*
Expand All @@ -562,6 +601,46 @@ sealed abstract class Tensor[D <: DType]( /* private[torch] */ val native: pyto
*/
def t: Tensor[D] = fromNative(native.t())

/** Returns a tensor that is a transposed version of `input` (this Tensor). The given dimensions
* `dim0` and `dim1` are swapped.
*
* If `input` is a strided tensor then the resulting `out` tensor shares its underlying storage with
* the `input` tensor, so changing the content of one would change the content of the other.
*
* If `input` is a [[https://pytorch.org/docs/stable/sparse.html#sparse-docs sparse tensor]] then the
* resulting `out` tensor does not share the underlying storage with the input tensor.
*
* If input is a [[https://pytorch.org/docs/stable/sparse.html#sparse-docs sparse tensor]] with
* compressed layout (SparseCSR, SparseBSR, SparseCSC or SparseBSC) the arguments `dim0` and `dim1`
* must be both batch dimensions, or must both be sparse dimensions. The batch dimensions of a sparse
* tensor are the dimensions preceding the sparse dimensions.
*
* @note Transpositions which interchange the sparse dimensions of a *SparseCSR* or *SparseCSC*
* layout tensor will result in the layout changing between the two options. Transposition of the
* sparse dimensions of a `SparseBSR` or `SparseBSC` layout tensor will likewise generate a result
* with the opposite layout.
*
* @example:
* {{{
* val x = torch.randn(2, 3)
* println(x)
* val y = torch.transpose(x, 0, 1)
* println(y)
* }}}
*
* @param input
* the input tensor.
* @param dim0
* the first dimension to be transposed
* @param dim1
* the second dimension to be transposed
* @return Tensor[D]
*
* @see [[Tensor.mT]]
*
*/
def transpose(dim0: Int, dim1: Int): Tensor[D] = fromNative(native.transpose(dim0, dim1))

/** Calculates the variance of all elements of this tensor. */
def variance = fromNative(native.`var`())

Expand Down
27 changes: 19 additions & 8 deletions core/src/main/scala/torch/nn/functional/Activations.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package torch
package nn
package functional

import Derive.derive
import org.bytedeco.pytorch
import org.bytedeco.pytorch.global.torch as torchNative
import org.bytedeco.javacpp.LongPointer
Expand All @@ -36,11 +37,16 @@ private[torch] trait Activations {
*
* @group nn_activation
*/
def logSoftmax[In <: DType, Out <: DType](input: Tensor[In], dim: Long)(
dtype: Out = input.dtype
): Tensor[Out] =
def logSoftmax[In <: DType, Out <: FloatNN | Derive](
input: Tensor[In],
dim: Long,
dtype: Out = derive
): Tensor[DTypeOrDeriveFromTensor[In, Out]] =
val derivedDType = dtype match
case _: Derive => input.dtype
case d: DType => d
val nativeDType =
if dtype == input.dtype then ScalarTypeOptional() else ScalarTypeOptional(dtype.toScalarType)
if dtype == input.dtype then ScalarTypeOptional() else ScalarTypeOptional(derivedDType.toScalarType)
fromNative(torchNative.log_softmax(input.native, dim, nativeDType))

/** Applies the rectified linear unit function element-wise.
Expand Down Expand Up @@ -72,10 +78,15 @@ private[torch] trait Activations {
*
* @group nn_activation
*/
def softmax[In <: DType, Out <: DType](input: Tensor[In], dim: Long)(
dtype: Out = input.dtype
): Tensor[Out] =
def softmax[In <: DType, Out <: FloatNN | Derive](
input: Tensor[In],
dim: Long,
dtype: Out = derive
): Tensor[DTypeOrDeriveFromTensor[In, Out]] =
val derivedDType = dtype match
case _: Derive => input.dtype
case d: DType => d
val nativeDType =
if dtype == input.dtype then ScalarTypeOptional() else ScalarTypeOptional(dtype.toScalarType)
if dtype == input.dtype then ScalarTypeOptional() else ScalarTypeOptional(derivedDType.toScalarType)
fromNative(torchNative.softmax(input.native, dim, nativeDType))
}
88 changes: 88 additions & 0 deletions core/src/main/scala/torch/nn/functional/Loss.scala
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,92 @@ private[torch] trait Loss {
BCEWithLogitsLossOptions()
)
)


// http://bytedeco.org/javacpp-presets/pytorch/apidocs/
/** This criterion computes the cross entropy loss between input logits and target.
* See [[torch.nn.loss.CrossEntropyLoss]] for details.
*
* **Shape:**
*
* * Input: Shape $(C)$, $(N,C)$ or $(N,C,d_1,d_2,...,d_K)$ with $K≥1$ in the case of K-dimensional
* loss.
* * Target: If containing class indices, shape $()$, $(N)$ or $(N,d_1,d_2,...,d_K)$ with $K≥1$
* in the case of K-dimensional loss where each value should be between $[0,C)$. If containing class
* probabilities, same shape as the input and each value should be between [0,1][0,1].
*
* where:
* * C = number of classes
* * N = batch size​
*
*
* @example
* {{{
* // Example of target with class indices
* val input = torch.randn(3, 5, requires_grad=True)
* val target = torch.randint(5, (3,), dtype=torch.int64)
* val loss = F.cross_entropy(input, target)
* loss.backward()
*
* // Example of target with class probabilities
* val input = torch.randn(3, 5, requires_grad=True)
* val target = torch.randn(3, 5).softmax(dim=1)
* val loss = F.crossEntropy(input, target)
* loss.backward()
* }}}
*
*
* @param input
* Predicted unnormalized logits; see Shape section above for supported shapes.
* @param target
* Ground truth class indices or class probabilities; see Shape section below for supported shapes.
* @param weight
* a manual rescaling weight given to each class. If given, has to be a Tensor of size C
* @param size_average
* Deprecated (see reduction). By default, the losses are averaged over each loss element in the
* batch. Note that for some losses, there multiple elements per sample. If the field `size_average`
* is set to `false`, the losses are instead summed for each mini-batch. Ignored when reduce is
* `false`. Default: `true`
* @param ignore_index
* Specifies a target value that is ignored and does not contribute to the input gradient. When
* `size_average` is `true`, the loss is averaged over non-ignored targets. Note that
* `ignore_index` is only applicable when the target contains class indices. Default: `-100`
* @param reduce
* Deprecated (see reduction). By default, the losses are averaged or summed over observations for
* each mini-batch depending on `size_average`. When reduce is `false`, returns a loss per batch
* element instead and ignores size_average. Default: `true`
* @param reduction
* Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'. 'none': no reduction will
* be applied, 'mean': the sum of the output will be divided by the number of elements in the output,
* 'sum': the output will be summed. Note: `size_average` and `reduce` are in the process of being
* deprecated, and in the meantime, specifying either of those two args will override reduction.
* Default: 'mean'
* @param label_smoothing
* A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0
* means no smoothing. The targets become a mixture of the original ground truth and a uniform
* distribution as described in [[https://arxiv.org/abs/1512.00567 Rethinking the Inception Architecture
* for Computer Vision]]. Default: 0.0
*
* @return [[torch.Tensor]]
*
*
* @see See [[https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html torch.nn.functional.cross_entropy]]
* @see See [[https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html for equivalent torch.nn.CrossEntropyLoss class]]
* @see See [[https://pytorch.org/cppdocs/ PyTorch C++ documentation]]
* @see See [[http://bytedeco.org/javacpp-presets/pytorch/apidocs/ ByteDeco PyTorch preset]]
*/
def crossEntropy[
I <: BFloat16 | Float32 | Float64,
O <: NumericRealNN
](
input: Tensor[I],
target: Tensor[O]
): Tensor[I] =
fromNative(
torchNative.cross_entropy(
input.native,
target.native
)
)

}
Loading

0 comments on commit 75857c4

Please sign in to comment.