discojs-core/models: add gpt

Closes: #641 Closes: #619 Closes: #600
epfml · Mar 18, 2024 · cb27570 · cb27570
1 parent 9ba11ff
commit cb27570
Show file tree

Hide file tree

Showing 12 changed files with 1,140 additions and 6 deletions.
diff --git a/discojs/discojs-core/src/informant/training_informant/base.ts b/discojs/discojs-core/src/informant/training_informant/base.ts
@@ -11,6 +11,8 @@ export abstract class Base {
   protected readonly trainingGraphInformant = new GraphInformant()
   protected readonly validationGraphInformant = new GraphInformant()
 
+  private _losses = List<number>()
+
   // statistics
   protected currentRound = 0
   protected currentNumberOfParticipants = 0
@@ -71,6 +73,20 @@ export abstract class Base {
     return this.validationGraphInformant.accuracy()
   }
 
+  set loss (loss: number | undefined) {
+    if (loss === undefined) throw new Error('loss is undefined')
+    this._losses = this._losses.push(loss)
+  }
+
+  get loss (): number | undefined {
+    return this._losses.last()
+  }
+
+  /** return loss of each round */
+  get losses (): List<number> {
+    return this._losses
+  }
+
   trainingAccuracyData (): List<number> {
     return this.trainingGraphInformant.data()
   }

diff --git a/discojs/discojs-core/src/models/gpt/LICENSE.md b/discojs/discojs-core/src/models/gpt/LICENSE.md
@@ -0,0 +1,23 @@
+MIT License
+
+Copyright (c) 2023 Nathan Maire
+Copyright (c) 2023 lukemovement
+Copyright (c) 2023 Anton Zemlyansky
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/discojs/discojs-core/src/models/gpt/config.ts b/discojs/discojs-core/src/models/gpt/config.ts
@@ -0,0 +1,77 @@
+type ModelType =
+    | 'gpt2'
+    | 'gpt2-medium'
+    | 'gpt2-large'
+    | 'gpt2-xl'
+    | 'gpt-mini'
+    | 'gpt-micro'
+    | 'gpt-nano'
+
+interface ModelSize {
+  nLayer?: number
+  nHead?: number
+  nEmbd?: number
+}
+
+export interface GPTConfig {
+  lr: number
+  batchSize: number
+  blockSize: number
+  vocabSize: number
+  evaluate?: boolean
+  maxEvalBatches?: number
+  evaluateEvery?: number
+  epochs?: number
+  maxIter?: number
+  weightDecay?: number
+  verbose?: 0 | 1
+  bias?: boolean
+  debug?: boolean
+  dropout?: number
+  residDrop?: number
+  embdDrop?: number
+  tokEmb?: boolean
+  lmHead?: boolean
+  modelType: ModelType
+}
+
+export const DEFAULT_CONFIG: Required<GPTConfig> = {
+  lr: 0.001,
+  weightDecay: 0,
+  batchSize: 2,
+  epochs: 9999,
+  maxIter: 10_000,
+  verbose: 0,
+  modelType: 'gpt-nano',
+  evaluate: true,
+  maxEvalBatches: 12,
+  evaluateEvery: 100,
+  blockSize: 128,
+  vocabSize: 50258,
+  bias: true,
+  debug: false,
+  dropout: 0.2,
+  residDrop: 0.2,
+  embdDrop: 0.2,
+  tokEmb: true,
+  lmHead: true
+}
+
+export function getModelSizes (modelType: ModelType): Required<ModelSize> {
+  switch (modelType) {
+    case 'gpt2':
+      return { nLayer: 12, nHead: 12, nEmbd: 768 }
+    case 'gpt2-medium':
+      return { nLayer: 24, nHead: 16, nEmbd: 1024 }
+    case 'gpt2-large':
+      return { nLayer: 36, nHead: 20, nEmbd: 1280 }
+    case 'gpt2-xl':
+      return { nLayer: 48, nHead: 25, nEmbd: 1600 }
+    case 'gpt-mini':
+      return { nLayer: 6, nHead: 6, nEmbd: 192 }
+    case 'gpt-micro':
+      return { nLayer: 4, nHead: 4, nEmbd: 128 }
+    case 'gpt-nano':
+      return { nLayer: 3, nHead: 3, nEmbd: 48 }
+  }
+}
diff --git a/discojs/discojs-core/src/models/gpt/evaluate.ts b/discojs/discojs-core/src/models/gpt/evaluate.ts
@@ -0,0 +1,54 @@
+import tf from '@tensorflow/tfjs'
+
+export default async function evaluate (
+  model: tf.LayersModel,
+  dataset: tf.data.Dataset<{ xs: tf.Tensor, ys: tf.Tensor }>
+): Promise<Record<'acc' | 'val_acc' | 'val_loss' | 'val_perplexity', number>> {
+  let datasetSize = 0
+  let totalLoss = 0
+  const acc: [number, number] = [0, 0]
+
+  await dataset.map(({ xs, ys }) => {
+    const logits = model.apply(xs)
+    if (Array.isArray(logits)) {
+      throw new Error('model outputed many tensor')
+    }
+    if (logits instanceof tf.SymbolicTensor) {
+      throw new Error('model outputed symbolic tensor')
+    }
+    xs.dispose()
+
+    return { logits, ys }
+  }).mapAsync(async ({ logits, ys }) => {
+    const loss = (await tf.losses.softmaxCrossEntropy(ys, logits).array())
+    if (typeof loss !== 'number') {
+      throw new Error('got multiple loss')
+    }
+
+    const accTensor = tf.metrics.categoricalAccuracy(ys, logits)
+    const accSize = accTensor.shape.reduce((l, r) => l * r, 1)
+    const accSum = accTensor.sum()
+    const accSummed = await accSum.array()
+    if (typeof accSummed !== 'number') {
+      throw new Error('got multiple accuracy sum')
+    }
+
+    tf.dispose([ys, logits, accTensor, accSum])
+
+    return { loss, accSummed, accSize }
+  }).forEachAsync(({ loss, accSummed, accSize }) => {
+    datasetSize += 1
+    totalLoss += loss
+    acc[0] += accSummed
+    acc[1] += accSize
+  })
+
+  const loss = totalLoss / datasetSize
+
+  return {
+    val_loss: loss,
+    val_perplexity: Math.exp(loss),
+    acc: acc[0] / acc[1],
+    val_acc: acc[0] / acc[1]
+  }
+}
diff --git a/discojs/discojs-core/src/models/gpt/index.ts b/discojs/discojs-core/src/models/gpt/index.ts
@@ -0,0 +1,144 @@
+/**
+ * this code is taken from gpt-tfjs with modifications from @peacefulotter and @lukemovement
+ **/
+
+import tf from '@tensorflow/tfjs'
+
+import { WeightsContainer } from '../..'
+import type { Dataset } from '../../dataset'
+import { Sink } from '../../utils/event_emitter'
+
+import type { EpochLogs, Prediction, Sample } from '../model'
+import { Model } from '../model'
+
+import { GPTLMHeadModel } from './model'
+
+// TODO too big config
+interface Config {
+  modelType: 'gpt-nano'
+  epochs: number // TODO mv to Task
+  maxIter: number
+  batchSize: number
+  blockSize: number
+  lr: number
+  vocabSize: number
+  maxEvalBatches: number
+}
+
+export class GPT extends Model {
+  private readonly model: GPTLMHeadModel
+
+  private static readonly batchSize = 4
+  private static readonly blockSize = 128
+  private static readonly vocabSize = 50258
+
+  constructor () {
+    super()
+
+    // TODO sensible defaults?
+    const config: Config = {
+      modelType: 'gpt-nano',
+      epochs: 1,
+      maxIter: 2,
+      batchSize: GPT.batchSize,
+      blockSize: GPT.blockSize,
+      lr: 0.001,
+      vocabSize: GPT.vocabSize,
+      maxEvalBatches: 1
+    }
+
+    this.model = new GPTLMHeadModel(config)
+  }
+
+  override get weights (): WeightsContainer {
+    return new WeightsContainer(this.model.weights.map((w) => w.read()))
+  }
+
+  override set weights (ws: WeightsContainer) {
+    this.model.setWeights(ws.weights)
+  }
+
+  // takes a stream of two bytes followed by a token ID
+  private convertCharDataset (dataset: Dataset): tf.data.Dataset<{ xs: tf.Tensor2D, ys: tf.Tensor3D }> {
+    const batchSize = 4
+    const sampleSize = GPT.blockSize + 1
+    const chunkSize = sampleSize * batchSize * 2
+
+    function toUInt16 (low: number, high: number): number {
+      low &= 0xff
+      high &= 0xff
+      return (high << 8) | low
+    }
+
+    // TODO add support for small last batch
+    return dataset.batch(chunkSize, false).mapAsync(async (chunk) => {
+      if (!(chunk instanceof tf.Tensor)) {
+        throw new Error('chunk is not a Tensor')
+      }
+      if (chunk.shape.length !== 2 || chunk.shape[1] !== 1) {
+        throw new Error('dataset is not a only char')
+      }
+
+      const buffer = await chunk.buffer()
+
+      const xs = tf.buffer<tf.Rank.R2, 'int32'>([batchSize, GPT.blockSize], 'int32')
+      const ys = tf.buffer<tf.Rank.R3, 'int32'>([batchSize, GPT.blockSize, GPT.vocabSize], 'int32')
+
+      for (let i = 0; i < batchSize; i++) {
+        for (let j = 0; j < sampleSize; j++) {
+          const idx = (i * sampleSize + j) * 2
+          const low = buffer.get(idx)
+          const high = buffer.get(idx + 1)
+          const token = toUInt16(low, high)
+          if (j < sampleSize - 1) xs.set(token, i, j)
+          if (j > 0) ys.set(1, i, j - 1, token)
+        }
+      }
+
+      return { xs: xs.toTensor(), ys: ys.toTensor() }
+    })
+  }
+
+  override async * train (
+    trainingData: Dataset,
+    validationData?: Dataset,
+    epochs = 1,
+    tracker = new Sink()
+  ): AsyncGenerator<EpochLogs, void> {
+    for (let i = 0; i < epochs; i++) {
+      let logs: tf.Logs | undefined
+
+      await this.model.fitDataset(
+        this.convertCharDataset(trainingData), {
+          epochs: 1,
+          validationData: validationData !== undefined ? this.convertCharDataset(validationData) : validationData,
+          callbacks: {
+            onEpochEnd: (_, cur) => { logs = cur },
+            onBatchBegin: () => { tracker.emit('batchBegin', undefined) },
+            onBatchEnd: () => { tracker.emit('batchEnd', undefined) }
+          }
+        })
+
+      yield logs
+    }
+  }
+
+  override async predict (input: Sample): Promise<Prediction> {
+    const ret = this.model.predict(input)
+    if (Array.isArray(ret)) {
+      throw new Error('prediction yield many Tensors but should have only returned one')
+    }
+
+    return ret
+  }
+
+  static deserialize (weights: WeightsContainer): Model {
+    const model = new GPT()
+    model.weights = weights
+    return model
+  }
+
+  serialize (): WeightsContainer {
+    return this.weights
+  }
+}