epfml · JulienVig · Apr 3, 2024 · Mar 18, 2024 · Mar 19, 2024 · Mar 19, 2024
diff --git a/discojs/discojs-core/package.json b/discojs/discojs-core/package.json
@@ -22,6 +22,7 @@
   "dependencies": {
     "@tensorflow/tfjs": "4",
     "@types/msgpack-lite": "0.1",
+    "@xenova/transformers": "2",
     "axios": "1",
     "gpt3-tokenizer": "1",
     "immutable": "4",

diff --git a/discojs/discojs-core/src/dataset/data/data.ts b/discojs/discojs-core/src/dataset/data/data.ts
@@ -64,40 +64,37 @@ export abstract class Data {
    * functions in a series. The preprocessing functions are chained according to their defined
    * priority.
    */
-  get preprocessing (): (entry: tf.TensorContainer) => tf.TensorContainer {
+  get preprocessing (): (entry: tf.TensorContainer) => Promise<tf.TensorContainer> {
     const params = this.task.trainingInformation
     const taskPreprocessing = params.preprocessingFunctions
-
     if (
       taskPreprocessing === undefined ||
       taskPreprocessing.length === 0 ||
       this.availablePreprocessing === undefined ||
       this.availablePreprocessing.size === 0
-    ) {
-      return (x) => x
-    }
-
+      ) {
+        return x => Promise.resolve(x)
+      }
+    
     const applyPreprocessing = this.availablePreprocessing
-      .filter((e) => e.type in taskPreprocessing)
-      .map((e) => e.apply)
-
+    .filter((e) => e.type in taskPreprocessing)
+    .map((e) => e.apply)
+    
     if (applyPreprocessing.size === 0) {
-      return (x) => x
+      return x => Promise.resolve(x)
     }
-
+    
     const preprocessingChain = applyPreprocessing.reduce((acc, fn) =>
-      (x: tf.TensorContainer) => fn(acc(x), this.task),
-      (x: tf.TensorContainer) => x,
-    )
-
-    return (x: tf.TensorContainer) => preprocessingChain(x)
+      x => fn(acc(x), this.task), (x: Promise<tf.TensorContainer>) => x)
+
+    return x => preprocessingChain(Promise.resolve(x))
   }
 
   /**
    * The TF.js dataset preprocessing according to the set of preprocessing functions and the task's
    * parameters.
    */
   get preprocessedDataset (): Dataset {
-    return this.dataset.map(this.preprocessing)
+    return this.dataset.mapAsync(this.preprocessing)
   }
 }
diff --git a/discojs/discojs-core/src/dataset/data/preprocessing/base.ts b/discojs/discojs-core/src/dataset/data/preprocessing/base.ts
@@ -15,5 +15,5 @@ export type Preprocessing = ImagePreprocessing | TextPreprocessing | TabularPrep
  */
 export interface PreprocessingFunction {
   type: Preprocessing
-  apply: (x: tf.TensorContainer, task: Task) => tf.TensorContainer
+  apply: (x: Promise<tf.TensorContainer>, task: Task) => Promise<tf.TensorContainer>
 }
diff --git a/discojs/discojs-core/src/dataset/data/preprocessing/image_preprocessing.ts b/discojs/discojs-core/src/dataset/data/preprocessing/image_preprocessing.ts
@@ -19,8 +19,8 @@ interface ImageEntry extends tf.TensorContainerObject {
 
 const resize: PreprocessingFunction = {
   type: ImagePreprocessing.Resize,
-  apply: (entry: tf.TensorContainer, task: Task): tf.TensorContainer => {
-    const { xs, ys } = entry as ImageEntry
+  apply: async (entry: Promise<tf.TensorContainer>, task: Task): Promise<tf.TensorContainer> => {
+    const { xs, ys } = await entry as ImageEntry
     const params = task.trainingInformation
     return {
       xs: params.IMAGE_W !== undefined && params.IMAGE_H !== undefined
@@ -33,8 +33,8 @@ const resize: PreprocessingFunction = {
 
 const normalize: PreprocessingFunction = {
   type: ImagePreprocessing.Normalize,
-  apply: (entry: tf.TensorContainer): tf.TensorContainer => {
-    const { xs, ys } = entry as ImageEntry
+  apply: async (entry: Promise<tf.TensorContainer>): Promise<tf.TensorContainer> => {
+    const { xs, ys } = await entry as ImageEntry
     return {
       xs: xs.div(tf.scalar(255)),
       ys

diff --git a/discojs/discojs-core/src/dataset/data/preprocessing/tabular_preprocessing.ts b/discojs/discojs-core/src/dataset/data/preprocessing/tabular_preprocessing.ts
@@ -18,13 +18,13 @@ interface TabularEntry extends tf.TensorContainerObject {
 
 const sanitize: PreprocessingFunction = {
   type: TabularPreprocessing.Sanitize,
-  apply: (entry: tf.TensorContainer): tf.TensorContainer => {
+  apply: async (entry: Promise<tf.TensorContainer>) => {
     // if preprocessing a dataset without labels, then the entry is an array of numbers
     if (Array.isArray(entry)) {
-      return entry.map(i => i ?? 0)
+      return entry.map((i: number) => i ?? 0)
     // otherwise it is an object with feature and labels
     } else {
-      const { xs, ys } = entry as TabularEntry
+      const { xs, ys } = await entry as TabularEntry
       return {
         xs: xs.map(i => i ?? 0),
         ys

diff --git a/discojs/discojs-core/src/dataset/data/preprocessing/text_preprocessing.ts b/discojs/discojs-core/src/dataset/data/preprocessing/text_preprocessing.ts
@@ -1,57 +1,74 @@
 import { List } from 'immutable'
 import * as tf from '@tensorflow/tfjs'
 
+import type { Task } from '../../../index.js'
 import type { PreprocessingFunction } from './base.js'
+import { models } from '../../../index.js'
 
 /**
  * Available text preprocessing types.
  */
 export enum TextPreprocessing {
   Tokenize,
-  Padding
-}
-
-interface TextEntry extends tf.TensorContainerObject {
-  xs: string[]
-  ys: number[]
+  LeftPadding
 }
 
 interface TokenizedEntry extends tf.TensorContainerObject {
   xs: tf.Tensor1D
-  ys: tf.Tensor1D
 }
 
-// TODO that'll fail everytime
-const gpt3Tokenizer = null as unknown as { encode: (_: string) => { bpe: number[]; text: string[] } }
+/**
+ * We are currently only implementing left padding for text generation 
+ * https://huggingface.co/docs/transformers/en/llm_tutorial#wrong-padding-side
+ * The function can easily be extended to support right padding once the need arise
+ */
+const leftPadding: PreprocessingFunction = {
+  type: TextPreprocessing.LeftPadding,
+  apply: async (x: Promise<tf.TensorContainer>, task: Task): Promise<tf.TensorContainer> => {
+    let { xs } = await x as TokenizedEntry
+    const tokenizer = await models.getTaskTokenizer(task)
 
-const padding: PreprocessingFunction = {
-  type: TextPreprocessing.Padding,
-  apply: (x: tf.TensorContainer) => {
-    const { xs, ys } = x as TokenizedEntry
-    // TODO: add to task definition
-    const maxLength = 64
-    if (maxLength === undefined) {
-      return { xs, ys }
+    
+    const maxLength = task.trainingInformation.maxSequenceLength ?? tokenizer.model_max_length as number
+    // Should never happen because tokenization truncates inputs
+    if (xs.size > maxLength) {
+      xs = xs.slice([0], [maxLength])
+    } else if (xs.size < maxLength) {
+      const paddingToken = tokenizer.pad_token_id
+      xs = xs.pad([[Math.max(0, maxLength - xs.size), 0]], paddingToken)
     }
+    // if xs.size == maxLength we can leave it as it is
     return {
-      xs: xs
-        .pad([[0, Math.max(0, maxLength - xs.size)]])
-        .slice([0], [maxLength]),
-      ys
+      xs,
+      ys: tf.oneHot(xs, tokenizer.model.vocab.length + 1) // gpt-tfjs expects a one-hot encoded token label
     }
   }
 }
 
+interface TokenizerOutput {
+  input_ids: number[]
+}
+/**
+ * Tokenize and truncates input strings
+ */
 const tokenize: PreprocessingFunction = {
   type: TextPreprocessing.Tokenize,
-  apply: (x: tf.TensorContainer) => {
-    const { xs, ys } = x as TextEntry
-
-    const tokenized = gpt3Tokenizer.encode(xs[0]).bpe
+  apply: async (x: Promise<tf.TensorContainer>, task: Task): Promise<tf.TensorContainer> => {
+    const xs = await x as string // tf.TextLineDataset yields strings
+    const tokenizer = await models.getTaskTokenizer(task)
+    const maxLength = task.trainingInformation.maxSequenceLength ?? tokenizer.model_max_length as number
 
+    const {input_ids: tokens} = tokenizer(xs, {
+      // Transformers.js currently only supports right padding while we need left for text generation
+      // Right padding should be supported in the future, once it is, we can directly pad while tokenizing
+      // https://github.com/xenova/transformers.js/blob/8804c36591d11d8456788d1bb4b16489121b3be2/src/tokenizers.js#L2517
+      padding: false,
+      truncation: true,
+      return_tensor: false,
+      max_length: maxLength,
+    }) as TokenizerOutput
     return {
-      xs: tf.tensor(tokenized),
-      ys: tf.tensor(ys)
+      xs: tf.tensor(tokens, undefined, 'int32') // cast tokens from float to int for gpt-tfjs}
     }
   }
 }
@@ -61,5 +78,5 @@ const tokenize: PreprocessingFunction = {
  */
 export const AVAILABLE_PREPROCESSING = List.of(
   tokenize,
-  padding
+  leftPadding
 ).sortBy((e) => e.type)
diff --git a/discojs/discojs-core/src/dataset/data_loader/text_loader.ts b/discojs/discojs-core/src/dataset/data_loader/text_loader.ts
@@ -3,7 +3,7 @@ import type { Task } from '../../index.js'
 import type { DataSplit, Dataset } from '../index.js'
 import { TextData } from '../index.js'
 
-import { DataLoader } from './index.js'
+import { DataLoader, DataConfig } from './index.js'
 
 /**
  * Text data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
@@ -18,13 +18,15 @@ export abstract class TextLoader<S> extends DataLoader<S> {
 
   abstract loadDatasetFrom (source: S): Promise<Dataset>
 
-  async load (source: S): Promise<Dataset> {
-    return await this.loadDatasetFrom(source)
+  async load (source: S, config?: DataConfig): Promise<Dataset> {
+    const dataset = await this.loadDatasetFrom(source)
+    // 1st arg: Stream shuffling buffer size
+    return (config?.shuffle === undefined || config?.shuffle) ? dataset.shuffle(1000, undefined, true) : dataset
   }
 
-  async loadAll (sources: S[]): Promise<DataSplit> {
+  async loadAll (sources: S[], config?: DataConfig): Promise<DataSplit> {
     const concatenated =
-      (await Promise.all(sources.map(async (src) => await this.load(src))))
+      (await Promise.all(sources.map(async (src) => await this.load(src, config))))
         .reduce((acc, dataset) => acc.concatenate(dataset))
 
     return {

diff --git a/discojs/discojs-core/src/default_tasks/wikitext.ts b/discojs/discojs-core/src/default_tasks/wikitext.ts
@@ -1,5 +1,5 @@
 import type { Model, Task, TaskProvider } from '../index.js'
-import { models } from '../index.js'
+import { data, models } from '../index.js'
 
 export const wikitext: TaskProvider = {
   getTask (): Task {
@@ -19,17 +19,18 @@ export const wikitext: TaskProvider = {
       trainingInformation: {
         dataType: 'text',
         modelID: 'wikitext-103-raw-model',
+        preprocessingFunctions: [data.TextPreprocessing.Tokenize, data.TextPreprocessing.LeftPadding],
         validationSplit: 0.2, // TODO: is this used somewhere? because train, eval and test are already split in dataset
-        epochs: 10,
-        // constructing a batch is taken care automatically in the dataset to make things faster
-        // so we fake a batch size of 1
-        batchSize: 1,
+        epochs: 5,
         scheme: 'federated',
         noiseScale: undefined,
         decentralizedSecure: true,
         minimumReadyPeers: 3,
         maxShareValue: 100,
-        roundDuration: 10
+        roundDuration: 10,
+        batchSize: 16,
+        tokenizer: 'Xenova/gpt2',
+        maxSequenceLength: 128
       }
     }
   },

diff --git a/discojs/discojs-core/src/logging/trainer_logger.ts b/discojs/discojs-core/src/logging/trainer_logger.ts
@@ -41,15 +41,20 @@ export class TrainerLogger extends ConsoleLogger {
     }
 
     // console output
-    const msg = `Epoch: ${epoch}\nTrain: ${logs?.acc ?? 'undefined'}\nValidation:${logs?.val_acc ?? 'undefined'}\nLoss:${logs?.loss ?? 'undefined'}`
+    let msg = `Epoch: ${epoch}\n`
+    if (logs !== undefined) {
+      for (const [key, value] of Object.entries(logs)) {
+        msg += `${key}: ${value}\n`
+      }
+    }
     this.success(`On epoch end:\n${msg}\n`)
   }
 
   /**
    *  Display ram usage
    */
   ramUsage (): void {
-    this.success(`Training RAM usage is  = ${tf.memory().numBytes * 0.000001} MB`)
+    this.success(`Training RAM usage is  = ${tf.memory().numBytes / 1024 / 1024} MB`)
     this.success(`Number of allocated tensors  = ${tf.memory().numTensors}`)
   }
 }
diff --git a/discojs/discojs-core/src/models/gpt/config.ts b/discojs/discojs-core/src/models/gpt/config.ts
@@ -15,13 +15,11 @@ export interface ModelSize {
 
 export interface GPTConfig {
   lr: number
-  batchSize: number
   blockSize: number
   vocabSize: number
   evaluate?: boolean
   maxEvalBatches?: number
   evaluateEvery?: number
-  epochs?: number
   maxIter?: number
   weightDecay?: number
   verbose?: 0 | 1
@@ -38,8 +36,6 @@ export interface GPTConfig {
 export const DEFAULT_CONFIG: Required<GPTConfig> = {
   lr: 0.001,
   weightDecay: 0,
-  batchSize: 2,
-  epochs: 9999,
   maxIter: 10_000,
   verbose: 0,
   modelType: 'gpt-nano',