Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tokenization and prompting API to GPT models #651

Merged
merged 39 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from 37 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b18d187
Basic tokenization with single hard-coded tokenizer
JulienVig Mar 18, 2024
59c1b7e
Mv batch size from gpt-tfjs config to Task and mv all preprocessing i…
JulienVig Mar 19, 2024
57e425a
Move epochs parameter from gpt-tfjs config to Task
JulienVig Mar 19, 2024
612d441
No memory leak with iterator while true
JulienVig Mar 19, 2024
9bb4120
Clean training loop and fix evaluate memory leak
JulienVig Mar 19, 2024
d6c12c4
*: use latest LTS node
tharvik Feb 16, 2024
737339a
github: upgrade actions
tharvik Mar 21, 2024
f5624ce
isomorphic-wrtc/node: use node-datachannel
tharvik Mar 21, 2024
6159198
web-client: rm wrtc unneeded specifics
tharvik Mar 21, 2024
2a584c4
discojs-core: inline Weights
tharvik Mar 15, 2024
12ab70d
discojs-core: remove test dep on discojs-node
tharvik Mar 18, 2024
df4c7de
discojs-core/trainer_builder: drop aggregator
tharvik Mar 19, 2024
18e90b4
web-client: readd cypress
tharvik Mar 19, 2024
276ba1f
discojs-core/task: ensure type guards completeness
tharvik Mar 20, 2024
a256cd2
web-client/locale: simplify
tharvik Mar 20, 2024
2c75b04
*: upgrade to ES2022 modules
tharvik Mar 15, 2024
babb84d
*: upgrade eslint
tharvik Mar 18, 2024
b9db50f
web-client: bump deps
tharvik Mar 22, 2024
309b6c1
Merge with ES2022 branch
JulienVig Mar 25, 2024
e2c2057
Add transformers.js
JulienVig Mar 25, 2024
b4e46c3
Integrate Transformers.js tokenizers
JulienVig Mar 26, 2024
270ca10
Load tokenizer only once during pre-processing
JulienVig Mar 26, 2024
7306fe8
Change tokenizer trainingInformation name
JulienVig Mar 26, 2024
b561f2a
Implement text dataset shuffling and left padding preprocessing
JulienVig Mar 27, 2024
07175e0
Update wikitext example
JulienVig Mar 27, 2024
30ad5c9
Merge with develop
JulienVig Mar 27, 2024
482df2d
Fix merge
JulienVig Mar 27, 2024
8c3e245
Fix merge
JulienVig Mar 27, 2024
decb341
Fix linting errors
JulienVig Mar 27, 2024
45b7c98
Fixup package-lock.json
JulienVig Mar 27, 2024
ff90cb7
Fix lint error
JulienVig Mar 27, 2024
05ad23a
Fix default wikitext max iter
JulienVig Mar 27, 2024
971e744
Change wikitext default hp for server test
JulienVig Mar 28, 2024
4a18a55
Update discojs/discojs-core/src/models/gpt/evaluate.ts
JulienVig Mar 28, 2024
00cff66
Only specify transformers.js major version in package.json
JulienVig Mar 28, 2024
ec94c08
Address PR' comments
JulienVig Mar 28, 2024
53fd0cb
Use async array rather than arraySync
JulienVig Mar 28, 2024
8d86f54
Improve getTaskTokenizer doc
JulienVig Mar 28, 2024
1ba5f60
Add text preprocessing type checks
JulienVig Apr 3, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions discojs/discojs-core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"dependencies": {
"@tensorflow/tfjs": "4",
"@types/msgpack-lite": "0.1",
"@xenova/transformers": "2",
"axios": "1",
"gpt3-tokenizer": "1",
"immutable": "4",
Expand Down
31 changes: 14 additions & 17 deletions discojs/discojs-core/src/dataset/data/data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,40 +64,37 @@ export abstract class Data {
* functions in a series. The preprocessing functions are chained according to their defined
* priority.
*/
get preprocessing (): (entry: tf.TensorContainer) => tf.TensorContainer {
get preprocessing (): (entry: tf.TensorContainer) => Promise<tf.TensorContainer> {
const params = this.task.trainingInformation
const taskPreprocessing = params.preprocessingFunctions

if (
taskPreprocessing === undefined ||
taskPreprocessing.length === 0 ||
this.availablePreprocessing === undefined ||
this.availablePreprocessing.size === 0
) {
return (x) => x
}

) {
return x => Promise.resolve(x)
}
const applyPreprocessing = this.availablePreprocessing
.filter((e) => e.type in taskPreprocessing)
.map((e) => e.apply)

.filter((e) => e.type in taskPreprocessing)
.map((e) => e.apply)
if (applyPreprocessing.size === 0) {
return (x) => x
return x => Promise.resolve(x)
}

const preprocessingChain = applyPreprocessing.reduce((acc, fn) =>
(x: tf.TensorContainer) => fn(acc(x), this.task),
(x: tf.TensorContainer) => x,
)

return (x: tf.TensorContainer) => preprocessingChain(x)
x => fn(acc(x), this.task), (x: Promise<tf.TensorContainer>) => x)

return x => preprocessingChain(Promise.resolve(x))
}

/**
* The TF.js dataset preprocessing according to the set of preprocessing functions and the task's
* parameters.
*/
get preprocessedDataset (): Dataset {
return this.dataset.map(this.preprocessing)
return this.dataset.mapAsync(this.preprocessing)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ export type Preprocessing = ImagePreprocessing | TextPreprocessing | TabularPrep
*/
export interface PreprocessingFunction {
type: Preprocessing
apply: (x: tf.TensorContainer, task: Task) => tf.TensorContainer
apply: (x: Promise<tf.TensorContainer>, task: Task) => Promise<tf.TensorContainer>
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ interface ImageEntry extends tf.TensorContainerObject {

const resize: PreprocessingFunction = {
type: ImagePreprocessing.Resize,
apply: (entry: tf.TensorContainer, task: Task): tf.TensorContainer => {
const { xs, ys } = entry as ImageEntry
apply: async (entry: Promise<tf.TensorContainer>, task: Task): Promise<tf.TensorContainer> => {
const { xs, ys } = await entry as ImageEntry
const params = task.trainingInformation
return {
xs: params.IMAGE_W !== undefined && params.IMAGE_H !== undefined
Expand All @@ -33,8 +33,8 @@ const resize: PreprocessingFunction = {

const normalize: PreprocessingFunction = {
type: ImagePreprocessing.Normalize,
apply: (entry: tf.TensorContainer): tf.TensorContainer => {
const { xs, ys } = entry as ImageEntry
apply: async (entry: Promise<tf.TensorContainer>): Promise<tf.TensorContainer> => {
const { xs, ys } = await entry as ImageEntry
return {
xs: xs.div(tf.scalar(255)),
ys
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ interface TabularEntry extends tf.TensorContainerObject {

const sanitize: PreprocessingFunction = {
type: TabularPreprocessing.Sanitize,
apply: (entry: tf.TensorContainer): tf.TensorContainer => {
apply: async (entry: Promise<tf.TensorContainer>) => {
// if preprocessing a dataset without labels, then the entry is an array of numbers
if (Array.isArray(entry)) {
return entry.map(i => i ?? 0)
return entry.map((i: number) => i ?? 0)
// otherwise it is an object with feature and labels
} else {
const { xs, ys } = entry as TabularEntry
const { xs, ys } = await entry as TabularEntry
return {
xs: xs.map(i => i ?? 0),
ys
Expand Down
Original file line number Diff line number Diff line change
@@ -1,57 +1,74 @@
import { List } from 'immutable'
import * as tf from '@tensorflow/tfjs'

import type { Task } from '../../../index.js'
import type { PreprocessingFunction } from './base.js'
import { models } from '../../../index.js'

/**
* Available text preprocessing types.
*/
export enum TextPreprocessing {
Tokenize,
Padding
}

interface TextEntry extends tf.TensorContainerObject {
xs: string[]
ys: number[]
LeftPadding
}

interface TokenizedEntry extends tf.TensorContainerObject {
xs: tf.Tensor1D
ys: tf.Tensor1D
}

// TODO that'll fail everytime
const gpt3Tokenizer = null as unknown as { encode: (_: string) => { bpe: number[]; text: string[] } }
/**
* We are currently only implementing left padding for text generation
* https://huggingface.co/docs/transformers/en/llm_tutorial#wrong-padding-side
* The function can easily be extended to support right padding once the need arise
*/
const leftPadding: PreprocessingFunction = {
type: TextPreprocessing.LeftPadding,
apply: async (x: Promise<tf.TensorContainer>, task: Task): Promise<tf.TensorContainer> => {
let { xs } = await x as TokenizedEntry
JulienVig marked this conversation as resolved.
Show resolved Hide resolved
const tokenizer = await models.getTaskTokenizer(task)

const padding: PreprocessingFunction = {
type: TextPreprocessing.Padding,
apply: (x: tf.TensorContainer) => {
const { xs, ys } = x as TokenizedEntry
// TODO: add to task definition
const maxLength = 64
if (maxLength === undefined) {
return { xs, ys }
const maxLength = task.trainingInformation.maxSequenceLength ?? tokenizer.model_max_length as number
// Should never happen because tokenization truncates inputs
if (xs.size > maxLength) {
xs = xs.slice([0], [maxLength])
} else if (xs.size < maxLength) {
const paddingToken = tokenizer.pad_token_id
xs = xs.pad([[Math.max(0, maxLength - xs.size), 0]], paddingToken)
JulienVig marked this conversation as resolved.
Show resolved Hide resolved
}
// if xs.size == maxLength we can leave it as it is
return {
xs: xs
.pad([[0, Math.max(0, maxLength - xs.size)]])
.slice([0], [maxLength]),
ys
xs,
ys: tf.oneHot(xs, tokenizer.model.vocab.length + 1) // gpt-tfjs expects a one-hot encoded token label
}
}
}

interface TokenizerOutput {
input_ids: number[]
}
/**
* Tokenize and truncates input strings
*/
const tokenize: PreprocessingFunction = {
type: TextPreprocessing.Tokenize,
apply: (x: tf.TensorContainer) => {
const { xs, ys } = x as TextEntry

const tokenized = gpt3Tokenizer.encode(xs[0]).bpe
apply: async (x: Promise<tf.TensorContainer>, task: Task): Promise<tf.TensorContainer> => {
const xs = await x as string // tf.TextLineDataset yields strings
JulienVig marked this conversation as resolved.
Show resolved Hide resolved
const tokenizer = await models.getTaskTokenizer(task)
const maxLength = task.trainingInformation.maxSequenceLength ?? tokenizer.model_max_length as number

const {input_ids: tokens} = tokenizer(xs, {
// Transformers.js currently only supports right padding while we need left for text generation
// Right padding should be supported in the future, once it is, we can directly pad while tokenizing
// https://github.com/xenova/transformers.js/blob/8804c36591d11d8456788d1bb4b16489121b3be2/src/tokenizers.js#L2517
padding: false,
truncation: true,
return_tensor: false,
max_length: maxLength,
}) as TokenizerOutput
return {
xs: tf.tensor(tokenized),
ys: tf.tensor(ys)
xs: tf.tensor(tokens, undefined, 'int32') // cast tokens from float to int for gpt-tfjs}
}
}
}
Expand All @@ -61,5 +78,5 @@ const tokenize: PreprocessingFunction = {
*/
export const AVAILABLE_PREPROCESSING = List.of(
tokenize,
padding
leftPadding
).sortBy((e) => e.type)
12 changes: 7 additions & 5 deletions discojs/discojs-core/src/dataset/data_loader/text_loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import type { Task } from '../../index.js'
import type { DataSplit, Dataset } from '../index.js'
import { TextData } from '../index.js'

import { DataLoader } from './index.js'
import { DataLoader, DataConfig } from './index.js'

/**
* Text data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
Expand All @@ -18,13 +18,15 @@ export abstract class TextLoader<S> extends DataLoader<S> {

abstract loadDatasetFrom (source: S): Promise<Dataset>

async load (source: S): Promise<Dataset> {
return await this.loadDatasetFrom(source)
async load (source: S, config?: DataConfig): Promise<Dataset> {
const dataset = await this.loadDatasetFrom(source)
// 1st arg: Stream shuffling buffer size
return (config?.shuffle === undefined || config?.shuffle) ? dataset.shuffle(1000, undefined, true) : dataset
}

async loadAll (sources: S[]): Promise<DataSplit> {
async loadAll (sources: S[], config?: DataConfig): Promise<DataSplit> {
const concatenated =
(await Promise.all(sources.map(async (src) => await this.load(src))))
(await Promise.all(sources.map(async (src) => await this.load(src, config))))
.reduce((acc, dataset) => acc.concatenate(dataset))

return {
Expand Down
13 changes: 7 additions & 6 deletions discojs/discojs-core/src/default_tasks/wikitext.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { Model, Task, TaskProvider } from '../index.js'
import { models } from '../index.js'
import { data, models } from '../index.js'

export const wikitext: TaskProvider = {
getTask (): Task {
Expand All @@ -19,17 +19,18 @@ export const wikitext: TaskProvider = {
trainingInformation: {
dataType: 'text',
modelID: 'wikitext-103-raw-model',
preprocessingFunctions: [data.TextPreprocessing.Tokenize, data.TextPreprocessing.LeftPadding],
validationSplit: 0.2, // TODO: is this used somewhere? because train, eval and test are already split in dataset
epochs: 10,
// constructing a batch is taken care automatically in the dataset to make things faster
// so we fake a batch size of 1
batchSize: 1,
epochs: 5,
scheme: 'federated',
noiseScale: undefined,
decentralizedSecure: true,
minimumReadyPeers: 3,
maxShareValue: 100,
roundDuration: 10
roundDuration: 10,
batchSize: 16,
tokenizer: 'Xenova/gpt2',
maxSequenceLength: 128
}
}
},
Expand Down
9 changes: 7 additions & 2 deletions discojs/discojs-core/src/logging/trainer_logger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,20 @@ export class TrainerLogger extends ConsoleLogger {
}

// console output
const msg = `Epoch: ${epoch}\nTrain: ${logs?.acc ?? 'undefined'}\nValidation:${logs?.val_acc ?? 'undefined'}\nLoss:${logs?.loss ?? 'undefined'}`
let msg = `Epoch: ${epoch}\n`
if (logs !== undefined) {
for (const [key, value] of Object.entries(logs)) {
msg += `${key}: ${value}\n`
}
}
this.success(`On epoch end:\n${msg}\n`)
}

/**
* Display ram usage
*/
ramUsage (): void {
this.success(`Training RAM usage is = ${tf.memory().numBytes * 0.000001} MB`)
this.success(`Training RAM usage is = ${tf.memory().numBytes / 1024 / 1024} MB`)
this.success(`Number of allocated tensors = ${tf.memory().numTensors}`)
}
}
4 changes: 0 additions & 4 deletions discojs/discojs-core/src/models/gpt/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,11 @@ export interface ModelSize {

export interface GPTConfig {
lr: number
batchSize: number
blockSize: number
vocabSize: number
evaluate?: boolean
maxEvalBatches?: number
evaluateEvery?: number
epochs?: number
maxIter?: number
weightDecay?: number
verbose?: 0 | 1
Expand All @@ -38,8 +36,6 @@ export interface GPTConfig {
export const DEFAULT_CONFIG: Required<GPTConfig> = {
lr: 0.001,
weightDecay: 0,
batchSize: 2,
epochs: 9999,
maxIter: 10_000,
verbose: 0,
modelType: 'gpt-nano',
Expand Down
Loading
Loading