From 72f162917d42cf0465f4b8f153e013b163b777d4 Mon Sep 17 00:00:00 2001 From: s314cy Date: Fri, 1 Mar 2024 17:38:07 +0100 Subject: [PATCH] discojs-core/data_loader: add text --- .../src/dataset/data_loader/data_loader.ts | 2 +- .../src/dataset/data_loader/index.ts | 4 +- .../src/dataset/data_loader/text_loader.ts | 34 ++++++++++++++--- discojs/discojs-node/src/data/index.ts | 1 + discojs/discojs-node/src/data/text_loader.ts | 38 ++++++------------- discojs/discojs-web/src/data/text_loader.ts | 8 +--- 6 files changed, 46 insertions(+), 41 deletions(-) diff --git a/discojs/discojs-core/src/dataset/data_loader/data_loader.ts b/discojs/discojs-core/src/dataset/data_loader/data_loader.ts index 7dbc22b01..a4377d1e8 100644 --- a/discojs/discojs-core/src/dataset/data_loader/data_loader.ts +++ b/discojs/discojs-core/src/dataset/data_loader/data_loader.ts @@ -1,4 +1,4 @@ -import type { Dataset, DataSplit } from '..' +import type { DataSplit, Dataset } from '..' export interface DataConfig { features?: string[], labels?: string[], shuffle?: boolean, validationSplit?: number, inference?: boolean } diff --git a/discojs/discojs-core/src/dataset/data_loader/index.ts b/discojs/discojs-core/src/dataset/data_loader/index.ts index c5a886e44..39c0aa34d 100644 --- a/discojs/discojs-core/src/dataset/data_loader/index.ts +++ b/discojs/discojs-core/src/dataset/data_loader/index.ts @@ -1,4 +1,6 @@ -export { type DataConfig, DataLoader } from './data_loader' +export type { DataConfig } from './data_loader' +export { DataLoader } from './data_loader' + export { ImageLoader } from './image_loader' export { TabularLoader } from './tabular_loader' export { TextLoader } from './text_loader' diff --git a/discojs/discojs-core/src/dataset/data_loader/text_loader.ts b/discojs/discojs-core/src/dataset/data_loader/text_loader.ts index e87227f4c..d1bbd633e 100644 --- a/discojs/discojs-core/src/dataset/data_loader/text_loader.ts +++ b/discojs/discojs-core/src/dataset/data_loader/text_loader.ts @@ -1,12 +1,34 @@ -import type { Dataset } from '..' +import type { Task } from '../..' -import { TabularLoader } from './tabular_loader' +import type { DataSplit, Dataset } from '..' +import { TextData } from '..' + +import { DataLoader } from '.' /** * Text data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely, - * @epfml/discojs-web and @epfml/discojs-node. Loads data from files whose entries are line-separated and each consist of - * a sentence-like sample associated to an optional label. + * @epfml/discojs-web and @epfml/discojs-node. */ -export abstract class TextLoader extends TabularLoader { - abstract loadDatasetFrom (source: Source, config: Record): Promise +export abstract class TextLoader extends DataLoader { + constructor ( + private readonly task: Task + ) { + super() + } + + abstract loadDatasetFrom (source: S): Promise + + async load (source: S): Promise { + return await this.loadDatasetFrom(source) + } + + async loadAll (sources: S[]): Promise { + const concatenated = + (await Promise.all(sources.map(async (src) => await this.load(src)))) + .reduce((acc, dataset) => acc.concatenate(dataset)) + + return { + train: await TextData.init(concatenated, this.task) + } + } } diff --git a/discojs/discojs-node/src/data/index.ts b/discojs/discojs-node/src/data/index.ts index 612a1c891..685b18afb 100644 --- a/discojs/discojs-node/src/data/index.ts +++ b/discojs/discojs-node/src/data/index.ts @@ -1,2 +1,3 @@ export { ImageLoader as NodeImageLoader } from './image_loader' export { TabularLoader as NodeTabularLoader } from './tabular_loader' +export { TextLoader as NodeTextLoader } from './text_loader' diff --git a/discojs/discojs-node/src/data/text_loader.ts b/discojs/discojs-node/src/data/text_loader.ts index 0044e07fa..9698df611 100644 --- a/discojs/discojs-node/src/data/text_loader.ts +++ b/discojs/discojs-node/src/data/text_loader.ts @@ -1,30 +1,14 @@ -// import fs from 'node:fs' +import fs from 'node:fs/promises' +import { data as tfData } from '@tensorflow/tfjs-node' -// import split2 from 'split2' +import { data } from '@epfml/discojs-core' -// import { tf } from '../..' -// import { TextLoader } from '../../core/dataset/data_loader/text_loader' -// import { Dataset } from '../../core/dataset' -// import { DataConfig } from '../../core/dataset/data_loader' +export class TextLoader extends data.TextLoader { + async loadDatasetFrom (source: string): Promise { + // TODO sure, good idea to load the whole dataset in memory #irony + const content = await fs.readFile(source) + const file = new tfData.FileDataSource(content) -// export class NodeTextLoader extends TextLoader { -// async loadDatasetFrom (source: string, config?: DataConfig): Promise { -// const prefix = 'file://' -// if (source.slice(0, 7) !== prefix) { -// source = prefix + source -// } -// // create stream being read by generator -// const stream = fs.createReadStream(source, { encoding: 'utf-8' }) -// // eslint-disable-next-line @typescript-eslint/no-this-alias -// const self = this - -// async function * dataGenerator (): AsyncGenerator { -// // TODO @s314cy -// const withLabels = config?.labels !== undefined -// stream.pipe(split2()) -// stream.on('data', (data) => yield self.tokenize(data)) -// } - -// return tf.data.generator(dataGenerator) -// } -// } + return new tfData.TextLineDataset(file) + } +} diff --git a/discojs/discojs-web/src/data/text_loader.ts b/discojs/discojs-web/src/data/text_loader.ts index 702368ca3..4fdd3c5fc 100644 --- a/discojs/discojs-web/src/data/text_loader.ts +++ b/discojs/discojs-web/src/data/text_loader.ts @@ -3,12 +3,8 @@ import tf from '@tensorflow/tfjs' import { data } from '@epfml/discojs-core' export class TextLoader extends data.TextLoader { - async loadDatasetFrom (source: File, config?: Record): Promise { + async loadDatasetFrom (source: File): Promise { const file = new tf.data.FileDataSource(source) - if (config !== undefined) { - return new tf.data.CSVDataset(file, config) - } else { - return new tf.data.TextLineDataset(file) - } + return new tf.data.TextLineDataset(file) } }