From effa9a9e175f504285db48e54dbe3d9474e705b6 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 11 Dec 2024 15:47:50 +0200 Subject: [PATCH] Refactor per-model unit testing (#1083) * Set up per-model unit tests * Rename tests * Do not modify original object when updating model file name * Distribute unit tests across separate files * Update comments * Update tokenization test file names * Refactor: use asset cache * Destructuring for code deduplication * Remove empty file * Rename deberta-v2 -> deberta_v2 * Rename * Support casting between number and bigint types * Use fp32 tiny models * Move image processing tests to separate folders + auto-detection --- src/models.js | 80 +- src/utils/tensor.js | 15 +- tests/asset_cache.js | 43 + tests/init.js | 58 + tests/models.test.js | 177 +- ...ization.js => test_tokenization_albert.js} | 0 tests/models/all_modeling_tests.js | 33 + tests/models/all_tokenization_tests.js | 44 +- .../models/beit/test_image_processing_beit.js | 31 + tests/models/bert/test_modeling_bert.js | 221 ++ ...enization.js => test_tokenization_bert.js} | 0 tests/models/bit/test_image_processing_bit.js | 31 + ... => test_tokenization_blenderbot_small.js} | 0 tests/models/bloom/test_modeling_bloom.js | 50 + ...nization.js => test_tokenization_bloom.js} | 0 .../models/clip/test_image_processing_clip.js | 33 + tests/models/clip/test_modeling_clip.js | 58 + ...enization.js => test_tokenization_clip.js} | 0 tests/models/codegen/test_modeling_codegen.js | 51 + tests/models/cohere/test_modeling_cohere.js | 90 + .../test_image_processing_convnext.js | 32 + .../test_tokenization_deberta_v2.js} | 0 .../models/deit/test_image_processing_deit.js | 31 + .../models/detr/test_image_processing_detr.js | 34 + ...ion.js => test_tokenization_distilbert.js} | 0 .../donut/test_image_processing_donut.js | 34 + tests/models/dpt/test_image_processing_dpt.js | 77 + .../test_image_processing_efficientnet.js | 46 + ...kenization.js => test_tokenization_esm.js} | 0 ...ization.js => test_tokenization_falcon.js} | 0 .../florence2/test_modeling_florence2.js | 83 + tests/models/gemma/test_modeling_gemma.js | 51 + ...nization.js => test_tokenization_gemma.js} | 0 tests/models/gemma2/test_modeling_gemma2.js | 51 + .../models/glpn/test_image_processing_glpn.js | 48 + tests/models/gpt2/test_modeling_gpt2.js | 51 + ...enization.js => test_tokenization_gpt2.js} | 0 .../gpt_bigcode/test_modeling_gpt_bigcode.js | 51 + tests/models/gpt_neo/test_modeling_gpt_neo.js | 51 + .../models/gpt_neox/test_modeling_gpt_neox.js | 51 + tests/models/gptj/test_modeling_gptj.js | 51 + tests/models/granite/test_modeling_granite.js | 50 + .../test_image_processing_idefics3.js | 107 + .../models/idefics3/test_modeling_idefics3.js | 142 + tests/models/jais/test_modeling_jais.js | 51 + .../test_image_processing_jina_clip.js | 33 + tests/models/llama/test_modeling_llama.js | 85 + ...nization.js => test_tokenization_llama.js} | 0 tests/models/llava/test_modeling_llava.js | 78 + ...zation.js => test_tokenization_m2m_100.js} | 0 tests/models/marian/test_modeling_marian.js | 51 + tests/models/mistral/test_modeling_mistral.js | 50 + .../test_image_processing_mobilevit.js | 90 + ...nization.js => test_tokenization_mpnet.js} | 0 tests/models/mpt/test_modeling_mpt.js | 51 + .../models/musicgen/test_modeling_musicgen.js | 61 + ...enization.js => test_tokenization_nllb.js} | 0 .../nougat/test_image_processing_nougat.js | 33 + tests/models/olmo/test_modeling_olmo.js | 51 + tests/models/olmo2/test_modeling_olmo2.js | 51 + tests/models/opt/test_modeling_opt.js | 51 + .../owlvit/test_image_processing_owlvit.js | 31 + .../paligemma/test_modeling_paligemma.js | 52 + .../test_modeling_patchtsmixer.js | 65 + .../models/patchtst/test_modeling_patchtst.js | 65 + .../models/pyannote/test_modeling_pyannote.js | 56 + ...nization.js => test_tokenization_qwen2.js} | 0 .../test_image_processing_qwen2_vl.js | 34 + .../models/qwen2_vl/test_modeling_qwen2_vl.js | 93 + ...zation.js => test_tokenization_roberta.js} | 0 tests/models/sam/test_image_processing_sam.js | 95 + .../swin2sr/test_image_processing_swin2sr.js | 41 + tests/models/t5/test_modeling_t5.js | 96 + ...okenization.js => test_tokenization_t5.js} | 0 .../test_modeling_vision_encoder_decoder.js | 52 + tests/models/vit/test_image_processing_vit.js | 31 + .../test_image_processing_vitmatte.js | 68 + ...enization.js => test_tokenization_vits.js} | 0 ...ation.js => test_tokenization_wav2vec2.js} | 0 tests/models/whisper/test_modeling_whisper.js | 148 + ...zation.js => test_tokenization_whisper.js} | 0 .../test_tokenization_xlm_roberta.js} | 0 .../yolos/test_image_processing_yolos.js | 31 + tests/processors.test.js | 953 +------ tests/tiny_random.test.js | 2517 +---------------- tests/utils/generation.test.js | 164 +- tests/utils/tensor.test.js | 43 + 87 files changed, 3930 insertions(+), 3497 deletions(-) create mode 100644 tests/asset_cache.js rename tests/models/albert/{tokenization.js => test_tokenization_albert.js} (100%) create mode 100644 tests/models/all_modeling_tests.js create mode 100644 tests/models/beit/test_image_processing_beit.js create mode 100644 tests/models/bert/test_modeling_bert.js rename tests/models/bert/{tokenization.js => test_tokenization_bert.js} (100%) create mode 100644 tests/models/bit/test_image_processing_bit.js rename tests/models/blenderbot_small/{tokenization.js => test_tokenization_blenderbot_small.js} (100%) create mode 100644 tests/models/bloom/test_modeling_bloom.js rename tests/models/bloom/{tokenization.js => test_tokenization_bloom.js} (100%) create mode 100644 tests/models/clip/test_image_processing_clip.js create mode 100644 tests/models/clip/test_modeling_clip.js rename tests/models/clip/{tokenization.js => test_tokenization_clip.js} (100%) create mode 100644 tests/models/codegen/test_modeling_codegen.js create mode 100644 tests/models/cohere/test_modeling_cohere.js create mode 100644 tests/models/convnext/test_image_processing_convnext.js rename tests/models/{deberta-v2/tokenization.js => deberta_v2/test_tokenization_deberta_v2.js} (100%) create mode 100644 tests/models/deit/test_image_processing_deit.js create mode 100644 tests/models/detr/test_image_processing_detr.js rename tests/models/distilbert/{tokenization.js => test_tokenization_distilbert.js} (100%) create mode 100644 tests/models/donut/test_image_processing_donut.js create mode 100644 tests/models/dpt/test_image_processing_dpt.js create mode 100644 tests/models/efficientnet/test_image_processing_efficientnet.js rename tests/models/esm/{tokenization.js => test_tokenization_esm.js} (100%) rename tests/models/falcon/{tokenization.js => test_tokenization_falcon.js} (100%) create mode 100644 tests/models/florence2/test_modeling_florence2.js create mode 100644 tests/models/gemma/test_modeling_gemma.js rename tests/models/gemma/{tokenization.js => test_tokenization_gemma.js} (100%) create mode 100644 tests/models/gemma2/test_modeling_gemma2.js create mode 100644 tests/models/glpn/test_image_processing_glpn.js create mode 100644 tests/models/gpt2/test_modeling_gpt2.js rename tests/models/gpt2/{tokenization.js => test_tokenization_gpt2.js} (100%) create mode 100644 tests/models/gpt_bigcode/test_modeling_gpt_bigcode.js create mode 100644 tests/models/gpt_neo/test_modeling_gpt_neo.js create mode 100644 tests/models/gpt_neox/test_modeling_gpt_neox.js create mode 100644 tests/models/gptj/test_modeling_gptj.js create mode 100644 tests/models/granite/test_modeling_granite.js create mode 100644 tests/models/idefics3/test_image_processing_idefics3.js create mode 100644 tests/models/idefics3/test_modeling_idefics3.js create mode 100644 tests/models/jais/test_modeling_jais.js create mode 100644 tests/models/jina_clip/test_image_processing_jina_clip.js create mode 100644 tests/models/llama/test_modeling_llama.js rename tests/models/llama/{tokenization.js => test_tokenization_llama.js} (100%) create mode 100644 tests/models/llava/test_modeling_llava.js rename tests/models/m2m_100/{tokenization.js => test_tokenization_m2m_100.js} (100%) create mode 100644 tests/models/marian/test_modeling_marian.js create mode 100644 tests/models/mistral/test_modeling_mistral.js create mode 100644 tests/models/mobilevit/test_image_processing_mobilevit.js rename tests/models/mpnet/{tokenization.js => test_tokenization_mpnet.js} (100%) create mode 100644 tests/models/mpt/test_modeling_mpt.js create mode 100644 tests/models/musicgen/test_modeling_musicgen.js rename tests/models/nllb/{tokenization.js => test_tokenization_nllb.js} (100%) create mode 100644 tests/models/nougat/test_image_processing_nougat.js create mode 100644 tests/models/olmo/test_modeling_olmo.js create mode 100644 tests/models/olmo2/test_modeling_olmo2.js create mode 100644 tests/models/opt/test_modeling_opt.js create mode 100644 tests/models/owlvit/test_image_processing_owlvit.js create mode 100644 tests/models/paligemma/test_modeling_paligemma.js create mode 100644 tests/models/patchtsmixer/test_modeling_patchtsmixer.js create mode 100644 tests/models/patchtst/test_modeling_patchtst.js create mode 100644 tests/models/pyannote/test_modeling_pyannote.js rename tests/models/qwen2/{tokenization.js => test_tokenization_qwen2.js} (100%) create mode 100644 tests/models/qwen2_vl/test_image_processing_qwen2_vl.js create mode 100644 tests/models/qwen2_vl/test_modeling_qwen2_vl.js rename tests/models/roberta/{tokenization.js => test_tokenization_roberta.js} (100%) create mode 100644 tests/models/sam/test_image_processing_sam.js create mode 100644 tests/models/swin2sr/test_image_processing_swin2sr.js create mode 100644 tests/models/t5/test_modeling_t5.js rename tests/models/t5/{tokenization.js => test_tokenization_t5.js} (100%) create mode 100644 tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.js create mode 100644 tests/models/vit/test_image_processing_vit.js create mode 100644 tests/models/vitmatte/test_image_processing_vitmatte.js rename tests/models/vits/{tokenization.js => test_tokenization_vits.js} (100%) rename tests/models/wav2vec2/{tokenization.js => test_tokenization_wav2vec2.js} (100%) create mode 100644 tests/models/whisper/test_modeling_whisper.js rename tests/models/whisper/{tokenization.js => test_tokenization_whisper.js} (100%) rename tests/models/{xlm-roberta/tokenization.js => xlm_roberta/test_tokenization_xlm_roberta.js} (100%) create mode 100644 tests/models/yolos/test_image_processing_yolos.js diff --git a/src/models.js b/src/models.js index f8242b5f0..93d92e8c6 100644 --- a/src/models.js +++ b/src/models.js @@ -3666,9 +3666,11 @@ export class CLIPModel extends CLIPPreTrainedModel { } export class CLIPTextModel extends CLIPPreTrainedModel { /** @type {typeof PreTrainedModel.from_pretrained} */ static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'text_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); + return super.from_pretrained(pretrained_model_name_or_path, { + // Update default model file name if not provided + model_file_name: 'text_model', + ...options, + }); } } @@ -3701,9 +3703,11 @@ export class CLIPTextModel extends CLIPPreTrainedModel { export class CLIPTextModelWithProjection extends CLIPPreTrainedModel { /** @type {typeof PreTrainedModel.from_pretrained} */ static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'text_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); + return super.from_pretrained(pretrained_model_name_or_path, { + // Update default model file name if not provided + model_file_name: 'text_model', + ...options, + }); } } @@ -3713,9 +3717,11 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel { export class CLIPVisionModel extends CLIPPreTrainedModel { /** @type {typeof PreTrainedModel.from_pretrained} */ static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'vision_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); + return super.from_pretrained(pretrained_model_name_or_path, { + // Update default model file name if not provided + model_file_name: 'vision_model', + ...options, + }); } } @@ -3748,9 +3754,11 @@ export class CLIPVisionModel extends CLIPPreTrainedModel { export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel { /** @type {typeof PreTrainedModel.from_pretrained} */ static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'vision_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); + return super.from_pretrained(pretrained_model_name_or_path, { + // Update default model file name if not provided + model_file_name: 'vision_model', + ...options, + }); } } ////////////////////////////////////////////////// @@ -3834,9 +3842,11 @@ export class SiglipModel extends SiglipPreTrainedModel { } export class SiglipTextModel extends SiglipPreTrainedModel { /** @type {typeof PreTrainedModel.from_pretrained} */ static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'text_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); + return super.from_pretrained(pretrained_model_name_or_path, { + // Update default model file name if not provided + model_file_name: 'text_model', + ...options, + }); } } @@ -3869,9 +3879,11 @@ export class SiglipTextModel extends SiglipPreTrainedModel { export class SiglipVisionModel extends CLIPPreTrainedModel { /** @type {typeof PreTrainedModel.from_pretrained} */ static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'vision_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); + return super.from_pretrained(pretrained_model_name_or_path, { + // Update default model file name if not provided + model_file_name: 'vision_model', + ...options, + }); } } ////////////////////////////////////////////////// @@ -3926,18 +3938,22 @@ export class JinaCLIPModel extends JinaCLIPPreTrainedModel { export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel { /** @type {typeof PreTrainedModel.from_pretrained} */ static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'text_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); + return super.from_pretrained(pretrained_model_name_or_path, { + // Update default model file name if not provided + model_file_name: 'text_model', + ...options, + }); } } export class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel { /** @type {typeof PreTrainedModel.from_pretrained} */ static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'vision_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); + return super.from_pretrained(pretrained_model_name_or_path, { + // Update default model file name if not provided + model_file_name: 'vision_model', + ...options, + }); } } ////////////////////////////////////////////////// @@ -6159,9 +6175,11 @@ export class ClapModel extends ClapPreTrainedModel { } export class ClapTextModelWithProjection extends ClapPreTrainedModel { /** @type {typeof PreTrainedModel.from_pretrained} */ static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'text_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); + return super.from_pretrained(pretrained_model_name_or_path, { + // Update default model file name if not provided + model_file_name: 'text_model', + ...options, + }); } } @@ -6194,9 +6212,11 @@ export class ClapTextModelWithProjection extends ClapPreTrainedModel { export class ClapAudioModelWithProjection extends ClapPreTrainedModel { /** @type {typeof PreTrainedModel.from_pretrained} */ static async from_pretrained(pretrained_model_name_or_path, options = {}) { - // Update default model file name if not provided - options.model_file_name ??= 'audio_model'; - return super.from_pretrained(pretrained_model_name_or_path, options); + return super.from_pretrained(pretrained_model_name_or_path, { + // Update default model file name if not provided + model_file_name: 'audio_model', + ...options, + }); } } ////////////////////////////////////////////////// diff --git a/src/utils/tensor.js b/src/utils/tensor.js index 8b8133770..6bdfd20a3 100644 --- a/src/utils/tensor.js +++ b/src/utils/tensor.js @@ -772,8 +772,21 @@ export class Tensor { if (!DataTypeMap.hasOwnProperty(type)) { throw new Error(`Unsupported type: ${type}`); } + + // Handle special cases where a mapping function is needed (e.g., where one type is a bigint and the other is a number) + let map_fn; + const is_source_bigint = ['int64', 'uint64'].includes(this.type); + const is_dest_bigint = ['int64', 'uint64'].includes(type); + if (is_source_bigint && !is_dest_bigint) { + // TypeError: Cannot convert a BigInt value to a number + map_fn = Number; + } else if (!is_source_bigint && is_dest_bigint) { + // TypeError: Cannot convert [x] to a BigInt + map_fn = BigInt; + } + // @ts-ignore - return new Tensor(type, DataTypeMap[type].from(this.data), this.dims); + return new Tensor(type, DataTypeMap[type].from(this.data, map_fn), this.dims); } } diff --git a/tests/asset_cache.js b/tests/asset_cache.js new file mode 100644 index 000000000..8d62fb6bf --- /dev/null +++ b/tests/asset_cache.js @@ -0,0 +1,43 @@ +import { RawImage } from "../src/transformers.js"; + +const BASE_URL = "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/"; +const TEST_IMAGES = Object.freeze({ + white_image: BASE_URL + "white-image.png", + pattern_3x3: BASE_URL + "pattern_3x3.png", + pattern_3x5: BASE_URL + "pattern_3x5.png", + checkerboard_8x8: BASE_URL + "checkerboard_8x8.png", + checkerboard_64x32: BASE_URL + "checkerboard_64x32.png", + gradient_1280x640: BASE_URL + "gradient_1280x640.png", + receipt: BASE_URL + "receipt.png", + tiger: BASE_URL + "tiger.jpg", + paper: BASE_URL + "nougat_paper.png", + cats: BASE_URL + "cats.jpg", + + // grayscale image + skateboard: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/ml-web-games/skateboard.png", + + vitmatte_image: BASE_URL + "vitmatte_image.png", + vitmatte_trimap: BASE_URL + "vitmatte_trimap.png", + + beetle: BASE_URL + "beetle.png", + book_cover: BASE_URL + "book-cover.png", +}); + +/** @type {Map} */ +const IMAGE_CACHE = new Map(); +const load_image = async (url) => { + const cached = IMAGE_CACHE.get(url); + if (cached) { + return cached; + } + const image = await RawImage.fromURL(url); + IMAGE_CACHE.set(url, image); + return image; +}; + +/** + * Load a cached image. + * @param {keyof typeof TEST_IMAGES} name The name of the image to load. + * @returns {Promise} The loaded image. + */ +export const load_cached_image = (name) => load_image(TEST_IMAGES[name]); diff --git a/tests/init.js b/tests/init.js index a52fe2cf2..29097eb47 100644 --- a/tests/init.js +++ b/tests/init.js @@ -57,8 +57,66 @@ export function init() { registerBackend("test", onnxruntimeBackend, Number.POSITIVE_INFINITY); } +export const MAX_PROCESSOR_LOAD_TIME = 10_000; // 10 seconds export const MAX_MODEL_LOAD_TIME = 15_000; // 15 seconds export const MAX_TEST_EXECUTION_TIME = 60_000; // 60 seconds export const MAX_MODEL_DISPOSE_TIME = 1_000; // 1 second export const MAX_TEST_TIME = MAX_MODEL_LOAD_TIME + MAX_TEST_EXECUTION_TIME + MAX_MODEL_DISPOSE_TIME; + +export const DEFAULT_MODEL_OPTIONS = { + dtype: "fp32", +}; + +expect.extend({ + toBeCloseToNested(received, expected, numDigits = 2) { + const compare = (received, expected, path = "") => { + if (typeof received === "number" && typeof expected === "number" && !Number.isInteger(received) && !Number.isInteger(expected)) { + const pass = Math.abs(received - expected) < Math.pow(10, -numDigits); + return { + pass, + message: () => (pass ? `✓ At path '${path}': expected ${received} not to be close to ${expected} with tolerance of ${numDigits} decimal places` : `✗ At path '${path}': expected ${received} to be close to ${expected} with tolerance of ${numDigits} decimal places`), + }; + } else if (Array.isArray(received) && Array.isArray(expected)) { + if (received.length !== expected.length) { + return { + pass: false, + message: () => `✗ At path '${path}': array lengths differ. Received length ${received.length}, expected length ${expected.length}`, + }; + } + for (let i = 0; i < received.length; i++) { + const result = compare(received[i], expected[i], `${path}[${i}]`); + if (!result.pass) return result; + } + } else if (typeof received === "object" && typeof expected === "object" && received !== null && expected !== null) { + const receivedKeys = Object.keys(received); + const expectedKeys = Object.keys(expected); + if (receivedKeys.length !== expectedKeys.length) { + return { + pass: false, + message: () => `✗ At path '${path}': object keys length differ. Received keys: ${JSON.stringify(receivedKeys)}, expected keys: ${JSON.stringify(expectedKeys)}`, + }; + } + for (const key of receivedKeys) { + if (!expected.hasOwnProperty(key)) { + return { + pass: false, + message: () => `✗ At path '${path}': key '${key}' found in received but not in expected`, + }; + } + const result = compare(received[key], expected[key], `${path}.${key}`); + if (!result.pass) return result; + } + } else { + const pass = received === expected; + return { + pass, + message: () => (pass ? `✓ At path '${path}': expected ${JSON.stringify(received)} not to equal ${JSON.stringify(expected)}` : `✗ At path '${path}': expected ${JSON.stringify(received)} to equal ${JSON.stringify(expected)}`), + }; + } + return { pass: true }; + }; + + return compare(received, expected); + }, +}); diff --git a/tests/models.test.js b/tests/models.test.js index f1bc7961c..a668baee4 100644 --- a/tests/models.test.js +++ b/tests/models.test.js @@ -2,7 +2,9 @@ * Test that models loaded outside of the `pipeline` function work correctly (e.g., `AutoModel.from_pretrained(...)`); */ -import { AutoTokenizer, AutoModel, AutoProcessor, BertModel, GPT2Model, T5ForConditionalGeneration, CLIPTextModelWithProjection, CLIPVisionModelWithProjection, BertTokenizer, GPT2Tokenizer, T5Tokenizer, RawImage } from "../src/transformers.js"; +import * as MODEL_TESTS from "./models/all_modeling_tests.js"; + +import { AutoTokenizer, AutoModel, BertModel, GPT2Model, T5ForConditionalGeneration, BertTokenizer, GPT2Tokenizer, T5Tokenizer } from "../src/transformers.js"; import { init, MAX_TEST_EXECUTION_TIME } from "./init.js"; @@ -11,120 +13,69 @@ import { compare } from "./test_utils.js"; // Initialise the testing environment init(); -describe("Models", () => { - describe("Loading different architecture types", () => { - // List all models which will be tested - const models_to_test = [ - // [name, modelClass, tokenizerClass] - ["hf-internal-testing/tiny-random-BertForMaskedLM", BertModel, BertTokenizer], // Encoder-only - ["hf-internal-testing/tiny-random-GPT2LMHeadModel", GPT2Model, GPT2Tokenizer], // Decoder-only - ["hf-internal-testing/tiny-random-T5ForConditionalGeneration", T5ForConditionalGeneration, T5Tokenizer], // Encoder-decoder - ]; - - const texts = ["Once upon a time", "I like to eat apples"]; - - for (const [model_id, modelClass, tokenizerClass] of models_to_test) { - // Test that both the auto model and the specific model work - const tokenizers = [AutoTokenizer, tokenizerClass]; - const models = [AutoModel, modelClass]; - - for (let i = 0; i < tokenizers.length; ++i) { - const tokenizerClassToTest = tokenizers[i]; - const modelClassToTest = models[i]; - - it( - `${model_id} (${modelClassToTest.name})`, - async () => { - // Load model and tokenizer - const tokenizer = await tokenizerClassToTest.from_pretrained(model_id); - const model = await modelClassToTest.from_pretrained(model_id); - - const tests = [ - texts[0], // single - texts, // batched - ]; - for (const test of tests) { - const inputs = await tokenizer(test, { truncation: true, padding: true }); - if (model.config.is_encoder_decoder) { - inputs.decoder_input_ids = inputs.input_ids; - } - const output = await model(inputs); - - if (output.logits) { - // Ensure correct shapes - const expected_shape = [...inputs.input_ids.dims, model.config.vocab_size]; - const actual_shape = output.logits.dims; - compare(expected_shape, actual_shape); - } else if (output.last_hidden_state) { - const expected_shape = [...inputs.input_ids.dims, model.config.d_model]; - const actual_shape = output.last_hidden_state.dims; - compare(expected_shape, actual_shape); - } else { - console.warn("Unexpected output", output); - throw new Error("Unexpected output"); - } +describe("Loading different architecture types", () => { + // List all models which will be tested + const models_to_test = [ + // [name, modelClass, tokenizerClass] + ["hf-internal-testing/tiny-random-BertForMaskedLM", BertModel, BertTokenizer], // Encoder-only + ["hf-internal-testing/tiny-random-GPT2LMHeadModel", GPT2Model, GPT2Tokenizer], // Decoder-only + ["hf-internal-testing/tiny-random-T5ForConditionalGeneration", T5ForConditionalGeneration, T5Tokenizer], // Encoder-decoder + ]; + + const texts = ["Once upon a time", "I like to eat apples"]; + + for (const [model_id, modelClass, tokenizerClass] of models_to_test) { + // Test that both the auto model and the specific model work + const tokenizers = [AutoTokenizer, tokenizerClass]; + const models = [AutoModel, modelClass]; + + for (let i = 0; i < tokenizers.length; ++i) { + const tokenizerClassToTest = tokenizers[i]; + const modelClassToTest = models[i]; + + it( + `${model_id} (${modelClassToTest.name})`, + async () => { + // Load model and tokenizer + const tokenizer = await tokenizerClassToTest.from_pretrained(model_id); + const model = await modelClassToTest.from_pretrained(model_id, { dtype: "fp32" }); + + const tests = [ + texts[0], // single + texts, // batched + ]; + for (const test of tests) { + const inputs = await tokenizer(test, { truncation: true, padding: true }); + if (model.config.is_encoder_decoder) { + inputs.decoder_input_ids = inputs.input_ids; + } + const output = await model(inputs); + + if (output.logits) { + // Ensure correct shapes + const expected_shape = [...inputs.input_ids.dims, model.config.vocab_size]; + const actual_shape = output.logits.dims; + compare(expected_shape, actual_shape); + } else if (output.last_hidden_state) { + const expected_shape = [...inputs.input_ids.dims, model.config.d_model]; + const actual_shape = output.last_hidden_state.dims; + compare(expected_shape, actual_shape); + } else { + console.warn("Unexpected output", output); + throw new Error("Unexpected output"); } + } - await model.dispose(); - }, - MAX_TEST_EXECUTION_TIME, - ); - } + await model.dispose(); + }, + MAX_TEST_EXECUTION_TIME, + ); } - }); - - describe("Running specific models", () => { - const models_to_test = ["hf-internal-testing/tiny-random-CLIPModel"]; - it( - `CLIP (text)`, - async () => { - const model_id = models_to_test[0]; - - // Load tokenizer and text model - const tokenizer = await AutoTokenizer.from_pretrained(model_id); - const text_model = await CLIPTextModelWithProjection.from_pretrained(model_id, { revision: "refs/pr/5" }); - - // Run tokenization - const texts = ["a photo of a car", "a photo of a football match"]; - const text_inputs = tokenizer(texts, { padding: true, truncation: true }); - - // Compute embeddings - const { text_embeds } = await text_model(text_inputs); - - // Ensure correct shapes - const expected_shape = [texts.length, text_model.config.projection_dim]; - const actual_shape = text_embeds.dims; - compare(expected_shape, actual_shape); - - await text_model.dispose(); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - `CLIP (vision)`, - async () => { - const model_id = models_to_test[0]; - - // Load processor and vision model - const processor = await AutoProcessor.from_pretrained(model_id); - const vision_model = await CLIPVisionModelWithProjection.from_pretrained(model_id, { revision: "refs/pr/5" }); - - // Read image and run processor - const image = await RawImage.read("https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg"); - const image_inputs = await processor(image); - - // Compute embeddings - const { image_embeds } = await vision_model(image_inputs); - - // Ensure correct shapes - const expected_shape = [1, vision_model.config.projection_dim]; - const actual_shape = image_embeds.dims; - compare(expected_shape, actual_shape); + } +}); - await vision_model.dispose(); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); +describe("Model-specific tests", () => { + for (const [modelName, modelTest] of Object.entries(MODEL_TESTS)) { + describe(modelName, modelTest); + } }); diff --git a/tests/models/albert/tokenization.js b/tests/models/albert/test_tokenization_albert.js similarity index 100% rename from tests/models/albert/tokenization.js rename to tests/models/albert/test_tokenization_albert.js diff --git a/tests/models/all_modeling_tests.js b/tests/models/all_modeling_tests.js new file mode 100644 index 000000000..0f64ec581 --- /dev/null +++ b/tests/models/all_modeling_tests.js @@ -0,0 +1,33 @@ +export { default as bert } from "./bert/test_modeling_bert.js"; +export { default as bloom } from "./bloom/test_modeling_bloom.js"; +export { default as clip } from "./clip/test_modeling_clip.js"; +export { default as codegen } from "./codegen/test_modeling_codegen.js"; +export { default as cohere } from "./cohere/test_modeling_cohere.js"; +export { default as florence2 } from "./florence2/test_modeling_florence2.js"; +export { default as gemma } from "./gemma/test_modeling_gemma.js"; +export { default as gemma2 } from "./gemma2/test_modeling_gemma2.js"; +export { default as gpt2 } from "./gpt2/test_modeling_gpt2.js"; +export { default as gpt_bigcode } from "./gpt_bigcode/test_modeling_gpt_bigcode.js"; +export { default as gpt_neo } from "./gpt_neo/test_modeling_gpt_neo.js"; +export { default as gpt_neox } from "./gpt_neox/test_modeling_gpt_neox.js"; +export { default as gptj } from "./gptj/test_modeling_gptj.js"; +export { default as granite } from "./granite/test_modeling_granite.js"; +export { default as idefics3 } from "./idefics3/test_modeling_idefics3.js"; +export { default as jais } from "./jais/test_modeling_jais.js"; +export { default as llama } from "./llama/test_modeling_llama.js"; +export { default as llava } from "./llava/test_modeling_llava.js"; +export { default as marian } from "./marian/test_modeling_marian.js"; +export { default as mistral } from "./mistral/test_modeling_mistral.js"; +export { default as mpt } from "./mpt/test_modeling_mpt.js"; +export { default as musicgen } from "./musicgen/test_modeling_musicgen.js"; +export { default as olmo } from "./olmo/test_modeling_olmo.js"; +export { default as olmo2 } from "./olmo2/test_modeling_olmo2.js"; +export { default as opt } from "./opt/test_modeling_opt.js"; +export { default as paligemma } from "./paligemma/test_modeling_paligemma.js"; +export { default as patchtsmixer } from "./patchtsmixer/test_modeling_patchtsmixer.js"; +export { default as patchtst } from "./patchtst/test_modeling_patchtst.js"; +export { default as pyannote } from "./pyannote/test_modeling_pyannote.js"; +export { default as qwen2_vl } from "./qwen2_vl/test_modeling_qwen2_vl.js"; +export { default as t5 } from "./t5/test_modeling_t5.js"; +export { default as vision_encoder_decoder } from "./vision_encoder_decoder/test_modeling_vision_encoder_decoder.js"; +export { default as whisper } from "./whisper/test_modeling_whisper.js"; diff --git a/tests/models/all_tokenization_tests.js b/tests/models/all_tokenization_tests.js index 00ec6d639..b9bac9d1f 100644 --- a/tests/models/all_tokenization_tests.js +++ b/tests/models/all_tokenization_tests.js @@ -1,22 +1,22 @@ -export * as AlbertTokenizer from "./albert/tokenization.js"; -export * as BertTokenizer from "./bert/tokenization.js"; -export * as BlenderbotSmallTokenizer from "./blenderbot_small/tokenization.js"; -export * as BloomTokenizer from "./bloom/tokenization.js"; -export * as CLIPTokenizer from "./clip/tokenization.js"; -export * as DebertaV2Tokenizer from "./deberta-v2/tokenization.js"; -export * as DistilBertTokenizer from "./distilbert/tokenization.js"; -export * as EsmTokenizer from "./esm/tokenization.js"; -export * as FalconTokenizer from "./falcon/tokenization.js"; -export * as GPT2Tokenizer from "./gpt2/tokenization.js"; -export * as GemmaTokenizer from "./gemma/tokenization.js"; -export * as LlamaTokenizer from "./llama/tokenization.js"; -export * as M2M100Tokenizer from "./m2m_100/tokenization.js"; -export * as MPNetTokenizer from "./mpnet/tokenization.js"; -export * as NllbTokenizer from "./nllb/tokenization.js"; -export * as Qwen2Tokenizer from "./qwen2/tokenization.js"; -export * as RobertaTokenizer from "./roberta/tokenization.js"; -export * as T5Tokenizer from "./t5/tokenization.js"; -export * as VitsTokenizer from "./vits/tokenization.js"; -export * as Wav2Vec2CTCTokenizer from "./wav2vec2/tokenization.js"; -export * as WhisperTokenizer from "./whisper/tokenization.js"; -export * as XLMRobertaTokenizer from "./xlm-roberta/tokenization.js"; +export * as AlbertTokenizer from "./albert/test_tokenization_albert.js"; +export * as BertTokenizer from "./bert/test_tokenization_bert.js"; +export * as BlenderbotSmallTokenizer from "./blenderbot_small/test_tokenization_blenderbot_small.js"; +export * as BloomTokenizer from "./bloom/test_tokenization_bloom.js"; +export * as CLIPTokenizer from "./clip/test_tokenization_clip.js"; +export * as DebertaV2Tokenizer from "./deberta_v2/test_tokenization_deberta_v2.js"; +export * as DistilBertTokenizer from "./distilbert/test_tokenization_distilbert.js"; +export * as EsmTokenizer from "./esm/test_tokenization_esm.js"; +export * as FalconTokenizer from "./falcon/test_tokenization_falcon.js"; +export * as GPT2Tokenizer from "./gpt2/test_tokenization_gpt2.js"; +export * as GemmaTokenizer from "./gemma/test_tokenization_gemma.js"; +export * as LlamaTokenizer from "./llama/test_tokenization_llama.js"; +export * as M2M100Tokenizer from "./m2m_100/test_tokenization_m2m_100.js"; +export * as MPNetTokenizer from "./mpnet/test_tokenization_mpnet.js"; +export * as NllbTokenizer from "./nllb/test_tokenization_nllb.js"; +export * as Qwen2Tokenizer from "./qwen2/test_tokenization_qwen2.js"; +export * as RobertaTokenizer from "./roberta/test_tokenization_roberta.js"; +export * as T5Tokenizer from "./t5/test_tokenization_t5.js"; +export * as VitsTokenizer from "./vits/test_tokenization_vits.js"; +export * as Wav2Vec2CTCTokenizer from "./wav2vec2/test_tokenization_wav2vec2.js"; +export * as WhisperTokenizer from "./whisper/test_tokenization_whisper.js"; +export * as XLMRobertaTokenizer from "./xlm_roberta/test_tokenization_xlm_roberta.js"; diff --git a/tests/models/beit/test_image_processing_beit.js b/tests/models/beit/test_image_processing_beit.js new file mode 100644 index 000000000..065f6cbae --- /dev/null +++ b/tests/models/beit/test_image_processing_beit.js @@ -0,0 +1,31 @@ +import { AutoImageProcessor, BeitFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("BeitFeatureExtractor", () => { + const model_id = "Xenova/beit-base-patch16-224-pt22k-ft22k"; + + /** @type {BeitFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("tiger"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 224, 224]); + expect(pixel_values.mean().item()).toBeCloseTo(-0.22706867939852762, 6); + + expect(original_sizes).toEqual([[408, 612]]); + expect(reshaped_input_sizes).toEqual([[224, 224]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/bert/test_modeling_bert.js b/tests/models/bert/test_modeling_bert.js new file mode 100644 index 000000000..46a281d31 --- /dev/null +++ b/tests/models/bert/test_modeling_bert.js @@ -0,0 +1,221 @@ +import { BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("BertModel", () => { + const model_id = "hf-internal-testing/tiny-random-BertModel"; + + /** @type {BertModel} */ + let model; + /** @type {BertTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await BertModel.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await BertTokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const { last_hidden_state } = await model(inputs); + expect(last_hidden_state.dims).toEqual([1, 7, 32]); + expect(last_hidden_state.mean().item()).toBeCloseTo(0.0, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const { last_hidden_state } = await model(inputs); + expect(last_hidden_state.dims).toEqual([2, 12, 32]); + expect(last_hidden_state.mean().item()).toBeCloseTo(1.4901161193847656e-8, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + + describe("BertForMaskedLM", () => { + const model_id = "hf-internal-testing/tiny-random-BertForMaskedLM"; + + const texts = ["The goal of life is [MASK].", "Paris is the [MASK] of France."]; + + /** @type {BertForMaskedLM} */ + let model; + /** @type {BertTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await BertForMaskedLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await BertTokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer(texts[0]); + const { logits } = await model(inputs); + expect(logits.dims).toEqual([1, 19, 1124]); + expect(logits.mean().item()).toBeCloseTo(0.0016587056452408433, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(texts, { padding: true }); + const { logits } = await model(inputs); + expect(logits.dims).toEqual([2, 22, 1124]); + expect(logits.mean().item()).toBeCloseTo(0.0017160633578896523, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + + describe("BertForSequenceClassification", () => { + const model_id = "hf-internal-testing/tiny-random-BertForSequenceClassification"; + + /** @type {BertForSequenceClassification} */ + let model; + /** @type {BertTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await BertForSequenceClassification.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await BertTokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const { logits } = await model(inputs); + const target = [[0.00043986947275698185, -0.030218850821256638]].flat(); + expect(logits.dims).toEqual([1, 2]); + logits + .tolist() + .flat() + .forEach((item, i) => { + expect(item).toBeCloseTo(target[i], 5); + }); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const { logits } = await model(inputs); + const target = [ + [0.00043986947275698185, -0.030218850821256638], + [0.0003853091038763523, -0.03022204339504242], + ].flat(); + expect(logits.dims).toEqual([2, 2]); + logits + .tolist() + .flat() + .forEach((item, i) => { + expect(item).toBeCloseTo(target[i], 5); + }); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + + describe("BertForTokenClassification", () => { + const model_id = "hf-internal-testing/tiny-random-BertForTokenClassification"; + + /** @type {BertForTokenClassification} */ + let model; + /** @type {BertTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await BertForTokenClassification.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await BertTokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const { logits } = await model(inputs); + expect(logits.dims).toEqual([1, 7, 2]); + expect(logits.mean().item()).toBeCloseTo(0.07089076191186905, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const { logits } = await model(inputs); + expect(logits.dims).toEqual([2, 12, 2]); + expect(logits.mean().item()).toBeCloseTo(0.04702216014266014, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + + describe("BertForQuestionAnswering", () => { + const model_id = "hf-internal-testing/tiny-random-BertForQuestionAnswering"; + + /** @type {BertForQuestionAnswering} */ + let model; + /** @type {BertTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await BertForQuestionAnswering.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await BertTokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const { start_logits, end_logits } = await model(inputs); + expect(start_logits.dims).toEqual([1, 7]); + expect(start_logits.mean().item()).toBeCloseTo(0.12772157788276672, 5); + expect(end_logits.dims).toEqual([1, 7]); + expect(end_logits.mean().item()).toBeCloseTo(0.11811424791812897, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const { start_logits, end_logits } = await model(inputs); + expect(start_logits.dims).toEqual([2, 12]); + expect(start_logits.mean().item()).toBeCloseTo(0.12843115627765656, 5); + expect(end_logits.dims).toEqual([2, 12]); + expect(end_logits.mean().item()).toBeCloseTo(0.11745202541351318, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/bert/tokenization.js b/tests/models/bert/test_tokenization_bert.js similarity index 100% rename from tests/models/bert/tokenization.js rename to tests/models/bert/test_tokenization_bert.js diff --git a/tests/models/bit/test_image_processing_bit.js b/tests/models/bit/test_image_processing_bit.js new file mode 100644 index 000000000..1c4fdccd0 --- /dev/null +++ b/tests/models/bit/test_image_processing_bit.js @@ -0,0 +1,31 @@ +import { AutoImageProcessor, BitImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("BitImageProcessor", () => { + const model_id = "Xenova/dinov2-small-imagenet1k-1-layer"; + + /** @type {BitImageProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("tiger"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 224, 224]); + expect(pixel_values.mean().item()).toBeCloseTo(0.06262318789958954, 3); + + expect(original_sizes).toEqual([[408, 612]]); + expect(reshaped_input_sizes).toEqual([[224, 224]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/blenderbot_small/tokenization.js b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.js similarity index 100% rename from tests/models/blenderbot_small/tokenization.js rename to tests/models/blenderbot_small/test_tokenization_blenderbot_small.js diff --git a/tests/models/bloom/test_modeling_bloom.js b/tests/models/bloom/test_modeling_bloom.js new file mode 100644 index 000000000..5eec189ca --- /dev/null +++ b/tests/models/bloom/test_modeling_bloom.js @@ -0,0 +1,50 @@ +import { BloomTokenizer, BloomForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("BloomForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-BloomForCausalLM"; + /** @type {BloomForCausalLM} */ + let model; + /** @type {BloomTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await BloomForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await BloomTokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[198n, 803n, 82n, 82n, 82n, 82n, 82n, 82n, 82n, 82n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [3n, 3n, 198n, 803n, 82n, 82n, 82n, 82n, 82n, 82n], + [198n, 803n, 82n, 209n, 753n, 753n, 753n, 753n, 753n, 753n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/bloom/tokenization.js b/tests/models/bloom/test_tokenization_bloom.js similarity index 100% rename from tests/models/bloom/tokenization.js rename to tests/models/bloom/test_tokenization_bloom.js diff --git a/tests/models/clip/test_image_processing_clip.js b/tests/models/clip/test_image_processing_clip.js new file mode 100644 index 000000000..cafa71623 --- /dev/null +++ b/tests/models/clip/test_image_processing_clip.js @@ -0,0 +1,33 @@ +import { AutoImageProcessor, CLIPFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // CLIPFeatureExtractor + // - tests center crop (do_center_crop=true, crop_size=224) + describe("CLIPFeatureExtractor", () => { + const model_id = "Xenova/clip-vit-base-patch16"; + + /** @type {CLIPFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("tiger"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 224, 224]); + expect(pixel_values.mean().item()).toBeCloseTo(-0.06678297738282096, 6); + + expect(original_sizes).toEqual([[408, 612]]); + expect(reshaped_input_sizes).toEqual([[224, 224]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/clip/test_modeling_clip.js b/tests/models/clip/test_modeling_clip.js new file mode 100644 index 000000000..433517f7d --- /dev/null +++ b/tests/models/clip/test_modeling_clip.js @@ -0,0 +1,58 @@ +import { AutoTokenizer, AutoProcessor, load_image, CLIPVisionModelWithProjection, CLIPTextModelWithProjection } from "../../../src/transformers.js"; + +import { MAX_TEST_EXECUTION_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + const models_to_test = ["hf-internal-testing/tiny-random-CLIPModel"]; + it( + `CLIPTextModelWithProjection`, + async () => { + const model_id = models_to_test[0]; + + // Load tokenizer and text model + const tokenizer = await AutoTokenizer.from_pretrained(model_id); + const text_model = await CLIPTextModelWithProjection.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + + // Run tokenization + const texts = ["a photo of a car", "a photo of a football match"]; + const text_inputs = tokenizer(texts, { padding: true, truncation: true }); + + // Compute embeddings + const { text_embeds } = await text_model(text_inputs); + + // Ensure correct shapes + const expected_shape = [texts.length, text_model.config.projection_dim]; + const actual_shape = text_embeds.dims; + expect(expected_shape).toEqual(actual_shape); + + await text_model.dispose(); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + `CLIPVisionModelWithProjection`, + async () => { + const model_id = models_to_test[0]; + + // Load processor and vision model + const processor = await AutoProcessor.from_pretrained(model_id); + const vision_model = await CLIPVisionModelWithProjection.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + + // Read image and run processor + const image = await load_image("https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg"); + const image_inputs = await processor(image); + + // Compute embeddings + const { image_embeds } = await vision_model(image_inputs); + + // Ensure correct shapes + const expected_shape = [1, vision_model.config.projection_dim]; + const actual_shape = image_embeds.dims; + expect(expected_shape).toEqual(actual_shape); + + await vision_model.dispose(); + }, + MAX_TEST_EXECUTION_TIME, + ); +}; diff --git a/tests/models/clip/tokenization.js b/tests/models/clip/test_tokenization_clip.js similarity index 100% rename from tests/models/clip/tokenization.js rename to tests/models/clip/test_tokenization_clip.js diff --git a/tests/models/codegen/test_modeling_codegen.js b/tests/models/codegen/test_modeling_codegen.js new file mode 100644 index 000000000..d8044e556 --- /dev/null +++ b/tests/models/codegen/test_modeling_codegen.js @@ -0,0 +1,51 @@ +import { CodeGenTokenizer, CodeGenForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("CodeGenForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-CodeGenForCausalLM"; + /** @type {CodeGenForCausalLM} */ + let model; + /** @type {CodeGenTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await CodeGenForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await CodeGenTokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[258n, 863n, 79n, 437n, 334n, 450n, 294n, 621n, 375n, 385n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 0n, 258n, 863n, 79n, 437n, 334n, 450n, 294n, 621n], + [258n, 863n, 79n, 269n, 813n, 759n, 113n, 295n, 574n, 987n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/cohere/test_modeling_cohere.js b/tests/models/cohere/test_modeling_cohere.js new file mode 100644 index 000000000..b3ff8631b --- /dev/null +++ b/tests/models/cohere/test_modeling_cohere.js @@ -0,0 +1,90 @@ +import { CohereTokenizer, CohereModel, CohereForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("CohereModel", () => { + const model_id = "hf-internal-testing/tiny-random-CohereModel"; + /** @type {CohereModel} */ + let model; + /** @type {CohereTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await CohereModel.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await CohereTokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const { last_hidden_state } = await model(inputs); + expect(last_hidden_state.dims).toEqual([1, 4, 32]); + expect(last_hidden_state.mean().item()).toBeCloseTo(0.0, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const { last_hidden_state } = await model(inputs); + expect(last_hidden_state.dims).toEqual([2, 6, 32]); + expect(last_hidden_state.mean().item()).toBeCloseTo(9.934107758624577e-9, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + + describe("CohereForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-CohereForCausalLM"; + /** @type {CohereForCausalLM} */ + let model; + /** @type {CohereTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await CohereForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await CohereTokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[5n, 203n, 790n, 87n, 87n, 87n, 87n, 87n, 87n, 87n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 0n, 5n, 203n, 790n, 87n, 87n, 87n, 87n, 87n], + [5n, 203n, 790n, 87n, 214n, 741n, 741n, 741n, 741n, 741n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/convnext/test_image_processing_convnext.js b/tests/models/convnext/test_image_processing_convnext.js new file mode 100644 index 000000000..2ff92aa82 --- /dev/null +++ b/tests/models/convnext/test_image_processing_convnext.js @@ -0,0 +1,32 @@ +import { AutoImageProcessor, ConvNextFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // ConvNextFeatureExtractor + describe("ConvNextFeatureExtractor", () => { + const model_id = "Xenova/resnet-50"; + + /** @type {ConvNextFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("tiger"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 224, 224]); + expect(pixel_values.mean().item()).toBeCloseTo(0.06262318789958954, 3); + + expect(original_sizes).toEqual([[408, 612]]); + expect(reshaped_input_sizes).toEqual([[224, 224]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/deberta-v2/tokenization.js b/tests/models/deberta_v2/test_tokenization_deberta_v2.js similarity index 100% rename from tests/models/deberta-v2/tokenization.js rename to tests/models/deberta_v2/test_tokenization_deberta_v2.js diff --git a/tests/models/deit/test_image_processing_deit.js b/tests/models/deit/test_image_processing_deit.js new file mode 100644 index 000000000..3f1c591a0 --- /dev/null +++ b/tests/models/deit/test_image_processing_deit.js @@ -0,0 +1,31 @@ +import { AutoImageProcessor, DeiTFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("DeiTFeatureExtractor", () => { + const model_id = "Xenova/deit-tiny-distilled-patch16-224"; + + /** @type {DeiTFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("tiger"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 224, 224]); + expect(pixel_values.mean().item()).toBeCloseTo(-0.2760336682859463, 6); + + expect(original_sizes).toEqual([[408, 612]]); + expect(reshaped_input_sizes).toEqual([[224, 224]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/detr/test_image_processing_detr.js b/tests/models/detr/test_image_processing_detr.js new file mode 100644 index 000000000..23c58d51e --- /dev/null +++ b/tests/models/detr/test_image_processing_detr.js @@ -0,0 +1,34 @@ +import { AutoImageProcessor, DetrFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("DetrFeatureExtractor", () => { + const model_id = "Xenova/detr-resnet-50"; + + /** @type {DetrFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("tiger"); + const { pixel_values, original_sizes, reshaped_input_sizes, pixel_mask } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 888, 1333]); + expect(pixel_values.mean().item()).toBeCloseTo(-0.27840224131001773, 6); + + expect(original_sizes).toEqual([[408, 612]]); + expect(reshaped_input_sizes).toEqual([[888, 1333]]); + + expect(pixel_mask.dims).toEqual([1, 64, 64]); + expect(pixel_mask.to("float32").mean().item()).toBeCloseTo(1.0, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/distilbert/tokenization.js b/tests/models/distilbert/test_tokenization_distilbert.js similarity index 100% rename from tests/models/distilbert/tokenization.js rename to tests/models/distilbert/test_tokenization_distilbert.js diff --git a/tests/models/donut/test_image_processing_donut.js b/tests/models/donut/test_image_processing_donut.js new file mode 100644 index 000000000..682b3589b --- /dev/null +++ b/tests/models/donut/test_image_processing_donut.js @@ -0,0 +1,34 @@ +import { AutoImageProcessor, DonutFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // DonutFeatureExtractor + // - tests thumbnail resizing (do_thumbnail=true, size=[960, 1280]) + // - tests padding after normalization (image_mean=image_std=0.5) + describe("DonutFeatureExtractor", () => { + const model_id = "Xenova/donut-base-finetuned-cord-v2"; + + /** @type {DonutFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("receipt"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 1280, 960]); + expect(pixel_values.mean().item()).toBeCloseTo(0.1229388610053704, 6); + + expect(original_sizes).toEqual([[864, 576]]); + expect(reshaped_input_sizes).toEqual([[1280, 853]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/dpt/test_image_processing_dpt.js b/tests/models/dpt/test_image_processing_dpt.js new file mode 100644 index 000000000..cbe54ded6 --- /dev/null +++ b/tests/models/dpt/test_image_processing_dpt.js @@ -0,0 +1,77 @@ +import { AutoImageProcessor, DPTFeatureExtractor, DPTImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // DPTFeatureExtractor + describe("DPTFeatureExtractor", () => { + const model_id = "Xenova/dpt-hybrid-midas"; + + /** @type {DPTFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "grayscale images", + async () => { + const image = await load_cached_image("cats"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 384, 384]); + expect(pixel_values.mean().item()).toBeCloseTo(0.0372855559389454, 6); + + expect(original_sizes).toEqual([[480, 640]]); + expect(reshaped_input_sizes).toEqual([[384, 384]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + // DPTImageProcessor + // - tests ensure_multiple_of + // - tests keep_aspect_ratio + // - tests bankers rounding + describe("DPTImageProcessor", () => { + const model_id = "Xenova/depth-anything-small-hf"; + + /** @type {DPTImageProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "ensure_multiple_of w/ normal rounding", + async () => { + const image = await load_cached_image("cats"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 518, 686]); + expect(pixel_values.mean().item()).toBeCloseTo(0.30337387323379517, 3); + + expect(original_sizes).toEqual([[480, 640]]); + expect(reshaped_input_sizes).toEqual([[518, 686]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "ensure_multiple_of w/ bankers rounding", + async () => { + const image = await load_cached_image("checkerboard_64x32"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + // NOTE: without bankers rounding, this would be [1, 3, 266, 518] + expect(pixel_values.dims).toEqual([1, 3, 252, 518]); + expect(pixel_values.mean().item()).toBeCloseTo(0.2267402559518814, 1); + + expect(original_sizes).toEqual([[32, 64]]); + expect(reshaped_input_sizes).toEqual([[252, 518]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/efficientnet/test_image_processing_efficientnet.js b/tests/models/efficientnet/test_image_processing_efficientnet.js new file mode 100644 index 000000000..fefa7d9aa --- /dev/null +++ b/tests/models/efficientnet/test_image_processing_efficientnet.js @@ -0,0 +1,46 @@ +import { EfficientNetImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // EfficientNetImageProcessor + // - tests include_top + describe("EfficientNetImageProcessor", () => { + /** @type {EfficientNetImageProcessor} */ + const processor = new EfficientNetImageProcessor({ + crop_size: { + height: 289, + width: 289, + }, + do_center_crop: false, + do_normalize: true, + do_rescale: true, + do_resize: true, + image_mean: [0.485, 0.456, 0.406], + image_processor_type: "EfficientNetImageProcessor", + image_std: [0.47853944, 0.4732864, 0.47434163], + include_top: true, + resample: 0, + rescale_factor: 0.00392156862745098, + rescale_offset: false, + size: { + height: 224, + width: 224, + }, + }); + + it( + "default", + async () => { + const image = await load_cached_image("cats"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + expect(pixel_values.dims).toEqual([1, 3, 224, 224]); + expect(pixel_values.mean().item()).toBeCloseTo(0.3015307230282871, 6); + expect(original_sizes).toEqual([[480, 640]]); + expect(reshaped_input_sizes).toEqual([[224, 224]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/esm/tokenization.js b/tests/models/esm/test_tokenization_esm.js similarity index 100% rename from tests/models/esm/tokenization.js rename to tests/models/esm/test_tokenization_esm.js diff --git a/tests/models/falcon/tokenization.js b/tests/models/falcon/test_tokenization_falcon.js similarity index 100% rename from tests/models/falcon/tokenization.js rename to tests/models/falcon/test_tokenization_falcon.js diff --git a/tests/models/florence2/test_modeling_florence2.js b/tests/models/florence2/test_modeling_florence2.js new file mode 100644 index 000000000..9d21cb4be --- /dev/null +++ b/tests/models/florence2/test_modeling_florence2.js @@ -0,0 +1,83 @@ +import { Florence2Processor, Florence2ForConditionalGeneration, RawImage, full } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + const texts = ["Describe with a paragraph what is shown in the image.", "Locate the objects with category name in the image."]; + + // Empty white image + const dims = [224, 224, 3]; + const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims); + + describe("Florence2ForConditionalGeneration", () => { + const model_id = "Xenova/tiny-random-Florence2ForConditionalGeneration"; + + /** @type {Florence2ForConditionalGeneration} */ + let model; + /** @type {Florence2Processor} */ + let processor; + beforeAll(async () => { + model = await Florence2ForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + processor = await Florence2Processor.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "forward", + async () => { + const inputs = await processor(image, texts[0]); + + const { logits } = await model({ + ...inputs, + decoder_input_ids: full([1, 1], 2n), + }); + expect(logits.dims).toEqual([1, 1, 51289]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size=1", + async () => { + { + const text_inputs = processor.tokenizer(texts[0]); + const generate_ids = await model.generate({ ...text_inputs, max_new_tokens: 10 }); + expect(generate_ids.tolist()).toEqual([[2n, 0n, 0n, 0n, 1n, 0n, 0n, 2n]]); + } + { + const inputs = await processor(image, texts[0]); + const generate_ids = await model.generate({ ...inputs, max_new_tokens: 10 }); + expect(generate_ids.tolist()).toEqual([[2n, 0n, 48n, 48n, 48n, 48n, 48n, 48n, 48n, 48n, 2n]]); + } + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + { + const text_inputs = processor.tokenizer(texts, { padding: true }); + const generate_ids = await model.generate({ ...text_inputs, max_new_tokens: 10 }); + expect(generate_ids.tolist()).toEqual([ + [2n, 0n, 0n, 0n, 1n, 0n, 0n, 2n], + [2n, 0n, 0n, 0n, 1n, 0n, 0n, 2n], + ]); + } + { + const inputs = await processor([image, image], texts, { padding: true }); + + const generate_ids = await model.generate({ ...inputs, max_new_tokens: 10 }); + expect(generate_ids.tolist()).toEqual([ + [2n, 0n, 48n, 48n, 48n, 48n, 48n, 48n, 48n, 48n, 2n], + [2n, 0n, 48n, 48n, 48n, 48n, 48n, 48n, 48n, 48n, 2n], + ]); + } + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/gemma/test_modeling_gemma.js b/tests/models/gemma/test_modeling_gemma.js new file mode 100644 index 000000000..47501f230 --- /dev/null +++ b/tests/models/gemma/test_modeling_gemma.js @@ -0,0 +1,51 @@ +import { GemmaTokenizer, GemmaForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("GemmaForCausalLM", () => { + const model_id = "Xenova/tiny-random-GemmaForCausalLM"; + /** @type {GemmaForCausalLM} */ + let model; + /** @type {GemmaTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await GemmaForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GemmaTokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[2n, 17534n, 254059n, 254059n, 254059n, 254059n, 254059n, 254059n, 254059n, 254059n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 2n, 17534n, 254059n, 254059n, 254059n, 254059n, 254059n, 254059n, 254059n], + [2n, 17534n, 2134n, 71055n, 71055n, 71055n, 71055n, 71055n, 71055n, 71055n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/gemma/tokenization.js b/tests/models/gemma/test_tokenization_gemma.js similarity index 100% rename from tests/models/gemma/tokenization.js rename to tests/models/gemma/test_tokenization_gemma.js diff --git a/tests/models/gemma2/test_modeling_gemma2.js b/tests/models/gemma2/test_modeling_gemma2.js new file mode 100644 index 000000000..768c1161b --- /dev/null +++ b/tests/models/gemma2/test_modeling_gemma2.js @@ -0,0 +1,51 @@ +import { GemmaTokenizer, Gemma2ForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("Gemma2ForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-Gemma2ForCausalLM"; + /** @type {Gemma2ForCausalLM} */ + let model; + /** @type {GemmaTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await Gemma2ForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GemmaTokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[2n, 17534n, 127534n, 160055n, 160055n, 160055n, 160055n, 160055n, 160055n, 160055n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 2n, 17534n, 127534n, 127534n, 215341n, 215341n, 215341n, 215341n, 215341n], + [2n, 17534n, 2134n, 107508n, 160055n, 160055n, 160055n, 160055n, 160055n, 160055n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/glpn/test_image_processing_glpn.js b/tests/models/glpn/test_image_processing_glpn.js new file mode 100644 index 000000000..93487166f --- /dev/null +++ b/tests/models/glpn/test_image_processing_glpn.js @@ -0,0 +1,48 @@ +import { AutoImageProcessor, GLPNFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // GLPNFeatureExtractor + // - tests `size_divisor` and no size (size_divisor=32) + describe("GLPNFeatureExtractor", () => { + const model_id = "Xenova/glpn-kitti"; + + /** @type {GLPNFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "multiple of size_divisor", + async () => { + const image = await load_cached_image("cats"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + expect(pixel_values.dims).toEqual([1, 3, 480, 640]); + expect(pixel_values.mean().item()).toBeCloseTo(0.5186172404123327, 6); + + expect(original_sizes).toEqual([[480, 640]]); + expect(reshaped_input_sizes).toEqual([[480, 640]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "non-multiple of size_divisor", + async () => { + // Tests input which is not a multiple of 32 ([408, 612] -> [384, 608]) + const image = await load_cached_image("tiger"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 384, 608]); + expect(pixel_values.mean().item()).toBeCloseTo(0.38628831535989555, 6); + + expect(original_sizes).toEqual([[408, 612]]); + expect(reshaped_input_sizes).toEqual([[384, 608]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/gpt2/test_modeling_gpt2.js b/tests/models/gpt2/test_modeling_gpt2.js new file mode 100644 index 000000000..a2f93744e --- /dev/null +++ b/tests/models/gpt2/test_modeling_gpt2.js @@ -0,0 +1,51 @@ +import { GPT2Tokenizer, GPT2LMHeadModel } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("GPT2LMHeadModel", () => { + const model_id = "hf-internal-testing/tiny-random-GPT2LMHeadModel"; + /** @type {GPT2LMHeadModel} */ + let model; + /** @type {GPT2Tokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await GPT2LMHeadModel.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GPT2Tokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[258n, 863n, 79n, 79n, 79n, 79n, 79n, 79n, 79n, 243n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 0n, 258n, 863n, 79n, 79n, 79n, 79n, 79n, 79n], + [258n, 863n, 79n, 269n, 813n, 813n, 813n, 813n, 813n, 813n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/gpt2/tokenization.js b/tests/models/gpt2/test_tokenization_gpt2.js similarity index 100% rename from tests/models/gpt2/tokenization.js rename to tests/models/gpt2/test_tokenization_gpt2.js diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.js b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.js new file mode 100644 index 000000000..3cc5c4ffe --- /dev/null +++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.js @@ -0,0 +1,51 @@ +import { GPT2Tokenizer, GPTBigCodeForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("GPTBigCodeForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-GPTBigCodeForCausalLM"; + /** @type {GPTBigCodeForCausalLM} */ + let model; + /** @type {GPT2Tokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await GPTBigCodeForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GPT2Tokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[258n, 863n, 79n, 79n, 79n, 79n, 79n, 79n, 79n, 79n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 0n, 258n, 863n, 79n, 79n, 79n, 79n, 79n, 79n], + [258n, 863n, 79n, 269n, 813n, 832n, 93n, 93n, 93n, 93n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.js b/tests/models/gpt_neo/test_modeling_gpt_neo.js new file mode 100644 index 000000000..3557b6a82 --- /dev/null +++ b/tests/models/gpt_neo/test_modeling_gpt_neo.js @@ -0,0 +1,51 @@ +import { GPT2Tokenizer, GPTNeoForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("GPTNeoForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-GPTNeoForCausalLM"; + /** @type {GPTNeoForCausalLM} */ + let model; + /** @type {GPT2Tokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await GPTNeoForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GPT2Tokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[258n, 863n, 79n, 79n, 79n, 949n, 949n, 949n, 949n, 949n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 0n, 258n, 863n, 79n, 79n, 79n, 949n, 949n, 949n], + [258n, 863n, 79n, 269n, 813n, 849n, 849n, 849n, 849n, 849n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.js b/tests/models/gpt_neox/test_modeling_gpt_neox.js new file mode 100644 index 000000000..9d340f93d --- /dev/null +++ b/tests/models/gpt_neox/test_modeling_gpt_neox.js @@ -0,0 +1,51 @@ +import { GPTNeoXTokenizer, GPTNeoXForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("GPTNeoXForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-GPTNeoXForCausalLM"; + /** @type {GPTNeoXForCausalLM} */ + let model; + /** @type {GPTNeoXTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await GPTNeoXForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GPTNeoXTokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[259n, 864n, 80n, 881n, 502n, 895n, 938n, 668n, 502n, 895n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 0n, 259n, 864n, 80n, 881n, 502n, 895n, 938n, 668n], + [259n, 864n, 80n, 270n, 814n, 522n, 112n, 268n, 503n, 468n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/gptj/test_modeling_gptj.js b/tests/models/gptj/test_modeling_gptj.js new file mode 100644 index 000000000..ee5a74275 --- /dev/null +++ b/tests/models/gptj/test_modeling_gptj.js @@ -0,0 +1,51 @@ +import { GPTNeoXTokenizer, GPTJForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("GPTJForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-GPTJForCausalLM"; + /** @type {GPTJForCausalLM} */ + let model; + /** @type {GPTNeoXTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await GPTJForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GPTNeoXTokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[258n, 863n, 79n, 102n, 401n, 773n, 889n, 159n, 957n, 869n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 0n, 258n, 863n, 79n, 102n, 401n, 773n, 889n, 159n], + [258n, 863n, 79n, 269n, 813n, 879n, 175n, 39n, 141n, 1000n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/granite/test_modeling_granite.js b/tests/models/granite/test_modeling_granite.js new file mode 100644 index 000000000..82e6b8c6d --- /dev/null +++ b/tests/models/granite/test_modeling_granite.js @@ -0,0 +1,50 @@ +import { GPT2Tokenizer, GraniteForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("GraniteForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-GraniteForCausalLM"; + /** @type {GraniteForCausalLM} */ + let model; + /** @type {GPT2Tokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await GraniteForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GPT2Tokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[7656n, 39727n, 33077n, 9643n, 30539n, 47869n, 48739n, 15085n, 9203n, 14020n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 7656n, 39727n, 33077n, 9643n, 30539n, 47869n, 48739n, 15085n, 9203n], + [7656n, 5788n, 17835n, 13234n, 7592n, 21471n, 30537n, 23023n, 43450n, 4824n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/idefics3/test_image_processing_idefics3.js b/tests/models/idefics3/test_image_processing_idefics3.js new file mode 100644 index 000000000..a9a15092b --- /dev/null +++ b/tests/models/idefics3/test_image_processing_idefics3.js @@ -0,0 +1,107 @@ +import { AutoImageProcessor, Idefics3ImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // Idefics3ImageProcessor + // - custom image processing (patching) + describe("Idefics3ImageProcessor", () => { + const model_id = "hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration"; + + /** @type {Record} */ + const images = {}; + /** @type {Idefics3ImageProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + + // Load images + const image = await load_cached_image("gradient_1280x640"); + const white_image = await load_cached_image("white_image"); + + images.image = image; + images.image_1 = await image.resize(1600, 1067); + images.image_2 = await image.resize(224, 224); + images.white_image = white_image; + images.white_image_1 = await white_image.resize(1600, 1067); + images.white_image_2 = await white_image.resize(224, 224); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "no image splitting", + async () => { + const { pixel_values, rows, cols } = await processor(images.image, { do_image_splitting: false, return_row_col_info: true }); + expect(pixel_values.dims).toEqual([1, 1, 3, 364, 364]); + expect(pixel_values.mean().item()).toBeCloseTo(-0.001035306602716446, 2); + expect(rows).toEqual([[0]]); + expect(cols).toEqual([[0]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batched no image splitting", + async () => { + const { pixel_values, pixel_attention_mask, rows, cols } = await processor([[images.white_image_1], [images.white_image_2], [images.white_image_1, images.white_image_2]], { do_image_splitting: false, return_row_col_info: true }); + expect(pixel_values.dims).toEqual([3, 2, 3, 364, 364]); + expect(pixel_values.mean().item()).toBeCloseTo(2 / 3, 2); + expect(pixel_attention_mask.dims).toEqual([3, 2, 364, 364]); + expect(pixel_attention_mask.to("float32").mean().item()).toBeCloseTo(2 / 3, 3); + expect(rows).toEqual([[0], [0], [0, 0]]); + expect(cols).toEqual([[0], [0], [0, 0]]); + + // Test that the order of the pixel attention mask matches the python implementation + expect(pixel_attention_mask.data.reduce((a, b, i) => a + i * b, 0)).toEqual(228217205216); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "correct patching", + async () => { + const { pixel_values, rows, cols } = await processor(images.image, { return_row_col_info: true }); + expect(pixel_values.dims).toEqual([1, 9, 3, 364, 364]); + expect(pixel_values.flatten(2).mean(2).tolist()).toBeCloseToNested([[-0.7012196183204651, -0.30104631185531616, 0.09912905097007751, 0.49929487705230713, -0.5011996626853943, -0.10103467106819153, 0.2991456389427185, 0.6993265151977539, -0.0010353063698858023]], 1); + expect(rows).toEqual([[2]]); + expect(cols).toEqual([[4]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "unbatched, single image", + async () => { + const { pixel_values, rows, cols } = await processor(images.image_1, { return_row_col_info: true }); + expect(pixel_values.dims).toEqual([1, 13, 3, 364, 364]); + + expect(rows).toEqual([[3]]); + expect(cols).toEqual([[4]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "unbatched, multiple images", + async () => { + const { pixel_values, rows, cols } = await processor([images.image_1, images.image_2], { return_row_col_info: true }); + expect(pixel_values.dims).toEqual([1, 30, 3, 364, 364]); + + expect(rows).toEqual([[3, 4]]); + expect(cols).toEqual([[4, 4]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batched, multiple images", + async () => { + const { pixel_values, rows, cols } = await processor([[images.image_1], [images.image_1, images.image_2]], { return_row_col_info: true }); + expect(pixel_values.dims).toEqual([2, 30, 3, 364, 364]); + expect(rows).toEqual([[3], [3, 4]]); + expect(cols).toEqual([[4], [4, 4]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/idefics3/test_modeling_idefics3.js b/tests/models/idefics3/test_modeling_idefics3.js new file mode 100644 index 000000000..92b163e11 --- /dev/null +++ b/tests/models/idefics3/test_modeling_idefics3.js @@ -0,0 +1,142 @@ +import { Idefics3Processor, Idefics3ForConditionalGeneration, RawImage } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + const conversation = [ + { + role: "user", + content: [{ type: "image" }, { type: "text", text: "Can you describe this image?" }], + }, + ]; + + // Empty white and black images + const white_image_dims = [224, 224, 3]; + const white_image = new RawImage(new Uint8ClampedArray(white_image_dims[0] * white_image_dims[1] * white_image_dims[2]).fill(255), ...white_image_dims); + const black_image_dims = [720, 360, 3]; + const black_image = new RawImage(new Uint8ClampedArray(black_image_dims[0] * black_image_dims[1] * black_image_dims[2]).fill(0), ...black_image_dims); + + describe("Idefics3ForConditionalGeneration", () => { + const model_id = "hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration"; + + /** @type {Idefics3ForConditionalGeneration} */ + let model; + /** @type {Idefics3Processor} */ + let processor; + /** @type {string} */ + let text; + beforeAll(async () => { + model = await Idefics3ForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + processor = await Idefics3Processor.from_pretrained(model_id); + + text = processor.apply_chat_template(conversation, { + add_generation_prompt: true, + }); + }, MAX_MODEL_LOAD_TIME); + + it( + "forward w/ image splitting (default)", + async () => { + const inputs = await processor(text, white_image, { + do_image_splitting: true, + }); + + const { logits } = await model(inputs); + expect(logits.dims).toEqual([1, 3041, 128259]); + expect(logits.mean().item()).toBeCloseTo(-0.0002692154666874558, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "forward w/o image splitting", + async () => { + const inputs = await processor(text, white_image, { + do_image_splitting: false, + }); + + const { logits } = await model(inputs); + expect(logits.dims).toEqual([1, 189, 128259]); + expect(logits.mean().item()).toBeCloseTo(-0.00019743280427064747, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size=1 w/ image splitting", + async () => { + const inputs = await processor(text, white_image, { + do_image_splitting: true, + }); + const generate_ids = await model.generate({ + ...inputs, + max_new_tokens: 10, + + // To obtain unique output tokens, deterministically + repetition_penalty: 2.0, + }); + expect(generate_ids.dims).toEqual([1, 3051]); + + const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]); + expect(new_tokens.tolist()).toEqual([[64531n, 121777n, 70370n, 105334n, 12720n, 113356n, 47739n, 59240n, 102001n, 60344n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size=1 w/o image splitting", + async () => { + const inputs = await processor(text, white_image, { + do_image_splitting: false, + }); + const generate_ids = await model.generate({ + ...inputs, + max_new_tokens: 10, + + // To obtain unique output tokens, deterministically + repetition_penalty: 2.0, + }); + expect(generate_ids.dims).toEqual([1, 199]); + + const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]); + expect(new_tokens.tolist()).toEqual([[64531n, 121777n, 70370n, 105334n, 12720n, 113356n, 47739n, 59240n, 59697n, 65246n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size=1 multi-image w/o image splitting", + async () => { + const multi_image_conversation = [ + { + role: "user", + content: [{ type: "image" }, { type: "image" }, { type: "text", text: "Can you describe these images?" }], + }, + ]; + + const multi_image_text = processor.apply_chat_template(multi_image_conversation, { + add_generation_prompt: true, + }); + const inputs = await processor(multi_image_text, [white_image, black_image], { + do_image_splitting: false, + }); + const generate_ids = await model.generate({ + ...inputs, + max_new_tokens: 10, + + // To obtain unique output tokens, deterministically + repetition_penalty: 2.0, + }); + expect(generate_ids.dims).toEqual([1, 374]); + + const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]); + expect(new_tokens.tolist()).toEqual([[73189n, 99346n, 113252n, 51743n, 33499n, 66430n, 78739n, 89539n, 121023n, 14474n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/jais/test_modeling_jais.js b/tests/models/jais/test_modeling_jais.js new file mode 100644 index 000000000..77a3a4435 --- /dev/null +++ b/tests/models/jais/test_modeling_jais.js @@ -0,0 +1,51 @@ +import { PreTrainedTokenizer, JAISLMHeadModel } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("JAISLMHeadModel", () => { + const model_id = "onnx-community/tiny-random-jais"; + /** @type {JAISLMHeadModel} */ + let model; + /** @type {PreTrainedTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await JAISLMHeadModel.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await PreTrainedTokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n], + [55422n, 2838n, 2838n, 2838n, 2838n, 2838n, 2838n, 2838n, 2838n, 2838n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/jina_clip/test_image_processing_jina_clip.js b/tests/models/jina_clip/test_image_processing_jina_clip.js new file mode 100644 index 000000000..6d59dd6ca --- /dev/null +++ b/tests/models/jina_clip/test_image_processing_jina_clip.js @@ -0,0 +1,33 @@ +import { AutoImageProcessor, JinaCLIPImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // JinaCLIPImageProcessor + // - custom config overrides + describe("JinaCLIPImageProcessor", () => { + const model_id = "jinaai/jina-clip-v2"; + + /** @type {JinaCLIPImageProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("tiger"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 512, 512]); + expect(pixel_values.mean().item()).toBeCloseTo(-0.06637834757566452, 3); + + expect(original_sizes).toEqual([[408, 612]]); + expect(reshaped_input_sizes).toEqual([[512, 512]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/llama/test_modeling_llama.js b/tests/models/llama/test_modeling_llama.js new file mode 100644 index 000000000..e13f66b2e --- /dev/null +++ b/tests/models/llama/test_modeling_llama.js @@ -0,0 +1,85 @@ +import { LlamaTokenizer, LlamaForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, MAX_TEST_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("LlamaForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM"; + /** @type {LlamaForCausalLM} */ + let model; + /** @type {LlamaTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await LlamaForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await LlamaTokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[1n, 22172n, 18547n, 8143n, 22202n, 9456n, 17213n, 15330n, 26591n, 15721n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 1n, 22172n, 18547n, 8143n, 22202n, 9456n, 17213n, 15330n, 26591n], + [1n, 22172n, 3186n, 24786n, 19169n, 20222n, 29993n, 27146n, 27426n, 24562n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + + describe("LlamaForCausalLM (onnxruntime-genai)", () => { + const model_id = "onnx-community/tiny-random-LlamaForCausalLM-ONNX"; + /** @type {LlamaTokenizer} */ + let tokenizer; + let inputs; + beforeAll(async () => { + tokenizer = await LlamaTokenizer.from_pretrained(model_id); + inputs = tokenizer("hello"); + }, MAX_MODEL_LOAD_TIME); + + const dtypes = ["fp32", "fp16", "q4", "q4f16"]; + + for (const dtype of dtypes) { + it( + `dtype=${dtype}`, + async () => { + /** @type {LlamaForCausalLM} */ + const model = await LlamaForCausalLM.from_pretrained(model_id, { + ...DEFAULT_MODEL_OPTIONS, + dtype, + }); + + const outputs = await model.generate({ + ...inputs, + max_length: 5, + }); + expect(outputs.tolist()).toEqual([[128000n, 15339n, 15339n, 15339n, 15339n]]); + + await model?.dispose(); + }, + MAX_TEST_TIME, + ); + } + }); +}; diff --git a/tests/models/llama/tokenization.js b/tests/models/llama/test_tokenization_llama.js similarity index 100% rename from tests/models/llama/tokenization.js rename to tests/models/llama/test_tokenization_llama.js diff --git a/tests/models/llava/test_modeling_llava.js b/tests/models/llava/test_modeling_llava.js new file mode 100644 index 000000000..e70cefa2c --- /dev/null +++ b/tests/models/llava/test_modeling_llava.js @@ -0,0 +1,78 @@ +import { LlamaTokenizer, CLIPImageProcessor, LlavaForConditionalGeneration, RawImage } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + const prompts = [ + // Example adapted from https://huggingface.co/docs/transformers/model_doc/llava#transformers.LlavaForConditionalGeneration.forward.example + "\nUSER: What's the content of the image?\nASSISTANT:", + "Hi", + ]; + + // Empty white image + const dims = [224, 224, 3]; + const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims); + + describe("LlavaForConditionalGeneration", () => { + const model_id = "Xenova/tiny-random-LlavaForConditionalGeneration"; + + /** @type {LlavaForConditionalGeneration} */ + let model; + /** @type {LlamaTokenizer} */ + let tokenizer; + /** @type {CLIPImageProcessor} */ + let processor; + beforeAll(async () => { + model = await LlavaForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await LlamaTokenizer.from_pretrained(model_id); + processor = await CLIPImageProcessor.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "forward", + async () => { + const text_inputs = tokenizer(prompts[0]); + const vision_inputs = await processor(image); + const inputs = { ...text_inputs, ...vision_inputs }; + + const { logits } = await model(inputs); + expect(logits.dims).toEqual([1, 244, 32002]); + expect(logits.mean().item()).toBeCloseTo(-0.0005755752790719271, 8); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size=1", + async () => { + const text_inputs = tokenizer(prompts[0]); + const vision_inputs = await processor(image); + const inputs = { ...text_inputs, ...vision_inputs }; + + const generate_ids = await model.generate({ ...inputs, max_new_tokens: 10 }); + expect(generate_ids.tolist()).toEqual([[1n, 32000n, 29871n, 13n, 11889n, 29901n, 1724n, 29915n, 29879n, 278n, 2793n, 310n, 278n, 1967n, 29973n, 13n, 22933n, 9047n, 13566n, 29901n, 21557n, 16781n, 27238n, 8279n, 20454n, 11927n, 12462n, 12306n, 2414n, 7561n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const text_inputs = tokenizer(prompts, { padding: true }); + const vision_inputs = await processor([image, image]); + const inputs = { ...text_inputs, ...vision_inputs }; + + const generate_ids = await model.generate({ ...inputs, max_new_tokens: 10 }); + expect(generate_ids.tolist()).toEqual([ + [1n, 32000n, 29871n, 13n, 11889n, 29901n, 1724n, 29915n, 29879n, 278n, 2793n, 310n, 278n, 1967n, 29973n, 13n, 22933n, 9047n, 13566n, 29901n, 21557n, 16781n, 27238n, 8279n, 20454n, 11927n, 12462n, 12306n, 2414n, 7561n], + [0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 1n, 32000n, 6324n, 1217n, 22958n, 22913n, 10381n, 148n, 31410n, 31736n, 7358n, 9150n, 28635n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/m2m_100/tokenization.js b/tests/models/m2m_100/test_tokenization_m2m_100.js similarity index 100% rename from tests/models/m2m_100/tokenization.js rename to tests/models/m2m_100/test_tokenization_m2m_100.js diff --git a/tests/models/marian/test_modeling_marian.js b/tests/models/marian/test_modeling_marian.js new file mode 100644 index 000000000..186951e06 --- /dev/null +++ b/tests/models/marian/test_modeling_marian.js @@ -0,0 +1,51 @@ +import { MarianTokenizer, MarianMTModel } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("MarianMTModel", () => { + const model_id = "onnx-community/tiny-random-MarianMTModel"; + + /** @type {MarianMTModel} */ + let model; + /** @type {MarianTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await MarianMTModel.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await MarianTokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[3n, 40672n, 8358n, 32810n, 32810n, 32810n, 32810n, 35687n, 33073n, 6870n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [3n, 40672n, 8358n, 32810n, 32810n, 32810n, 32810n, 35687n, 33073n, 6870n], + [3n, 40672n, 8358n, 32810n, 32810n, 32810n, 32810n, 35687n, 33073n, 6870n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/mistral/test_modeling_mistral.js b/tests/models/mistral/test_modeling_mistral.js new file mode 100644 index 000000000..294e904b9 --- /dev/null +++ b/tests/models/mistral/test_modeling_mistral.js @@ -0,0 +1,50 @@ +import { LlamaTokenizer, MistralForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("MistralForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-MistralForCausalLM"; + /** @type {MistralForCausalLM} */ + let model; + /** @type {LlamaTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await MistralForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await LlamaTokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[1n, 6312n, 28709n, 24704n, 8732n, 1310n, 9808n, 13771n, 27309n, 4779n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [2n, 1n, 6312n, 28709n, 24704n, 8732n, 1310n, 9808n, 13771n, 27309n], + [1n, 6312n, 28709n, 1526n, 8687n, 5690n, 1770n, 30811n, 12501n, 3325n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/mobilevit/test_image_processing_mobilevit.js b/tests/models/mobilevit/test_image_processing_mobilevit.js new file mode 100644 index 000000000..a737cc8c9 --- /dev/null +++ b/tests/models/mobilevit/test_image_processing_mobilevit.js @@ -0,0 +1,90 @@ +import { AutoImageProcessor, MobileViTFeatureExtractor, MobileViTImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // MobileViTFeatureExtractor + describe("MobileViTFeatureExtractor (default)", () => { + const model_id = "Xenova/mobilevit-small"; + + /** @type {MobileViTFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("tiger"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 256, 256]); + expect(pixel_values.mean().item()).toBeCloseTo(0.4599160496887033, 6); + + expect(original_sizes).toEqual([[408, 612]]); + expect(reshaped_input_sizes).toEqual([[256, 256]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + // MobileViTFeatureExtractor + // - tests not converting to rgb (do_convert_rgb=false) + describe("MobileViTFeatureExtractor (do_convert_rgb=false)", () => { + const model_id = "Xenova/quickdraw-mobilevit-small"; + + /** @type {MobileViTFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "grayscale image", + async () => { + const image = await load_cached_image("skateboard"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 1, 28, 28]); + expect(pixel_values.mean().item()).toBeCloseTo(0.08558923671585128, 6); + + expect(original_sizes).toEqual([[28, 28]]); + expect(reshaped_input_sizes).toEqual([[28, 28]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + // MobileViTImageProcessor + // - tests converting RGB to BGR (do_flip_channel_order=true) + describe("MobileViTImageProcessor (do_flip_channel_order=true)", () => { + const model_id = "Xenova/mobilevitv2-1.0-imagenet1k-256"; + + /** @type {MobileViTImageProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "RGB to BGR", + async () => { + const image = await load_cached_image("cats"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + const { data, dims } = pixel_values; + + expect(dims).toEqual([1, 3, 256, 256]); + expect(pixel_values.mean().item()).toBeCloseTo(0.5215385556221008, 2); + + expect(original_sizes).toEqual([[480, 640]]); + expect(reshaped_input_sizes).toEqual([[256, 256]]); + + // Ensure RGB to BGR conversion + expect(data.slice(0, 3)).toBeCloseToNested([0.24313725531101227, 0.250980406999588, 0.364705890417099], 4); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/mpnet/tokenization.js b/tests/models/mpnet/test_tokenization_mpnet.js similarity index 100% rename from tests/models/mpnet/tokenization.js rename to tests/models/mpnet/test_tokenization_mpnet.js diff --git a/tests/models/mpt/test_modeling_mpt.js b/tests/models/mpt/test_modeling_mpt.js new file mode 100644 index 000000000..7f6e914b4 --- /dev/null +++ b/tests/models/mpt/test_modeling_mpt.js @@ -0,0 +1,51 @@ +import { GPTNeoXTokenizer, MptForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("MptForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-MptForCausalLM"; + /** @type {MptForCausalLM} */ + let model; + /** @type {GPTNeoXTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await MptForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GPTNeoXTokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[259n, 864n, 80n, 80n, 80n, 80n, 80n, 80n, 80n, 80n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 0n, 259n, 864n, 80n, 80n, 80n, 80n, 80n, 80n], + [259n, 864n, 80n, 270n, 814n, 293n, 293n, 293n, 293n, 293n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/musicgen/test_modeling_musicgen.js b/tests/models/musicgen/test_modeling_musicgen.js new file mode 100644 index 000000000..7ebf808ed --- /dev/null +++ b/tests/models/musicgen/test_modeling_musicgen.js @@ -0,0 +1,61 @@ +import { T5Tokenizer, MusicgenForConditionalGeneration, full } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("MusicgenForConditionalGeneration", () => { + const model_id = "hf-internal-testing/tiny-random-MusicgenForConditionalGeneration"; + + // Example adapted from https://huggingface.co/docs/transformers/model_doc/musicgen#text-conditional-generation + const texts = ["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"]; + + /** @type {MusicgenForConditionalGeneration} */ + let model; + /** @type {T5Tokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await MusicgenForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await T5Tokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "forward", + async () => { + // Example from https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenForConditionalGeneration.forward.example + const inputs = tokenizer(texts, { padding: true }); + const pad_token_id = BigInt(model.generation_config.pad_token_id); + const decoder_input_ids = full([inputs.input_ids.dims[0] * model.config.decoder.num_codebooks, 1], pad_token_id); + const { logits } = await model({ ...inputs, decoder_input_ids }); + expect(logits.dims).toEqual([8, 1, 99]); + expect(logits.mean().item()).toBeCloseTo(-0.0018370470497757196, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer(texts[0]); + const audio_values = await model.generate({ ...inputs, max_length: 10 }); + expect(audio_values.dims).toEqual([1, 1, 1920]); + expect(audio_values.mean().item()).toBeCloseTo(0.16644205152988434, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(texts, { padding: true }); + const audio_values = await model.generate({ ...inputs, max_length: 10 }); + expect(audio_values.dims).toEqual([2, 1, 1920]); + expect(audio_values.mean().item()).toBeCloseTo(0.16644206643104553, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/nllb/tokenization.js b/tests/models/nllb/test_tokenization_nllb.js similarity index 100% rename from tests/models/nllb/tokenization.js rename to tests/models/nllb/test_tokenization_nllb.js diff --git a/tests/models/nougat/test_image_processing_nougat.js b/tests/models/nougat/test_image_processing_nougat.js new file mode 100644 index 000000000..1cbdff6e4 --- /dev/null +++ b/tests/models/nougat/test_image_processing_nougat.js @@ -0,0 +1,33 @@ +import { AutoImageProcessor, NougatImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // NougatImageProcessor + // - tests padding after normalization (image_mean != 0.5, image_std != 0.5) + describe("NougatImageProcessor", () => { + const model_id = "Xenova/nougat-small"; + + /** @type {NougatImageProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "padding after normalization", + async () => { + const image = await load_cached_image("paper"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 896, 672]); + expect(pixel_values.mean().item()).toBeCloseTo(1.8447155005897355, 6); + + expect(original_sizes).toEqual([[850, 685]]); + expect(reshaped_input_sizes).toEqual([[833, 672]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/olmo/test_modeling_olmo.js b/tests/models/olmo/test_modeling_olmo.js new file mode 100644 index 000000000..14f70f914 --- /dev/null +++ b/tests/models/olmo/test_modeling_olmo.js @@ -0,0 +1,51 @@ +import { GPTNeoXTokenizer, OlmoForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("OlmoForCausalLM", () => { + const model_id = "onnx-community/tiny-random-olmo-hf"; + /** @type {OlmoForCausalLM} */ + let model; + /** @type {GPTNeoXTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await OlmoForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GPTNeoXTokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[25521n, 10886n, 44936n, 38777n, 33038n, 18557n, 1810n, 33853n, 9517n, 28892n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [1n, 25521n, 10886n, 44936n, 38777n, 33038n, 18557n, 1810n, 33853n, 9517n], + [25521n, 1533n, 37199n, 27362n, 30594n, 39261n, 8824n, 19175n, 8545n, 29335n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/olmo2/test_modeling_olmo2.js b/tests/models/olmo2/test_modeling_olmo2.js new file mode 100644 index 000000000..1c6f2ff50 --- /dev/null +++ b/tests/models/olmo2/test_modeling_olmo2.js @@ -0,0 +1,51 @@ +import { GPT2Tokenizer, Olmo2ForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("Olmo2ForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-Olmo2ForCausalLM"; + /** @type {Olmo2ForCausalLM} */ + let model; + /** @type {GPT2Tokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await Olmo2ForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GPT2Tokenizer.from_pretrained(model_id); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[15339n, 50957n, 43410n, 77030n, 91444n, 99516n, 80720n, 4608n, 90428n, 22806n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [100277n, 15339n, 50957n, 43410n, 77030n, 91444n, 99516n, 80720n, 4608n, 90428n], + [15339n, 1917n, 12095n, 21350n, 61586n, 19306n, 39486n, 91527n, 59768n, 31934n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/opt/test_modeling_opt.js b/tests/models/opt/test_modeling_opt.js new file mode 100644 index 000000000..9a505bcae --- /dev/null +++ b/tests/models/opt/test_modeling_opt.js @@ -0,0 +1,51 @@ +import { GPT2Tokenizer, OPTForCausalLM } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("OPTForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-OPTForCausalLM"; + /** @type {OPTForCausalLM} */ + let model; + /** @type {GPT2Tokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await OPTForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GPT2Tokenizer.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer.padding_side = "left"; + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[2n, 42891n, 39144n, 39144n, 39144n, 39144n, 39144n, 39144n, 39144n, 39144n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [1n, 2n, 42891n, 39144n, 39144n, 39144n, 39144n, 39144n, 39144n, 39144n], + [2n, 42891n, 232n, 24680n, 24680n, 24680n, 24680n, 24680n, 24680n, 24680n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/owlvit/test_image_processing_owlvit.js b/tests/models/owlvit/test_image_processing_owlvit.js new file mode 100644 index 000000000..77320c077 --- /dev/null +++ b/tests/models/owlvit/test_image_processing_owlvit.js @@ -0,0 +1,31 @@ +import { AutoImageProcessor, OwlViTFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("OwlViTFeatureExtractor", () => { + const model_id = "Xenova/owlvit-base-patch32"; + + /** @type {OwlViTFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("cats"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 768, 768]); + expect(pixel_values.mean().item()).toBeCloseTo(0.250620447910435, 6); + + expect(original_sizes).toEqual([[480, 640]]); + expect(reshaped_input_sizes).toEqual([[768, 768]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/paligemma/test_modeling_paligemma.js b/tests/models/paligemma/test_modeling_paligemma.js new file mode 100644 index 000000000..e2cfa3fa5 --- /dev/null +++ b/tests/models/paligemma/test_modeling_paligemma.js @@ -0,0 +1,52 @@ +import { PaliGemmaProcessor, PaliGemmaForConditionalGeneration, RawImage } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + const text = "What is on the flower?"; + + // Empty white image + const dims = [224, 224, 3]; + const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims); + + describe("PaliGemmaForConditionalGeneration", () => { + const model_id = "hf-internal-testing/tiny-random-PaliGemmaForConditionalGeneration"; + + /** @type {PaliGemmaForConditionalGeneration} */ + let model; + /** @type {PaliGemmaProcessor} */ + let processor; + beforeAll(async () => { + model = await PaliGemmaForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + processor = await PaliGemmaProcessor.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "forward", + async () => { + const inputs = await processor(image, text); + + const { logits } = await model(inputs); + expect(logits.dims).toEqual([1, 264, 257216]); + expect(logits.mean().item()).toBeCloseTo(-0.0023024685215204954, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size=1", + async () => { + const inputs = await processor(image, text); + const generate_ids = await model.generate({ ...inputs, max_new_tokens: 10 }); + + const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]); + expect(new_tokens.tolist()).toEqual([[91711n, 24904n, 144054n, 124983n, 83862n, 124983n, 124983n, 124983n, 141236n, 124983n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/patchtsmixer/test_modeling_patchtsmixer.js b/tests/models/patchtsmixer/test_modeling_patchtsmixer.js new file mode 100644 index 000000000..6eee6fbea --- /dev/null +++ b/tests/models/patchtsmixer/test_modeling_patchtsmixer.js @@ -0,0 +1,65 @@ +import { PatchTSMixerModel, PatchTSMixerForPrediction, Tensor } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + const dims = [64, 512, 7]; + const prod = dims.reduce((a, b) => a * b, 1); + const past_values = new Tensor( + "float32", + Float32Array.from({ length: prod }, (_, i) => i / prod), + dims, + ); + + describe("PatchTSMixerModel", () => { + const model_id = "hf-internal-testing/tiny-random-PatchTSMixerModel"; + + /** @type {PatchTSMixerModel} */ + let model; + beforeAll(async () => { + model = await PatchTSMixerModel.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it( + "default", + async () => { + const { last_hidden_state } = await model({ past_values }); + + const { num_input_channels, num_patches, d_model } = model.config; + expect(last_hidden_state.dims).toEqual([dims[0], num_input_channels, num_patches, d_model]); + expect(last_hidden_state.mean().item()).toBeCloseTo(0.03344963490962982, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + + describe("PatchTSMixerForPrediction", () => { + const model_id = "onnx-community/granite-timeseries-patchtsmixer"; + + /** @type {PatchTSMixerForPrediction} */ + let model; + beforeAll(async () => { + model = await PatchTSMixerForPrediction.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it( + "default", + async () => { + const { prediction_outputs } = await model({ past_values }); + + const { prediction_length, num_input_channels } = model.config; + expect(prediction_outputs.dims).toEqual([dims[0], prediction_length, num_input_channels]); + expect(prediction_outputs.mean().item()).toBeCloseTo(0.5064773559570312, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/patchtst/test_modeling_patchtst.js b/tests/models/patchtst/test_modeling_patchtst.js new file mode 100644 index 000000000..398538ea1 --- /dev/null +++ b/tests/models/patchtst/test_modeling_patchtst.js @@ -0,0 +1,65 @@ +import { PatchTSTModel, PatchTSTForPrediction, Tensor } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + const dims = [64, 512, 7]; + const prod = dims.reduce((a, b) => a * b, 1); + const past_values = new Tensor( + "float32", + Float32Array.from({ length: prod }, (_, i) => i / prod), + dims, + ); + + describe("PatchTSTModel", () => { + const model_id = "hf-internal-testing/tiny-random-PatchTSTModel"; + + /** @type {PatchTSTModel} */ + let model; + beforeAll(async () => { + model = await PatchTSTModel.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it( + "default", + async () => { + const { last_hidden_state } = await model({ past_values }); + + const { num_input_channels, d_model } = model.config; + expect(last_hidden_state.dims).toEqual([dims[0], num_input_channels, 43, d_model]); + expect(last_hidden_state.mean().item()).toBeCloseTo(0.016672514379024506, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + + describe("PatchTSTForPrediction", () => { + const model_id = "onnx-community/granite-timeseries-patchtst"; + + /** @type {PatchTSTForPrediction} */ + let model; + beforeAll(async () => { + model = await PatchTSTForPrediction.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + }, MAX_MODEL_LOAD_TIME); + + it( + "default", + async () => { + const { prediction_outputs } = await model({ past_values }); + + const { prediction_length, num_input_channels } = model.config; + expect(prediction_outputs.dims).toEqual([dims[0], prediction_length, num_input_channels]); + expect(prediction_outputs.mean().item()).toBeCloseTo(0.506528377532959, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/pyannote/test_modeling_pyannote.js b/tests/models/pyannote/test_modeling_pyannote.js new file mode 100644 index 000000000..8e73c677c --- /dev/null +++ b/tests/models/pyannote/test_modeling_pyannote.js @@ -0,0 +1,56 @@ +import { AutoProcessor, AutoModelForAudioFrameClassification } from "../../../src/transformers.js"; + +import { MAX_TEST_EXECUTION_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; +import { compare } from "../../test_utils.js"; + +export default () => { + const models_to_test = ["onnx-community/pyannote-segmentation-3.0"]; + + let audio; + beforeAll(async () => { + const url = "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/mlk.npy"; + const buffer = await (await fetch(url)).arrayBuffer(); + audio = Float32Array.from(new Float64Array(buffer)); + }); + + it( + `PyAnnoteForAudioFrameClassification`, + async () => { + const model_id = models_to_test[0]; + + // Load model and processor + const model = await AutoModelForAudioFrameClassification.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + const processor = await AutoProcessor.from_pretrained(model_id); + + // Check processor config + expect(processor.sampling_rate).toEqual(16000); + + // Preprocess audio + const inputs = await processor(audio); + + // Run model with inputs + const { logits } = await model(inputs); + compare(logits.dims, [1, 767, 7]); + compare(logits.mean().item(), -4.822614669799805, 6); + + const result = processor.post_process_speaker_diarization(logits, audio.length); + const target = [ + [ + { id: 0, start: 0, end: 1.0512535626298245, confidence: 0.7898106738171984 }, + { id: 2, start: 1.0512535626298245, end: 2.373798367228636, confidence: 0.8923380609065887 }, + { id: 0, start: 2.373798367228636, end: 3.5776532534660155, confidence: 0.6920057005438546 }, + { id: 2, start: 3.5776532534660155, end: 4.578039708226655, confidence: 0.8169249580865657 }, + { id: 3, start: 4.578039708226655, end: 6.2396985652867, confidence: 0.6921662061495533 }, + { id: 2, start: 6.2396985652867, end: 8.664364040384521, confidence: 0.705263573835628 }, + { id: 0, start: 8.664364040384521, end: 10.071687358098641, confidence: 0.6650650397924295 }, + { id: 2, start: 10.071687358098641, end: 12.598087048934833, confidence: 0.8999033333468749 }, + { id: 0, start: 12.598087048934833, end: 13.005023911888312, confidence: 0.37838892004965197 }, + ], + ]; + compare(result, target); + + await model.dispose(); + }, + MAX_TEST_EXECUTION_TIME, + ); +}; diff --git a/tests/models/qwen2/tokenization.js b/tests/models/qwen2/test_tokenization_qwen2.js similarity index 100% rename from tests/models/qwen2/tokenization.js rename to tests/models/qwen2/test_tokenization_qwen2.js diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.js b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.js new file mode 100644 index 000000000..a348ec427 --- /dev/null +++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.js @@ -0,0 +1,34 @@ +import { AutoImageProcessor, Qwen2VLImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // Qwen2VLImageProcessor + // - custom image processing (min_pixels, max_pixels) + describe("Qwen2VLImageProcessor", () => { + const model_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration"; + + /** @type {Qwen2VLImageProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "custom image processing", + async () => { + const image = await load_cached_image("white_image"); + const { pixel_values, image_grid_thw, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([256, 1176]); + expect(pixel_values.mean().item()).toBeCloseTo(2.050372362136841, 6); + expect(image_grid_thw.tolist()).toEqual([[1n, 16n, 16n]]); + + expect(original_sizes).toEqual([[224, 224]]); + expect(reshaped_input_sizes).toEqual([[224, 224]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.js b/tests/models/qwen2_vl/test_modeling_qwen2_vl.js new file mode 100644 index 000000000..887a8e092 --- /dev/null +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.js @@ -0,0 +1,93 @@ +import { Qwen2VLProcessor, Qwen2VLForConditionalGeneration, RawImage } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + const CONVERSATION = [ + { + role: "user", + content: [{ type: "text", text: "Hello" }], + }, + ]; + + // Example adapted from https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct + const CONVERSATION_WITH_IMAGE = [ + { + role: "user", + content: [{ type: "image" }, { type: "text", text: "Describe this image." }], + }, + ]; + // Empty white image + const dims = [224, 224, 3]; + const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims); + + describe("Qwen2VLForConditionalGeneration", () => { + const model_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration"; + + /** @type {Qwen2VLForConditionalGeneration} */ + let model; + /** @type {Qwen2VLProcessor} */ + let processor; + beforeAll(async () => { + model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id, { + // TODO move to config + ...DEFAULT_MODEL_OPTIONS, + }); + processor = await Qwen2VLProcessor.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "forward", + async () => { + const text = processor.apply_chat_template(CONVERSATION_WITH_IMAGE, { + add_generation_prompt: true, + }); + const inputs = await processor(text, image); + const { logits } = await model(inputs); + expect(logits.dims).toEqual([1, 89, 152064]); + expect(logits.mean().item()).toBeCloseTo(-0.0011299321195110679, 5); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "text-only (batch_size=1)", + async () => { + const text = processor.apply_chat_template(CONVERSATION, { + add_generation_prompt: true, + }); + const inputs = await processor(text); + const generate_ids = await model.generate({ + ...inputs, + max_new_tokens: 10, + }); + + const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]); + expect(new_tokens.tolist()).toEqual([[24284n, 63986n, 108860n, 84530n, 8889n, 23262n, 128276n, 64948n, 136757n, 138348n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "text + image (batch_size=1)", + async () => { + const text = processor.apply_chat_template(CONVERSATION_WITH_IMAGE, { + add_generation_prompt: true, + }); + const inputs = await processor(text, image); + const generate_ids = await model.generate({ + ...inputs, + max_new_tokens: 10, + }); + + const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]); + expect(new_tokens.tolist()).toEqual([[24284n, 35302n, 60575n, 38679n, 113390n, 115118n, 137596n, 38241n, 96726n, 142301n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/roberta/tokenization.js b/tests/models/roberta/test_tokenization_roberta.js similarity index 100% rename from tests/models/roberta/tokenization.js rename to tests/models/roberta/test_tokenization_roberta.js diff --git a/tests/models/sam/test_image_processing_sam.js b/tests/models/sam/test_image_processing_sam.js new file mode 100644 index 000000000..31fa53682 --- /dev/null +++ b/tests/models/sam/test_image_processing_sam.js @@ -0,0 +1,95 @@ +import { AutoImageProcessor, SamImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // SamImageProcessor + // - tests normal padding (do_pad=true, pad_size={"height":1024,"width":1024}) + // - In addition to the image, pass in a list of points + describe("SamImageProcessor", () => { + const model_id = "Xenova/sam-vit-base"; + + /** @type {SamImageProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "without input points", + async () => { + const image = await load_cached_image("pattern_3x3"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + expect(pixel_values.dims).toEqual([1, 3, 1024, 1024]); + expect(pixel_values.mean().item()).toBeCloseTo(-0.4505715670146813, 6); + + expect(original_sizes).toEqual([[3, 3]]); + expect(reshaped_input_sizes).toEqual([[1024, 1024]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "with input points", + async () => { + const image = await load_cached_image("pattern_3x3"); + const { original_sizes, reshaped_input_sizes, input_points } = await processor(image, { + input_points: [[[1, 2]]], + }); + + expect(original_sizes).toEqual([[3, 3]]); + expect(reshaped_input_sizes).toEqual([[1024, 1024]]); + expect(input_points.tolist()).toBeCloseToNested([[[[341.3333, 682.6667]]]], 4); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "multiple points with labels", + async () => { + const image = await load_cached_image("pattern_3x3"); + const { original_sizes, reshaped_input_sizes, input_points, input_labels } = await processor(image, { + input_points: [ + [ + [1, 2], + [2, 1], + ], + ], + input_labels: [[1, 0]], + }); + + expect(original_sizes).toEqual([[3, 3]]); + expect(reshaped_input_sizes).toEqual([[1024, 1024]]); + expect(input_points.tolist()).toBeCloseToNested( + [ + [ + [ + [341.3333, 682.6667], + [682.6667, 341.3333], + ], + ], + ], + 4, + ); + expect(input_labels.tolist()).toEqual([[[1n, 0n]]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "with input boxes", + async () => { + const image = await load_cached_image("pattern_3x3"); + const { original_sizes, reshaped_input_sizes, input_boxes } = await processor(image, { + input_boxes: [[[0, 1, 2, 2]]], + }); + + expect(original_sizes).toEqual([[3, 3]]); + expect(reshaped_input_sizes).toEqual([[1024, 1024]]); + expect(input_boxes.tolist()).toBeCloseToNested([[[0, 341.3333, 682.6667, 682.6667]]], 4); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/swin2sr/test_image_processing_swin2sr.js b/tests/models/swin2sr/test_image_processing_swin2sr.js new file mode 100644 index 000000000..db4579883 --- /dev/null +++ b/tests/models/swin2sr/test_image_processing_swin2sr.js @@ -0,0 +1,41 @@ +import { AutoImageProcessor, Swin2SRImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // Swin2SRImageProcessor + // - tests when padding is a number (do_pad=true, pad_size=8) + describe("Swin2SRImageProcessor", () => { + const model_id = "Xenova/swin2SR-classical-sr-x2-64"; + + /** @type {Swin2SRImageProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "Pad to multiple of 8 (3x3 -> 8x8)", + async () => { + const image = await load_cached_image("pattern_3x3"); + const { pixel_values } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 8, 8]); + expect(pixel_values.mean().item()).toBeCloseTo(0.5458333368102709, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "Do not pad if already a multiple of 8 (8x8 -> 8x8)", + async () => { + const image = await load_cached_image("checkerboard_8x8"); + const { pixel_values } = await processor(image); + expect(pixel_values.dims).toEqual([1, 3, 8, 8]); + expect(pixel_values.mean().item()).toBeCloseTo(0.5, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/t5/test_modeling_t5.js b/tests/models/t5/test_modeling_t5.js new file mode 100644 index 000000000..c48150d1f --- /dev/null +++ b/tests/models/t5/test_modeling_t5.js @@ -0,0 +1,96 @@ +import { T5Tokenizer, T5Model, T5ForConditionalGeneration } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("T5Model", () => { + const model_id = "hf-internal-testing/tiny-random-T5Model"; + + /** @type {T5Model} */ + let model; + /** @type {T5Tokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await T5Model.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await T5Tokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "forward", + async () => { + // Example adapted from https://huggingface.co/google-t5/t5-small#how-to-get-started-with-the-model + const inputs = tokenizer("Studies have been shown that owning a dog is good for you"); + const { input_ids: decoder_input_ids } = tokenizer("Studies show that"); + + const { last_hidden_state } = await model({ ...inputs, decoder_input_ids }); + expect(last_hidden_state.dims).toEqual([1, 4, 32]); + expect(last_hidden_state.mean().item()).toBeCloseTo(7.492632721550763e-5, 8); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + describe("T5ForConditionalGeneration", () => { + const model_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration"; + + /** @type {T5ForConditionalGeneration} */ + let model; + /** @type {T5Tokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await T5ForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await T5Tokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "forward", + async () => { + // Example adapted from https://huggingface.co/google-t5/t5-small#how-to-get-started-with-the-model + const inputs = tokenizer("Studies have been shown that owning a dog is good for you"); + const { input_ids: decoder_input_ids } = tokenizer("Studies show that"); + + const model = await T5ForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + const outputs = await model({ ...inputs, decoder_input_ids }); + expect(outputs.logits.dims).toEqual([1, 4, 32100]); + expect(outputs.logits.mean().item()).toBeCloseTo(8.867568901393952e-9, 12); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("hello"); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([[0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "batch_size>1", + async () => { + const inputs = tokenizer(["hello", "hello world"], { padding: true }); + const outputs = await model.generate({ + ...inputs, + max_length: 10, + }); + expect(outputs.tolist()).toEqual([ + [0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n], + [0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/t5/tokenization.js b/tests/models/t5/test_tokenization_t5.js similarity index 100% rename from tests/models/t5/tokenization.js rename to tests/models/t5/test_tokenization_t5.js diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.js b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.js new file mode 100644 index 000000000..a13c4bb5e --- /dev/null +++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.js @@ -0,0 +1,52 @@ +import { GPT2Tokenizer, VisionEncoderDecoderModel, RawImage, full } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("VisionEncoderDecoderModel", () => { + const model_id = "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2"; + + /** @type {VisionEncoderDecoderModel} */ + let model; + /** @type {GPT2Tokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await VisionEncoderDecoderModel.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await GPT2Tokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const outputs = await model.generate({ + pixel_values: full([1, 3, 30, 30], -1.0), + max_length: 5, + }); + expect(outputs.tolist()).toEqual([[0n, 400n, 400n, 400n, 400n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + // TODO: Add back + // it('batch_size>1', async () => { + // const outputs = await model.generate({ + // pixel_values: cat([ + // full([1, 3, 30, 30], -1.0), + // full([1, 3, 30, 30], 0.0), + // ]), + // max_length: 5, + // }); + // expect(outputs.tolist()).toEqual([ + // // Generation continues + // [0n, 400n, 400n, 400n, 400n], + + // // Finishes early. 1023 is the padding token + // [0n, 0n, 1023n, 1023n, 1023n], + // ]); + // }, MAX_TEST_EXECUTION_TIME); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/vit/test_image_processing_vit.js b/tests/models/vit/test_image_processing_vit.js new file mode 100644 index 000000000..a69cbdc37 --- /dev/null +++ b/tests/models/vit/test_image_processing_vit.js @@ -0,0 +1,31 @@ +import { AutoImageProcessor, ViTFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("ViTFeatureExtractor", () => { + const model_id = "Xenova/vit-base-patch16-224"; + + /** @type {ViTFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("tiger"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 224, 224]); + expect(pixel_values.mean().item()).toBeCloseTo(-0.22706867939852762, 6); + + expect(original_sizes).toEqual([[408, 612]]); + expect(reshaped_input_sizes).toEqual([[224, 224]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/vitmatte/test_image_processing_vitmatte.js b/tests/models/vitmatte/test_image_processing_vitmatte.js new file mode 100644 index 000000000..56e9ec4bf --- /dev/null +++ b/tests/models/vitmatte/test_image_processing_vitmatte.js @@ -0,0 +1,68 @@ +import { AutoImageProcessor, VitMatteImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // VitMatteImageProcessor + // - tests custom overrides + // - tests multiple inputs + // - tests `size_divisibility` and no size (size_divisibility=32) + // - tests do_pad and `size_divisibility` + describe("VitMatteImageProcessor", () => { + const model_id = "Xenova/vitmatte-small-distinctions-646"; + + /** @type {VitMatteImageProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "w/o resize", + async () => { + const image = await load_cached_image("vitmatte_image"); + const image2 = await load_cached_image("vitmatte_trimap"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image, image2); + const { data, dims } = pixel_values; + + expect(dims).toEqual([1, 4, 640, 960]); + expect(pixel_values.mean().item()).toBeCloseTo(-0.4028555154800415); + expect(data[0]).toBeCloseTo(-0.9921568632125854); + expect(data[1]).toBeCloseTo(-0.9921568632125854); + expect(data[5]).toBeCloseTo(-1.0); + expect(data[640]).toBeCloseTo(-0.6784313917160034); + expect(data[641]).toBeCloseTo(-0.6705882549285889); + expect(data[640 * 960]).toBeCloseTo(-1.0); + expect(data[640 * 960 + 1]).toBeCloseTo(-1.0); + expect(data.at(-1)).toBeCloseTo(0.0); + + expect(original_sizes).toEqual([[640, 960]]); + expect(reshaped_input_sizes).toEqual([[640, 960]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "w/ resize", + async () => { + const image = await load_cached_image("pattern_3x5"); + const image2 = await load_cached_image("pattern_3x5"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image, image2); + const { data, dims } = pixel_values; + expect(dims).toEqual([1, 4, 32, 32]); + expect(pixel_values.mean().item()).toBeCloseTo(-0.00867417361587286); + expect(data[0]).toBeCloseTo(-0.9921568632125854); + expect(data[1]).toBeCloseTo(-0.9686274528503418); + expect(data[5]).toBeCloseTo(0.0); + expect(data[32]).toBeCloseTo(-0.9215686321258545); + expect(data[33]).toBeCloseTo(-0.8980392217636108); + expect(data.at(-1)).toBeCloseTo(0.0); + + expect(original_sizes).toEqual([[5, 3]]); + expect(reshaped_input_sizes).toEqual([[5, 3]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/models/vits/tokenization.js b/tests/models/vits/test_tokenization_vits.js similarity index 100% rename from tests/models/vits/tokenization.js rename to tests/models/vits/test_tokenization_vits.js diff --git a/tests/models/wav2vec2/tokenization.js b/tests/models/wav2vec2/test_tokenization_wav2vec2.js similarity index 100% rename from tests/models/wav2vec2/tokenization.js rename to tests/models/wav2vec2/test_tokenization_wav2vec2.js diff --git a/tests/models/whisper/test_modeling_whisper.js b/tests/models/whisper/test_modeling_whisper.js new file mode 100644 index 000000000..fff3cc1f7 --- /dev/null +++ b/tests/models/whisper/test_modeling_whisper.js @@ -0,0 +1,148 @@ +import { WhisperTokenizer, WhisperForConditionalGeneration, full } from "../../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js"; + +export default () => { + describe("WhisperForConditionalGeneration", () => { + const model_id = "Xenova/tiny-random-WhisperForConditionalGeneration"; + + /** @type {WhisperForConditionalGeneration} */ + let model; + /** @type {WhisperTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await WhisperForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await WhisperTokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + describe("prefix tokens", () => { + const input_features = full([1, 80, 3000], 0.0); + + describe("English-only", () => { + it( + "default", + async () => { + const outputs = await model.generate({ + input_features, + is_multilingual: false, + max_new_tokens: 1, + }); + + expect(outputs.tolist()).toEqual([[/* Prefix */ 50258n, 50363n, /* Generated */ 45084n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "return_timestamps=true", + async () => { + const outputs = await model.generate({ + input_features, + is_multilingual: false, + max_new_tokens: 1, + return_timestamps: true, + }); + + expect(outputs.tolist()).toEqual([[/* Prefix */ 50258n, /* Generated */ 50366n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + describe("multilingual", () => { + it( + "language unset; task unset", + async () => { + // language defaults to 'en' + // task defaults to 'transcribe' + + const outputs = await model.generate({ + input_features, + max_new_tokens: 1, + }); + + expect(outputs.tolist()).toEqual([[/* Prefix */ 50258n, 50259n, 50359n, 50363n, /* Generated */ 45084n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "language set; task unset", + async () => { + // task defaults to 'transcribe' + const outputs = await model.generate({ + input_features, + max_new_tokens: 1, + language: "af", + }); + + expect(outputs.tolist()).toEqual([[/* Prefix */ 50258n, 50327n, 50359n, 50363n, /* Generated */ 45084n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "language set; task set", + async () => { + const outputs = await model.generate({ + input_features, + max_new_tokens: 1, + language: "zh", + task: "translate", + }); + + expect(outputs.tolist()).toEqual([[/* Prefix */ 50258n, 50260n, 50358n, 50363n, /* Generated */ 45084n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + + it( + "return_timestamps=true", + async () => { + const outputs = await model.generate({ + input_features, + max_new_tokens: 1, + language: "en", + task: "transcribe", + return_timestamps: true, + }); + + expect(outputs.tolist()).toEqual([[/* Prefix */ 50258n, 50259n, 50359n, /* Generated */ 50400n]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + }); + + describe("decoder_start_ids", () => { + const input_features = full([1, 80, 3000], 0.0); + + it( + "broadcast inputs", + async () => { + const { decoder_start_token_id, lang_to_id, task_to_id, no_timestamps_token_id } = model.generation_config; + + const outputs = await model.generate({ + input_features, // batch size 1 + max_new_tokens: 1, + decoder_input_ids: [ + // batch size 2 + // <|startoftranscript|> <|lang_id|> <|task|> [<|notimestamps|>] + [decoder_start_token_id, lang_to_id["<|en|>"], task_to_id["translate"], no_timestamps_token_id], + [decoder_start_token_id, lang_to_id["<|fr|>"], task_to_id["transcribe"], no_timestamps_token_id], + ], + }); + expect(outputs.tolist()).toEqual([ + [/* Prefix */ 50258n, 50259n, 50358n, 50363n, /* Generated */ 45084n], + [/* Prefix */ 50258n, 50265n, 50359n, 50363n, /* Generated */ 45084n], + ]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}; diff --git a/tests/models/whisper/tokenization.js b/tests/models/whisper/test_tokenization_whisper.js similarity index 100% rename from tests/models/whisper/tokenization.js rename to tests/models/whisper/test_tokenization_whisper.js diff --git a/tests/models/xlm-roberta/tokenization.js b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.js similarity index 100% rename from tests/models/xlm-roberta/tokenization.js rename to tests/models/xlm_roberta/test_tokenization_xlm_roberta.js diff --git a/tests/models/yolos/test_image_processing_yolos.js b/tests/models/yolos/test_image_processing_yolos.js new file mode 100644 index 000000000..3eb9c1d44 --- /dev/null +++ b/tests/models/yolos/test_image_processing_yolos.js @@ -0,0 +1,31 @@ +import { AutoImageProcessor, YolosFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("YolosFeatureExtractor", () => { + const model_id = "Xenova/yolos-small-300"; + + /** @type {YolosFeatureExtractor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("tiger"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 888, 1333]); + expect(pixel_values.mean().item()).toBeCloseTo(-0.27840224131001773, 6); + + expect(original_sizes).toEqual([[408, 612]]); + expect(reshaped_input_sizes).toEqual([[888, 1333]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +}; diff --git a/tests/processors.test.js b/tests/processors.test.js index 8e3133563..e35e555d2 100644 --- a/tests/processors.test.js +++ b/tests/processors.test.js @@ -1,785 +1,67 @@ -import { env, AutoProcessor, AutoImageProcessor, RawImage } from "../src/transformers.js"; +import fs from "fs"; +import path from "path"; + +import { AutoProcessor } from "../src/transformers.js"; +import { load_cached_image } from "./asset_cache.js"; import { init, MAX_TEST_TIME } from "./init.js"; -import { compare } from "./test_utils.js"; +import { fileURLToPath } from "url"; // Initialise the testing environment init(); -env.allowLocalModels = false; -env.useFSCache = false; -const sum = (array) => Number(array.reduce((a, b) => a + b, array instanceof BigInt64Array ? 0n : 0)); -const avg = (array) => sum(array) / array.length; +// Collect all unit tests, which can be found in files of the form: +// `tests/models//test_image_processors_.js` +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); +const models_dir = path.join(__dirname, "models"); +const model_types = fs.readdirSync(models_dir); +for (const model_type of model_types) { + const dir = path.join(models_dir, model_type); + + if (!fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) { + continue; + } -/** @type {Map} */ -const IMAGE_CACHE = new Map(); -const load_image = async (url) => { - const cached = IMAGE_CACHE.get(url); - if (cached) { - return cached; + const file = path.join(dir, `test_image_processing_${model_type}.js`); + if (!fs.existsSync(file)) { + continue; } - const image = await RawImage.fromURL(url); - IMAGE_CACHE.set(url, image); - return image; -}; + + const { default: tests } = await import(file); + describe(model_type, tests); +} + +const sum = (array) => Number(array.reduce((a, b) => a + b, array instanceof BigInt64Array ? 0n : 0)); +const avg = (array) => sum(array) / array.length; const MODELS = { - swin2sr: "Xenova/swin2SR-classical-sr-x2-64", - sam: "Xenova/sam-vit-base", - "donut-swin": "Xenova/donut-base-finetuned-cord-v2", - resnet: "Xenova/resnet-50", - vit: "Xenova/vit-base-patch16-224", - mobilevit: "Xenova/mobilevit-small", - mobilevit_2: "Xenova/quickdraw-mobilevit-small", - mobilevit_3: "Xenova/mobilevitv2-1.0-imagenet1k-256", - deit: "Xenova/deit-tiny-distilled-patch16-224", - beit: "Xenova/beit-base-patch16-224-pt22k-ft22k", - detr: "Xenova/detr-resnet-50", - yolos: "Xenova/yolos-small-300", - dpt: "Xenova/dpt-hybrid-midas", - dpt_2: "Xenova/depth-anything-small-hf", - glpn: "Xenova/glpn-kitti", - nougat: "Xenova/nougat-small", - owlvit: "Xenova/owlvit-base-patch32", - clip: "Xenova/clip-vit-base-patch16", - jina_clip: "jinaai/jina-clip-v2", - vitmatte: "Xenova/vitmatte-small-distinctions-646", - dinov2: "Xenova/dinov2-small-imagenet1k-1-layer", - // efficientnet: 'Xenova/efficientnet-b0', florence2: "Xenova/tiny-random-Florence2ForConditionalGeneration", qwen2_vl: "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration", idefics3: "hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration", paligemma: "hf-internal-testing/tiny-random-PaliGemmaForConditionalGeneration", }; -const BASE_URL = "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/"; -const TEST_IMAGES = { - white_image: BASE_URL + "white-image.png", - pattern_3x3: BASE_URL + "pattern_3x3.png", - pattern_3x5: BASE_URL + "pattern_3x5.png", - checkerboard_8x8: BASE_URL + "checkerboard_8x8.png", - checkerboard_64x32: BASE_URL + "checkerboard_64x32.png", - gradient_1280x640: BASE_URL + "gradient_1280x640.png", - receipt: BASE_URL + "receipt.png", - tiger: BASE_URL + "tiger.jpg", - paper: BASE_URL + "nougat_paper.png", - cats: BASE_URL + "cats.jpg", - - // grayscale image - skateboard: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/ml-web-games/skateboard.png", - - vitmatte_image: BASE_URL + "vitmatte_image.png", - vitmatte_trimap: BASE_URL + "vitmatte_trimap.png", - - beetle: BASE_URL + "beetle.png", - book_cover: BASE_URL + "book-cover.png", -}; - describe("Processors", () => { - describe("Image processors", () => { - // Swin2SRImageProcessor - // - tests when padding is a number (do_pad=true, pad_size=8) - it( - MODELS.swin2sr, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.swin2sr); - - { - // Pad to multiple of 8 (3x3 -> 8x8) - const image = await load_image(TEST_IMAGES.pattern_3x3); - const { pixel_values } = await processor(image); - - compare(pixel_values.dims, [1, 3, 8, 8]); - compare(avg(pixel_values.data), 0.5458333368102709); - } - - { - // Do not pad if already a multiple of 8 (8x8 -> 8x8) - const image = await load_image(TEST_IMAGES.checkerboard_8x8); - const { pixel_values } = await processor(image); - compare(pixel_values.dims, [1, 3, 8, 8]); - compare(avg(pixel_values.data), 0.5); - } - }, - MAX_TEST_TIME, - ); - - // SamProcessor/SamImageProcessor - // - tests normal padding (do_pad=true, pad_size={"height":1024,"width":1024}) - // - In addition to the image, pass in a list of points - it( - MODELS.sam, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.sam); - - { - // without input points - const image = await load_image(TEST_IMAGES.pattern_3x3); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - compare(pixel_values.dims, [1, 3, 1024, 1024]); - compare(avg(pixel_values.data), -0.4505715670146813); - - compare(original_sizes, [[3, 3]]); - compare(reshaped_input_sizes, [[1024, 1024]]); - } - - { - // with input points - const image = await load_image(TEST_IMAGES.pattern_3x3); - const { original_sizes, reshaped_input_sizes, input_points } = await processor(image, { - input_points: [[[1, 2]]], - }); - - compare(original_sizes, [[3, 3]]); - compare(reshaped_input_sizes, [[1024, 1024]]); - compare(input_points.tolist(), [[[[341.3333, 682.6667]]]]); - } - - { - // multiple points with labels - const image = await load_image(TEST_IMAGES.pattern_3x3); - const { original_sizes, reshaped_input_sizes, input_points, input_labels } = await processor(image, { - input_points: [ - [ - [1, 2], - [2, 1], - ], - ], - input_labels: [[1, 0]], - }); - - compare(original_sizes, [[3, 3]]); - compare(reshaped_input_sizes, [[1024, 1024]]); - compare(input_points.tolist(), [ - [ - [ - [341.3333, 682.6667], - [682.6667, 341.3333], - ], - ], - ]); - compare(input_labels.tolist(), [[[1n, 0n]]]); - } - - { - // with input boxes - const image = await load_image(TEST_IMAGES.pattern_3x3); - const { original_sizes, reshaped_input_sizes, input_boxes } = await processor(image, { - input_boxes: [[[0, 1, 2, 2]]], - }); - - compare(original_sizes, [[3, 3]]); - compare(reshaped_input_sizes, [[1024, 1024]]); - compare(input_boxes.tolist(), [[[0, 341.3333, 682.6667, 682.6667]]]); - } - }, - MAX_TEST_TIME, - ); - - // DonutProcessor/DonutFeatureExtractor - // - tests thumbnail resizing (do_thumbnail=true, size=[960, 1280]) - // - tests padding after normalization (image_mean=image_std=0.5) - it( - MODELS["donut-swin"], - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS["donut-swin"]); - - { - const image = await load_image(TEST_IMAGES.receipt); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 1280, 960]); - compare(avg(pixel_values.data), 0.1229388610053704); - - compare(original_sizes, [[864, 576]]); - compare(reshaped_input_sizes, [[1280, 853]]); - } - }, - MAX_TEST_TIME, - ); - - // ConvNextFeatureExtractor - it( - MODELS.resnet, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.resnet); - - { - const image = await load_image(TEST_IMAGES.tiger); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 224, 224]); - compare(avg(pixel_values.data), 0.06262318789958954); - - compare(original_sizes, [[408, 612]]); - compare(reshaped_input_sizes, [[224, 224]]); - } - }, - MAX_TEST_TIME, - ); - - // ViTFeatureExtractor - it( - MODELS.vit, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.vit); - - { - const image = await load_image(TEST_IMAGES.tiger); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 224, 224]); - compare(avg(pixel_values.data), -0.22706867939852762); - - compare(original_sizes, [[408, 612]]); - compare(reshaped_input_sizes, [[224, 224]]); - } - }, - MAX_TEST_TIME, - ); - - // MobileViTFeatureExtractor - it( - MODELS.mobilevit, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.mobilevit); - - { - const image = await load_image(TEST_IMAGES.tiger); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 256, 256]); - compare(avg(pixel_values.data), 0.4599160496887033); - - compare(original_sizes, [[408, 612]]); - compare(reshaped_input_sizes, [[256, 256]]); - } - }, - MAX_TEST_TIME, - ); - - // MobileViTFeatureExtractor - // - tests not converting to rgb (do_convert_rgb=false) - it( - MODELS.mobilevit_2, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.mobilevit_2); - - { - // Tests grayscale image - const image = await load_image(TEST_IMAGES.skateboard); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 1, 28, 28]); - compare(avg(pixel_values.data), 0.08558923671585128); - - compare(original_sizes, [[28, 28]]); - compare(reshaped_input_sizes, [[28, 28]]); - } - }, - MAX_TEST_TIME, - ); - - // MobileViTImageProcessor - // - tests converting RGB to BGR (do_flip_channel_order=true) - it( - MODELS.mobilevit_3, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.mobilevit_3); - - { - const image = await load_image(TEST_IMAGES.cats); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 256, 256]); - compare(avg(pixel_values.data), 0.5215385556221008); - - compare(original_sizes, [[480, 640]]); - compare(reshaped_input_sizes, [[256, 256]]); - - // Ensure RGB to BGR conversion - compare(pixel_values.data.slice(0, 3), [0.24313725531101227, 0.250980406999588, 0.364705890417099]); - } - }, - MAX_TEST_TIME, - ); - - // DeiTFeatureExtractor - it( - MODELS.deit, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.deit); - - { - const image = await load_image(TEST_IMAGES.tiger); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 224, 224]); - compare(avg(pixel_values.data), -0.2760336682859463); - - compare(original_sizes, [[408, 612]]); - compare(reshaped_input_sizes, [[224, 224]]); - } - }, - MAX_TEST_TIME, - ); - - // BeitFeatureExtractor - it( - MODELS.beit, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.beit); - - { - const image = await load_image(TEST_IMAGES.tiger); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 224, 224]); - compare(avg(pixel_values.data), -0.22706867939852762); - - compare(original_sizes, [[408, 612]]); - compare(reshaped_input_sizes, [[224, 224]]); - } - }, - MAX_TEST_TIME, - ); - - // DetrFeatureExtractor - it( - MODELS.detr, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.detr); - - { - const image = await load_image(TEST_IMAGES.tiger); - const { pixel_values, original_sizes, reshaped_input_sizes, pixel_mask } = await processor(image); - - compare(pixel_values.dims, [1, 3, 888, 1333]); - compare(avg(pixel_values.data), -0.27840224131001773); - - compare(original_sizes, [[408, 612]]); - compare(reshaped_input_sizes, [[888, 1333]]); - - compare(pixel_mask.dims, [1, 64, 64]); - compare(avg(pixel_mask.data), 1); - } - }, - MAX_TEST_TIME, - ); - - // YolosFeatureExtractor - it( - MODELS.yolos, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.yolos); - - { - const image = await load_image(TEST_IMAGES.tiger); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 888, 1333]); - compare(avg(pixel_values.data), -0.27840224131001773); - - compare(original_sizes, [[408, 612]]); - compare(reshaped_input_sizes, [[888, 1333]]); - } - }, - MAX_TEST_TIME, - ); - - // DPTFeatureExtractor - it( - MODELS.dpt, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.dpt); - - { - // Tests grayscale image - const image = await load_image(TEST_IMAGES.cats); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 384, 384]); - compare(avg(pixel_values.data), 0.0372855559389454); - - compare(original_sizes, [[480, 640]]); - compare(reshaped_input_sizes, [[384, 384]]); - } - }, - MAX_TEST_TIME, - ); - - // GLPNForDepthEstimation - // - tests `size_divisor` and no size (size_divisor=32) - it( - MODELS.glpn, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.glpn); - - { - const image = await load_image(TEST_IMAGES.cats); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - compare(pixel_values.dims, [1, 3, 480, 640]); - compare(avg(pixel_values.data), 0.5186172404123327); - - compare(original_sizes, [[480, 640]]); - compare(reshaped_input_sizes, [[480, 640]]); - } - - { - // Tests input which is not a multiple of 32 ([408, 612] -> [384, 608]) - const image = await load_image(TEST_IMAGES.tiger); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 384, 608]); - compare(avg(pixel_values.data), 0.38628831535989555); - - compare(original_sizes, [[408, 612]]); - compare(reshaped_input_sizes, [[384, 608]]); - } - }, - MAX_TEST_TIME, - ); - - // NougatImageProcessor - // - tests padding after normalization (image_mean != 0.5, image_std != 0.5) - it( - MODELS.nougat, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.nougat); - - { - const image = await load_image(TEST_IMAGES.paper); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 896, 672]); - compare(avg(pixel_values.data), 1.8447155005897355); - - compare(original_sizes, [[850, 685]]); - compare(reshaped_input_sizes, [[833, 672]]); - } - }, - MAX_TEST_TIME, - ); - - // OwlViTFeatureExtractor - it(MODELS.owlvit, async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.owlvit); - { - const image = await load_image(TEST_IMAGES.cats); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 768, 768]); - compare(avg(pixel_values.data), 0.250620447910435); - - compare(original_sizes, [[480, 640]]); - compare(reshaped_input_sizes, [[768, 768]]); - } - }); - - // CLIPFeatureExtractor - // - tests center crop (do_center_crop=true, crop_size=224) - it( - MODELS.clip, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.clip); - - { - const image = await load_image(TEST_IMAGES.tiger); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 224, 224]); - compare(avg(pixel_values.data), -0.06678297738282096); - - compare(original_sizes, [[408, 612]]); - compare(reshaped_input_sizes, [[224, 224]]); - } - }, - MAX_TEST_TIME, - ); - - // JinaCLIPImageProcessor - // - custom config overrides - it( - MODELS.jina_clip, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.jina_clip); - - { - const image = await load_image(TEST_IMAGES.tiger); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 512, 512]); - compare(avg(pixel_values.data), -0.06637834757566452); - - compare(original_sizes, [[408, 612]]); - compare(reshaped_input_sizes, [[512, 512]]); - } - }, - MAX_TEST_TIME, - ); - - // VitMatteImageProcessor - // - tests custom overrides - // - tests multiple inputs - // - tests `size_divisibility` and no size (size_divisibility=32) - // - tests do_pad and `size_divisibility` - it( - MODELS.vitmatte, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.vitmatte); - - { - const image = await load_image(TEST_IMAGES.vitmatte_image); - const image2 = await load_image(TEST_IMAGES.vitmatte_trimap); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image, image2); - - compare(pixel_values.dims, [1, 4, 640, 960]); - expect(avg(pixel_values.data)).toBeCloseTo(-0.4028555154800415); - expect(pixel_values.data[0]).toBeCloseTo(-0.9921568632125854); - expect(pixel_values.data[1]).toBeCloseTo(-0.9921568632125854); - expect(pixel_values.data[5]).toBeCloseTo(-1.0); - expect(pixel_values.data[640]).toBeCloseTo(-0.6784313917160034); - expect(pixel_values.data[641]).toBeCloseTo(-0.6705882549285889); - expect(pixel_values.data[640 * 960]).toBeCloseTo(-1.0); - expect(pixel_values.data[640 * 960 + 1]).toBeCloseTo(-1.0); - expect(pixel_values.data.at(-1)).toBeCloseTo(0.0); - - compare(original_sizes, [[640, 960]]); - compare(reshaped_input_sizes, [[640, 960]]); - } - - { - const image = await load_image(TEST_IMAGES.pattern_3x5); - const image2 = await load_image(TEST_IMAGES.pattern_3x5); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image, image2); - - compare(pixel_values.dims, [1, 4, 32, 32]); - expect(avg(pixel_values.data)).toBeCloseTo(-0.00867417361587286); - expect(pixel_values.data[0]).toBeCloseTo(-0.9921568632125854); - expect(pixel_values.data[1]).toBeCloseTo(-0.9686274528503418); - expect(pixel_values.data[5]).toBeCloseTo(0.0); - expect(pixel_values.data[32]).toBeCloseTo(-0.9215686321258545); - expect(pixel_values.data[33]).toBeCloseTo(-0.8980392217636108); - expect(pixel_values.data.at(-1)).toBeCloseTo(0.0); - - compare(original_sizes, [[5, 3]]); - compare(reshaped_input_sizes, [[5, 3]]); - } - }, - MAX_TEST_TIME, - ); - - // BitImageProcessor - it( - MODELS.dinov2, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.dinov2); - - { - const image = await load_image(TEST_IMAGES.tiger); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 224, 224]); - compare(avg(pixel_values.data), 0.06262318789958954); - - compare(original_sizes, [[408, 612]]); - compare(reshaped_input_sizes, [[224, 224]]); - } - }, - MAX_TEST_TIME, - ); - - // DPTImageProcessor - // - tests ensure_multiple_of - // - tests keep_aspect_ratio - // - tests bankers rounding - it( - MODELS.dpt_2, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.dpt_2); - - { - const image = await load_image(TEST_IMAGES.cats); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [1, 3, 518, 686]); - compare(avg(pixel_values.data), 0.30337387323379517); - - compare(original_sizes, [[480, 640]]); - compare(reshaped_input_sizes, [[518, 686]]); - } - - { - const image = await load_image(TEST_IMAGES.checkerboard_64x32); - const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - // NOTE: without bankers rounding, this would be [1, 3, 266, 518] - compare(pixel_values.dims, [1, 3, 252, 518]); - compare(avg(pixel_values.data), 0.2267402559518814); - - compare(original_sizes, [[32, 64]]); - compare(reshaped_input_sizes, [[252, 518]]); - } - }, - MAX_TEST_TIME, - ); - - // TODO: Add back - // // EfficientNetImageProcessor - // // - tests include_top - // it(MODELS.efficientnet, async () => { - // const processor = await AutoImageProcessor.from_pretrained(MODELS.efficientnet) - - // { - // const image = await load_image(TEST_IMAGES.cats); - // const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); - - // compare(pixel_values.dims, [1, 3, 224, 224]); - // compare(avg(pixel_values.data), 0.3015307230282871); - - // compare(original_sizes, [[480, 640]]); - // compare(reshaped_input_sizes, [[224, 224]]); - // } - // }, MAX_TEST_TIME); - - // Qwen2VLImageProcessor - // - custom image processing (min_pixels, max_pixels) - it( - MODELS.qwen2_vl, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.qwen2_vl); - - { - const image = await load_image(TEST_IMAGES.white_image); - const { pixel_values, image_grid_thw, original_sizes, reshaped_input_sizes } = await processor(image); - - compare(pixel_values.dims, [256, 1176]); - compare(avg(pixel_values.data), 2.050372362136841); - compare(image_grid_thw.tolist(), [[1n, 16n, 16n]]); - - compare(original_sizes, [[224, 224]]); - compare(reshaped_input_sizes, [[224, 224]]); - } - }, - MAX_TEST_TIME, - ); - - // Idefics3ImageProcessor - // - custom image processing (patching) - it( - MODELS.idefics3, - async () => { - const processor = await AutoImageProcessor.from_pretrained(MODELS.idefics3); - - const image = await load_image(TEST_IMAGES.gradient_1280x640); - const image_1 = await image.resize(1600, 1067); - const image_2 = await image.resize(224, 224); - - const white_image = await load_image(TEST_IMAGES.white_image); - const white_image_1 = await white_image.resize(1600, 1067); - const white_image_2 = await white_image.resize(224, 224); - - { - // test no image splitting - const { pixel_values, rows, cols } = await processor(image, { do_image_splitting: false, return_row_col_info: true }); - compare(pixel_values.dims, [1, 1, 3, 364, 364]); - compare( - pixel_values.mean().item(), - -0.001035306602716446, - 0.1, // threshold - ); - compare(rows, [[0]]); - compare(cols, [[0]]); - } - - { - // test batched no image splitting - const { pixel_values, pixel_attention_mask, rows, cols } = await processor([[white_image_1], [white_image_2], [white_image_1, white_image_2]], { do_image_splitting: false, return_row_col_info: true }); - compare(pixel_values.dims, [3, 2, 3, 364, 364]); - compare( - pixel_values.mean().item(), - 2 / 3, - 0.01, // threshold - ); - compare(pixel_attention_mask.dims, [3, 2, 364, 364]); - compare( - pixel_attention_mask.to("float32").mean().item(), - 2 / 3, - 0.001, // threshold - ); - compare(rows, [[0], [0], [0, 0]]); - compare(cols, [[0], [0], [0, 0]]); - - // Test that the order of the pixel attention mask matches the python implementation - compare( - pixel_attention_mask.data.reduce((a, b, i) => a + i * b, 0), - 228217205216, - ); - } - - { - // test correct patching - const { pixel_values, rows, cols } = await processor(image, { return_row_col_info: true }); - compare(pixel_values.dims, [1, 9, 3, 364, 364]); - compare( - pixel_values.flatten(2).mean(2).tolist(), - [[-0.7012196183204651, -0.30104631185531616, 0.09912905097007751, 0.49929487705230713, -0.5011996626853943, -0.10103467106819153, 0.2991456389427185, 0.6993265151977539, -0.0010353063698858023]], - 0.1, // threshold - ); - compare(rows, [[2]]); - compare(cols, [[4]]); - } - - { - // unbatched, single image - const { pixel_values, rows, cols } = await processor(image_1, { return_row_col_info: true }); - compare(pixel_values.dims, [1, 13, 3, 364, 364]); - - compare(rows, [[3]]); - compare(cols, [[4]]); - } - - { - // unbatched, multiple images - const { pixel_values, rows, cols } = await processor([image_1, image_2], { return_row_col_info: true }); - compare(pixel_values.dims, [1, 30, 3, 364, 364]); - - compare(rows, [[3, 4]]); - compare(cols, [[4, 4]]); - } - - { - // batched, multiple images - const { pixel_values, rows, cols } = await processor([[image_1], [image_1, image_2]], { return_row_col_info: true }); - compare(pixel_values.dims, [2, 30, 3, 364, 364]); - compare(rows, [[3], [3, 4]]); - compare(cols, [[4], [4, 4]]); - } - }, - // NOTE: We set a higher timeout for this test - 2 * MAX_TEST_TIME, - ); - }); - describe("Audio processors", () => { - const audioPromise = new Promise(async (resolve) => { + let audio; + beforeAll(async () => { const url = "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/mlk.npy"; const buffer = await (await fetch(url)).arrayBuffer(); - const audio = Float32Array.from(new Float64Array(buffer)); - resolve(audio); + audio = Float32Array.from(new Float64Array(buffer)); }); it( "WhisperFeatureExtractor", async () => { - const audio = await audioPromise; const processor = await AutoProcessor.from_pretrained("Xenova/whisper-tiny.en"); const { input_features } = await processor(audio); - compare(input_features.dims, [1, 80, 3000]); - expect(avg(input_features.data)).toBeCloseTo(-0.2813588131551941); - expect(input_features.data[0]).toBeCloseTo(0.33168578147888184); - expect(input_features.data[1]).toBeCloseTo(0.30986475944519043); - expect(input_features.data[81]).toBeCloseTo(0.10727232694625854); - expect(input_features.data[3001]).toBeCloseTo(0.2555035352706909); + const { dims, data } = input_features; + expect(dims).toEqual([1, 80, 3000]); + expect(avg(data)).toBeCloseTo(-0.2813588131551941); + expect(data[0]).toBeCloseTo(0.33168578147888184); + expect(data[1]).toBeCloseTo(0.30986475944519043); + expect(data[81]).toBeCloseTo(0.10727232694625854); + expect(data[3001]).toBeCloseTo(0.2555035352706909); }, MAX_TEST_TIME, ); @@ -787,12 +69,11 @@ describe("Processors", () => { it( "ASTFeatureExtractor", async () => { - const audio = await audioPromise; const processor = await AutoProcessor.from_pretrained("Xenova/ast-finetuned-audioset-10-10-0.4593"); { // truncation const { input_values } = await processor(audio); - compare(input_values.dims, [1, 1024, 128]); + expect(input_values.dims).toEqual([1, 1024, 128]); expect(avg(input_values.data)).toBeCloseTo(-0.04054912979309085); expect(input_values.data[0]).toBeCloseTo(-0.5662586092948914); @@ -803,7 +84,7 @@ describe("Processors", () => { { // padding const { input_values } = await processor(audio.slice(0, 1000)); - compare(input_values.dims, [1, 1024, 128]); // [1, 4, 128] -> (padded to) -> [1, 1024, 128] + expect(input_values.dims).toEqual([1, 1024, 128]); // [1, 4, 128] -> (padded to) -> [1, 1024, 128] expect(avg(input_values.data)).toBeCloseTo(0.4647964835166931); expect(input_values.data[0]).toBeCloseTo(-0.5662586092948914); @@ -822,37 +103,38 @@ describe("Processors", () => { it( "SeamlessM4TFeatureExtractor", async () => { - const audio = await audioPromise; const processor = await AutoProcessor.from_pretrained("Xenova/wav2vec2-bert-CV16-en"); { // normal const { input_features, attention_mask } = await processor(audio); - compare(input_features.dims, [1, 649, 160]); - compare(attention_mask.dims, [1, 649]); + const { dims, data } = input_features; + expect(dims).toEqual([1, 649, 160]); + expect(attention_mask.dims).toEqual([1, 649]); - expect(avg(input_features.data)).toBeCloseTo(-2.938903875815413e-8); - expect(input_features.data[0]).toBeCloseTo(1.1939343214035034); - expect(input_features.data[1]).toBeCloseTo(0.7874255180358887); - expect(input_features.data[160]).toBeCloseTo(-0.712975025177002); - expect(input_features.data[161]).toBeCloseTo(0.045802414417266846); - expect(input_features.data.at(-1)).toBeCloseTo(-1.3328346014022827); + expect(avg(data)).toBeCloseTo(-2.938903875815413e-8); + expect(data[0]).toBeCloseTo(1.1939343214035034); + expect(data[1]).toBeCloseTo(0.7874255180358887); + expect(data[160]).toBeCloseTo(-0.712975025177002); + expect(data[161]).toBeCloseTo(0.045802414417266846); + expect(data.at(-1)).toBeCloseTo(-1.3328346014022827); expect(sum(attention_mask.data)).toEqual(649); } { // padding (pad_to_multiple_of=2) const { input_features, attention_mask } = await processor(audio.slice(0, 10000)); + const { dims, data } = input_features; // [1, 61, 80] -> [1, 62, 80] -> [1, 31, 160] - compare(input_features.dims, [1, 31, 160]); - compare(attention_mask.dims, [1, 31]); + expect(dims).toEqual([1, 31, 160]); + expect(attention_mask.dims).toEqual([1, 31]); - expect(avg(input_features.data)).toBeCloseTo(0.01612919569015503); - expect(input_features.data[0]).toBeCloseTo(0.9657132029533386); - expect(input_features.data[1]).toBeCloseTo(0.12912897765636444); - expect(input_features.data[160]).toBeCloseTo(-1.2364212274551392); - expect(input_features.data[161]).toBeCloseTo(-0.9703778028488159); - expect(input_features.data.at(-1)).toBeCloseTo(1); // padding value + expect(avg(data)).toBeCloseTo(0.01612919569015503); + expect(data[0]).toBeCloseTo(0.9657132029533386); + expect(data[1]).toBeCloseTo(0.12912897765636444); + expect(data[160]).toBeCloseTo(-1.2364212274551392); + expect(data[161]).toBeCloseTo(-0.9703778028488159); + expect(data.at(-1)).toBeCloseTo(1); // padding value expect(sum(attention_mask.data)).toEqual(30); } @@ -863,7 +145,6 @@ describe("Processors", () => { it( "ClapFeatureExtractor", async () => { - const audio = await audioPromise; const processor = await AutoProcessor.from_pretrained("Xenova/clap-htsat-unfused"); { // truncation @@ -877,17 +158,18 @@ describe("Processors", () => { long_audio.set(audio, long_audio.length - audio.length); const { input_features } = await processor(long_audio); - compare(input_features.dims, [1, 1, 1001, 64]); - - expect(avg(input_features.data)).toBeCloseTo(-37.94569396972656); - expect(input_features.data[0]).toBeCloseTo(-53.32647705078125); - expect(input_features.data[1]).toBeCloseTo(-47.76755142211914); - expect(input_features.data[65]).toBeCloseTo(-36.32261276245117); - expect(input_features.data[1002]).toBeCloseTo(-28.0314884185791); - expect(input_features.data[10000]).toBeCloseTo(-21.905902862548828); - expect(input_features.data[60000]).toBeCloseTo(-14.877863883972168); - expect(input_features.data[64062]).toBeCloseTo(-37.9784049987793); - expect(input_features.data[64063]).toBeCloseTo(-37.73963928222656); + const { dims, data } = input_features; + expect(dims).toEqual([1, 1, 1001, 64]); + + expect(avg(data)).toBeCloseTo(-37.94569396972656); + expect(data[0]).toBeCloseTo(-53.32647705078125); + expect(data[1]).toBeCloseTo(-47.76755142211914); + expect(data[65]).toBeCloseTo(-36.32261276245117); + expect(data[1002]).toBeCloseTo(-28.0314884185791); + expect(data[10000]).toBeCloseTo(-21.905902862548828); + expect(data[60000]).toBeCloseTo(-14.877863883972168); + expect(data[64062]).toBeCloseTo(-37.9784049987793); + expect(data[64063]).toBeCloseTo(-37.73963928222656); // Reset Math.random Math.random = originalRandom; @@ -895,19 +177,20 @@ describe("Processors", () => { { // padding const { input_features } = await processor(audio); - compare(input_features.dims, [1, 1, 1001, 64]); + const { data, dims } = input_features; + expect(dims).toEqual([1, 1, 1001, 64]); - expect(avg(input_features.data)).toBeCloseTo(-34.99049377441406); - expect(input_features.data[0]).toBeCloseTo(-21.32573890686035); - expect(input_features.data[1]).toBeCloseTo(-26.168411254882812); - expect(input_features.data[65]).toBeCloseTo(-29.716018676757812); - expect(input_features.data[1002]).toBeCloseTo(-32.16273498535156); - expect(input_features.data[10000]).toBeCloseTo(-19.9283390045166); + expect(avg(data)).toBeCloseTo(-34.99049377441406); + expect(data[0]).toBeCloseTo(-21.32573890686035); + expect(data[1]).toBeCloseTo(-26.168411254882812); + expect(data[65]).toBeCloseTo(-29.716018676757812); + expect(data[1002]).toBeCloseTo(-32.16273498535156); + expect(data[10000]).toBeCloseTo(-19.9283390045166); // padded values - expect(input_features.data[60000]).toBeCloseTo(-100.0); - expect(input_features.data[64062]).toBeCloseTo(-100.0); - expect(input_features.data[64063]).toBeCloseTo(-100.0); + expect(data[60000]).toBeCloseTo(-100.0); + expect(data[64062]).toBeCloseTo(-100.0); + expect(data[64063]).toBeCloseTo(-100.0); } }, MAX_TEST_TIME, @@ -921,32 +204,34 @@ describe("Processors", () => { // default const audio = new Float32Array(16000).map((_, i) => Math.sin(i / 100)); const { input_features } = await processor(audio); - compare(input_features.dims, [1, 98, 80]); + const { dims, data } = input_features; + expect(dims).toEqual([1, 98, 80]); - expect(avg(input_features.data)).toBeCloseTo(5.461731689138105e-8); - expect(input_features.data[0]).toBeCloseTo(-0.19300270080566406); - expect(input_features.data[1]).toBeCloseTo(-0.05825042724609375); - expect(input_features.data[78]).toBeCloseTo(0.2683420181274414); - expect(input_features.data[79]).toBeCloseTo(0.26250171661376953); - expect(input_features.data[80]).toBeCloseTo(0.19062232971191406); - expect(input_features.data.at(-2)).toBeCloseTo(-0.43694400787353516); - expect(input_features.data.at(-1)).toBeCloseTo(-0.4266204833984375); + expect(avg(data)).toBeCloseTo(5.461731689138105e-8); + expect(data[0]).toBeCloseTo(-0.19300270080566406); + expect(data[1]).toBeCloseTo(-0.05825042724609375); + expect(data[78]).toBeCloseTo(0.2683420181274414); + expect(data[79]).toBeCloseTo(0.26250171661376953); + expect(data[80]).toBeCloseTo(0.19062232971191406); + expect(data.at(-2)).toBeCloseTo(-0.43694400787353516); + expect(data.at(-1)).toBeCloseTo(-0.4266204833984375); } { // pad to `min_num_frames` const audio = new Float32Array(3).map((_, i) => Math.sin(i / 100)); const { input_features } = await processor(audio); - compare(input_features.dims, [1, 9, 80]); - - expect(avg(input_features.data)).toBeCloseTo(-0.0000010093053181966146); - expect(input_features.data[0]).toBeCloseTo(20.761859893798828); - expect(input_features.data[1]).toBeCloseTo(21.02924346923828); - expect(input_features.data[78]).toBeCloseTo(19.083993911743164); - expect(input_features.data[79]).toBeCloseTo(18.003454208374023); - expect(input_features.data[80]).toBeCloseTo(-2.595233917236328); - expect(input_features.data.at(-2)).toBeCloseTo(-2.385499954223633); - expect(input_features.data.at(-1)).toBeCloseTo(-2.2504329681396484); + const { dims, data } = input_features; + expect(dims).toEqual([1, 9, 80]); + + expect(avg(data)).toBeCloseTo(-0.0000010093053181966146); + expect(data[0]).toBeCloseTo(20.761859893798828); + expect(data[1]).toBeCloseTo(21.02924346923828); + expect(data[78]).toBeCloseTo(19.083993911743164); + expect(data[79]).toBeCloseTo(18.003454208374023); + expect(data[80]).toBeCloseTo(-2.595233917236328); + expect(data.at(-2)).toBeCloseTo(-2.385499954223633); + expect(data.at(-1)).toBeCloseTo(-2.2504329681396484); } }, MAX_TEST_TIME, @@ -964,8 +249,8 @@ describe("Processors", () => { beforeAll(async () => { processor = await AutoProcessor.from_pretrained(MODELS.florence2); images = { - beetle: await load_image(TEST_IMAGES.beetle), - book_cover: await load_image(TEST_IMAGES.book_cover), + beetle: await load_cached_image("beetle"), + book_cover: await load_cached_image("book_cover"), }; }); @@ -974,14 +259,14 @@ describe("Processors", () => { const text = ""; const prompts = processor.construct_prompts(text); const target = ["Locate the objects with category name in the image."]; - compare(prompts, target); + expect(prompts).toEqual(target); }); it("Construct prompts", async () => { const texts = ["", "Locate the objects with category name in the image.", "cat"]; const prompts = processor.construct_prompts(texts); const target = ["Describe with a paragraph what is shown in the image.", "Locate the objects with category name in the image.", "Locate cat in the image."]; - compare(prompts, target); + expect(prompts).toEqual(target); }); }); @@ -1154,7 +439,7 @@ describe("Processors", () => { for (const { task, generated_text, target, image } of TESTS) { it(task, () => { const result = processor.post_process_generation(generated_text, task, images[image].size); - compare(result, target); + expect(result).toBeCloseToNested(target, 4); }); } }); @@ -1172,7 +457,7 @@ describe("Processors", () => { beforeAll(async () => { processor = await AutoProcessor.from_pretrained(MODELS.qwen2_vl); images = { - white_image: await load_image(TEST_IMAGES.white_image), + white_image: await load_cached_image("white_image"), }; }); @@ -1189,10 +474,10 @@ describe("Processors", () => { }); const { input_ids, attention_mask, pixel_values, image_grid_thw } = await processor(text, images.white_image); - compare(input_ids.dims, [1, 89]); - compare(attention_mask.dims, [1, 89]); - compare(pixel_values.dims, [256, 1176]); - compare(image_grid_thw.dims, [1, 3]); + expect(input_ids.dims).toEqual([1, 89]); + expect(attention_mask.dims).toEqual([1, 89]); + expect(pixel_values.dims).toEqual([256, 1176]); + expect(image_grid_thw.dims).toEqual([1, 3]); }); }, MAX_TEST_TIME, @@ -1208,26 +493,26 @@ describe("Processors", () => { beforeAll(async () => { processor = await AutoProcessor.from_pretrained(MODELS.paligemma); images = { - white_image: await load_image(TEST_IMAGES.white_image), + white_image: await load_cached_image("white_image"), }; }); it("Image-only (default text)", async () => { const { input_ids, pixel_values } = await processor(images.white_image); - compare(input_ids.dims, [1, 258]); - compare(pixel_values.dims, [1, 3, 224, 224]); + expect(input_ids.dims).toEqual([1, 258]); + expect(pixel_values.dims).toEqual([1, 3, 224, 224]); }); it("Single image & text", async () => { const { input_ids, pixel_values } = await processor(images.white_image, "What is on the flower?"); - compare(input_ids.dims, [1, 264]); - compare(pixel_values.dims, [1, 3, 224, 224]); + expect(input_ids.dims).toEqual([1, 264]); + expect(pixel_values.dims).toEqual([1, 3, 224, 224]); }); it("Multiple images & text", async () => { const { input_ids, pixel_values } = await processor([images.white_image, images.white_image], "Describe the images."); - compare(input_ids.dims, [1, 518]); - compare(pixel_values.dims, [2, 3, 224, 224]); + expect(input_ids.dims).toEqual([1, 518]); + expect(pixel_values.dims).toEqual([2, 3, 224, 224]); }); }, MAX_TEST_TIME, diff --git a/tests/tiny_random.test.js b/tests/tiny_random.test.js index 9bffbfc62..d80699a11 100644 --- a/tests/tiny_random.test.js +++ b/tests/tiny_random.test.js @@ -1,2315 +1,24 @@ import { - // Tokenizers - CodeGenTokenizer, - LlamaTokenizer, - CohereTokenizer, - GemmaTokenizer, - GPT2Tokenizer, - GPTNeoXTokenizer, - BloomTokenizer, - BertTokenizer, - T5Tokenizer, - WhisperTokenizer, - MarianTokenizer, - PreTrainedTokenizer, - AutoTokenizer, - - // Processors - CLIPImageProcessor, - AutoProcessor, - Processor, - Florence2Processor, - Idefics3Processor, - PaliGemmaProcessor, - - // Models - LlamaForCausalLM, - OlmoForCausalLM, - Olmo2ForCausalLM, - GraniteForCausalLM, - CohereModel, - CohereForCausalLM, - GemmaForCausalLM, - Gemma2ForCausalLM, - OPTForCausalLM, - GPTNeoXForCausalLM, - GPTJForCausalLM, - BloomForCausalLM, - GPTBigCodeForCausalLM, - GPT2LMHeadModel, - JAISLMHeadModel, - MptForCausalLM, - CodeGenForCausalLM, - MistralForCausalLM, - GPTNeoForCausalLM, - BertForMaskedLM, - BertForSequenceClassification, - T5ForConditionalGeneration, - T5Model, - BertModel, - BertForTokenClassification, - BertForQuestionAnswering, - MusicgenForConditionalGeneration, - LlavaForConditionalGeneration, - Idefics3ForConditionalGeneration, - WhisperForConditionalGeneration, - VisionEncoderDecoderModel, - Florence2ForConditionalGeneration, - Qwen2VLForConditionalGeneration, - PaliGemmaForConditionalGeneration, - MarianMTModel, - PatchTSTModel, - PatchTSTForPrediction, - PatchTSMixerModel, - PatchTSMixerForPrediction, - - // Pipelines - pipeline, - FillMaskPipeline, - TextClassificationPipeline, - TextGenerationPipeline, - TranslationPipeline, - ImageClassificationPipeline, - ZeroShotImageClassificationPipeline, - TokenClassificationPipeline, - QuestionAnsweringPipeline, - DocumentQuestionAnsweringPipeline, - - // Other - full, - RawImage, - Tensor, -} from "../src/transformers.js"; - -import { init, MAX_TEST_TIME, MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME } from "./init.js"; -import { compare } from "./test_utils.js"; - -init(); - -const DEFAULT_MODEL_OPTIONS = { - dtype: "fp32", -}; -describe("Tiny random models", () => { - describe("bert", () => { - describe("BertModel", () => { - const model_id = "hf-internal-testing/tiny-random-BertModel"; - - /** @type {BertModel} */ - let model; - /** @type {BertTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await BertModel.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await BertTokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const { last_hidden_state } = await model(inputs); - expect(last_hidden_state.dims).toEqual([1, 7, 32]); - expect(last_hidden_state.mean().item()).toBeCloseTo(0.0, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const { last_hidden_state } = await model(inputs); - expect(last_hidden_state.dims).toEqual([2, 12, 32]); - expect(last_hidden_state.mean().item()).toBeCloseTo(1.4901161193847656e-8, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("BertForMaskedLM", () => { - const model_id = "hf-internal-testing/tiny-random-BertForMaskedLM"; - - const texts = ["The goal of life is [MASK].", "Paris is the [MASK] of France."]; - - /** @type {BertForMaskedLM} */ - let model; - /** @type {BertTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await BertForMaskedLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await BertTokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer(texts[0]); - const { logits } = await model(inputs); - expect(logits.dims).toEqual([1, 19, 1124]); - expect(logits.mean().item()).toBeCloseTo(0.0016587056452408433, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(texts, { padding: true }); - const { logits } = await model(inputs); - expect(logits.dims).toEqual([2, 22, 1124]); - expect(logits.mean().item()).toBeCloseTo(0.0017160633578896523, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("BertForSequenceClassification", () => { - const model_id = "hf-internal-testing/tiny-random-BertForSequenceClassification"; - - /** @type {BertForSequenceClassification} */ - let model; - /** @type {BertTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await BertForSequenceClassification.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await BertTokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const { logits } = await model(inputs); - const target = [[0.00043986947275698185, -0.030218850821256638]].flat(); - expect(logits.dims).toEqual([1, 2]); - logits - .tolist() - .flat() - .forEach((item, i) => { - expect(item).toBeCloseTo(target[i], 5); - }); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const { logits } = await model(inputs); - const target = [ - [0.00043986947275698185, -0.030218850821256638], - [0.0003853091038763523, -0.03022204339504242], - ].flat(); - expect(logits.dims).toEqual([2, 2]); - logits - .tolist() - .flat() - .forEach((item, i) => { - expect(item).toBeCloseTo(target[i], 5); - }); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("BertForTokenClassification", () => { - const model_id = "hf-internal-testing/tiny-random-BertForTokenClassification"; - - /** @type {BertForTokenClassification} */ - let model; - /** @type {BertTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await BertForTokenClassification.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await BertTokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const { logits } = await model(inputs); - expect(logits.dims).toEqual([1, 7, 2]); - expect(logits.mean().item()).toBeCloseTo(0.07089076191186905, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const { logits } = await model(inputs); - expect(logits.dims).toEqual([2, 12, 2]); - expect(logits.mean().item()).toBeCloseTo(0.04702216014266014, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("BertForQuestionAnswering", () => { - const model_id = "hf-internal-testing/tiny-random-BertForQuestionAnswering"; - - /** @type {BertForQuestionAnswering} */ - let model; - /** @type {BertTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await BertForQuestionAnswering.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await BertTokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const { start_logits, end_logits } = await model(inputs); - expect(start_logits.dims).toEqual([1, 7]); - expect(start_logits.mean().item()).toBeCloseTo(0.12772157788276672, 5); - expect(end_logits.dims).toEqual([1, 7]); - expect(end_logits.mean().item()).toBeCloseTo(0.11811424791812897, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const { start_logits, end_logits } = await model(inputs); - expect(start_logits.dims).toEqual([2, 12]); - expect(start_logits.mean().item()).toBeCloseTo(0.12843115627765656, 5); - expect(end_logits.dims).toEqual([2, 12]); - expect(end_logits.mean().item()).toBeCloseTo(0.11745202541351318, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("t5", () => { - describe("T5Model", () => { - const model_id = "hf-internal-testing/tiny-random-T5Model"; - - /** @type {T5Model} */ - let model; - /** @type {T5Tokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await T5Model.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await T5Tokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "forward", - async () => { - // Example adapted from https://huggingface.co/google-t5/t5-small#how-to-get-started-with-the-model - const inputs = tokenizer("Studies have been shown that owning a dog is good for you"); - const { input_ids: decoder_input_ids } = tokenizer("Studies show that"); - - const { last_hidden_state } = await model({ ...inputs, decoder_input_ids }); - expect(last_hidden_state.dims).toEqual([1, 4, 32]); - expect(last_hidden_state.mean().item()).toBeCloseTo(7.492632721550763e-5, 8); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - describe("T5ForConditionalGeneration", () => { - const model_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration"; - - /** @type {T5ForConditionalGeneration} */ - let model; - /** @type {T5Tokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await T5ForConditionalGeneration.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await T5Tokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "forward", - async () => { - // Example adapted from https://huggingface.co/google-t5/t5-small#how-to-get-started-with-the-model - const inputs = tokenizer("Studies have been shown that owning a dog is good for you"); - const { input_ids: decoder_input_ids } = tokenizer("Studies show that"); - - const model = await T5ForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); - const outputs = await model({ ...inputs, decoder_input_ids }); - expect(outputs.logits.dims).toEqual([1, 4, 32100]); - expect(outputs.logits.mean().item()).toBeCloseTo(8.867568901393952e-9, 12); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n], - [0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("marian", () => { - describe("MarianMTModel", () => { - const model_id = "onnx-community/tiny-random-MarianMTModel"; - - /** @type {MarianMTModel} */ - let model; - /** @type {MarianTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await MarianMTModel.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await MarianTokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[3n, 40672n, 8358n, 32810n, 32810n, 32810n, 32810n, 35687n, 33073n, 6870n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [3n, 40672n, 8358n, 32810n, 32810n, 32810n, 32810n, 35687n, 33073n, 6870n], - [3n, 40672n, 8358n, 32810n, 32810n, 32810n, 32810n, 35687n, 33073n, 6870n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("musicgen", () => { - describe("MusicgenForConditionalGeneration", () => { - const model_id = "hf-internal-testing/tiny-random-MusicgenForConditionalGeneration"; - - // Example adapted from https://huggingface.co/docs/transformers/model_doc/musicgen#text-conditional-generation - const texts = ["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"]; - - /** @type {MusicgenForConditionalGeneration} */ - let model; - /** @type {T5Tokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await MusicgenForConditionalGeneration.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await T5Tokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "forward", - async () => { - // Example from https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenForConditionalGeneration.forward.example - const inputs = tokenizer(texts, { padding: true }); - const pad_token_id = BigInt(model.generation_config.pad_token_id); - const decoder_input_ids = full([inputs.input_ids.dims[0] * model.config.decoder.num_codebooks, 1], pad_token_id); - const { logits } = await model({ ...inputs, decoder_input_ids }); - expect(logits.dims).toEqual([8, 1, 99]); - expect(logits.mean().item()).toBeCloseTo(-0.0018370470497757196, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer(texts[0]); - const audio_values = await model.generate({ ...inputs, max_length: 10 }); - expect(audio_values.dims).toEqual([1, 1, 1920]); - expect(audio_values.mean().item()).toBeCloseTo(0.16644205152988434, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(texts, { padding: true }); - const audio_values = await model.generate({ ...inputs, max_length: 10 }); - expect(audio_values.dims).toEqual([2, 1, 1920]); - expect(audio_values.mean().item()).toBeCloseTo(0.16644206643104553, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("whisper", () => { - describe("WhisperForConditionalGeneration", () => { - const model_id = "Xenova/tiny-random-WhisperForConditionalGeneration"; - - /** @type {WhisperForConditionalGeneration} */ - let model; - /** @type {WhisperTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await WhisperForConditionalGeneration.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await WhisperTokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - describe("prefix tokens", () => { - const input_features = full([1, 80, 3000], 0.0); - - describe("English-only", () => { - it( - "default", - async () => { - const outputs = await model.generate({ - input_features, - is_multilingual: false, - max_new_tokens: 1, - }); - - expect(outputs.tolist()).toEqual([[/* Prefix */ 50258n, 50363n, /* Generated */ 45084n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "return_timestamps=true", - async () => { - const outputs = await model.generate({ - input_features, - is_multilingual: false, - max_new_tokens: 1, - return_timestamps: true, - }); - - expect(outputs.tolist()).toEqual([[/* Prefix */ 50258n, /* Generated */ 50366n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - describe("multilingual", () => { - it( - "language unset; task unset", - async () => { - // language defaults to 'en' - // task defaults to 'transcribe' - - const outputs = await model.generate({ - input_features, - max_new_tokens: 1, - }); - - expect(outputs.tolist()).toEqual([[/* Prefix */ 50258n, 50259n, 50359n, 50363n, /* Generated */ 45084n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "language set; task unset", - async () => { - // task defaults to 'transcribe' - const outputs = await model.generate({ - input_features, - max_new_tokens: 1, - language: "af", - }); - - expect(outputs.tolist()).toEqual([[/* Prefix */ 50258n, 50327n, 50359n, 50363n, /* Generated */ 45084n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "language set; task set", - async () => { - const outputs = await model.generate({ - input_features, - max_new_tokens: 1, - language: "zh", - task: "translate", - }); - - expect(outputs.tolist()).toEqual([[/* Prefix */ 50258n, 50260n, 50358n, 50363n, /* Generated */ 45084n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "return_timestamps=true", - async () => { - const outputs = await model.generate({ - input_features, - max_new_tokens: 1, - language: "en", - task: "transcribe", - return_timestamps: true, - }); - - expect(outputs.tolist()).toEqual([[/* Prefix */ 50258n, 50259n, 50359n, /* Generated */ 50400n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - }); - - describe("decoder_start_ids", () => { - const input_features = full([1, 80, 3000], 0.0); - - it( - "broadcast inputs", - async () => { - const { decoder_start_token_id, lang_to_id, task_to_id, no_timestamps_token_id } = model.generation_config; - - const outputs = await model.generate({ - input_features, // batch size 1 - max_new_tokens: 1, - decoder_input_ids: [ - // batch size 2 - // <|startoftranscript|> <|lang_id|> <|task|> [<|notimestamps|>] - [decoder_start_token_id, lang_to_id["<|en|>"], task_to_id["translate"], no_timestamps_token_id], - [decoder_start_token_id, lang_to_id["<|fr|>"], task_to_id["transcribe"], no_timestamps_token_id], - ], - }); - expect(outputs.tolist()).toEqual([ - [/* Prefix */ 50258n, 50259n, 50358n, 50363n, /* Generated */ 45084n], - [/* Prefix */ 50258n, 50265n, 50359n, 50363n, /* Generated */ 45084n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - }); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("llava", () => { - const prompts = [ - // Example adapted from https://huggingface.co/docs/transformers/model_doc/llava#transformers.LlavaForConditionalGeneration.forward.example - "\nUSER: What's the content of the image?\nASSISTANT:", - "Hi", - ]; - - // Empty white image - const dims = [224, 224, 3]; - const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims); - - describe("LlavaForConditionalGeneration", () => { - const model_id = "Xenova/tiny-random-LlavaForConditionalGeneration"; - - /** @type {LlavaForConditionalGeneration} */ - let model; - /** @type {LlamaTokenizer} */ - let tokenizer; - /** @type {CLIPImageProcessor} */ - let processor; - beforeAll(async () => { - model = await LlavaForConditionalGeneration.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await LlamaTokenizer.from_pretrained(model_id); - processor = await AutoProcessor.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "forward", - async () => { - const text_inputs = tokenizer(prompts[0]); - const vision_inputs = await processor(image); - const inputs = { ...text_inputs, ...vision_inputs }; - - const { logits } = await model(inputs); - expect(logits.dims).toEqual([1, 244, 32002]); - expect(logits.mean().item()).toBeCloseTo(-0.0005755752790719271, 8); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size=1", - async () => { - const text_inputs = tokenizer(prompts[0]); - const vision_inputs = await processor(image); - const inputs = { ...text_inputs, ...vision_inputs }; - - const generate_ids = await model.generate({ ...inputs, max_new_tokens: 10 }); - expect(generate_ids.tolist()).toEqual([[1n, 32000n, 29871n, 13n, 11889n, 29901n, 1724n, 29915n, 29879n, 278n, 2793n, 310n, 278n, 1967n, 29973n, 13n, 22933n, 9047n, 13566n, 29901n, 21557n, 16781n, 27238n, 8279n, 20454n, 11927n, 12462n, 12306n, 2414n, 7561n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const text_inputs = tokenizer(prompts, { padding: true }); - const vision_inputs = await processor([image, image]); - const inputs = { ...text_inputs, ...vision_inputs }; - - const generate_ids = await model.generate({ ...inputs, max_new_tokens: 10 }); - expect(generate_ids.tolist()).toEqual([ - [1n, 32000n, 29871n, 13n, 11889n, 29901n, 1724n, 29915n, 29879n, 278n, 2793n, 310n, 278n, 1967n, 29973n, 13n, 22933n, 9047n, 13566n, 29901n, 21557n, 16781n, 27238n, 8279n, 20454n, 11927n, 12462n, 12306n, 2414n, 7561n], - [0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 0n, 1n, 32000n, 6324n, 1217n, 22958n, 22913n, 10381n, 148n, 31410n, 31736n, 7358n, 9150n, 28635n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("idefics3", () => { - const conversation = [ - { - role: "user", - content: [{ type: "image" }, { type: "text", text: "Can you describe this image?" }], - }, - ]; - - // Empty white and black images - const white_image_dims = [224, 224, 3]; - const white_image = new RawImage(new Uint8ClampedArray(white_image_dims[0] * white_image_dims[1] * white_image_dims[2]).fill(255), ...white_image_dims); - const black_image_dims = [720, 360, 3]; - const black_image = new RawImage(new Uint8ClampedArray(black_image_dims[0] * black_image_dims[1] * black_image_dims[2]).fill(0), ...black_image_dims); - - describe("Idefics3ForConditionalGeneration", () => { - const model_id = "hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration"; - - /** @type {Idefics3ForConditionalGeneration} */ - let model; - /** @type {Idefics3Processor} */ - let processor; - /** @type {string} */ - let text; - beforeAll(async () => { - model = await Idefics3ForConditionalGeneration.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - processor = await AutoProcessor.from_pretrained(model_id); - - text = processor.apply_chat_template(conversation, { - add_generation_prompt: true, - }); - }, MAX_MODEL_LOAD_TIME); - - it( - "forward w/ image splitting (default)", - async () => { - const inputs = await processor(text, white_image, { - do_image_splitting: true, - }); - - const { logits } = await model(inputs); - expect(logits.dims).toEqual([1, 3041, 128259]); - expect(logits.mean().item()).toBeCloseTo(-0.0002692154666874558, 6); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "forward w/o image splitting", - async () => { - const inputs = await processor(text, white_image, { - do_image_splitting: false, - }); - - const { logits } = await model(inputs); - expect(logits.dims).toEqual([1, 189, 128259]); - expect(logits.mean().item()).toBeCloseTo(-0.00019743280427064747, 6); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size=1 w/ image splitting", - async () => { - const inputs = await processor(text, white_image, { - do_image_splitting: true, - }); - const generate_ids = await model.generate({ - ...inputs, - max_new_tokens: 10, - - // To obtain unique output tokens, deterministically - repetition_penalty: 2.0, - }); - expect(generate_ids.dims).toEqual([1, 3051]); - - const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]); - expect(new_tokens.tolist()).toEqual([[64531n, 121777n, 70370n, 105334n, 12720n, 113356n, 47739n, 59240n, 102001n, 60344n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size=1 w/o image splitting", - async () => { - const inputs = await processor(text, white_image, { - do_image_splitting: false, - }); - const generate_ids = await model.generate({ - ...inputs, - max_new_tokens: 10, - - // To obtain unique output tokens, deterministically - repetition_penalty: 2.0, - }); - expect(generate_ids.dims).toEqual([1, 199]); - - const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]); - expect(new_tokens.tolist()).toEqual([[64531n, 121777n, 70370n, 105334n, 12720n, 113356n, 47739n, 59240n, 59697n, 65246n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size=1 multi-image w/o image splitting", - async () => { - const multi_image_conversation = [ - { - role: "user", - content: [{ type: "image" }, { type: "image" }, { type: "text", text: "Can you describe these images?" }], - }, - ]; - - const multi_image_text = processor.apply_chat_template(multi_image_conversation, { - add_generation_prompt: true, - }); - const inputs = await processor(multi_image_text, [white_image, black_image], { - do_image_splitting: false, - }); - const generate_ids = await model.generate({ - ...inputs, - max_new_tokens: 10, - - // To obtain unique output tokens, deterministically - repetition_penalty: 2.0, - }); - expect(generate_ids.dims).toEqual([1, 374]); - - const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]); - expect(new_tokens.tolist()).toEqual([[73189n, 99346n, 113252n, 51743n, 33499n, 66430n, 78739n, 89539n, 121023n, 14474n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("florence2", () => { - const texts = ["Describe with a paragraph what is shown in the image.", "Locate the objects with category name in the image."]; - - // Empty white image - const dims = [224, 224, 3]; - const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims); - - describe("Florence2ForConditionalGeneration", () => { - const model_id = "Xenova/tiny-random-Florence2ForConditionalGeneration"; - - /** @type {Florence2ForConditionalGeneration} */ - let model; - /** @type {Florence2Processor} */ - let processor; - beforeAll(async () => { - model = await Florence2ForConditionalGeneration.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - processor = await AutoProcessor.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "forward", - async () => { - const inputs = await processor(image, texts[0]); - - const { logits } = await model({ - ...inputs, - decoder_input_ids: full([1, 1], 2n), - }); - expect(logits.dims).toEqual([1, 1, 51289]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size=1", - async () => { - { - const text_inputs = processor.tokenizer(texts[0]); - const generate_ids = await model.generate({ ...text_inputs, max_new_tokens: 10 }); - expect(generate_ids.tolist()).toEqual([[2n, 0n, 0n, 0n, 1n, 0n, 0n, 2n]]); - } - { - const inputs = await processor(image, texts[0]); - const generate_ids = await model.generate({ ...inputs, max_new_tokens: 10 }); - expect(generate_ids.tolist()).toEqual([[2n, 0n, 48n, 48n, 48n, 48n, 48n, 48n, 48n, 48n, 2n]]); - } - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - { - const text_inputs = processor.tokenizer(texts, { padding: true }); - const generate_ids = await model.generate({ ...text_inputs, max_new_tokens: 10 }); - expect(generate_ids.tolist()).toEqual([ - [2n, 0n, 0n, 0n, 1n, 0n, 0n, 2n], - [2n, 0n, 0n, 0n, 1n, 0n, 0n, 2n], - ]); - } - { - const inputs = await processor([image, image], texts, { padding: true }); - - const generate_ids = await model.generate({ ...inputs, max_new_tokens: 10 }); - expect(generate_ids.tolist()).toEqual([ - [2n, 0n, 48n, 48n, 48n, 48n, 48n, 48n, 48n, 48n, 2n], - [2n, 0n, 48n, 48n, 48n, 48n, 48n, 48n, 48n, 48n, 2n], - ]); - } - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("qwen2_vl", () => { - const CONVERSATION = [ - { - role: "user", - content: [{ type: "text", text: "Hello" }], - }, - ]; - - // Example adapted from https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct - const CONVERSATION_WITH_IMAGE = [ - { - role: "user", - content: [{ type: "image" }, { type: "text", text: "Describe this image." }], - }, - ]; - // Empty white image - const dims = [224, 224, 3]; - const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims); - - describe("Qwen2VLForConditionalGeneration", () => { - const model_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration"; - - /** @type {Qwen2VLForConditionalGeneration} */ - let model; - /** @type {Qwen2VLProcessor} */ - let processor; - beforeAll(async () => { - model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - processor = await AutoProcessor.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "forward", - async () => { - const text = processor.apply_chat_template(CONVERSATION_WITH_IMAGE, { - add_generation_prompt: true, - }); - const inputs = await processor(text, image); - const { logits } = await model(inputs); - expect(logits.dims).toEqual([1, 89, 152064]); - expect(logits.mean().item()).toBeCloseTo(-0.0011299321195110679, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "text-only (batch_size=1)", - async () => { - const text = processor.apply_chat_template(CONVERSATION, { - add_generation_prompt: true, - }); - const inputs = await processor(text); - const generate_ids = await model.generate({ - ...inputs, - max_new_tokens: 10, - }); - - const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]); - expect(new_tokens.tolist()).toEqual([[24284n, 63986n, 108860n, 84530n, 8889n, 23262n, 128276n, 64948n, 136757n, 138348n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "text + image (batch_size=1)", - async () => { - const text = processor.apply_chat_template(CONVERSATION_WITH_IMAGE, { - add_generation_prompt: true, - }); - const inputs = await processor(text, image); - const generate_ids = await model.generate({ - ...inputs, - max_new_tokens: 10, - }); - - const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]); - expect(new_tokens.tolist()).toEqual([[24284n, 35302n, 60575n, 38679n, 113390n, 115118n, 137596n, 38241n, 96726n, 142301n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("paligemma", () => { - const text = "What is on the flower?"; - - // Empty white image - const dims = [224, 224, 3]; - const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims); - - describe("PaliGemmaForConditionalGeneration", () => { - const model_id = "hf-internal-testing/tiny-random-PaliGemmaForConditionalGeneration"; - - /** @type {PaliGemmaForConditionalGeneration} */ - let model; - /** @type {PaliGemmaProcessor} */ - let processor; - beforeAll(async () => { - model = await PaliGemmaForConditionalGeneration.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - processor = await AutoProcessor.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "forward", - async () => { - const inputs = await processor(image, text); - - const { logits } = await model(inputs); - expect(logits.dims).toEqual([1, 264, 257216]); - expect(logits.mean().item()).toBeCloseTo(-0.0023024685215204954, 6); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size=1", - async () => { - const inputs = await processor(image, text); - const generate_ids = await model.generate({ ...inputs, max_new_tokens: 10 }); - - const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]); - expect(new_tokens.tolist()).toEqual([[91711n, 24904n, 144054n, 124983n, 83862n, 124983n, 124983n, 124983n, 141236n, 124983n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("vision-encoder-decoder", () => { - describe("VisionEncoderDecoderModel", () => { - const model_id = "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2"; - - /** @type {VisionEncoderDecoderModel} */ - let model; - /** @type {GPT2Tokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await VisionEncoderDecoderModel.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GPT2Tokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const outputs = await model.generate({ - pixel_values: full([1, 3, 30, 30], -1.0), - max_length: 5, - }); - expect(outputs.tolist()).toEqual([[0n, 400n, 400n, 400n, 400n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - // TODO: Add back - // it('batch_size>1', async () => { - // const outputs = await model.generate({ - // pixel_values: cat([ - // full([1, 3, 30, 30], -1.0), - // full([1, 3, 30, 30], 0.0), - // ]), - // max_length: 5, - // }); - // expect(outputs.tolist()).toEqual([ - // // Generation continues - // [0n, 400n, 400n, 400n, 400n], - - // // Finishes early. 1023 is the padding token - // [0n, 0n, 1023n, 1023n, 1023n], - // ]); - // }, MAX_TEST_EXECUTION_TIME); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - describe("opt", () => { - describe("OPTForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-OPTForCausalLM"; - /** @type {OPTForCausalLM} */ - let model; - /** @type {GPT2Tokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await OPTForCausalLM.from_pretrained(model_id, { - // TODO move to config - revision: "refs/pr/2", - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GPT2Tokenizer.from_pretrained(model_id, { - // TODO update this - revision: "refs/pr/3", - }); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[2n, 42891n, 39144n, 39144n, 39144n, 39144n, 39144n, 39144n, 39144n, 39144n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [1n, 2n, 42891n, 39144n, 39144n, 39144n, 39144n, 39144n, 39144n, 39144n], - [2n, 42891n, 232n, 24680n, 24680n, 24680n, 24680n, 24680n, 24680n, 24680n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("llama", () => { - describe("LlamaForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM"; - /** @type {LlamaForCausalLM} */ - let model; - /** @type {LlamaTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await LlamaForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await LlamaTokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[1n, 22172n, 18547n, 8143n, 22202n, 9456n, 17213n, 15330n, 26591n, 15721n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 1n, 22172n, 18547n, 8143n, 22202n, 9456n, 17213n, 15330n, 26591n], - [1n, 22172n, 3186n, 24786n, 19169n, 20222n, 29993n, 27146n, 27426n, 24562n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("LlamaForCausalLM (onnxruntime-genai)", () => { - const model_id = "onnx-community/tiny-random-LlamaForCausalLM-ONNX"; - /** @type {LlamaTokenizer} */ - let tokenizer; - let inputs; - beforeAll(async () => { - tokenizer = await LlamaTokenizer.from_pretrained(model_id); - inputs = tokenizer("hello"); - }, MAX_MODEL_LOAD_TIME); - - const dtypes = ["fp32", "fp16", "q4", "q4f16"]; - - for (const dtype of dtypes) { - it( - `dtype=${dtype}`, - async () => { - /** @type {LlamaForCausalLM} */ - const model = await LlamaForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - dtype, - }); - - const outputs = await model.generate({ - ...inputs, - max_length: 5, - }); - expect(outputs.tolist()).toEqual([[128000n, 15339n, 15339n, 15339n, 15339n]]); - - await model?.dispose(); - }, - MAX_TEST_TIME, - ); - } - }); - }); - - describe("olmo", () => { - describe("OlmoForCausalLM", () => { - const model_id = "onnx-community/tiny-random-olmo-hf"; - /** @type {OlmoForCausalLM} */ - let model; - /** @type {GPTNeoXTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await OlmoForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GPTNeoXTokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[25521n, 10886n, 44936n, 38777n, 33038n, 18557n, 1810n, 33853n, 9517n, 28892n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [1n, 25521n, 10886n, 44936n, 38777n, 33038n, 18557n, 1810n, 33853n, 9517n], - [25521n, 1533n, 37199n, 27362n, 30594n, 39261n, 8824n, 19175n, 8545n, 29335n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("olmo2", () => { - describe("Olmo2ForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-Olmo2ForCausalLM"; - /** @type {Olmo2ForCausalLM} */ - let model; - /** @type {GPT2Tokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await Olmo2ForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GPT2Tokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[15339n, 50957n, 43410n, 77030n, 91444n, 99516n, 80720n, 4608n, 90428n, 22806n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [100277n, 15339n, 50957n, 43410n, 77030n, 91444n, 99516n, 80720n, 4608n, 90428n], - [15339n, 1917n, 12095n, 21350n, 61586n, 19306n, 39486n, 91527n, 59768n, 31934n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("granite", () => { - describe("GraniteForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-GraniteForCausalLM"; - /** @type {GraniteForCausalLM} */ - let model; - /** @type {GPT2Tokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await GraniteForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GPT2Tokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[7656n, 39727n, 33077n, 9643n, 30539n, 47869n, 48739n, 15085n, 9203n, 14020n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 7656n, 39727n, 33077n, 9643n, 30539n, 47869n, 48739n, 15085n, 9203n], - [7656n, 5788n, 17835n, 13234n, 7592n, 21471n, 30537n, 23023n, 43450n, 4824n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("cohere", () => { - describe("CohereModel", () => { - const model_id = "hf-internal-testing/tiny-random-CohereModel"; - /** @type {CohereModel} */ - let model; - /** @type {CohereTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await CohereModel.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await CohereTokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const { last_hidden_state } = await model(inputs); - expect(last_hidden_state.dims).toEqual([1, 4, 32]); - expect(last_hidden_state.mean().item()).toBeCloseTo(0.0, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const { last_hidden_state } = await model(inputs); - expect(last_hidden_state.dims).toEqual([2, 6, 32]); - expect(last_hidden_state.mean().item()).toBeCloseTo(9.934107758624577e-9, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("CohereForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-CohereForCausalLM"; - /** @type {CohereForCausalLM} */ - let model; - /** @type {CohereTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await CohereForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await CohereTokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[5n, 203n, 790n, 87n, 87n, 87n, 87n, 87n, 87n, 87n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 0n, 5n, 203n, 790n, 87n, 87n, 87n, 87n, 87n], - [5n, 203n, 790n, 87n, 214n, 741n, 741n, 741n, 741n, 741n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("gemma", () => { - describe("GemmaForCausalLM", () => { - const model_id = "Xenova/tiny-random-GemmaForCausalLM"; - /** @type {GemmaForCausalLM} */ - let model; - /** @type {GemmaTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await GemmaForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GemmaTokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[2n, 17534n, 254059n, 254059n, 254059n, 254059n, 254059n, 254059n, 254059n, 254059n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 2n, 17534n, 254059n, 254059n, 254059n, 254059n, 254059n, 254059n, 254059n], - [2n, 17534n, 2134n, 71055n, 71055n, 71055n, 71055n, 71055n, 71055n, 71055n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("gemma", () => { - describe("Gemma2ForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-Gemma2ForCausalLM"; - /** @type {Gemma2ForCausalLM} */ - let model; - /** @type {GemmaTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await Gemma2ForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GemmaTokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[2n, 17534n, 127534n, 160055n, 160055n, 160055n, 160055n, 160055n, 160055n, 160055n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 2n, 17534n, 127534n, 127534n, 215341n, 215341n, 215341n, 215341n, 215341n], - [2n, 17534n, 2134n, 107508n, 160055n, 160055n, 160055n, 160055n, 160055n, 160055n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("gpt_neo", () => { - describe("GPTNeoForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-GPTNeoForCausalLM"; - /** @type {GPTNeoForCausalLM} */ - let model; - /** @type {GPT2Tokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await GPTNeoForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GPT2Tokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[258n, 863n, 79n, 79n, 79n, 949n, 949n, 949n, 949n, 949n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 0n, 258n, 863n, 79n, 79n, 79n, 949n, 949n, 949n], - [258n, 863n, 79n, 269n, 813n, 849n, 849n, 849n, 849n, 849n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("gpt_neox", () => { - describe("GPTNeoXForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-GPTNeoXForCausalLM"; - /** @type {GPTNeoXForCausalLM} */ - let model; - /** @type {GPTNeoXTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await GPTNeoXForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GPTNeoXTokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[259n, 864n, 80n, 881n, 502n, 895n, 938n, 668n, 502n, 895n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 0n, 259n, 864n, 80n, 881n, 502n, 895n, 938n, 668n], - [259n, 864n, 80n, 270n, 814n, 522n, 112n, 268n, 503n, 468n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("gptj", () => { - describe("GPTJForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-GPTJForCausalLM"; - /** @type {GPTJForCausalLM} */ - let model; - /** @type {GPTNeoXTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await GPTJForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GPTNeoXTokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[258n, 863n, 79n, 102n, 401n, 773n, 889n, 159n, 957n, 869n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 0n, 258n, 863n, 79n, 102n, 401n, 773n, 889n, 159n], - [258n, 863n, 79n, 269n, 813n, 879n, 175n, 39n, 141n, 1000n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("bloom", () => { - describe("BloomForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-BloomForCausalLM"; - /** @type {BloomForCausalLM} */ - let model; - /** @type {BloomTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await BloomForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await BloomTokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[198n, 803n, 82n, 82n, 82n, 82n, 82n, 82n, 82n, 82n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [3n, 3n, 198n, 803n, 82n, 82n, 82n, 82n, 82n, 82n], - [198n, 803n, 82n, 209n, 753n, 753n, 753n, 753n, 753n, 753n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("gpt_bigcode", () => { - describe("GPTBigCodeForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-GPTBigCodeForCausalLM"; - /** @type {GPTBigCodeForCausalLM} */ - let model; - /** @type {GPT2Tokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await GPTBigCodeForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GPT2Tokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[258n, 863n, 79n, 79n, 79n, 79n, 79n, 79n, 79n, 79n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 0n, 258n, 863n, 79n, 79n, 79n, 79n, 79n, 79n], - [258n, 863n, 79n, 269n, 813n, 832n, 93n, 93n, 93n, 93n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("gpt2", () => { - describe("GPT2LMHeadModel", () => { - const model_id = "hf-internal-testing/tiny-random-GPT2LMHeadModel"; - /** @type {GPT2LMHeadModel} */ - let model; - /** @type {GPT2Tokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await GPT2LMHeadModel.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GPT2Tokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[258n, 863n, 79n, 79n, 79n, 79n, 79n, 79n, 79n, 243n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 0n, 258n, 863n, 79n, 79n, 79n, 79n, 79n, 79n], - [258n, 863n, 79n, 269n, 813n, 813n, 813n, 813n, 813n, 813n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("jais", () => { - describe("JAISLMHeadModel", () => { - const model_id = "onnx-community/tiny-random-jais"; - /** @type {JAISLMHeadModel} */ - let model; - /** @type {PreTrainedTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await JAISLMHeadModel.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await PreTrainedTokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n, 55422n], - [55422n, 2838n, 2838n, 2838n, 2838n, 2838n, 2838n, 2838n, 2838n, 2838n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("mpt", () => { - describe("MptForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-MptForCausalLM"; - /** @type {MptForCausalLM} */ - let model; - /** @type {GPTNeoXTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await MptForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await GPTNeoXTokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[259n, 864n, 80n, 80n, 80n, 80n, 80n, 80n, 80n, 80n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 0n, 259n, 864n, 80n, 80n, 80n, 80n, 80n, 80n], - [259n, 864n, 80n, 270n, 814n, 293n, 293n, 293n, 293n, 293n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("codegen", () => { - describe("CodeGenForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-CodeGenForCausalLM"; - /** @type {CodeGenForCausalLM} */ - let model; - /** @type {CodeGenTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await CodeGenForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await CodeGenTokenizer.from_pretrained(model_id); - tokenizer.padding_side = "left"; - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[258n, 863n, 79n, 437n, 334n, 450n, 294n, 621n, 375n, 385n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [0n, 0n, 258n, 863n, 79n, 437n, 334n, 450n, 294n, 621n], - [258n, 863n, 79n, 269n, 813n, 759n, 113n, 295n, 574n, 987n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("mistral", () => { - describe("MistralForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-MistralForCausalLM"; - /** @type {MistralForCausalLM} */ - let model; - /** @type {LlamaTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await MistralForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await LlamaTokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("hello"); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([[1n, 6312n, 28709n, 24704n, 8732n, 1310n, 9808n, 13771n, 27309n, 4779n]]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - it( - "batch_size>1", - async () => { - const inputs = tokenizer(["hello", "hello world"], { padding: true }); - const outputs = await model.generate({ - ...inputs, - max_length: 10, - }); - expect(outputs.tolist()).toEqual([ - [2n, 1n, 6312n, 28709n, 24704n, 8732n, 1310n, 9808n, 13771n, 27309n], - [1n, 6312n, 28709n, 1526n, 8687n, 5690n, 1770n, 30811n, 12501n, 3325n], - ]); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("patchtsmixer", () => { - const dims = [64, 512, 7]; - const prod = dims.reduce((a, b) => a * b, 1); - const past_values = new Tensor( - "float32", - Float32Array.from({ length: prod }, (_, i) => i / prod), - dims, - ); - - describe("PatchTSMixerModel", () => { - const model_id = "hf-internal-testing/tiny-random-PatchTSMixerModel"; - - /** @type {PatchTSMixerModel} */ - let model; - beforeAll(async () => { - model = await PatchTSMixerModel.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - }, MAX_MODEL_LOAD_TIME); - - it( - "default", - async () => { - const { last_hidden_state } = await model({ past_values }); - - const { num_input_channels, num_patches, d_model } = model.config; - expect(last_hidden_state.dims).toEqual([dims[0], num_input_channels, num_patches, d_model]); - expect(last_hidden_state.mean().item()).toBeCloseTo(0.03344963490962982, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("PatchTSMixerForPrediction", () => { - const model_id = "onnx-community/granite-timeseries-patchtsmixer"; - - /** @type {PatchTSMixerForPrediction} */ - let model; - beforeAll(async () => { - model = await PatchTSMixerForPrediction.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - }, MAX_MODEL_LOAD_TIME); - - it( - "default", - async () => { - const { prediction_outputs } = await model({ past_values }); - - const { prediction_length, num_input_channels } = model.config; - expect(prediction_outputs.dims).toEqual([dims[0], prediction_length, num_input_channels]); - expect(prediction_outputs.mean().item()).toBeCloseTo(0.5064773559570312, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); - - describe("patchtst", () => { - const dims = [64, 512, 7]; - const prod = dims.reduce((a, b) => a * b, 1); - const past_values = new Tensor( - "float32", - Float32Array.from({ length: prod }, (_, i) => i / prod), - dims, - ); - - describe("PatchTSTModel", () => { - const model_id = "hf-internal-testing/tiny-random-PatchTSTModel"; - - /** @type {PatchTSTModel} */ - let model; - beforeAll(async () => { - model = await PatchTSTModel.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - }, MAX_MODEL_LOAD_TIME); - - it( - "default", - async () => { - const { last_hidden_state } = await model({ past_values }); - - const { num_input_channels, d_model } = model.config; - expect(last_hidden_state.dims).toEqual([dims[0], num_input_channels, 43, d_model]); - expect(last_hidden_state.mean().item()).toBeCloseTo(0.016672514379024506, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("PatchTSTForPrediction", () => { - const model_id = "onnx-community/granite-timeseries-patchtst"; - - /** @type {PatchTSTForPrediction} */ - let model; - beforeAll(async () => { - model = await PatchTSTForPrediction.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - }, MAX_MODEL_LOAD_TIME); + // Pipelines + pipeline, + FillMaskPipeline, + TextClassificationPipeline, + TextGenerationPipeline, + TranslationPipeline, + ImageClassificationPipeline, + ZeroShotImageClassificationPipeline, + TokenClassificationPipeline, + QuestionAnsweringPipeline, + DocumentQuestionAnsweringPipeline, - it( - "default", - async () => { - const { prediction_outputs } = await model({ past_values }); + // Other + RawImage, +} from "../src/transformers.js"; - const { prediction_length, num_input_channels } = model.config; - expect(prediction_outputs.dims).toEqual([dims[0], prediction_length, num_input_channels]); - expect(prediction_outputs.mean().item()).toBeCloseTo(0.506528377532959, 5); - }, - MAX_TEST_EXECUTION_TIME, - ); +import { init, MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "./init.js"; +import { compare } from "./test_utils.js"; - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - }); -}); +init(); describe("Tiny random pipelines", () => { describe("fill-mask", () => { @@ -2318,10 +27,7 @@ describe("Tiny random pipelines", () => { /** @type {FillMaskPipeline} */ let pipe; beforeAll(async () => { - pipe = await pipeline("fill-mask", model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); + pipe = await pipeline("fill-mask", model_id, DEFAULT_MODEL_OPTIONS); }, MAX_MODEL_LOAD_TIME); describe("batch_size=1", () => { @@ -2410,10 +116,7 @@ describe("Tiny random pipelines", () => { /** @type {TextClassificationPipeline} */ let pipe; beforeAll(async () => { - pipe = await pipeline("text-classification", model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); + pipe = await pipeline("text-classification", model_id, DEFAULT_MODEL_OPTIONS); }, MAX_MODEL_LOAD_TIME); describe("batch_size=1", () => { @@ -2509,10 +212,7 @@ describe("Tiny random pipelines", () => { /** @type {TokenClassificationPipeline} */ let pipe; beforeAll(async () => { - pipe = await pipeline("token-classification", model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); + pipe = await pipeline("token-classification", model_id, DEFAULT_MODEL_OPTIONS); }, MAX_MODEL_LOAD_TIME); describe("batch_size=1", () => { @@ -2659,10 +359,7 @@ describe("Tiny random pipelines", () => { /** @type {QuestionAnsweringPipeline} */ let pipe; beforeAll(async () => { - pipe = await pipeline("question-answering", model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); + pipe = await pipeline("question-answering", model_id, DEFAULT_MODEL_OPTIONS); }, MAX_MODEL_LOAD_TIME); describe("batch_size=1", () => { @@ -2702,10 +399,7 @@ describe("Tiny random pipelines", () => { /** @type {ImageClassificationPipeline} */ let pipe; beforeAll(async () => { - pipe = await pipeline("image-classification", model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); + pipe = await pipeline("image-classification", model_id, DEFAULT_MODEL_OPTIONS); }, MAX_MODEL_LOAD_TIME); describe("batch_size=1", () => { @@ -2778,10 +472,7 @@ describe("Tiny random pipelines", () => { /** @type {ZeroShotImageClassificationPipeline} */ let pipe; beforeAll(async () => { - pipe = await pipeline("zero-shot-image-classification", model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); + pipe = await pipeline("zero-shot-image-classification", model_id, DEFAULT_MODEL_OPTIONS); }, MAX_MODEL_LOAD_TIME); describe("batch_size=1", () => { @@ -2862,10 +553,7 @@ describe("Tiny random pipelines", () => { /** @type {ImageClassificationPipeline} */ let pipe; beforeAll(async () => { - pipe = await pipeline("audio-classification", model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); + pipe = await pipeline("audio-classification", model_id, DEFAULT_MODEL_OPTIONS); }, MAX_MODEL_LOAD_TIME); describe("batch_size=1", () => { @@ -2933,10 +621,7 @@ describe("Tiny random pipelines", () => { /** @type {TextGenerationPipeline} */ let pipe; beforeAll(async () => { - pipe = await pipeline("text-generation", model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); + pipe = await pipeline("text-generation", model_id, DEFAULT_MODEL_OPTIONS); }, MAX_MODEL_LOAD_TIME); describe("batch_size=1", () => { @@ -3034,10 +719,7 @@ describe("Tiny random pipelines", () => { /** @type {TranslationPipeline} */ let pipe; beforeAll(async () => { - pipe = await pipeline("translation", model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); + pipe = await pipeline("translation", model_id, DEFAULT_MODEL_OPTIONS); }, MAX_MODEL_LOAD_TIME); describe("batch_size=1", () => { @@ -3069,10 +751,7 @@ describe("Tiny random pipelines", () => { /** @type {ImageClassificationPipeline} */ let pipe; beforeAll(async () => { - pipe = await pipeline("object-detection", model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); + pipe = await pipeline("object-detection", model_id, DEFAULT_MODEL_OPTIONS); }, MAX_MODEL_LOAD_TIME); describe("batch_size=1", () => { @@ -3136,10 +815,7 @@ describe("Tiny random pipelines", () => { /** @type {DocumentQuestionAnsweringPipeline} */ let pipe; beforeAll(async () => { - pipe = await pipeline("document-question-answering", model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); + pipe = await pipeline("document-question-answering", model_id, DEFAULT_MODEL_OPTIONS); }, MAX_MODEL_LOAD_TIME); describe("batch_size=1", () => { @@ -3163,140 +839,3 @@ describe("Tiny random pipelines", () => { }, MAX_MODEL_DISPOSE_TIME); }); }); - -describe("PKV caching", () => { - describe("LlamaForCausalLM", () => { - const model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM"; - /** @type {LlamaForCausalLM} */ - let model; - /** @type {LlamaTokenizer} */ - let tokenizer; - beforeAll(async () => { - model = await LlamaForCausalLM.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await LlamaTokenizer.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const inputs = tokenizer("1"); - - // Generate first sequence w/o PKV - // NOTE: `return_dict_in_generate=true` is required to get PKV - const { past_key_values, sequences } = await model.generate({ - ...inputs, - max_new_tokens: 5, - do_sample: false, - return_dict_in_generate: true, - }); - - // Update output with new text - const decoded = tokenizer.batch_decode(sequences, { - skip_special_tokens: false, - })[0]; - const new_inputs = tokenizer(decoded + "2", { - add_special_tokens: false, - }); - - // Run w/o PKV - const generated_ids = await model.generate({ - ...new_inputs, - max_new_tokens: 3, - do_sample: false, - }); - - // Run w/ PKV - const generated_ids_pkv = await model.generate({ - ...new_inputs, - past_key_values, - max_new_tokens: 3, - do_sample: false, - }); - - const target = [[1n, 259n, 29896n, 24959n, 22063n, 17192n, 12189n, 22468n, 29906n, 3399n, 24823n, 26470n]]; - - expect(generated_ids.tolist()).toEqual(target); - expect(generated_ids_pkv.tolist()).toEqual(target); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); - - describe("LlavaForConditionalGeneration", () => { - const model_id = "Xenova/tiny-random-LlavaForConditionalGeneration"; - /** @type {LlavaForConditionalGeneration} */ - let model; - /** @type {PreTrainedTokenizer} */ - let tokenizer; - /** @type {Processor} */ - let processor; - beforeAll(async () => { - model = await LlavaForConditionalGeneration.from_pretrained(model_id, { - // TODO move to config - ...DEFAULT_MODEL_OPTIONS, - }); - tokenizer = await AutoTokenizer.from_pretrained(model_id); - processor = await AutoProcessor.from_pretrained(model_id); - }, MAX_MODEL_LOAD_TIME); - - it( - "batch_size=1", - async () => { - const text_inputs = tokenizer("hello"); - - // Empty white image - const dims = [224, 224, 3]; - const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims); - const vision_inputs = await processor(image); - - // Generate first sequence w/o PKV - // NOTE: `return_dict_in_generate=true` is required to get PKV - const { past_key_values, sequences } = await model.generate({ - ...text_inputs, - ...vision_inputs, - max_new_tokens: 5, - do_sample: false, - return_dict_in_generate: true, - }); - - // Update output with new text - const decoded = tokenizer.batch_decode(sequences).map((x) => x + "new"); - const new_inputs = tokenizer(decoded, { - add_special_tokens: false, - }); - - // Run w/o PKV - const generated_ids = await model.generate({ - ...new_inputs, - ...vision_inputs, - max_new_tokens: 3, - do_sample: false, - }); - - // Run w/ PKV - const generated_ids_pkv = await model.generate({ - ...new_inputs, - past_key_values, - max_new_tokens: 3, - do_sample: false, - }); - - const target = [[1n, 32000n, 29871n, 23927n, 359n, 1519n, 568n, 5769n, 1330n, 21544n, 11568n, 1482n, 7258n, 1250n, 16117n]]; - expect(generated_ids.tolist()).toEqual(target); - expect(generated_ids_pkv.tolist()).toEqual(target); - }, - MAX_TEST_EXECUTION_TIME, - ); - - afterAll(async () => { - await model?.dispose(); - }, MAX_MODEL_DISPOSE_TIME); - }); -}); diff --git a/tests/utils/generation.test.js b/tests/utils/generation.test.js index dd2229826..207ba4661 100644 --- a/tests/utils/generation.test.js +++ b/tests/utils/generation.test.js @@ -1,7 +1,24 @@ -import { AutoTokenizer } from "../../src/tokenizers.js"; -import { AutoModelForSeq2SeqLM, AutoModelForCausalLM } from "../../src/models.js"; -import { TextStreamer } from "../../src/generation/streamers.js"; -import { init, MAX_TEST_EXECUTION_TIME, MAX_MODEL_LOAD_TIME, MAX_MODEL_DISPOSE_TIME } from "../init.js"; +import { + // Models + AutoModelForSeq2SeqLM, + AutoModelForCausalLM, + LlamaForCausalLM, + LlavaForConditionalGeneration, + + // Tokenizers + AutoTokenizer, + LlamaTokenizer, + + // Processors + AutoProcessor, + Processor, + + // Other + TextStreamer, + RawImage, +} from "../../src/transformers.js"; + +import { init, MAX_TEST_EXECUTION_TIME, MAX_MODEL_LOAD_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; // Initialise the testing environment init(); @@ -18,7 +35,7 @@ const generate = async (model, tokenizer, text, options) => { describe("Generation parameters", () => { // List all models which will be tested const models = [ - "hf-internal-testing/tiny-random-T5ForConditionalGeneration", // + "hf-internal-testing/tiny-random-T5ForConditionalGeneration", // encoder-decoder "hf-internal-testing/tiny-random-LlamaForCausalLM", // decoder-only ]; const DUMMY_TEXT = "hello"; @@ -29,7 +46,7 @@ describe("Generation parameters", () => { let model; let tokenizer; beforeAll(async () => { - model = await AutoModelForSeq2SeqLM.from_pretrained(model_id); + model = await AutoModelForSeq2SeqLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); tokenizer = await AutoTokenizer.from_pretrained(model_id); }, MAX_MODEL_LOAD_TIME); @@ -98,7 +115,7 @@ describe("Generation parameters", () => { let model; let tokenizer; beforeAll(async () => { - model = await AutoModelForCausalLM.from_pretrained(model_id); + model = await AutoModelForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); tokenizer = await AutoTokenizer.from_pretrained(model_id); }, MAX_MODEL_LOAD_TIME); @@ -171,7 +188,7 @@ describe("Streamers", () => { const model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM"; let model, tokenizer; beforeAll(async () => { - model = await AutoModelForCausalLM.from_pretrained(model_id); + model = await AutoModelForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); tokenizer = await AutoTokenizer.from_pretrained(model_id); }, MAX_MODEL_LOAD_TIME); @@ -202,3 +219,134 @@ describe("Streamers", () => { }, MAX_MODEL_DISPOSE_TIME); }); }); + +describe("PKV caching", () => { + describe("LlamaForCausalLM", () => { + const model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM"; + /** @type {LlamaForCausalLM} */ + let model; + /** @type {LlamaTokenizer} */ + let tokenizer; + beforeAll(async () => { + model = await LlamaForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await LlamaTokenizer.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const inputs = tokenizer("1"); + + // Generate first sequence w/o PKV + // NOTE: `return_dict_in_generate=true` is required to get PKV + const { past_key_values, sequences } = await model.generate({ + ...inputs, + max_new_tokens: 5, + do_sample: false, + return_dict_in_generate: true, + }); + + // Update output with new text + const decoded = tokenizer.batch_decode(sequences, { + skip_special_tokens: false, + })[0]; + const new_inputs = tokenizer(decoded + "2", { + add_special_tokens: false, + }); + + // Run w/o PKV + const generated_ids = await model.generate({ + ...new_inputs, + max_new_tokens: 3, + do_sample: false, + }); + + // Run w/ PKV + const generated_ids_pkv = await model.generate({ + ...new_inputs, + past_key_values, + max_new_tokens: 3, + do_sample: false, + }); + + const target = [[1n, 259n, 29896n, 24959n, 22063n, 17192n, 12189n, 22468n, 29906n, 3399n, 24823n, 26470n]]; + + expect(generated_ids.tolist()).toEqual(target); + expect(generated_ids_pkv.tolist()).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); + + describe("LlavaForConditionalGeneration", () => { + const model_id = "Xenova/tiny-random-LlavaForConditionalGeneration"; + /** @type {LlavaForConditionalGeneration} */ + let model; + /** @type {PreTrainedTokenizer} */ + let tokenizer; + /** @type {Processor} */ + let processor; + beforeAll(async () => { + model = await LlavaForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS); + tokenizer = await AutoTokenizer.from_pretrained(model_id); + processor = await AutoProcessor.from_pretrained(model_id); + }, MAX_MODEL_LOAD_TIME); + + it( + "batch_size=1", + async () => { + const text_inputs = tokenizer("hello"); + + // Empty white image + const dims = [224, 224, 3]; + const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims); + const vision_inputs = await processor(image); + + // Generate first sequence w/o PKV + // NOTE: `return_dict_in_generate=true` is required to get PKV + const { past_key_values, sequences } = await model.generate({ + ...text_inputs, + ...vision_inputs, + max_new_tokens: 5, + do_sample: false, + return_dict_in_generate: true, + }); + + // Update output with new text + const decoded = tokenizer.batch_decode(sequences).map((x) => x + "new"); + const new_inputs = tokenizer(decoded, { + add_special_tokens: false, + }); + + // Run w/o PKV + const generated_ids = await model.generate({ + ...new_inputs, + ...vision_inputs, + max_new_tokens: 3, + do_sample: false, + }); + + // Run w/ PKV + const generated_ids_pkv = await model.generate({ + ...new_inputs, + past_key_values, + max_new_tokens: 3, + do_sample: false, + }); + + const target = [[1n, 32000n, 29871n, 23927n, 359n, 1519n, 568n, 5769n, 1330n, 21544n, 11568n, 1482n, 7258n, 1250n, 16117n]]; + expect(generated_ids.tolist()).toEqual(target); + expect(generated_ids_pkv.tolist()).toEqual(target); + }, + MAX_TEST_EXECUTION_TIME, + ); + + afterAll(async () => { + await model?.dispose(); + }, MAX_MODEL_DISPOSE_TIME); + }); +}); diff --git a/tests/utils/tensor.test.js b/tests/utils/tensor.test.js index 622a14281..e2fa808e6 100644 --- a/tests/utils/tensor.test.js +++ b/tests/utils/tensor.test.js @@ -204,4 +204,47 @@ describe("Tensor operations", () => { compare(norm, target, 1e-3); }); }); + + describe("to", () => { + it("float32 to int32 (number to number)", async () => { + const t1 = new Tensor("float32", [1, 2, 3, 4, 5, 6], [2, 3]); + + const target = new Tensor("int32", [1, 2, 3, 4, 5, 6], [2, 3]); + + const t2 = t1.to("int32"); + compare(t2, target); + }); + it("float32 to int64 (number to bigint)", async () => { + const t1 = new Tensor("float32", [1, 2, 3, 4, 5, 6], [2, 3]); + + const target = new Tensor("int64", [1n, 2n, 3n, 4n, 5n, 6n], [2, 3]); + + const t2 = t1.to("int64"); + compare(t2, target); + }); + it("int64 to float32 (bigint to number)", async () => { + const t1 = new Tensor("int64", [1n, 2n, 3n, 4n, 5n, 6n], [2, 3]); + + const target = new Tensor("float32", [1, 2, 3, 4, 5, 6], [2, 3]); + + const t2 = t1.to("float32"); + compare(t2, target); + }); + it("int32 to uint32", async () => { + const t1 = new Tensor("int32", [-1, 2, -3, 4, -5, 6], [2, 3]); + + const target = new Tensor("uint32", [4294967295, 2, 4294967293, 4, 4294967291, 6], [2, 3]); + + const t2 = t1.to("uint32"); + compare(t2, target); + }); + it("int16 to int8 (overflow)", async () => { + const t1 = new Tensor("int16", [0, 1, 128, 256, 257, 512], [2, 3]); + + const target = new Tensor("int8", [0, 1, -128, 0, 1, 0], [2, 3]); + + const t2 = t1.to("int8"); + compare(t2, target); + }); + }); });