From c367f9d68b809bbbf81049c808bf6d219d761d23 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Tue, 19 Sep 2023 13:34:00 +0200 Subject: [PATCH] Add support for `Blenderbot` and `BlenderbotSmall` (#292) * Add support for `Blenderbot` models Closes #37 References #29 * Add support for `BlenderbotTokenizer` * Add blenderbot to supported models * Add support for `BlenderbotSmallTokenizer` * Add custom tests for blenderbot-small * Add support for `BlenderbotSmall` models * Update list of supported models * Improve `addPastKeyValues` function * Allow skipping of adding encoder past key values --- README.md | 2 + docs/snippets/6_supported-models.snippet | 2 + scripts/supported_models.py | 20 +-- src/models.js | 152 +++++++++++++++++------ src/tokenizers.js | 47 +++++++ tests/generate_tests.py | 7 ++ 6 files changed, 181 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index e1d46b1cd..048aa913e 100644 --- a/README.md +++ b/README.md @@ -258,6 +258,8 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei. 1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. +1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet index 1c5107ec8..d3c448692 100644 --- a/docs/snippets/6_supported-models.snippet +++ b/docs/snippets/6_supported-models.snippet @@ -5,6 +5,8 @@ 1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei. 1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. +1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. 1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/). 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. diff --git a/scripts/supported_models.py b/scripts/supported_models.py index 0b4312d34..c998ef954 100644 --- a/scripts/supported_models.py +++ b/scripts/supported_models.py @@ -97,16 +97,16 @@ 'bert-base-chinese', 'emilyalsentzer/Bio_ClinicalBERT', ], - # 'blenderbot': [ - # # Text2text generation (TODO add conversational) - # 'facebook/blenderbot-400M-distill', - # 'facebook/blenderbot-1B-distill', - # ], - # 'blenderbot-small': [ - # # Text2text generation (TODO add conversational) - # 'facebook/blenderbot-90M', # DEPRECATED - # 'facebook/blenderbot_small-90M', - # ], + 'blenderbot': [ + # Text2text generation (TODO add conversational) + 'facebook/blenderbot-400M-distill', + # 'facebook/blenderbot-1B-distill', + ], + 'blenderbot-small': [ + # Text2text generation (TODO add conversational) + # 'facebook/blenderbot-90M', # DEPRECATED + 'facebook/blenderbot_small-90M', + ], 'bloom': [ # Text generation 'bigscience/bloom-560m', diff --git a/src/models.js b/src/models.js index daf33fddf..c71730990 100644 --- a/src/models.js +++ b/src/models.js @@ -310,7 +310,6 @@ function boolTensor(value) { * @private */ async function seq2seqForward(self, model_inputs) { - const add_decoder_pkv = self.add_decoder_pkv ?? true; let { encoder_outputs, past_key_values } = model_inputs; @@ -327,7 +326,7 @@ async function seq2seqForward(self, model_inputs) { if (self.decoder_merged_session.inputNames.includes('encoder_attention_mask')) { decoderFeeds.encoder_attention_mask = model_inputs.attention_mask } - self.addPastKeyValues(decoderFeeds, past_key_values, add_decoder_pkv); + self.addPastKeyValues(decoderFeeds, past_key_values); const decoderResults = await sessionRun(self.decoder_merged_session, decoderFeeds); let logits = decoderResults.logits; @@ -1199,57 +1198,51 @@ export class PreTrainedModel extends Callable { * * @param {Object} decoderFeeds The decoder feeds object to add past key values to. * @param {Object} pastKeyValues An object containing past key values. - * @param {boolean} [hasDecoder=false] Whether the model has a decoder. */ - addPastKeyValues(decoderFeeds, pastKeyValues, hasDecoder = false) { + addPastKeyValues(decoderFeeds, pastKeyValues) { if (pastKeyValues) { Object.assign(decoderFeeds, pastKeyValues) } else { // TODO support batches (i.e., batch_size > 1) - if (hasDecoder) { + // @ts-ignore + if (this.config.is_encoder_decoder && (this.add_encoder_pkv ?? true)) { // @ts-ignore let encoder_dims = [1, this.num_encoder_heads, 0, this.encoder_dim_kv]; - // @ts-ignore - for (let i = 0; i < this.num_encoder_layers; ++i) { - decoderFeeds[`past_key_values.${i}.encoder.key`] = new Tensor('float32', [], encoder_dims) - decoderFeeds[`past_key_values.${i}.encoder.value`] = new Tensor('float32', [], encoder_dims) - } - // @ts-ignore let decoder_dims = [1, this.num_decoder_heads, 0, this.decoder_dim_kv]; // @ts-ignore for (let i = 0; i < this.num_decoder_layers; ++i) { + decoderFeeds[`past_key_values.${i}.encoder.key`] = new Tensor('float32', [], encoder_dims) + decoderFeeds[`past_key_values.${i}.encoder.value`] = new Tensor('float32', [], encoder_dims) decoderFeeds[`past_key_values.${i}.decoder.key`] = new Tensor('float32', [], decoder_dims) decoderFeeds[`past_key_values.${i}.decoder.value`] = new Tensor('float32', [], decoder_dims) } + } else if (this.config.multi_query) { // e.g., for `gpt_bigcode` + // @ts-ignore + let dims = [1, 0, 2 * this.dim_kv] + // @ts-ignore + for (let i = 0; i < this.num_layers; ++i) { + decoderFeeds[`past_key_values.${i}.key_value`] = new Tensor('float32', [], dims) + } + } else if (this.config.model_type === 'bloom') { + // NOTE: Custom implementation for Bloom - } else { - if (this.config.multi_query) { - // @ts-ignore - let dims = [1, 0, 2 * this.dim_kv] - // @ts-ignore - for (let i = 0; i < this.num_layers; ++i) { - decoderFeeds[`past_key_values.${i}.key_value`] = new Tensor('float32', [], dims) - } - } else if (this.config.model_type === 'bloom') { - // Custom implementation for Bloom - // @ts-ignore - let keyDims = [1 * this.num_heads, this.dim_kv, 0] // [batch_size x num_heads,64,past_sequence_length] - // @ts-ignore - let valueDims = [1 * this.num_heads, 0, this.dim_kv] // [batch_size x num_heads,past_sequence_length,64] - // @ts-ignore - for (let i = 0; i < this.num_layers; ++i) { - decoderFeeds[`past_key_values.${i}.key`] = new Tensor('float32', [], keyDims) - decoderFeeds[`past_key_values.${i}.value`] = new Tensor('float32', [], valueDims) - } - } else { - // @ts-ignore - let dims = [1, this.num_heads, 0, this.dim_kv] - // @ts-ignore - for (let i = 0; i < this.num_layers; ++i) { - decoderFeeds[`past_key_values.${i}.key`] = new Tensor('float32', [], dims) - decoderFeeds[`past_key_values.${i}.value`] = new Tensor('float32', [], dims) - } + // @ts-ignore + let keyDims = [1 * this.num_heads, this.dim_kv, 0] // [batch_size x num_heads,64,past_sequence_length] + // @ts-ignore + let valueDims = [1 * this.num_heads, 0, this.dim_kv] // [batch_size x num_heads,past_sequence_length,64] + // @ts-ignore + for (let i = 0; i < this.num_layers; ++i) { + decoderFeeds[`past_key_values.${i}.key`] = new Tensor('float32', [], keyDims) + decoderFeeds[`past_key_values.${i}.value`] = new Tensor('float32', [], valueDims) + } + } else { // Decoder-only + // @ts-ignore + let dims = [1, this.num_heads, 0, this.dim_kv] + // @ts-ignore + for (let i = 0; i < this.num_layers; ++i) { + decoderFeeds[`past_key_values.${i}.key`] = new Tensor('float32', [], dims) + decoderFeeds[`past_key_values.${i}.value`] = new Tensor('float32', [], dims) } } } @@ -2033,6 +2026,83 @@ export class MBartForSequenceClassification extends MBartPreTrainedModel { ////////////////////////////////////////////////// + +////////////////////////////////////////////////// +// Blenderbot models +export class BlenderbotPreTrainedModel extends PreTrainedModel { }; + +/** + * The bare Blenderbot Model outputting raw hidden-states without any specific head on top. + */ +export class BlenderbotModel extends BlenderbotPreTrainedModel { } + +/** + * The Blenderbot Model with a language modeling head. Can be used for summarization. + */ +export class BlenderbotForConditionalGeneration extends BlenderbotPreTrainedModel { + + /** + * Creates a new instance of the `BlenderbotForConditionalGeneration` class. + * @param {any} config The model configuration. + * @param {any} session The ONNX session containing the encoder weights. + * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; + + this.num_decoder_layers = this.config.decoder_layers; + this.num_decoder_heads = this.config.decoder_attention_heads; + this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; + + this.num_encoder_layers = this.config.encoder_layers; + this.num_encoder_heads = this.config.encoder_attention_heads; + this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; + } +} +////////////////////////////////////////////////// + + +////////////////////////////////////////////////// +// Blenderbot models +export class BlenderbotSmallPreTrainedModel extends PreTrainedModel { }; + +/** + * The bare BlenderbotSmall Model outputting raw hidden-states without any specific head on top. + */ +export class BlenderbotSmallModel extends BlenderbotSmallPreTrainedModel { } + +/** + * The BlenderbotSmall Model with a language modeling head. Can be used for summarization. + */ +export class BlenderbotSmallForConditionalGeneration extends BlenderbotSmallPreTrainedModel { + + /** + * Creates a new instance of the `BlenderbotForConditionalGeneration` class. + * @param {any} config The model configuration. + * @param {any} session The ONNX session containing the encoder weights. + * @param {any} decoder_merged_session The ONNX session containing the merged decoder weights. + * @param {GenerationConfig} generation_config The generation configuration. + */ + constructor(config, session, decoder_merged_session, generation_config) { + super(config, session); + this.decoder_merged_session = decoder_merged_session; + this.generation_config = generation_config; + + this.num_decoder_layers = this.config.decoder_layers; + this.num_decoder_heads = this.config.decoder_attention_heads; + this.decoder_dim_kv = this.config.d_model / this.num_decoder_heads; + + this.num_encoder_layers = this.config.encoder_layers; + this.num_encoder_heads = this.config.encoder_attention_heads; + this.encoder_dim_kv = this.config.d_model / this.num_encoder_heads; + } +} +////////////////////////////////////////////////// + + ////////////////////////////////////////////////// // Roberta models export class RobertaPreTrainedModel extends PreTrainedModel { } @@ -2458,7 +2528,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel { */ export class VisionEncoderDecoderModel extends PreTrainedModel { main_input_name = 'pixel_values'; - add_decoder_pkv = false; + add_encoder_pkv = false; /** * Creates a new instance of the `VisionEncoderDecoderModel` class. @@ -3422,6 +3492,8 @@ const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([ ['marian', ['MarianModel', MarianModel]], ['whisper', ['WhisperModel', WhisperModel]], ['m2m_100', ['M2M100Model', M2M100Model]], + ['blenderbot', ['BlenderbotModel', BlenderbotModel]], + ['blenderbot-small', ['BlenderbotSmallModel', BlenderbotSmallModel]], ]); @@ -3475,6 +3547,8 @@ const MODEL_FOR_SEQ_2_SEQ_MAPPING_NAMES = new Map([ ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]], ['marian', ['MarianMTModel', MarianMTModel]], ['m2m_100', ['M2M100ForConditionalGeneration', M2M100ForConditionalGeneration]], + ['blenderbot', ['BlenderbotForConditionalGeneration', BlenderbotForConditionalGeneration]], + ['blenderbot-small', ['BlenderbotSmallForConditionalGeneration', BlenderbotSmallForConditionalGeneration]], ]); const MODEL_WITH_LM_HEAD_MAPPING_NAMES = new Map([ diff --git a/src/tokenizers.js b/src/tokenizers.js index d0f47923e..9a0d295eb 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -516,6 +516,7 @@ class BPE extends TokenizerModel { * @param {Object} config.vocab A mapping of tokens to ids. * @param {string} config.unk_token The unknown token used for out of vocabulary words. * @param {string} config.end_of_word_suffix The suffix to place at the end of each word. + * @param {string} [config.continuing_subword_suffix] The suffix to insert between words. * @param {Array} config.merges An array of BPE merges as strings. */ constructor(config) { @@ -539,6 +540,9 @@ class BPE extends TokenizerModel { this.end_of_word_suffix = config.end_of_word_suffix; + // NOTE: `continuing_subword_suffix` is custom (to support `BlenderbotSmallTokenizer`) + this.continuing_subword_suffix = config.continuing_subword_suffix ?? null; + this.byte_fallback = this.config.byte_fallback ?? false; if (this.byte_fallback) { @@ -665,6 +669,14 @@ class BPE extends TokenizerModel { result = word; } + // Possibly append suffix + if (this.continuing_subword_suffix) { + // Do not append suffix to the last token + for (let i = 0; i < result.length - 1; ++i) { + result[i] += this.continuing_subword_suffix; + } + } + // Save the result to the cache this.cache.set(token, result); @@ -1116,6 +1128,8 @@ class PreTokenizer extends Callable { return new PunctuationPreTokenizer(config); case 'Digits': return new DigitsPreTokenizer(config); + case 'Replace': + return new ReplacePreTokenizer(config); default: throw new Error(`Unknown PreTokenizer type: ${config.type}`); } @@ -2079,6 +2093,34 @@ class WhitespaceSplit extends PreTokenizer { } } +// NOTE: `ReplacePreTokenizer` is custom (to support `BlenderbotSmallTokenizer`) +class ReplacePreTokenizer extends PreTokenizer { + /** + * @param {Object} config The configuration options for the pre-tokenizer. + * @param {Object} config.pattern The pattern used to split the text. Can be a string or a regex object. + * @param {string} config.content What to replace the pattern with. + */ + constructor(config) { + super(); + this.config = config; + this.pattern = createPattern(this.config.pattern); + this.content = this.config.content; + } + + /** + * Pre-tokenizes the input text by replacing certain characters. + * @param {string} text The text to be pre-tokenized. + * @returns {string[]} An array of tokens produced by replacing certain characters. + */ + pre_tokenize_text(text) { + if (this.pattern === null) { + return [text]; + } + return [text.replaceAll(this.pattern, this.config.content)]; + } +} + + export class PreTrainedTokenizer extends Callable { /** * Create a new PreTrainedTokenizer instance. @@ -3705,6 +3747,9 @@ export class MarianTokenizer extends PreTrainedTokenizer { export class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer { } +export class BlenderbotTokenizer extends PreTrainedTokenizer { } +export class BlenderbotSmallTokenizer extends PreTrainedTokenizer { } + /** * Helper class which is used to instantiate pretrained tokenizers with the `from_pretrained` function. * The chosen tokenizer class is determined by the type specified in the tokenizer config. @@ -3744,6 +3789,8 @@ export class AutoTokenizer { FalconTokenizer, GPTNeoXTokenizer, Wav2Vec2CTCTokenizer, + BlenderbotTokenizer, + BlenderbotSmallTokenizer, // Base case: PreTrainedTokenizer, diff --git a/tests/generate_tests.py b/tests/generate_tests.py index 37fd5e43b..2e816459a 100644 --- a/tests/generate_tests.py +++ b/tests/generate_tests.py @@ -63,6 +63,13 @@ "weird \uFF5E edge \uFF5E case", ], "custom": { + "facebook/blenderbot_small-90M": [ + # Test special tokens + "__start__hello world__end__", + # The original (python) tokenizer simply joins by spaces (regardless of special tokens or not) + "__start__ hey __end__" # --> ... --> "__start__ hey __end__" + "__start__hey __end__" # --> ... --> "__start__ hey __end__" + ], "tiiuae/falcon-7b": [ "12 and 123 and 1234", # Special case for splitting on 3 numbers ],