From 3b2521717ef5ccf953f6a2e6eb3cb80ac23bfd80 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 17 Apr 2024 19:14:07 +0200 Subject: [PATCH] Add `ignore_merges` option to BPE tokenizers --- src/tokenizers.js | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index ca0c2ab6c..ce60f7d1e 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -630,10 +630,12 @@ class BPE extends TokenizerModel { * Create a BPE instance. * @param {Object} config The configuration object for BPE. * @param {Object} config.vocab A mapping of tokens to ids. + * @param {string[]} config.merges An array of BPE merges as strings. * @param {string} config.unk_token The unknown token used for out of vocabulary words. * @param {string} config.end_of_word_suffix The suffix to place at the end of each word. * @param {string} [config.continuing_subword_suffix] The suffix to insert between words. - * @param {Array} config.merges An array of BPE merges as strings. + * @param {boolean} [config.byte_fallback=false] Whether to use spm byte-fallback trick (defaults to False) + * @param {boolean} [config.ignore_merges=false] Whether or not to match tokens with the vocab before using merges. */ constructor(config) { super(config); @@ -665,6 +667,8 @@ class BPE extends TokenizerModel { this.text_encoder = new TextEncoder(); } + this.ignore_merges = this.config.ignore_merges ?? false; + /** @type {Map} */ this.cache = new Map(); } @@ -826,6 +830,10 @@ class BPE extends TokenizerModel { const outputTokens = []; for (const token of tokens) { + if (this.ignore_merges && this.tokens_to_ids.has(token)) { + outputTokens.push(token); + continue; + } const bpe_token_list = this.bpe(token); for (const t of bpe_token_list) {