From 03f2763a274323d7285307d029f8403b0def4ee2 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sat, 27 Jan 2024 15:13:20 +0200 Subject: [PATCH] Add whitespace pretokenizer (#542) * Add `WhitespacePreTokenizer` * Add unit test for `Whitespace` pretokenizer --- src/tokenizers.js | 26 +++++++++++++++++++++++++- tests/generate_tests.py | 4 ++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index 8cee6f2dc..a604b20ca 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -1274,6 +1274,8 @@ class PreTokenizer extends Callable { return new BertPreTokenizer(config); case 'Sequence': return new PreTokenizerSequence(config); + case 'Whitespace': + return new WhitespacePreTokenizer(config); case 'WhitespaceSplit': return new WhitespaceSplit(config); case 'Metaspace': @@ -2291,6 +2293,28 @@ class PreTokenizerSequence extends PreTokenizer { } } +/** + * Splits on word boundaries (using the following regular expression: `\w+|[^\w\s]+`). + */ +class WhitespacePreTokenizer extends PreTokenizer { + /** + * Creates an instance of WhitespacePreTokenizer. + * @param {Object} config The configuration object for the pre-tokenizer. + */ + constructor(config) { + super(); + } + /** + * Pre-tokenizes the input text by splitting it on word boundaries. + * @param {string} text The text to be pre-tokenized. + * @param {Object} [options] Additional options for the pre-tokenization logic. + * @returns {string[]} An array of tokens produced by splitting the input text on whitespace. + */ + pre_tokenize_text(text, options) { + return text.match(/\w+|[^\w\s]+/g) || []; + } +} + /** * Splits a string of text by whitespace characters into individual tokens. * @extends PreTokenizer @@ -2298,7 +2322,7 @@ class PreTokenizerSequence extends PreTokenizer { class WhitespaceSplit extends PreTokenizer { /** * Creates an instance of WhitespaceSplit. - * @param {Object} config The configuration object for the pre-tokenizer sequence. + * @param {Object} config The configuration object for the pre-tokenizer. */ constructor(config) { super(); diff --git a/tests/generate_tests.py b/tests/generate_tests.py index d9c457b9d..f389d5cf6 100644 --- a/tests/generate_tests.py +++ b/tests/generate_tests.py @@ -28,6 +28,10 @@ # TODO: Add back when https://github.com/huggingface/transformers/issues/26318 is fixed # 'Xenova/t5-tokenizer-new', ], + 'bert': [ + # Uses `Whitespace` pretokenizer + 'Xenova/jina-embeddings-v2-base-zh-tokenizer', + ], } MODELS_TO_IGNORE = [