Add whitespace pretokenizer (#542)

* Add `WhitespacePreTokenizer` * Add unit test for `Whitespace` pretokenizer
huggingface · Jan 27, 2024 · 03f2763 · 03f2763
1 parent 497628f
commit 03f2763
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 1 deletion.
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -1274,6 +1274,8 @@ class PreTokenizer extends Callable {
                 return new BertPreTokenizer(config);
             case 'Sequence':
                 return new PreTokenizerSequence(config);
+            case 'Whitespace':
+                return new WhitespacePreTokenizer(config);
             case 'WhitespaceSplit':
                 return new WhitespaceSplit(config);
             case 'Metaspace':
@@ -2291,14 +2293,36 @@ class PreTokenizerSequence extends PreTokenizer {
     }
 }
 
+/**
+ * Splits on word boundaries (using the following regular expression: `\w+|[^\w\s]+`).
+ */
+class WhitespacePreTokenizer extends PreTokenizer {
+    /**
+     * Creates an instance of WhitespacePreTokenizer.
+     * @param {Object} config The configuration object for the pre-tokenizer.
+     */
+    constructor(config) {
+        super();
+    }
+    /**
+     * Pre-tokenizes the input text by splitting it on word boundaries.
+     * @param {string} text The text to be pre-tokenized.
+     * @param {Object} [options] Additional options for the pre-tokenization logic.
+     * @returns {string[]} An array of tokens produced by splitting the input text on whitespace.
+     */
+    pre_tokenize_text(text, options) {
+        return text.match(/\w+|[^\w\s]+/g) || [];
+    }
+}
+
 /**
  * Splits a string of text by whitespace characters into individual tokens.
  * @extends PreTokenizer
  */
 class WhitespaceSplit extends PreTokenizer {
     /**
      * Creates an instance of WhitespaceSplit.
-     * @param {Object} config The configuration object for the pre-tokenizer sequence.
+     * @param {Object} config The configuration object for the pre-tokenizer.
      */
     constructor(config) {
         super();

diff --git a/tests/generate_tests.py b/tests/generate_tests.py
@@ -28,6 +28,10 @@
         # TODO: Add back when https://github.com/huggingface/transformers/issues/26318 is fixed
         # 'Xenova/t5-tokenizer-new',
     ],
+    'bert': [
+        # Uses `Whitespace` pretokenizer 
+        'Xenova/jina-embeddings-v2-base-zh-tokenizer',
+    ],
 }
 
 MODELS_TO_IGNORE = [