Skip to content

Commit

Permalink
Add whitespace pretokenizer (#542)
Browse files Browse the repository at this point in the history
* Add `WhitespacePreTokenizer`

* Add unit test for `Whitespace` pretokenizer
  • Loading branch information
xenova authored Jan 27, 2024
1 parent 497628f commit 03f2763
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 1 deletion.
26 changes: 25 additions & 1 deletion src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -1274,6 +1274,8 @@ class PreTokenizer extends Callable {
return new BertPreTokenizer(config);
case 'Sequence':
return new PreTokenizerSequence(config);
case 'Whitespace':
return new WhitespacePreTokenizer(config);
case 'WhitespaceSplit':
return new WhitespaceSplit(config);
case 'Metaspace':
Expand Down Expand Up @@ -2291,14 +2293,36 @@ class PreTokenizerSequence extends PreTokenizer {
}
}

/**
* Splits on word boundaries (using the following regular expression: `\w+|[^\w\s]+`).
*/
class WhitespacePreTokenizer extends PreTokenizer {
/**
* Creates an instance of WhitespacePreTokenizer.
* @param {Object} config The configuration object for the pre-tokenizer.
*/
constructor(config) {
super();
}
/**
* Pre-tokenizes the input text by splitting it on word boundaries.
* @param {string} text The text to be pre-tokenized.
* @param {Object} [options] Additional options for the pre-tokenization logic.
* @returns {string[]} An array of tokens produced by splitting the input text on whitespace.
*/
pre_tokenize_text(text, options) {
return text.match(/\w+|[^\w\s]+/g) || [];
}
}

/**
* Splits a string of text by whitespace characters into individual tokens.
* @extends PreTokenizer
*/
class WhitespaceSplit extends PreTokenizer {
/**
* Creates an instance of WhitespaceSplit.
* @param {Object} config The configuration object for the pre-tokenizer sequence.
* @param {Object} config The configuration object for the pre-tokenizer.
*/
constructor(config) {
super();
Expand Down
4 changes: 4 additions & 0 deletions tests/generate_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
# TODO: Add back when https://github.com/huggingface/transformers/issues/26318 is fixed
# 'Xenova/t5-tokenizer-new',
],
'bert': [
# Uses `Whitespace` pretokenizer
'Xenova/jina-embeddings-v2-base-zh-tokenizer',
],
}

MODELS_TO_IGNORE = [
Expand Down

0 comments on commit 03f2763

Please sign in to comment.