Skip to content

Commit

Permalink
Fix ByteLevel pretokenizer
Browse files Browse the repository at this point in the history
Only add prefix space to first word, when option is enabled.
  • Loading branch information
xenova committed Sep 9, 2023
1 parent 32cf573 commit 9245f49
Showing 1 changed file with 9 additions and 10 deletions.
19 changes: 9 additions & 10 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -1229,19 +1229,18 @@ class ByteLevelPreTokenizer extends PreTokenizer {
* @returns {string[]} An array of tokens.
*/
pre_tokenize_text(text) {
// Add a leading space if the option is enabled
if (this.add_prefix_space && !text.startsWith(' ')) {
text = ' ' + text;
}

// Split on whitespace and punctuation
let tokens = this.use_regex ? (text.match(this.pattern) || []) : [text];

return tokens.map(token => {
if (this.add_prefix_space && !token.startsWith(' ')) {
token = ' ' + token;
}

// Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
token = Array.from(this.text_encoder.encode(token), byte => this.byte_encoder[byte]).join('');

return token;
});
// Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
return tokens.map(
token => Array.from(this.text_encoder.encode(token), byte => this.byte_encoder[byte]).join('')
);
}
}

Expand Down

0 comments on commit 9245f49

Please sign in to comment.