Skip to content

Commit

Permalink
Add huggingface without pre-tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
hendrikvanantwerpen committed Oct 9, 2024
1 parent fee4232 commit f0c61bf
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 8 deletions.
48 changes: 43 additions & 5 deletions crates/bpe/benchmarks/equivalence.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,50 @@
use bpe_benchmarks::*;

#[cfg(test)]
const N: usize = 32;

#[test]
fn test_encoding_equivalence_without_pretokenization() {
for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
let huggingface = without_pretokenizer(huggingface);
let text = create_test_string(&bpe.bpe, 20000);
let inputs = (0..N)
.map(|_| select_test_bytes(text.as_bytes(), 100))
.chain(std::iter::once(
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
));
for input in inputs {
let text = std::str::from_utf8(input).unwrap();
let out = bpe.bpe.encode_via_backtracking(input);
let huggingface_out: Vec<_> = huggingface
.encode_fast(text, false)
.unwrap()
.get_ids()
.to_vec();
if huggingface_out != out {
let text = bpe.decode(&out).unwrap();
let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
if huggingface_text != text {
panic!(
"huggingface tokens and text differ: {:?} != {:?}",
text, huggingface_text
);
} else {
panic!(
"huggingface tokens differ: {:?} != {:?}",
out, huggingface_out
);
}
}
}
}
}

#[test]
fn test_encoding_equivalence() {
fn test_encoding_equivalence_with_pretokenization() {
for (_, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
let text = create_test_string(&bpe.bpe, 20000);
let inputs = (0..32)
let inputs = (0..N)
.map(|_| select_test_bytes(text.as_bytes(), 100))
.chain(std::iter::once(
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
Expand All @@ -19,9 +59,7 @@ fn test_encoding_equivalence() {
.encode_fast(text, false)
.unwrap()
.get_ids()
.iter()
.copied()
.collect();
.to_vec();
if tiktoken_out2 != huggingface_out {
let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
if tiktoken_text != huggingface_text {
Expand Down
13 changes: 11 additions & 2 deletions crates/bpe/benchmarks/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use bpe::byte_pair_encoding::BytePairEncoding;
use bpe_openai::Tokenizer;
use rand::{thread_rng, Rng};
use tiktoken_rs::CoreBPE as TiktokenTokenizer;
use tokenizers::pre_tokenizers::byte_level::ByteLevel as HuggingfaceByteLevel;
use tokenizers::tokenizer::Tokenizer as HuggingfaceTokenizer;

pub static TOKENIZERS: LazyLock<
Expand All @@ -19,13 +20,13 @@ pub static TOKENIZERS: LazyLock<
"cl100k",
bpe_openai::cl100k(),
tiktoken_rs::cl100k_base().unwrap(),
{ HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).unwrap() },
HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).unwrap(),
),
(
"o200k",
bpe_openai::o200k(),
tiktoken_rs::o200k_base().unwrap(),
{ HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).unwrap() },
HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).unwrap(),
),
]
});
Expand Down Expand Up @@ -69,3 +70,11 @@ pub fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] {
}
&input[start..end]
}

pub fn without_pretokenizer(enc: &HuggingfaceTokenizer) -> HuggingfaceTokenizer {
let mut enc = enc.clone();
// boolean values taken from Xenova's tokenizer config
let pre_tokenizer = HuggingfaceByteLevel::new(false, false, false);
enc.with_pre_tokenizer(Some(pre_tokenizer));
enc
}
15 changes: 14 additions & 1 deletion crates/bpe/benchmarks/performance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ fn counting_benchmark(c: &mut Criterion) {
}

fn encoding_benchmark(c: &mut Criterion) {
for (name, bpe, _, _) in TOKENIZERS.iter() {
for (name, bpe, _, huggingface) in TOKENIZERS.iter() {
let huggingface = without_pretokenizer(huggingface);

let text = create_test_string(&bpe.bpe, 20000);
let input = text.as_bytes();

Expand Down Expand Up @@ -89,6 +91,17 @@ fn encoding_benchmark(c: &mut Criterion) {
criterion::BatchSize::SmallInput,
)
});
group.bench_with_input(
BenchmarkId::new("huggingface", bytes),
&bytes,
|b, bytes| {
b.iter_batched(
|| std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(),
|text| huggingface.encode_fast(text, false).unwrap(),
criterion::BatchSize::SmallInput,
)
},
);
}
group.finish();
}
Expand Down

0 comments on commit f0c61bf

Please sign in to comment.