From 3a66cb1c0ff15533f716f2fac32d132fe3bc6299 Mon Sep 17 00:00:00 2001 From: Alexander Neubeck Date: Fri, 18 Oct 2024 06:45:00 +0000 Subject: [PATCH] fix eof negative look-ahead --- crates/bpe-openai/src/lib.rs | 8 +++++--- crates/bpe/benchmarks/equivalence.rs | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs index 2ce2113..5a7292b 100644 --- a/crates/bpe-openai/src/lib.rs +++ b/crates/bpe-openai/src/lib.rs @@ -14,7 +14,7 @@ static BPE_R50K_BASE: LazyLock = LazyLock::new(|| { static BPE_P50K_BASE: LazyLock = LazyLock::new(|| { let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict")); let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); - let pat1 = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+"; + let pat1 = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+$"; let pat2 = "\\s+\\s"; let pat3 = "\\s+"; Tokenizer::with_many(bpe, &[pat1, pat2, pat3]).expect("valid regex") @@ -23,9 +23,10 @@ static BPE_P50K_BASE: LazyLock = LazyLock::new(|| { static BPE_CL100K_BASE: LazyLock = LazyLock::new(|| { let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict")); let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); - let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+"; + let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$"; // Note: Rewrite the negative look-ahead with a positive pseudo look-ahead. // The look-ahead character is dropped from the match by the SpecialRegexp iterator. + // Note: The negative look-ahead requires also the pattern `\\s+$` to handle end of file without dropping a character! let pat2 = "\\s+\\s"; let pat3 = "\\s+"; Tokenizer::with_many(bpe, &[pat1, pat2, pat3]).expect("valid regex") @@ -40,6 +41,7 @@ static BPE_O200K_BASE: LazyLock = LazyLock::new(|| { "\\p{N}{1,3}", " ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*", "\\s*[\\r\\n]+", + "\\s+$", ].join("|"); let pat2 = "\\s+\\s"; let pat3 = "\\s+"; @@ -109,7 +111,7 @@ impl Tokenizer { /// second pattern is always a look-ahead pattern, and that just a single character needs /// to be dropped. With this little hack, we can keep most of the regex patterns as they are, /// but achieve a >3x speedup. -/// +/// /// Alternatively, this could have been implemented with capture groups, but those were ~30% /// slower than this approach with multiple patterns. struct SpecialRegexp<'a> { diff --git a/crates/bpe/benchmarks/equivalence.rs b/crates/bpe/benchmarks/equivalence.rs index 7c71e4e..c9ea5cc 100644 --- a/crates/bpe/benchmarks/equivalence.rs +++ b/crates/bpe/benchmarks/equivalence.rs @@ -47,7 +47,7 @@ fn test_encoding_equivalence_with_pretokenization() { let inputs = (0..N) .map(|_| select_test_bytes(text.as_bytes(), 100)) .chain(std::iter::once( - "You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(), + "You should see the Greek word 'kosme': \"κόσμε\" ".as_bytes(), )); for input in inputs { let text = std::str::from_utf8(input).unwrap();