From 2b6c846d8f8cc0db73cf82c936e5242f9323faec Mon Sep 17 00:00:00 2001 From: Michal Moskal Date: Tue, 23 Apr 2024 22:20:57 +0000 Subject: [PATCH] allow longer tokens; fixes #98 --- controllers/aici_abi/src/recognizer.rs | 2 +- controllers/aici_abi/src/toktree.rs | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/controllers/aici_abi/src/recognizer.rs b/controllers/aici_abi/src/recognizer.rs index 7045da6e..588059e8 100644 --- a/controllers/aici_abi/src/recognizer.rs +++ b/controllers/aici_abi/src/recognizer.rs @@ -50,7 +50,7 @@ pub struct StackRecognizer> { impl> StackRecognizer { pub fn from(rec: R) -> Self { - let stack = vec![rec.initial(); 130]; + let stack = vec![rec.initial(); 300]; StackRecognizer { rec, stack, diff --git a/controllers/aici_abi/src/toktree.rs b/controllers/aici_abi/src/toktree.rs index 6ec6371a..c58f1246 100644 --- a/controllers/aici_abi/src/toktree.rs +++ b/controllers/aici_abi/src/toktree.rs @@ -122,6 +122,9 @@ impl TrieNode { } } +// max length of token is 1023 bytes +const LEN_BITS: u32 = 10; + impl TokTrie { pub fn from_host() -> Self { let buffer = trie_bytes(); @@ -137,8 +140,9 @@ impl TokTrie { if word.len() > 0 { trie.insert(word, idx as u32); } - assert!(word.len() < 0xff); - let desc = (word.len() as u32) | ((token_data.len() as u32) << 8); + assert!(word.len() < (1 << LEN_BITS)); + assert!(token_data.len() < (1 << (32 - LEN_BITS))); + let desc = (word.len() as u32) | ((token_data.len() as u32) << LEN_BITS); token_offsets.push(desc); token_data.extend_from_slice(word); } @@ -292,8 +296,8 @@ impl TokTrie { pub fn token(&self, idx: u32) -> &[u8] { let off = self.token_offsets[idx as usize]; - let len = off & 0xff; - let off = (off >> 8) as usize; + let len = off & ((1 << LEN_BITS) - 1); + let off = (off >> LEN_BITS) as usize; &self.token_data[off..(off + len as usize)] }