Skip to content

Commit

Permalink
allow longer tokens; fixes #98
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Apr 23, 2024
1 parent b082275 commit 2b6c846
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 5 deletions.
2 changes: 1 addition & 1 deletion controllers/aici_abi/src/recognizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ pub struct StackRecognizer<S: Copy, R: FunctionalRecognizer<S>> {

impl<S: Copy, R: FunctionalRecognizer<S>> StackRecognizer<S, R> {
pub fn from(rec: R) -> Self {
let stack = vec![rec.initial(); 130];
let stack = vec![rec.initial(); 300];
StackRecognizer {
rec,
stack,
Expand Down
12 changes: 8 additions & 4 deletions controllers/aici_abi/src/toktree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ impl TrieNode {
}
}

// max length of token is 1023 bytes
const LEN_BITS: u32 = 10;

impl TokTrie {
pub fn from_host() -> Self {
let buffer = trie_bytes();
Expand All @@ -137,8 +140,9 @@ impl TokTrie {
if word.len() > 0 {
trie.insert(word, idx as u32);
}
assert!(word.len() < 0xff);
let desc = (word.len() as u32) | ((token_data.len() as u32) << 8);
assert!(word.len() < (1 << LEN_BITS));
assert!(token_data.len() < (1 << (32 - LEN_BITS)));
let desc = (word.len() as u32) | ((token_data.len() as u32) << LEN_BITS);
token_offsets.push(desc);
token_data.extend_from_slice(word);
}
Expand Down Expand Up @@ -292,8 +296,8 @@ impl TokTrie {

pub fn token(&self, idx: u32) -> &[u8] {
let off = self.token_offsets[idx as usize];
let len = off & 0xff;
let off = (off >> 8) as usize;
let len = off & ((1 << LEN_BITS) - 1);
let off = (off >> LEN_BITS) as usize;
&self.token_data[off..(off + len as usize)]
}

Expand Down

0 comments on commit 2b6c846

Please sign in to comment.