Skip to content

Commit

Permalink
add aicirt --logits-size
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Apr 16, 2024
1 parent 6310683 commit 59e90ab
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
9 changes: 8 additions & 1 deletion aicirt/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ struct Cli {
#[arg(short, long, default_value = "llama")]
tokenizer: String,

/// Use if the tokenizer is smaller then the dimension of logits
#[arg(long)]
logits_size: Option<usize>,

/// Path to .wasm module to install
#[arg(short, long)]
module: Option<String>,
Expand Down Expand Up @@ -1158,7 +1162,10 @@ fn main() -> () {
ff_tokens: true,
};

let tokenizer = find_tokenizer(&cli.tokenizer).unwrap();
let mut tokenizer = find_tokenizer(&cli.tokenizer).unwrap();
if let Some(logits_size) = cli.logits_size {
tokenizer.add_missing_tokens(logits_size);
}
let token_bytes = tokenizer.token_bytes();
let wasm_ctx = WasmContext::new(inference_caps, limits.clone(), tokenizer).unwrap();

Expand Down
15 changes: 13 additions & 2 deletions controllers/aici_native/src/bintokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -317,9 +317,7 @@ impl ByteTokenizer {

Ok(res)
}
}

impl ByteTokenizer {
pub fn tokrx_info(&self) -> TokRxInfo {
TokRxInfo {
vocab_size: self.vocab_size,
Expand All @@ -329,6 +327,19 @@ impl ByteTokenizer {
pub fn token_bytes(&self) -> Vec<Vec<u8>> {
self.token_bytes.clone()
}

pub fn add_missing_tokens(&mut self, vocab_size: usize) {
assert!(self.vocab_size == self.token_bytes.len() as u32);
assert!(vocab_size >= self.token_bytes.len());
assert!(vocab_size - self.token_bytes.len() <= 200);
while self.token_bytes.len() < vocab_size {
let idx = self.token_bytes.len();
let name = format!("<AddedToken_{idx}>");
self.token_bytes.push(name.as_bytes().to_vec());
self.vocab_size += 1;
self.special.insert(name, idx as u32);
}
}
}

pub struct ByteTokenizerEnv {
Expand Down

0 comments on commit 59e90ab

Please sign in to comment.