Skip to content

Commit

Permalink
Add Tiktoken support for TRTLLM (#10306)
Browse files Browse the repository at this point in the history
* Add tiktoken tokenizer

* Add special token

* Remove unused import

* Apply isort and black reformatting

Signed-off-by: meatybobby <[email protected]>

* Remove unused import

* Fix after merge

* Change qnemo loading

* Apply isort and black reformatting

Signed-off-by: meatybobby <[email protected]>

* Clean up

---------

Signed-off-by: meatybobby <[email protected]>
Co-authored-by: meatybobby <[email protected]>
Co-authored-by: Matvei Novikov <[email protected]>
  • Loading branch information
3 people authored Nov 26, 2024
1 parent 080bcd7 commit 598c865
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 4 deletions.
3 changes: 3 additions & 0 deletions nemo/export/tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,13 @@ def export(

tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
tokenizer_path_nemo2 = os.path.join(nemo_export_dir, "nemo_context")
vocab_path = os.path.join(nemo_export_dir, "vocab.json")
if os.path.exists(tokenizer_path):
shutil.copy(tokenizer_path, self.model_dir)
elif os.path.exists(tokenizer_path_nemo2):
shutil.copytree(tokenizer_path_nemo2, Path(self.model_dir) / "nemo_context")
elif os.path.exists(vocab_path):
shutil.copy(vocab_path, os.path.join(self.model_dir, "vocab.json"))
else:
self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer'))

Expand Down
123 changes: 123 additions & 0 deletions nemo/export/tiktoken_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import base64
import json
from pathlib import Path
from typing import Dict, Optional

import numpy as np
import tiktoken
import torch

PATTERN_TIKTOKEN = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
DEFAULT_TIKTOKEN_MAX_VOCAB = 2**17 # 131072
SPECIAL_TOKENS = ["<unk>", "<s>", "</s>"]
SPECIAL_TOKEN_TEMPLATE = "<SPECIAL_{id}>"


def reload_mergeable_ranks(
path: str,
max_vocab: Optional[int] = None,
) -> Dict[bytes, int]:
"""
Reload the tokenizer JSON file and convert it to Tiktoken format.
"""
assert path.endswith(".json")

# reload vocab
with open(path, "r", encoding='utf-8') as f:
vocab = json.load(f)
assert isinstance(vocab, list)
print(f"Vocab size: {len(vocab)}")
if max_vocab is not None:
vocab = vocab[:max_vocab]
print(f"Cutting vocab to first {len(vocab)} tokens.")

# build ranks
ranks: Dict[bytes, int] = {}
for i, x in enumerate(vocab):
assert x.keys() == {"rank", "token_bytes", "token_str"}
assert x["rank"] == i
merge = base64.b64decode(x["token_bytes"])
assert i >= 256 or merge == bytes([i])
ranks[merge] = x["rank"]

# sanity check
assert len(ranks) == len(vocab)
assert set(ranks.values()) == set(range(len(ranks)))

return ranks


class TiktokenTokenizer:
def __init__(self, vocab_file: str):

self.num_special_tokens = 1000
vocab_size = DEFAULT_TIKTOKEN_MAX_VOCAB
pattern = PATTERN_TIKTOKEN
special_tokens = SPECIAL_TOKENS.copy()
inner_vocab_size = vocab_size - self.num_special_tokens

token2id = reload_mergeable_ranks(vocab_file, max_vocab=inner_vocab_size)
self.tokenizer = tiktoken.Encoding(
name=Path(vocab_file).parent.name,
pat_str=pattern,
mergeable_ranks=token2id,
special_tokens={}, # special tokens are handled manually
)

# BOS / EOS / Pad token IDs
self._bos_id = special_tokens.index("<s>")
self._eos_id = special_tokens.index("</s>")

def encode(self, text):
tokens = self.tokenizer.encode(text)
tokens = [t + self.num_special_tokens for t in tokens]
return tokens

def decode(self, tokens):
# Filter out special tokens and adjust the remaining tokens
adjusted_tokens = [
t - self.num_special_tokens
for t in tokens
if t not in {self._bos_id, self._eos_id} and t >= self.num_special_tokens
]

# Decode only if there are tokens left after filtering
if adjusted_tokens:
return self.tokenizer.decode(adjusted_tokens)
else:
return "" # Return an empty string if all tokens were filtered out

def batch_decode(self, ids):
if isinstance(ids, np.ndarray) or torch.is_tensor(ids):
ids = ids.tolist()

if isinstance(ids[0], list):
ids = ids[0]

return self.decode(ids)

@property
def pad_id(self):
return self._eos_id

@property
def bos_token_id(self):
return self._bos_id

@property
def eos_token_id(self):
return self._eos_id
17 changes: 13 additions & 4 deletions nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
from nemo.export.tarutils import TarPath, ZarrPathStore
from nemo.export.tiktoken_tokenizer import TiktokenTokenizer

LOGGER = logging.getLogger("NeMo")

Expand Down Expand Up @@ -235,7 +236,7 @@ def load_sharded_metadata(checkpoint_dir: Union[Path, TarPath], torch_tensor=Tru

def update_tokenizer_paths(tokenizer_config: Dict, unpacked_checkpoints_dir):
def _update_config_entry(key, file_pattern):
old_path = tokenizer_config[key]
old_path = tokenizer_config.get(key, None)
if old_path is None:
return
old_path = Path(old_path)
Expand All @@ -262,7 +263,7 @@ def copy_tokenizer_files(config, out_dir):
}

for key in basenames.keys():
if config[key] is None:
if config.get(key, None) is None:
continue

path = config[key]
Expand All @@ -275,13 +276,16 @@ def copy_tokenizer_files(config, out_dir):
continue

dst_path = out_dir / f"{basenames[key]}{path.suffix}"
config[key] = str(dst_path)
LOGGER.debug(f"Copy tokenizer {key}: {path}->{dst_path}")

# Copy 'path' to 'dst_path' without shutil.copy(...) because 'path' may be a TarPath
with path.open('rb') as infile:
with open(dst_path, 'wb') as outfile:
outfile.write(infile.read())

return config


def get_tokenizer(tokenizer_dir_or_path: Union[str, Path]) -> PreTrainedTokenizer:
"""Loads the tokenizer from the decoded NeMo weights dir."""
Expand All @@ -291,6 +295,10 @@ def get_tokenizer(tokenizer_dir_or_path: Union[str, Path]) -> PreTrainedTokenize

tokenizer_spec = io.load_context((tokenizer_dir_or_path / "nemo_context"), subpath="model.tokenizer")
return build_tokenizer(tokenizer_spec)
elif os.path.exists(os.path.join(tokenizer_dir_or_path, "vocab.json")):
vocab_path = tokenizer_dir_or_path / "vocab.json" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
tokenizer_config = {"library": "tiktoken", "vocab_file": str(vocab_path)}
return build_tokenizer(tokenizer_config)
else:
if (tokenizer_dir_or_path / "huggingface_tokenizer").is_dir():
return AutoTokenizer.from_pretrained(tokenizer_dir_or_path / "huggingface_tokenizer")
Expand All @@ -307,6 +315,8 @@ def build_tokenizer(tokenizer):
tokenizer_config = tokenizer
if tokenizer_config["library"] == "sentencepiece":
return SentencePieceTokenizer(model_path=tokenizer_config["model"])
elif tokenizer_config["library"] == "tiktoken":
return TiktokenTokenizer(vocab_file=tokenizer_config["vocab_file"])
elif "GPT2" in tokenizer_config["type"]:
tokenizer = GPT2Tokenizer(tokenizer_config["vocab_file"], tokenizer_config["merge_file"])
else:
Expand Down Expand Up @@ -373,9 +383,8 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat
)
else:
tokenizer_config = update_tokenizer_paths(nemo_model_config["tokenizer"], unpacked_checkpoint_dir)
copy_tokenizer_files(tokenizer_config, nemo_export_dir)
tokenizer_config = copy_tokenizer_files(tokenizer_config, nemo_export_dir)

tokenizer_config["model"] = os.path.join(nemo_export_dir, "tokenizer.model")
tokenizer = build_tokenizer(tokenizer_config)
elif (nemo_dir / "weights").exists():
dist_ckpt_folder = nemo_dir / "weights"
Expand Down
4 changes: 4 additions & 0 deletions nemo/export/trt_llm/qnemo/tokenizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from transformers import AutoTokenizer

from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
from nemo.export.tiktoken_tokenizer import TiktokenTokenizer

# TODO: use get_nmt_tokenizer helper below to instantiate tokenizer once environment / dependencies get stable
# from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
Expand All @@ -45,6 +46,9 @@ def get_nmt_tokenizer(nemo_checkpoint_path: str):
tokenizer = SentencePieceTokenizer(
model_path=os.path.join(nemo_checkpoint_path, tokenizer_cfg.model), legacy=legacy
)
elif library == "tiktoken":
print(f"Getting TiktokenTokenizer with file: {tokenizer_cfg.vocab_file}")
tokenizer = TiktokenTokenizer(vocab_file=os.path.join(nemo_checkpoint_path, tokenizer_cfg.vocab_file))
else:
raise NotImplementedError("Currently we only support 'huggingface' and 'sentencepiece' tokenizer libraries.")

Expand Down

0 comments on commit 598c865

Please sign in to comment.