From 400c4a110121135009fbaea3d19034ab758a1b8b Mon Sep 17 00:00:00 2001 From: Valerie Sarge Date: Sun, 11 Feb 2024 12:56:56 -0800 Subject: [PATCH] MCore dataset compatibility for tokenizers (#8390) * Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer Signed-off-by: Valerie Sarge * Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer. Signed-off-by: Valerie Sarge --------- Signed-off-by: Valerie Sarge Co-authored-by: Pablo Garay --- .../tokenizers/huggingface/auto_tokenizer.py | 8 --- .../common/tokenizers/tokenizer_spec.py | 58 +++++++++++++++++++ 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py index 4ed5dc07dbff..25d280f53f06 100644 --- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py +++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py @@ -122,9 +122,6 @@ def __init__( if token is not None and token not in self.tokenizer.get_vocab(): new_tokens_in_vocab.append(token) - # value is required for megatron-core - self.unique_identifiers = OrderedDict() - if len(new_tokens_in_vocab) > 0: """ Special tokens that were not previously included in the tokenizer's vocabulary file will be added to @@ -231,11 +228,6 @@ def bos_id(self): def eos_id(self): return self.tokens_to_ids([getattr(self, 'eos_token')])[0] - @property - def eod(self): - """Returns EOS token id. Exact copy of the eos_id function. Required for megatron-core.""" - return self.tokens_to_ids([getattr(self, 'eos_token')])[0] - @property def sep_id(self): return self.tokens_to_ids([getattr(self, 'sep_token')])[0] diff --git a/nemo/collections/common/tokenizers/tokenizer_spec.py b/nemo/collections/common/tokenizers/tokenizer_spec.py index 252571d76ef2..f6e905d75c3b 100644 --- a/nemo/collections/common/tokenizers/tokenizer_spec.py +++ b/nemo/collections/common/tokenizers/tokenizer_spec.py @@ -13,6 +13,7 @@ # limitations under the License. from abc import ABC, abstractmethod +from collections import OrderedDict from typing import List __all__ = ['TokenizerSpec'] @@ -53,3 +54,60 @@ def add_special_tokens(self, special_tokens: List[str]): @property def name(self): return type(self).__name__ + + @property + def unique_identifiers(self): + """Property required for use with megatron-core datasets.""" + return OrderedDict({"class": f"{type(self).__module__}.{type(self).__qualname__}"}) + + @property + def cls(self): + """Property alias to match MegatronTokenizer; returns cls_id if available.""" + if hasattr(self, 'cls_id'): + return self.cls_id + raise AttributeError(f"{type(self).__name__} has no attribute 'cls' or 'cls_id'") + + @property + def sep(self): + """Property alias to match MegatronTokenizer; returns sep_id if available.""" + if hasattr(self, 'sep_id'): + return self.sep_id + raise AttributeError(f"{type(self).__name__} has no attribute 'sep' or 'sep_id'") + + @property + def pad(self): + """Property alias to match MegatronTokenizer; returns pad_id if available.""" + if hasattr(self, 'pad_id'): + return self.pad_id + raise AttributeError(f"{type(self).__name__} has no attribute 'pad' or 'pad_id'") + + @property + def eod(self): + """Property alias to match MegatronTokenizer; returns eod_id if available.""" + if hasattr(self, 'eod_id'): + return self.eod_id + if hasattr(self, 'eos_id'): + # Default to end-of-sentence id if end-of-document is not defined. + return self.eos_id + raise AttributeError(f"{type(self).__name__} has no attribute 'eod', 'eod_id', 'eos', or 'eos_id'") + + @property + def bos(self): + """Property alias to match MegatronTokenizer; returns bos_id if available.""" + if hasattr(self, 'bos_id'): + return self.bos_id + raise AttributeError(f"{type(self).__name__} has no attribute 'bos' or 'bos_id'") + + @property + def eos(self): + """Property alias to match MegatronTokenizer; returns eos_id if available.""" + if hasattr(self, 'eos_id'): + return self.eos_id + raise AttributeError(f"{type(self).__name__} has no attribute 'eos' or 'eos_id'") + + @property + def mask(self): + """Property alias to match MegatronTokenizer; returns mask_id if available.""" + if hasattr(self, 'mask_id'): + return self.mask_id + raise AttributeError(f"{type(self).__name__} has no attribute 'mask' or 'mask_id'")