Skip to content

Commit

Permalink
MCore dataset compatibility for tokenizers (#8390)
Browse files Browse the repository at this point in the history
* Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer

Signed-off-by: Valerie Sarge <[email protected]>

* Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer.

Signed-off-by: Valerie Sarge <[email protected]>

---------

Signed-off-by: Valerie Sarge <[email protected]>
Co-authored-by: Pablo Garay <[email protected]>
  • Loading branch information
vysarge and pablo-garay authored Feb 11, 2024
1 parent 88d7b21 commit 400c4a1
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,6 @@ def __init__(
if token is not None and token not in self.tokenizer.get_vocab():
new_tokens_in_vocab.append(token)

# value is required for megatron-core
self.unique_identifiers = OrderedDict()

if len(new_tokens_in_vocab) > 0:
"""
Special tokens that were not previously included in the tokenizer's vocabulary file will be added to
Expand Down Expand Up @@ -231,11 +228,6 @@ def bos_id(self):
def eos_id(self):
return self.tokens_to_ids([getattr(self, 'eos_token')])[0]

@property
def eod(self):
"""Returns EOS token id. Exact copy of the eos_id function. Required for megatron-core."""
return self.tokens_to_ids([getattr(self, 'eos_token')])[0]

@property
def sep_id(self):
return self.tokens_to_ids([getattr(self, 'sep_token')])[0]
Expand Down
58 changes: 58 additions & 0 deletions nemo/collections/common/tokenizers/tokenizer_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from abc import ABC, abstractmethod
from collections import OrderedDict
from typing import List

__all__ = ['TokenizerSpec']
Expand Down Expand Up @@ -53,3 +54,60 @@ def add_special_tokens(self, special_tokens: List[str]):
@property
def name(self):
return type(self).__name__

@property
def unique_identifiers(self):
"""Property required for use with megatron-core datasets."""
return OrderedDict({"class": f"{type(self).__module__}.{type(self).__qualname__}"})

@property
def cls(self):
"""Property alias to match MegatronTokenizer; returns cls_id if available."""
if hasattr(self, 'cls_id'):
return self.cls_id
raise AttributeError(f"{type(self).__name__} has no attribute 'cls' or 'cls_id'")

@property
def sep(self):
"""Property alias to match MegatronTokenizer; returns sep_id if available."""
if hasattr(self, 'sep_id'):
return self.sep_id
raise AttributeError(f"{type(self).__name__} has no attribute 'sep' or 'sep_id'")

@property
def pad(self):
"""Property alias to match MegatronTokenizer; returns pad_id if available."""
if hasattr(self, 'pad_id'):
return self.pad_id
raise AttributeError(f"{type(self).__name__} has no attribute 'pad' or 'pad_id'")

@property
def eod(self):
"""Property alias to match MegatronTokenizer; returns eod_id if available."""
if hasattr(self, 'eod_id'):
return self.eod_id
if hasattr(self, 'eos_id'):
# Default to end-of-sentence id if end-of-document is not defined.
return self.eos_id
raise AttributeError(f"{type(self).__name__} has no attribute 'eod', 'eod_id', 'eos', or 'eos_id'")

@property
def bos(self):
"""Property alias to match MegatronTokenizer; returns bos_id if available."""
if hasattr(self, 'bos_id'):
return self.bos_id
raise AttributeError(f"{type(self).__name__} has no attribute 'bos' or 'bos_id'")

@property
def eos(self):
"""Property alias to match MegatronTokenizer; returns eos_id if available."""
if hasattr(self, 'eos_id'):
return self.eos_id
raise AttributeError(f"{type(self).__name__} has no attribute 'eos' or 'eos_id'")

@property
def mask(self):
"""Property alias to match MegatronTokenizer; returns mask_id if available."""
if hasattr(self, 'mask_id'):
return self.mask_id
raise AttributeError(f"{type(self).__name__} has no attribute 'mask' or 'mask_id'")

0 comments on commit 400c4a1

Please sign in to comment.