From 49f9972705d6b33bc0a453f73b9d60381bbdcbdf Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 22 Oct 2023 03:18:48 +0000 Subject: [PATCH 1/4] only add special tokens when --no_new_tokens; default pad to eos; --- megatron/tokenizer/tokenizer.py | 41 ++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index d675396..90adc28 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -350,8 +350,8 @@ def _initalize(self, vocab_extra_ids, vocab_extra_ids_list, new_tokens): self._vocab[t] = i def _add_special_token(t): - if t not in self.vocab and not new_tokens: - return + # if t not in self.vocab and not new_tokens: + # return if t not in self._vocab: next_id = len(self._vocab) self._vocab[t] = next_id @@ -359,22 +359,23 @@ def _add_special_token(t): self._special_tokens[t] = self._vocab[t] self._inv_special_tokens[self._vocab[t]] = t - _add_special_token('') - self._cls_id = self._vocab.get('') - _add_special_token('') - self._sep_id = self._vocab.get('') - _add_special_token('') - self._eod_id = self._vocab.get('') - _add_special_token('') - self._mask_id = self._vocab.get('') - - pad_id = self._tokenizer.pad_id() - try: - pad_token = self._tokenizer.id_to_piece(pad_id) - except IndexError: - pad_token = '' - _add_special_token(pad_token) - self._pad_id = self._vocab.get(pad_token) + if new_tokens: + _add_special_token('') + self._cls_id = self._vocab.get('') + _add_special_token('') + self._sep_id = self._vocab.get('') + _add_special_token('') + self._eod_id = self._vocab.get('') + _add_special_token('') + self._mask_id = self._vocab.get('') + + pad_id = self._tokenizer.pad_id() + try: + pad_token = self._tokenizer.id_to_piece(pad_id) + except IndexError: + pad_token = '' + _add_special_token(pad_token) + self._pad_id = self._vocab.get(pad_token) bos_id = self._tokenizer.bos_id() try: @@ -392,6 +393,10 @@ def _add_special_token(t): _add_special_token(eos_token) self._eos_id = self._vocab.get(eos_token) + if not new_tokens: + # default to eos + self._pad_id = self._eos_id + for i in range(vocab_extra_ids): t = "".format(i) _add_special_token(t) From 2e36fd85bf69c113811149c9f1eb710f2bf76dbb Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 24 Oct 2023 02:28:55 +0000 Subject: [PATCH 2/4] support not adding CLS, etc special token; fix the additional special token ordering bug due to `set` --- weights_conversion/megatron_to_hf.py | 41 ++++++++++++++-------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/weights_conversion/megatron_to_hf.py b/weights_conversion/megatron_to_hf.py index b0cf596..c3cd056 100644 --- a/weights_conversion/megatron_to_hf.py +++ b/weights_conversion/megatron_to_hf.py @@ -371,36 +371,34 @@ def write_tokenizer(args: Namespace): # add default args for megatron tokenizer args.rank = 0 args.vocab_extra_ids = 0 - args.new_tokens = True args.make_vocab_size_divisible_by = 128 args.tensor_model_parallel_size = 1 mt_tokenizer = build_tokenizer(args) if args.tokenizer_type == "SentencePieceTokenizer": - if mt_tokenizer.cls is not None: - hf_tokenizer.add_tokens("", special_tokens=True) - hf_tokenizer.cls_token_id = mt_tokenizer.cls - if mt_tokenizer.sep is not None: - hf_tokenizer.add_tokens("", special_tokens=True) - hf_tokenizer.sep_token_id = mt_tokenizer.sep - if mt_tokenizer.eod is not None: - hf_tokenizer.add_tokens("", special_tokens=True) - if mt_tokenizer.mask is not None: - hf_tokenizer.add_tokens("", special_tokens=True) - hf_tokenizer.mask_token_id = mt_tokenizer.mask - if mt_tokenizer.pad is not None: - hf_tokenizer.add_tokens("", special_tokens=True) - hf_tokenizer.pad_token_id = mt_tokenizer.pad + # Do not add def special tokens if requested + if args.new_tokens: + if mt_tokenizer.cls is not None: + hf_tokenizer.add_tokens("", special_tokens=True) + hf_tokenizer.cls_token_id = mt_tokenizer.cls + if mt_tokenizer.sep is not None: + hf_tokenizer.add_tokens("", special_tokens=True) + hf_tokenizer.sep_token_id = mt_tokenizer.sep + if mt_tokenizer.eod is not None: + hf_tokenizer.add_tokens("", special_tokens=True) + if mt_tokenizer.mask is not None: + hf_tokenizer.add_tokens("", special_tokens=True) + hf_tokenizer.mask_token_id = mt_tokenizer.mask + if mt_tokenizer.pad is not None: + hf_tokenizer.add_tokens("", special_tokens=True) + hf_tokenizer.pad_token_id = mt_tokenizer.pad additional_special_tokens = hf_tokenizer.additional_special_tokens - special_tokens = {"additional_special_tokens": additional_special_tokens} if args.vocab_extra_ids_list: additional_special_tokens.extend(args.vocab_extra_ids_list.split(",")) - hf_tokenizer.add_special_tokens(special_tokens_dict=special_tokens, replace_additional_special_tokens=True) - - additional_special_tokens_ids = [mt_tokenizer.vocab.get(t) for t in additional_special_tokens] - hf_tokenizer.additional_special_tokens_ids = additional_special_tokens_ids + for special_token in additional_special_tokens: + hf_tokenizer.add_special_tokens({"additional_special_tokens": [special_token]}) hf_vocab = hf_tokenizer.get_vocab() tokens_to_check = [ @@ -453,6 +451,9 @@ def main(): help=("One or more arguments to override special tokens. " "Syntax set as `key=value`, e.g. `eos=<|im_end|>`. " "Overrides available only bos, cls, eos, mask, pad, sep, unk.")) + parser.add_argument("--no_new_tokens", action="store_false", dest="new_tokens", + help=("Do not add special tokens (e.g. CLS, MASK, etc) " + "in the sentenciepiece tokenizer")) args = parser.parse_args() if args.model in {"llama", "llama2", "codellama"}: From 6cf817ea71a0e4ef2606ed7bd332849868675410 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 7 Nov 2023 15:40:32 -0600 Subject: [PATCH 3/4] use force=True to force add extra ids --- megatron/tokenizer/tokenizer.py | 43 ++++++++++++++-------------- weights_conversion/megatron_to_hf.py | 33 ++++++++++----------- 2 files changed, 36 insertions(+), 40 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 90adc28..12d57e6 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -349,9 +349,9 @@ def _initalize(self, vocab_extra_ids, vocab_extra_ids_list, new_tokens): self._inv_vocab[i] = t self._vocab[t] = i - def _add_special_token(t): - # if t not in self.vocab and not new_tokens: - # return + def _add_special_token(t, force=False): + if t not in self.vocab and not new_tokens and not force: + return if t not in self._vocab: next_id = len(self._vocab) self._vocab[t] = next_id @@ -359,23 +359,22 @@ def _add_special_token(t): self._special_tokens[t] = self._vocab[t] self._inv_special_tokens[self._vocab[t]] = t - if new_tokens: - _add_special_token('') - self._cls_id = self._vocab.get('') - _add_special_token('') - self._sep_id = self._vocab.get('') - _add_special_token('') - self._eod_id = self._vocab.get('') - _add_special_token('') - self._mask_id = self._vocab.get('') - - pad_id = self._tokenizer.pad_id() - try: - pad_token = self._tokenizer.id_to_piece(pad_id) - except IndexError: - pad_token = '' - _add_special_token(pad_token) - self._pad_id = self._vocab.get(pad_token) + _add_special_token('') + self._cls_id = self._vocab.get('') + _add_special_token('') + self._sep_id = self._vocab.get('') + _add_special_token('') + self._eod_id = self._vocab.get('') + _add_special_token('') + self._mask_id = self._vocab.get('') + + pad_id = self._tokenizer.pad_id() + try: + pad_token = self._tokenizer.id_to_piece(pad_id) + except IndexError: + pad_token = '' + _add_special_token(pad_token) + self._pad_id = self._vocab.get(pad_token) bos_id = self._tokenizer.bos_id() try: @@ -399,11 +398,11 @@ def _add_special_token(t): for i in range(vocab_extra_ids): t = "".format(i) - _add_special_token(t) + _add_special_token(t, force=True) self._t5_tokens += [t] if vocab_extra_ids_list: for t in vocab_extra_ids_list.split(","): - _add_special_token(t) + _add_special_token(t, force=True) print("Special tokens: {}".format(self._special_tokens)) @property diff --git a/weights_conversion/megatron_to_hf.py b/weights_conversion/megatron_to_hf.py index c3cd056..05b2f96 100644 --- a/weights_conversion/megatron_to_hf.py +++ b/weights_conversion/megatron_to_hf.py @@ -371,27 +371,27 @@ def write_tokenizer(args: Namespace): # add default args for megatron tokenizer args.rank = 0 args.vocab_extra_ids = 0 + args.new_tokens = True args.make_vocab_size_divisible_by = 128 args.tensor_model_parallel_size = 1 mt_tokenizer = build_tokenizer(args) if args.tokenizer_type == "SentencePieceTokenizer": # Do not add def special tokens if requested - if args.new_tokens: - if mt_tokenizer.cls is not None: - hf_tokenizer.add_tokens("", special_tokens=True) - hf_tokenizer.cls_token_id = mt_tokenizer.cls - if mt_tokenizer.sep is not None: - hf_tokenizer.add_tokens("", special_tokens=True) - hf_tokenizer.sep_token_id = mt_tokenizer.sep - if mt_tokenizer.eod is not None: - hf_tokenizer.add_tokens("", special_tokens=True) - if mt_tokenizer.mask is not None: - hf_tokenizer.add_tokens("", special_tokens=True) - hf_tokenizer.mask_token_id = mt_tokenizer.mask - if mt_tokenizer.pad is not None: - hf_tokenizer.add_tokens("", special_tokens=True) - hf_tokenizer.pad_token_id = mt_tokenizer.pad + if mt_tokenizer.cls is not None: + hf_tokenizer.add_tokens("", special_tokens=True) + hf_tokenizer.cls_token_id = mt_tokenizer.cls + if mt_tokenizer.sep is not None: + hf_tokenizer.add_tokens("", special_tokens=True) + hf_tokenizer.sep_token_id = mt_tokenizer.sep + if mt_tokenizer.eod is not None: + hf_tokenizer.add_tokens("", special_tokens=True) + if mt_tokenizer.mask is not None: + hf_tokenizer.add_tokens("", special_tokens=True) + hf_tokenizer.mask_token_id = mt_tokenizer.mask + if mt_tokenizer.pad is not None: + hf_tokenizer.add_tokens("", special_tokens=True) + hf_tokenizer.pad_token_id = mt_tokenizer.pad additional_special_tokens = hf_tokenizer.additional_special_tokens if args.vocab_extra_ids_list: @@ -451,9 +451,6 @@ def main(): help=("One or more arguments to override special tokens. " "Syntax set as `key=value`, e.g. `eos=<|im_end|>`. " "Overrides available only bos, cls, eos, mask, pad, sep, unk.")) - parser.add_argument("--no_new_tokens", action="store_false", dest="new_tokens", - help=("Do not add special tokens (e.g. CLS, MASK, etc) " - "in the sentenciepiece tokenizer")) args = parser.parse_args() if args.model in {"llama", "llama2", "codellama"}: From 24a174d9da3e0b8a0247076e97ac4da8fcf42e4a Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 7 Nov 2023 15:41:10 -0600 Subject: [PATCH 4/4] remove comment --- weights_conversion/megatron_to_hf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/weights_conversion/megatron_to_hf.py b/weights_conversion/megatron_to_hf.py index 05b2f96..64980f4 100644 --- a/weights_conversion/megatron_to_hf.py +++ b/weights_conversion/megatron_to_hf.py @@ -377,7 +377,6 @@ def write_tokenizer(args: Namespace): mt_tokenizer = build_tokenizer(args) if args.tokenizer_type == "SentencePieceTokenizer": - # Do not add def special tokens if requested if mt_tokenizer.cls is not None: hf_tokenizer.add_tokens("", special_tokens=True) hf_tokenizer.cls_token_id = mt_tokenizer.cls