Skip to content

Commit

Permalink
revert some changes
Browse files Browse the repository at this point in the history
  • Loading branch information
lvdongyi committed Oct 17, 2024
1 parent 353fb41 commit d279d8d
Show file tree
Hide file tree
Showing 17 changed files with 659 additions and 783 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ msgid "Tokenization class for FNet model."
msgstr ""

#: of paddlenlp.transformers.fnet.tokenizer.FNetTokenizer:1
msgid "基类::class:`paddlenlp.transformers.albert_english.tokenizer.AlbertEnglishTokenizer`"
msgid "基类::class:`paddlenlp.transformers.albert.tokenizer.AlbertEnglishTokenizer`"
msgstr ""

#: of paddlenlp.transformers.fnet.tokenizer.FNetTokenizer:1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ msgid "tokenizer"
msgstr ""

#: of paddlenlp.transformers.reformer.tokenizer.ReformerTokenizer:1
msgid "基类::class:`paddlenlp.transformers.albert_english.tokenizer.AlbertEnglishTokenizer`"
msgid "基类::class:`paddlenlp.transformers.albert.tokenizer.AlbertEnglishTokenizer`"
msgstr ""

#: of paddlenlp.transformers.reformer.tokenizer.ReformerTokenizer:1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ msgid "tokenizer"
msgstr ""

#: of paddlenlp.transformers.t5.tokenizer.T5Tokenizer:1
msgid "基类::class:`paddlenlp.transformers.albert_english.tokenizer.AlbertEnglishTokenizer`"
msgid "基类::class:`paddlenlp.transformers.albert.tokenizer.AlbertEnglishTokenizer`"
msgstr ""

#: of paddlenlp.transformers.t5.tokenizer.T5Tokenizer:1
Expand Down
3 changes: 0 additions & 3 deletions paddlenlp/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,6 @@
from .albert.configuration import *
from .albert.modeling import *
from .albert.tokenizer import *
from .albert_chinese.tokenizer import *
from .albert_english.tokenizer import *
from .bit.modeling import *
from .bit.configuration import *
from .bit.image_processing import *
Expand Down Expand Up @@ -143,7 +141,6 @@
from .mbart.modeling import *
from .mbart.tokenizer import *
from .mbart.configuration import *
from .mbart50.tokenizer import *
from .megatronbert.modeling import *
from .megatronbert.tokenizer import *
from .megatronbert.configuration import *
Expand Down
330 changes: 327 additions & 3 deletions paddlenlp/transformers/albert/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@
# limitations under the License.
"""Tokenization class for ALBERT model."""

from .. import AddedToken, PretrainedTokenizer
from ..albert_chinese.tokenizer import AlbertChineseTokenizer
from ..albert_english.tokenizer import AlbertEnglishTokenizer
import os
import unicodedata
from shutil import copyfile

import sentencepiece as spm

from .. import PretrainedTokenizer, BertTokenizer, AddedToken

__all__ = ["AlbertTokenizer"]

Expand Down Expand Up @@ -475,3 +479,323 @@ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):

def save_resources(self, save_directory):
return self.tokenizer.save_resources(save_directory)


class AlbertEnglishTokenizer(PretrainedTokenizer):
resource_files_names = {
"sentencepiece_model_file": "spiece.model",
}

pretrained_resource_files_map = {
"sentencepiece_model_file": {
"albert-base-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v1.spiece.model",
"albert-large-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v1.spiece.model",
"albert-xlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v1.spiece.model",
"albert-xxlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v1.spiece.model",
"albert-base-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v2.spiece.model",
"albert-large-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v2.spiece.model",
"albert-xlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v2.spiece.model",
"albert-xxlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v2.spiece.model",
},
}

pretrained_init_configuration = {
"albert-base-v1": {
"do_lower_case": True,
"remove_space": True,
"keep_accents": False,
"unk_token": "<unk>",
"pad_token": "<pad>",
},
"albert-large-v1": {
"do_lower_case": True,
"remove_space": True,
"keep_accents": False,
"unk_token": "<unk>",
"pad_token": "<pad>",
},
"albert-xlarge-v1": {
"do_lower_case": True,
"remove_space": True,
"keep_accents": False,
"unk_token": "<unk>",
"pad_token": "<pad>",
},
"albert-xxlarge-v1": {
"do_lower_case": True,
"remove_space": True,
"keep_accents": False,
"unk_token": "<unk>",
"pad_token": "<pad>",
},
"albert-base-v2": {
"do_lower_case": True,
"remove_space": True,
"keep_accents": False,
"unk_token": "<unk>",
"pad_token": "<pad>",
},
"albert-large-v2": {
"do_lower_case": True,
"remove_space": True,
"keep_accents": False,
"unk_token": "<unk>",
"pad_token": "<pad>",
},
"albert-xlarge-v2": {
"do_lower_case": True,
"remove_space": True,
"keep_accents": False,
"unk_token": "<unk>",
"pad_token": "<pad>",
},
"albert-xxlarge-v2": {
"do_lower_case": True,
"remove_space": True,
"keep_accents": False,
"unk_token": "<unk>",
"pad_token": "<pad>",
},
}
max_model_input_sizes = {
"albert-base-v1": 512,
"albert-large-v1": 512,
"albert-xlarge-v1": 512,
"albert-xxlarge-v1": 512,
"albert-base-v2": 512,
"albert-large-v2": 512,
"albert-xlarge-v2": 512,
"albert-xxlarge-v2": 512,
}

def __init__(
self,
sentencepiece_model_file,
do_lower_case=True,
remove_space=True,
keep_accents=False,
bos_token="[CLS]",
eos_token="[SEP]",
unk_token="<unk>",
sep_token="[SEP]",
pad_token="<pad>",
cls_token="[CLS]",
mask_token="[MASK]",
sp_model_kwargs=None,
**kwargs
):

self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.sentencepiece_model_file = sentencepiece_model_file

self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(sentencepiece_model_file)

@property
def vocab_size(self):
return len(self.sp_model)

def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab

def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state

def __setstate__(self, d):
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}

self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.sentencepiece_model_file)

def preprocess_text(self, inputs):
if self.remove_space:
outputs = " ".join(inputs.strip().split())
else:
outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"')

if not self.keep_accents:
outputs = unicodedata.normalize("NFKD", outputs)
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
if self.do_lower_case:
outputs = outputs.lower()

return outputs

def _tokenize(self, text):
"""Tokenize a string."""
text = self.preprocess_text(text)
pieces = self.sp_model.encode(text, out_type=str)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else:
new_pieces.append(piece)

return new_pieces

def _convert_token_to_id(self, token):
"""Converts a token (str) to an id using the vocab."""
return self.sp_model.PieceToId(token)

def _convert_id_to_token(self, index):
"""Converts an index (integer) to a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)

def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string

def num_special_tokens_to_add(self, pair=False):
token_ids_0 = []
token_ids_1 = []
return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))

def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep

def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
if offset_mapping_1 is None:
return [(0, 0)] + offset_mapping_0 + [(0, 0)]

return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]

def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):

if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]

def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
sep = [self.sep_token_id]
cls = [self.cls_token_id]

if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

def save_resources(self, save_directory):
for name, file_name in self.resource_files_names.items():
save_path = os.path.join(save_directory, file_name)
if os.path.abspath(self.sentencepiece_model_file) != os.path.abspath(save_path) and os.path.isfile(
self.sentencepiece_model_file
):
copyfile(self.sentencepiece_model_file, save_path)
elif not os.path.isfile(self.sentencepiece_model_file):
with open(save_path, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)


class AlbertChineseTokenizer(BertTokenizer):
resource_files_names = {"vocab_file": "vocab.txt"}
pretrained_resource_files_map = {
"vocab_file": {
"albert-chinese-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-tiny.vocab.txt",
"albert-chinese-small": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-small.vocab.txt",
"albert-chinese-base": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-base.vocab.txt",
"albert-chinese-large": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-large.vocab.txt",
"albert-chinese-xlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xlarge.vocab.txt",
"albert-chinese-xxlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xxlarge.vocab.txt",
}
}
pretrained_init_configuration = {
"albert-chinese-tiny": {
"do_lower_case": False,
"unk_token": "[UNK]",
"pad_token": "[PAD]",
},
"albert-chinese-small": {
"do_lower_case": False,
"unk_token": "[UNK]",
"pad_token": "[PAD]",
},
"albert-chinese-base": {
"do_lower_case": False,
"unk_token": "[UNK]",
"pad_token": "[PAD]",
},
"albert-chinese-large": {
"do_lower_case": False,
"unk_token": "[UNK]",
"pad_token": "[PAD]",
},
"albert-chinese-xlarge": {
"do_lower_case": False,
"unk_token": "[UNK]",
"pad_token": "[PAD]",
},
"albert-chinese-xxlarge": {
"do_lower_case": False,
"unk_token": "[UNK]",
"pad_token": "[PAD]",
},
}
max_model_input_sizes = {
"albert-chinese-tiny": 512,
"albert-chinese-small": 512,
"albert-chinese-base": 512,
"albert-chinese-large": 512,
"albert-chinese-xlarge": 512,
"albert-chinese-xxlarge": 512,
}

def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs
):
super(AlbertChineseTokenizer, self).__init__(
vocab_file,
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
Loading

0 comments on commit d279d8d

Please sign in to comment.