Skip to content

Commit

Permalink
try fix
Browse files Browse the repository at this point in the history
  • Loading branch information
lvdongyi committed Oct 18, 2024
1 parent e367332 commit a422932
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 218 deletions.
2 changes: 1 addition & 1 deletion paddlenlp/transformers/albert/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from .. import AddedToken, BertTokenizer, PretrainedTokenizer

__all__ = ["AlbertTokenizer"]
__all__ = ["AlbertTokenizer", "AlbertChineseTokenizer", "AlbertEnglishTokenizer"]

SPIECE_UNDERLINE = "▁"

Expand Down
24 changes: 11 additions & 13 deletions paddlenlp/transformers/auto/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
else:
TOKENIZER_MAPPING_NAMES = OrderedDict(
[
("albert", (("AlbertTokenizer", "AlbertChineseTokenizer", "AlbertEnglishTokenizer"),)),
("albert", (("AlbertTokenizer", "AlbertChineseTokenizer", "AlbertEnglishTokenizer"), None)),
("bart", "BartTokenizer"),
("bert", "BertTokenizer"),
("blenderbot", "BlenderbotTokenizer"),
Expand Down Expand Up @@ -72,7 +72,7 @@
),
("luke", "LukeTokenizer"),
("mamba", "MambaTokenizer"),
("mbart", (("MBartTokenizer", "MBart50Tokenizer"),)),
("mbart", (("MBartTokenizer", "MBart50Tokenizer"), None)),
("mobilebert", "MobileBertTokenizer"),
("mpnet", "MPNetTokenizer"),
("nezha", "NeZhaTokenizer"),
Expand Down Expand Up @@ -140,11 +140,7 @@ def tokenizer_class_from_name(class_name: str):
for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
all_tokenizers = []
if isinstance(tokenizers, tuple):
if len(tokenizers) == 2:
tokenizer_slow, tokenizer_fast = tokenizers
else:
tokenizer_slow = tokenizers[0]
tokenizer_fast = None
(tokenizer_slow, tokenizer_fast) = tokenizers
if isinstance(tokenizer_slow, tuple):
all_tokenizers.extend(tokenizer_slow)
else:
Expand Down Expand Up @@ -409,18 +405,20 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
if model_type is not None:
tokenizer_class_py = TOKENIZER_MAPPING[type(config)]
if isinstance(tokenizer_class_py, (list, tuple)):
if len(tokenizer_class_py) == 2:
tokenizer_class_fast = tokenizer_class_py[1]
tokenizer_class_py = tokenizer_class_py[0]
else:
tokenizer_class_fast = None
(tokenizer_class_py, tokenizer_class_fast) = tokenizer_class_py
else:
tokenizer_class_fast = None
if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
else:
if tokenizer_class_py is not None:
return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
if isinstance(tokenizer_class_py, str):
return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
else:
# Use the first tokenizer class in the list
return tokenizer_class_py[0].from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
else:
raise ValueError(
"This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
Expand Down
1 change: 1 addition & 0 deletions paddlenlp/transformers/mbart/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from .configuration import *
from .tokenizer import *
13 changes: 0 additions & 13 deletions tests/transformers/mbart50/__init__.py

This file was deleted.

191 changes: 0 additions & 191 deletions tests/transformers/mbart50/test_tokenizer.py

This file was deleted.

0 comments on commit a422932

Please sign in to comment.