diff --git a/README.md b/README.md index 84b8c5992..9fb031c6e 100644 --- a/README.md +++ b/README.md @@ -289,6 +289,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet index 608c46042..dadcfa80b 100644 --- a/docs/snippets/6_supported-models.snippet +++ b/docs/snippets/6_supported-models.snippet @@ -25,6 +25,7 @@ 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT. 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park. 1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. +1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. 1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. diff --git a/scripts/supported_models.py b/scripts/supported_models.py index 8a114bb5b..c790216a0 100644 --- a/scripts/supported_models.py +++ b/scripts/supported_models.py @@ -371,6 +371,14 @@ 'Intel/dpt-large', ], }, + 'electra': { + # Feature extraction + 'feature-extraction': [ + # NOTE: requires --task feature-extraction + 'google/electra-small-discriminator', + 'google/electra-base-discriminator', + ], + }, 'falcon': { # Text generation 'text-generation': [ diff --git a/src/models.js b/src/models.js index ae1f5814e..d9ad0c6cc 100644 --- a/src/models.js +++ b/src/models.js @@ -1538,6 +1538,80 @@ export class ConvBertForQuestionAnswering extends ConvBertPreTrainedModel { ////////////////////////////////////////////////// +////////////////////////////////////////////////// +// Electra models +export class ElectraPreTrainedModel extends PreTrainedModel { } + +/** + * The bare Electra Model transformer outputting raw hidden-states without any specific head on top. + * Identical to the BERT model except that it uses an additional linear layer between the embedding + * layer and the encoder if the hidden size and embedding size are different. + */ +export class ElectraModel extends ElectraPreTrainedModel { } +// TODO add ElectraForPreTraining +/** + * Electra model with a language modeling head on top. + */ +export class ElectraForMaskedLM extends ElectraPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) + */ +export class ElectraForSequenceClassification extends ElectraPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * Electra model with a token classification head on top. + */ +export class ElectraForTokenClassification extends ElectraPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * LECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD + * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + */ +export class ElectraForQuestionAnswering extends ElectraPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} +////////////////////////////////////////////////// + + ////////////////////////////////////////////////// // CamemBERT models export class CamembertPreTrainedModel extends PreTrainedModel { } @@ -4464,6 +4538,7 @@ export class PretrainedMixin { const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([ ['bert', ['BertModel', BertModel]], + ['electra', ['ElectraModel', ElectraModel]], ['convbert', ['ConvBertModel', ConvBertModel]], ['camembert', ['CamembertModel', CamembertModel]], ['deberta', ['DebertaModel', DebertaModel]], @@ -4546,6 +4621,7 @@ const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([ const MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = new Map([ ['bert', ['BertForSequenceClassification', BertForSequenceClassification]], + ['electra', ['ElectraForSequenceClassification', ElectraForSequenceClassification]], ['convbert', ['ConvBertForSequenceClassification', ConvBertForSequenceClassification]], ['camembert', ['CamembertForSequenceClassification', CamembertForSequenceClassification]], ['deberta', ['DebertaForSequenceClassification', DebertaForSequenceClassification]], @@ -4564,6 +4640,7 @@ const MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = new Map([ const MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = new Map([ ['bert', ['BertForTokenClassification', BertForTokenClassification]], + ['electra', ['ElectraForTokenClassification', ElectraForTokenClassification]], ['convbert', ['ConvBertForTokenClassification', ConvBertForTokenClassification]], ['camembert', ['CamembertForTokenClassification', CamembertForTokenClassification]], ['deberta', ['DebertaForTokenClassification', DebertaForTokenClassification]], @@ -4607,6 +4684,7 @@ const MODEL_WITH_LM_HEAD_MAPPING_NAMES = new Map([ const MODEL_FOR_MASKED_LM_MAPPING_NAMES = new Map([ ['bert', ['BertForMaskedLM', BertForMaskedLM]], + ['electra', ['ElectraForMaskedLM', ElectraForMaskedLM]], ['convbert', ['ConvBertForMaskedLM', ConvBertForMaskedLM]], ['camembert', ['CamembertForMaskedLM', CamembertForMaskedLM]], ['deberta', ['DebertaForMaskedLM', DebertaForMaskedLM]], @@ -4623,6 +4701,7 @@ const MODEL_FOR_MASKED_LM_MAPPING_NAMES = new Map([ const MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = new Map([ ['bert', ['BertForQuestionAnswering', BertForQuestionAnswering]], + ['electra', ['ElectraForQuestionAnswering', ElectraForQuestionAnswering]], ['convbert', ['ConvBertForQuestionAnswering', ConvBertForQuestionAnswering]], ['camembert', ['CamembertForQuestionAnswering', CamembertForQuestionAnswering]], ['deberta', ['DebertaForQuestionAnswering', DebertaForQuestionAnswering]], diff --git a/src/tokenizers.js b/src/tokenizers.js index 284428bd7..c3efa497e 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -2755,6 +2755,12 @@ export class XLMTokenizer extends PreTrainedTokenizer { return add_token_types(inputs); } } +export class ElectraTokenizer extends PreTrainedTokenizer { + /** @type {add_token_types} */ + prepare_model_inputs(inputs) { + return add_token_types(inputs); + } +} export class T5Tokenizer extends PreTrainedTokenizer { } export class GPT2Tokenizer extends PreTrainedTokenizer { } @@ -3868,6 +3874,7 @@ export class AutoTokenizer { HerbertTokenizer, ConvBertTokenizer, XLMTokenizer, + ElectraTokenizer, MobileBertTokenizer, SqueezeBertTokenizer, AlbertTokenizer,