From 9a8c664c2c33858ca3c422373edc09349607362f Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 6 Dec 2023 17:01:36 +0200
Subject: [PATCH] Documentation improvements (#299)

* Add link to optimum docs for supported architectures

Closes #288

* Refactor `SUPPORTED_MODELS` dict to include task

* Update example model id

* Update list of supported models

* Update generate_tests.py

* Remove requirement of `output_attentions` revision

* Add demo site to examples section (closes #233)

* Fix typo

* Include examples in docs index

* Update github issue templates

* Create config.yml

* Order supported models

* Cleanup

* Update 4_feature-request.yml
---
 .github/ISSUE_TEMPLATE/1_bug-report.md       |   40 -
 .github/ISSUE_TEMPLATE/1_bug-report.yml      |   51 +
 .github/ISSUE_TEMPLATE/2_feature-request.md  |   26 -
 .github/ISSUE_TEMPLATE/2_new_model.yml       |   40 +
 .github/ISSUE_TEMPLATE/3_new_pipeline.yml    |   40 +
 .github/ISSUE_TEMPLATE/3_question.md         |   10 -
 .github/ISSUE_TEMPLATE/4_feature-request.yml |   31 +
 .github/ISSUE_TEMPLATE/5_question.yml        |   13 +
 .github/ISSUE_TEMPLATE/config.yml            |    9 +
 README.md                                    |    6 +-
 docs/snippets/1_quick-tour.snippet           |    2 +-
 docs/snippets/3_examples.snippet             |    2 +
 docs/snippets/4_custom-usage.snippet         |    2 +
 docs/source/index.md                         |    8 +
 scripts/supported_models.py                  | 1154 ++++++++++--------
 src/pipelines.js                             |    4 +-
 tests/generate_tests.py                      |   11 +-
 17 files changed, 879 insertions(+), 570 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/1_bug-report.md
 create mode 100644 .github/ISSUE_TEMPLATE/1_bug-report.yml
 delete mode 100644 .github/ISSUE_TEMPLATE/2_feature-request.md
 create mode 100644 .github/ISSUE_TEMPLATE/2_new_model.yml
 create mode 100644 .github/ISSUE_TEMPLATE/3_new_pipeline.yml
 delete mode 100644 .github/ISSUE_TEMPLATE/3_question.md
 create mode 100644 .github/ISSUE_TEMPLATE/4_feature-request.yml
 create mode 100644 .github/ISSUE_TEMPLATE/5_question.yml
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml

diff --git a/.github/ISSUE_TEMPLATE/1_bug-report.md b/.github/ISSUE_TEMPLATE/1_bug-report.md
deleted file mode 100644
index 62b3d08a2..000000000
--- a/.github/ISSUE_TEMPLATE/1_bug-report.md
+++ /dev/null
@@ -1,40 +0,0 @@
----
-name: Bug report
-about: Create a report to help us improve
-title: "[Bug] Title goes here."
-labels: bug
-assignees: ''
-
----
-
-**Describe the bug**
-*A clear and concise description of what the bug is.*
-
-<!-- ANSWER GOES HERE -->
-
-
-**How to reproduce**
-*Steps or a minimal working example to reproduce the behavior*
-
-<!-- ANSWER GOES HERE -->
-
-
-**Expected behavior**
-*A clear and concise description of what you expected to happen.*
-
-<!-- ANSWER GOES HERE -->
-
-**Logs/screenshots**
-*If applicable, add logs/screenshots to help explain your problem.*
-
-**Environment**
-- Transformers.js version:
-- Browser (if applicable):
-- Operating system (if applicable):
-- Other:
-
-
-**Additional context**
-*Add any other context about the problem here.*
-
-<!-- ANSWER GOES HERE -->
diff --git a/.github/ISSUE_TEMPLATE/1_bug-report.yml b/.github/ISSUE_TEMPLATE/1_bug-report.yml
new file mode 100644
index 000000000..70ceedd96
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/1_bug-report.yml
@@ -0,0 +1,51 @@
+name: "🐛 Bug Report"
+description: Submit a bug report to help us improve transformers.js
+labels: [ "bug" ]
+body:
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info
+      description: Please share your system info with us. If you are using other JS libraries/frameworks (e.g., React or Next.js), please include their versions too.
+      placeholder: transformers.js version, browser (if applicable), operating system, Node.js version, bundlers, ...
+    validations:
+      required: true
+
+  - type: checkboxes
+    id: environment
+    attributes:
+      label: Environment
+      description: "The environment I am running in:"
+      options:
+        - label: "Website/web-app"
+        - label: "Browser extension"
+        - label: "Server-side (e.g., Node.js, Deno, Bun)"
+        - label: "Desktop app (e.g., Electron)"
+        - label: "Other (e.g., VSCode extension)"
+
+  - type: textarea
+    id: description
+    validations:
+      required: true
+    attributes:
+      label: Description
+      description: A clear and concise description of the bug, as well as what you expected to happen.
+
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction
+      description: |
+        Please provide a code sample that reproduces the problem you ran into.
+        If you have code snippets, error messages, stack traces please provide them here as well.
+        Important! Use [code tags](https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting) to correctly format your code.
+
+      placeholder: |
+        Steps to reproduce the behavior:
+          
+          1.
+          2.
+          3.
+          
diff --git a/.github/ISSUE_TEMPLATE/2_feature-request.md b/.github/ISSUE_TEMPLATE/2_feature-request.md
deleted file mode 100644
index 235519115..000000000
--- a/.github/ISSUE_TEMPLATE/2_feature-request.md
+++ /dev/null
@@ -1,26 +0,0 @@
----
-name: Feature request
-about: Suggest a new feature (e.g., model, pipeline, task) for this project
-title: "[Feature request] Title goes here."
-labels: enhancement
-assignees: ''
-
----
-
-**Name of the feature**
-*In general, the feature you want added should be supported by HuggingFace's [transformers](https://github.com/huggingface/transformers) library:*
- - *If requesting a **model**, it must be listed [here](https://huggingface.co/docs/transformers/index#supported-models).*
- - *If requesting a **pipeline**, it must be listed [here](https://huggingface.co/docs/transformers/main_classes/pipelines).*
-- *If requesting a **task**, it must be listed [here](https://huggingface.co/tasks).*
-
-<!-- ANSWER GOES HERE -->
-
-
-**Reason for request**
-*Why is it important that we add this feature? What is your intended use case? Remember, we are more likely to add support for models/pipelines/tasks that are popular (e.g., many downloads), or contain functionality that does not exist (e.g., new input type).*
-
-<!-- ANSWER GOES HERE -->
-
-
-**Additional context**
-*Add any other context or screenshots about the feature request here.*
diff --git a/.github/ISSUE_TEMPLATE/2_new_model.yml b/.github/ISSUE_TEMPLATE/2_new_model.yml
new file mode 100644
index 000000000..5b7fe3014
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/2_new_model.yml
@@ -0,0 +1,40 @@
+name: "🌟 New model addition"
+description: Submit a proposal/request to implement a new model
+labels: [ "New model" ]
+
+body:
+  - type: textarea
+    id: description-request
+    validations:
+      required: true
+    attributes:
+      label: Model description
+      description: |
+        Include important information about the model.
+
+  - type: checkboxes
+    id: information-tasks
+    attributes:
+      label: Prerequisites
+      description: |
+          Please note that Transformers.js relies on the model first being supported in [🤗 Transformers](https://github.com/huggingface/transformers) and [🤗 Optimum](https://github.com/huggingface/optimum). If the model you are requesting is not yet supported by either of them, feel free to open up a model request there too.
+      options:
+        - label: "The model is supported in Transformers (i.e., listed [here](https://huggingface.co/docs/transformers/index#supported-models-and-frameworks))"
+        - label: "The model can be exported to ONNX with Optimum (i.e., listed [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/overview))"
+
+  - type: textarea
+    id: additional-info
+    attributes:
+      label: Additional information
+      description: |
+        Please provide additional information about the model here.
+        If the model is already supported in Transformers, you can provide example Python code to help ensure the JavaScript implementation (and output) matches the original version.
+
+  - type: textarea
+    id: contribution
+    validations:
+      required: true
+    attributes:
+      label: Your contribution
+      description: |
+        Is there any way that you could help, e.g. by submitting a PR?
diff --git a/.github/ISSUE_TEMPLATE/3_new_pipeline.yml b/.github/ISSUE_TEMPLATE/3_new_pipeline.yml
new file mode 100644
index 000000000..ff7b8bff7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/3_new_pipeline.yml
@@ -0,0 +1,40 @@
+name: "🔧 New pipeline addition"
+description: Submit a proposal/request to implement a new pipeline
+labels: [ "New pipeline" ]
+
+body:
+  - type: textarea
+    id: description-request
+    validations:
+      required: true
+    attributes:
+      label: Pipeline description
+      description: |
+        Put any and all important information related to the pipeline.
+
+  - type: checkboxes
+    id: information-tasks
+    attributes:
+      label: Prerequisites
+      description: |
+          Please note that Transformers.js relies on the pipeline first being supported in [🤗 Transformers](https://github.com/huggingface/transformers). If the pipeline you are requesting is not yet supported by Transformers, feel free to open up a feature request for it there too.
+      options:
+        - label: "The pipeline is supported in Transformers (i.e., listed [here](https://huggingface.co/docs/transformers/main_classes/pipelines))"
+        - label: "The task is listed [here](https://huggingface.co/tasks)"
+
+  - type: textarea
+    id: additional-info
+    attributes:
+      label: Additional information
+      description: |
+        Please provide additional information about the pipeline here.
+        If the pipeline is already supported in Transformers, you can provide example Python code to help ensure the JavaScript implementation (and output) matches the original version.
+
+  - type: textarea
+    id: contribution
+    validations:
+      required: true
+    attributes:
+      label: Your contribution
+      description: |
+        Is there any way that you could help, e.g. by submitting a PR?
diff --git a/.github/ISSUE_TEMPLATE/3_question.md b/.github/ISSUE_TEMPLATE/3_question.md
deleted file mode 100644
index d8beec23e..000000000
--- a/.github/ISSUE_TEMPLATE/3_question.md
+++ /dev/null
@@ -1,10 +0,0 @@
----
-name: Ask a question
-about: Ask a question about the library
-title: "[Question] Title goes here."
-labels: question
-assignees: ''
-
----
-
-<!-- QUESTION GOES HERE -->
diff --git a/.github/ISSUE_TEMPLATE/4_feature-request.yml b/.github/ISSUE_TEMPLATE/4_feature-request.yml
new file mode 100644
index 000000000..0ef12f408
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/4_feature-request.yml
@@ -0,0 +1,31 @@
+name: "🚀 Feature request"
+description: Submit a proposal/request for a new transformers.js feature
+labels: [ "feature" ]
+body:
+  - type: textarea
+    id: feature-request
+    validations:
+      required: true
+    attributes:
+      label: Feature request
+      description: |
+        A clear and concise description of the feature proposal.
+        If the feature is already part of the python [Transformers](https://github.com/huggingface/transformers) library, please provide relevant links or example usage.
+
+  - type: textarea
+    id: motivation
+    validations:
+      required: true
+    attributes:
+      label: Motivation
+      description: |
+        Please outline the motivation for the proposal. Why is it important that we add this feature? What is your intended use case?
+
+  - type: textarea
+    id: contribution
+    validations:
+      required: true
+    attributes:
+      label: Your contribution
+      description: |
+        Is there any way that you could help, e.g. by submitting a PR?
diff --git a/.github/ISSUE_TEMPLATE/5_question.yml b/.github/ISSUE_TEMPLATE/5_question.yml
new file mode 100644
index 000000000..2af3acbcf
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/5_question.yml
@@ -0,0 +1,13 @@
+name: "🙋 Question"
+description: Ask a question about the library
+labels: [ "question" ]
+
+body:
+  - type: textarea
+    id: question
+    validations:
+      required: true
+    attributes:
+      label: Question
+      description: |
+        Please enter your question here...
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 000000000..d071e5961
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,9 @@
+blank_issues_enabled: true
+version: 2.1
+contact_links:
+  - name: Models on the Hugging Face Hub
+    url: https://huggingface.co/models?library=transformers.js
+    about: Open a Pull request / Discussion related to a specific model checkpoint directly on the Hugging Face Hub
+  - name: Documentation
+    url: https://huggingface.co/docs/transformers.js
+    about: View the Transformers.js documentation
diff --git a/README.md b/README.md
index 831b3dcf0..2ad41220e 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ let out = await pipe('I love transformers!');
 You can also use a different model by specifying the model id or path as the second argument to the `pipeline` function. For example:
 ```javascript
 // Use a different model for sentiment-analysis
-let pipe = await pipeline('sentiment-analysis', 'nlptown/bert-base-multilingual-uncased-sentiment');
+let pipe = await pipeline('sentiment-analysis', 'Xenova/bert-base-multilingual-uncased-sentiment');
 ```
 
 
@@ -125,6 +125,8 @@ Want to jump straight in? Get started with one of our sample applications/templa
 | Next.js (client-side) | Sentiment analysis (in-browser inference) | [code](./examples/next-client/), [demo](https://huggingface.co/spaces/Xenova/next-example-app) |
 | Next.js (server-side) | Sentiment analysis (Node.js inference) | [code](./examples/next-server/), [demo](https://huggingface.co/spaces/Xenova/next-server-example-app) |
 | Node.js           | Sentiment analysis API           | [code](./examples/node/)      |
+| Demo site         | A collection of demos | [code](./examples/demo-site/), [demo](https://xenova.github.io/transformers.js/) |
+
 
 
 ## Custom usage
@@ -176,6 +178,8 @@ bert-base-uncased/
     └── model_quantized.onnx
 ```
 
+For the full list of supported architectures, see the [Optimum documentation](https://huggingface.co/docs/optimum/main/en/exporters/onnx/overview).
+
 
 ## Supported tasks/models
 
diff --git a/docs/snippets/1_quick-tour.snippet b/docs/snippets/1_quick-tour.snippet
index 2f2fa58d9..dec6b341f 100644
--- a/docs/snippets/1_quick-tour.snippet
+++ b/docs/snippets/1_quick-tour.snippet
@@ -40,5 +40,5 @@ let out = await pipe('I love transformers!');
 You can also use a different model by specifying the model id or path as the second argument to the `pipeline` function. For example:
 ```javascript
 // Use a different model for sentiment-analysis
-let pipe = await pipeline('sentiment-analysis', 'nlptown/bert-base-multilingual-uncased-sentiment');
+let pipe = await pipeline('sentiment-analysis', 'Xenova/bert-base-multilingual-uncased-sentiment');
 ```
diff --git a/docs/snippets/3_examples.snippet b/docs/snippets/3_examples.snippet
index 6af3da7ec..7b836ce72 100644
--- a/docs/snippets/3_examples.snippet
+++ b/docs/snippets/3_examples.snippet
@@ -15,3 +15,5 @@ Want to jump straight in? Get started with one of our sample applications/templa
 | Next.js (client-side) | Sentiment analysis (in-browser inference) | [code](./examples/next-client/), [demo](https://huggingface.co/spaces/Xenova/next-example-app) |
 | Next.js (server-side) | Sentiment analysis (Node.js inference) | [code](./examples/next-server/), [demo](https://huggingface.co/spaces/Xenova/next-server-example-app) |
 | Node.js           | Sentiment analysis API           | [code](./examples/node/)      |
+| Demo site         | A collection of demos | [code](./examples/demo-site/), [demo](https://xenova.github.io/transformers.js/) |
+
diff --git a/docs/snippets/4_custom-usage.snippet b/docs/snippets/4_custom-usage.snippet
index fbfcdc717..3367b2685 100644
--- a/docs/snippets/4_custom-usage.snippet
+++ b/docs/snippets/4_custom-usage.snippet
@@ -44,3 +44,5 @@ bert-base-uncased/
     ├── model.onnx
     └── model_quantized.onnx
 ```
+
+For the full list of supported architectures, see the [Optimum documentation](https://huggingface.co/docs/optimum/main/en/exporters/onnx/overview).
diff --git a/docs/source/index.md b/docs/source/index.md
index 03e496e60..1b94c115f 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -23,6 +23,14 @@ The documentation is organized into 4 sections:
 3. **DEVELOPER GUIDES** show you how to use the library to achieve a specific goal.
 4. **API REFERENCE** describes all classes and functions, as well as their available parameters and types.
 
+## Examples
+
+<include>
+{
+    "path": "../snippets/3_examples.snippet"
+}
+</include>
+
 ## Supported tasks/models
 
 Here is the list of all tasks and architectures currently supported by Transformers.js.
diff --git a/scripts/supported_models.py b/scripts/supported_models.py
index 7944c39c5..fca98bc9e 100644
--- a/scripts/supported_models.py
+++ b/scripts/supported_models.py
@@ -3,338 +3,435 @@
 
 SUPPORTED_MODELS = {
     # NOTE: keys of `SUPPORTED_MODELS` are subsets of https://github.com/huggingface/optimum/blob/7f8e606689365931300ef5e6d3b20cb88771cb08/optimum/exporters/tasks.py#L281-L965
-    'audio-spectrogram-transformer': [
-        'MIT/ast-finetuned-audioset-10-10-0.4593',
-        'MIT/ast-finetuned-audioset-16-16-0.442',
-        'MIT/ast-finetuned-speech-commands-v2',
-        'mtg-upf/discogs-maest-30s-pw-73e-ts',
-    ],
-    
-    'albert': [
+    'albert': {
         # Masked language modelling
-        'albert-base-v2',
-        'albert-large-v2',
+        'fill-mask': [
+            'albert-base-v2',
+            'albert-large-v2',
+        ],
 
         # Feature extraction
-        'sentence-transformers/paraphrase-albert-small-v2',
-        'sentence-transformers/paraphrase-albert-base-v2',
-    ],
-    'bart': [
+        'feature-extraction': [
+            'sentence-transformers/paraphrase-albert-small-v2',
+            'sentence-transformers/paraphrase-albert-base-v2',
+        ],
+    },
+    'audio-spectrogram-transformer': {
+        # Audio classification
+        'audio-classification': {
+            'MIT/ast-finetuned-audioset-10-10-0.4593',
+            'MIT/ast-finetuned-audioset-16-16-0.442',
+            'MIT/ast-finetuned-speech-commands-v2',
+            'mtg-upf/discogs-maest-30s-pw-73e-ts',
+        }
+    },
+    'bart': {
         # Summarization
-        'sshleifer/distilbart-xsum-12-1',
-        'sshleifer/distilbart-xsum-6-6',
-        'sshleifer/distilbart-xsum-12-3',
-        'sshleifer/distilbart-xsum-9-6',
-        'sshleifer/distilbart-xsum-12-6',
-        'sshleifer/distilbart-cnn-12-3',
-        'sshleifer/distilbart-cnn-12-6',
-        'sshleifer/distilbart-cnn-6-6',
-        'facebook/bart-large-cnn',
-        'facebook/bart-large-xsum',
-
+        'summarization': [
+            'sshleifer/distilbart-xsum-12-1',
+            'sshleifer/distilbart-xsum-6-6',
+            'sshleifer/distilbart-xsum-12-3',
+            'sshleifer/distilbart-xsum-9-6',
+            'sshleifer/distilbart-xsum-12-6',
+            'sshleifer/distilbart-cnn-12-3',
+            'sshleifer/distilbart-cnn-12-6',
+            'sshleifer/distilbart-cnn-6-6',
+            'facebook/bart-large-cnn',
+            'facebook/bart-large-xsum',
+        ],
         # Zero-shot classification
-        'facebook/bart-large-mnli',
-    ],
-    'beit': [
+        'zero-shot-classification': {
+            'facebook/bart-large-mnli',
+        },
+    },
+    'beit': {
         # Image classification
-        'microsoft/beit-base-patch16-224',
-        'microsoft/beit-base-patch16-224-pt22k',
-        'microsoft/beit-base-patch16-384',
-        'microsoft/beit-base-patch16-224-pt22k-ft22k',
-        'microsoft/beit-large-patch16-224',
-        'microsoft/beit-large-patch16-224-pt22k',
-        'microsoft/beit-large-patch16-512',
-        'microsoft/beit-large-patch16-224-pt22k-ft22k',
-        'microsoft/beit-large-patch16-384',
-        'microsoft/dit-base-finetuned-rvlcdip',
-        'microsoft/dit-large-finetuned-rvlcdip',
-    ],
-    'bert': [
+        'image-classification': [
+            'microsoft/beit-base-patch16-224',
+            'microsoft/beit-base-patch16-224-pt22k',
+            'microsoft/beit-base-patch16-384',
+            'microsoft/beit-base-patch16-224-pt22k-ft22k',
+            'microsoft/beit-large-patch16-224',
+            'microsoft/beit-large-patch16-224-pt22k',
+            'microsoft/beit-large-patch16-512',
+            'microsoft/beit-large-patch16-224-pt22k-ft22k',
+            'microsoft/beit-large-patch16-384',
+            'microsoft/dit-base-finetuned-rvlcdip',
+            'microsoft/dit-large-finetuned-rvlcdip',
+        ],
+    },
+    'bert': {
         # Feature extraction
-        'sentence-transformers/all-MiniLM-L6-v2',
-        'sentence-transformers/all-MiniLM-L12-v2',
-        'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
-        'sentence-transformers/paraphrase-MiniLM-L6-v2',
-        'sentence-transformers/paraphrase-MiniLM-L3-v2',
-        'sentence-transformers/bert-base-nli-mean-tokens',
-        'sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
-        'sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens',
-        'sentence-transformers/LaBSE',
-        'deepset/sentence_bert',
-        'intfloat/e5-small',
-        'intfloat/e5-small-v2',
-        'intfloat/e5-base',
-        'intfloat/e5-base-v2',
-        'intfloat/e5-large',
-        'intfloat/e5-large-v2',
-        'intfloat/multilingual-e5-base',
-        'thenlper/gte-small',
-        'thenlper/gte-base',
-        'thenlper/gte-large',
-        'BAAI/bge-small-en',
-        'BAAI/bge-base-en',
-        'BAAI/bge-large-en',
-        'allenai/scibert_scivocab_uncased',
-        'SpanBERT/spanbert-large-cased',
-        'SpanBERT/spanbert-base-cased',
-        'cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
-        'indobenchmark/indobert-base-p1',
-        'GanjinZero/UMLSBert_ENG',
-        'DeepPavlov/rubert-base-cased',
-        'monologg/kobert',
+        'feature-extraction': [
+            'sentence-transformers/all-MiniLM-L6-v2',
+            'sentence-transformers/all-MiniLM-L12-v2',
+            'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
+            'sentence-transformers/paraphrase-MiniLM-L6-v2',
+            'sentence-transformers/paraphrase-MiniLM-L3-v2',
+            'sentence-transformers/bert-base-nli-mean-tokens',
+            'sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
+            'sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens',
+            'sentence-transformers/LaBSE',
+            'deepset/sentence_bert',
+            'intfloat/e5-small',
+            'intfloat/e5-small-v2',
+            'intfloat/e5-base',
+            'intfloat/e5-base-v2',
+            'intfloat/e5-large',
+            'intfloat/e5-large-v2',
+            'intfloat/multilingual-e5-base',
+            'thenlper/gte-small',
+            'thenlper/gte-base',
+            'thenlper/gte-large',
+            'BAAI/bge-small-en',
+            'BAAI/bge-base-en',
+            'BAAI/bge-large-en',
+            'BAAI/bge-large-en-v1.5',
+            'BAAI/bge-base-en-v1.5',
+            'BAAI/bge-small-en-v1.5',
+            'BAAI/bge-large-zh-v1.5',
+            'BAAI/bge-base-zh-v1.5',
+            'BAAI/bge-small-zh-v1.5',
+            'allenai/scibert_scivocab_uncased',
+            'SpanBERT/spanbert-large-cased',
+            'SpanBERT/spanbert-base-cased',
+            'cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
+            'indobenchmark/indobert-base-p1',
+            'GanjinZero/UMLSBert_ENG',
+            'DeepPavlov/rubert-base-cased',
+            'monologg/kobert',
+        ],
 
         # Text classification
-        'nlptown/bert-base-multilingual-uncased-sentiment',
-        'ProsusAI/finbert',
-        'unitary/toxic-bert',
+        'text-classification': [
+            'nlptown/bert-base-multilingual-uncased-sentiment',
+            'ProsusAI/finbert',
+            'unitary/toxic-bert',
+            'BAAI/bge-reranker-large',
+            'BAAI/bge-reranker-base',
+        ],
 
         # Token classification
-        'Davlan/bert-base-multilingual-cased-ner-hrl',
-        'ckiplab/bert-base-chinese-ner',
-        'ckiplab/bert-base-chinese-ws',
-        'ckiplab/bert-base-chinese-pos',
-        'dslim/bert-base-NER',
-        'dslim/bert-base-NER-uncased',
+        'token-classification': [
+            'Davlan/bert-base-multilingual-cased-ner-hrl',
+            'ckiplab/bert-base-chinese-ner',
+            'ckiplab/bert-base-chinese-ws',
+            'ckiplab/bert-base-chinese-pos',
+            'dslim/bert-base-NER',
+            'dslim/bert-base-NER-uncased',
+        ],
 
         # Masked language modelling
-        'bert-base-uncased',
-        'bert-base-cased',
-        'bert-base-multilingual-uncased',
-        'bert-base-multilingual-cased',
-        'bert-base-chinese',
-        'emilyalsentzer/Bio_ClinicalBERT',
-    ],
-    'blenderbot': [
-        # Text2text generation (TODO add conversational)
-        'facebook/blenderbot-400M-distill',
-        # 'facebook/blenderbot-1B-distill',
-    ],
-    'blenderbot-small': [
-        # Text2text generation (TODO add conversational)
-        # 'facebook/blenderbot-90M',  # DEPRECATED
-        'facebook/blenderbot_small-90M',
-    ],
-    'bloom': [
+        'fill-mask': [
+            'bert-base-uncased',
+            'bert-base-cased',
+            'bert-base-multilingual-uncased',
+            'bert-base-multilingual-cased',
+            'bert-base-chinese',
+            'emilyalsentzer/Bio_ClinicalBERT',
+        ],
+    },
+    'blenderbot': {
+        # Text-to-text (TODO add conversational)
+        'text2text-generation': [
+            'facebook/blenderbot-400M-distill',
+            # 'facebook/blenderbot-1B-distill',
+        ],
+    },
+    'blenderbot-small': {
+        # Text-to-text (TODO add conversational)
+        'text2text-generation': [
+            # 'facebook/blenderbot-90M',  # DEPRECATED
+            'facebook/blenderbot_small-90M',
+        ],
+    },
+    'bloom': {
         # Text generation
-        'bigscience/bloom-560m',
-        'bigscience/bloomz-560m',
-    ],
-    'camembert': [
+        'text-generation': [
+            'bigscience/bloom-560m',
+            'bigscience/bloomz-560m',
+        ],
+    },
+
+    'camembert': {
         # Feature extraction
-        'dangvantuan/sentence-camembert-large',
+        'feature-extraction': [
+            'dangvantuan/sentence-camembert-large',
+        ],
 
         # Token classification
-        'Jean-Baptiste/camembert-ner',
-        'Jean-Baptiste/camembert-ner-with-dates',
-        'pythainlp/thainer-corpus-v2-base-model',
-        'gilf/french-camembert-postag-model',
+        'token-classification': [
+            'Jean-Baptiste/camembert-ner',
+            'Jean-Baptiste/camembert-ner-with-dates',
+            'pythainlp/thainer-corpus-v2-base-model',
+            'gilf/french-camembert-postag-model',
+        ],
 
         # Masked language modelling
-        'camembert-base',
-        'airesearch/wangchanberta-base-att-spm-uncased',
-    ],
-    'clap': [
+        'fill-mask': [
+            'camembert-base',
+            'airesearch/wangchanberta-base-att-spm-uncased',
+        ],
+    },
+    'clap': {
         # Zero-shot audio classification and feature extraction
         # (with and without `--split_modalities`)
-        'laion/clap-htsat-unfused',
-        # TODO add 'laion/clap-htsat-fused',
-
-        'Xenova/tiny-random-ClapModel',
-    ],
-    'clip': [
-        # Zero-shot image classification and feature extraction
+        'zero-shot-audio-classification': {
+            'laion/clap-htsat-unfused',
+            # TODO add 'laion/clap-htsat-fused',
+            'laion/larger_clap_general',
+            'laion/larger_clap_music_and_speech',
+            # 'Xenova/tiny-random-ClapModel',
+        }
+    },
+    'clip': {
+        # Zero-shot image classification (and feature extraction)
         # (with and without `--split_modalities`)
-        'openai/clip-vit-base-patch16',
-        'openai/clip-vit-base-patch32',
-        'openai/clip-vit-large-patch14',
-        'openai/clip-vit-large-patch14-336',
-    ],
-    'codegen': [
+        'zero-shot-image-classification': [
+            'openai/clip-vit-base-patch16',
+            'openai/clip-vit-base-patch32',
+            'openai/clip-vit-large-patch14',
+            'openai/clip-vit-large-patch14-336',
+        ],
+    },
+    'codegen': {
         # Text generation
-        'Salesforce/codegen-350M-mono',
-        'Salesforce/codegen-350M-multi',
-        'Salesforce/codegen-350M-nl',
-    ],
-    'convnext':[
+        'text-generation': [
+            'Salesforce/codegen-350M-mono',
+            'Salesforce/codegen-350M-multi',
+            'Salesforce/codegen-350M-nl',
+        ],
+    },
+    'convnext': {
         # Image classification
-        'facebook/convnext-tiny-224',
-        'facebook/convnext-small-224',
-        'facebook/convnext-base-224',
-        'facebook/convnext-base-224-22k',
-        'facebook/convnext-base-224-22k-1k',
-        'facebook/convnext-base-384',
-        'facebook/convnext-base-384-22k-1k',
-        'facebook/convnext-large-224',
-        'facebook/convnext-large-224-22k',
-        'facebook/convnext-large-224-22k-1k',
-        'facebook/convnext-large-384',
-        'facebook/convnext-large-384-22k-1k',
-        'facebook/convnext-xlarge-224-22k',
-        'facebook/convnext-xlarge-224-22k-1k',
-        'facebook/convnext-xlarge-384-22k-1k',
-    ],
-    'convnextv2':[
+        'image-classification': [
+            'facebook/convnext-tiny-224',
+            'facebook/convnext-small-224',
+            'facebook/convnext-base-224',
+            'facebook/convnext-base-224-22k',
+            'facebook/convnext-base-224-22k-1k',
+            'facebook/convnext-base-384',
+            'facebook/convnext-base-384-22k-1k',
+            'facebook/convnext-large-224',
+            'facebook/convnext-large-224-22k',
+            'facebook/convnext-large-224-22k-1k',
+            'facebook/convnext-large-384',
+            'facebook/convnext-large-384-22k-1k',
+            'facebook/convnext-xlarge-224-22k',
+            'facebook/convnext-xlarge-224-22k-1k',
+            'facebook/convnext-xlarge-384-22k-1k',
+        ],
+    },
+    'convnextv2': {
         # Image classification
-        'facebook/convnextv2-atto-1k-224',
-        'facebook/convnextv2-femto-1k-224',
-        'facebook/convnextv2-pico-1k-224',
-        'facebook/convnextv2-tiny-1k-224',
-        'facebook/convnextv2-tiny-22k-384',
-        'facebook/convnextv2-tiny-22k-224',
-        'facebook/convnextv2-nano-1k-224',
-        'facebook/convnextv2-nano-22k-384',
-        'facebook/convnextv2-base-22k-224',
-        'facebook/convnextv2-base-1k-224',
-        'facebook/convnextv2-base-22k-384',
-        'facebook/convnextv2-large-22k-224',
-        'facebook/convnextv2-large-1k-224',
-        'facebook/convnextv2-large-22k-384',
-        # 'facebook/convnextv2-huge-22k-512',
-        # 'facebook/convnextv2-huge-1k-224',
-        # 'facebook/convnextv2-huge-22k-384',
-        # 'facebook/convnextv2-nano-22k-224',
-    ],
-    'deberta': [
+        'image-classification': [
+            'facebook/convnextv2-atto-1k-224',
+            'facebook/convnextv2-femto-1k-224',
+            'facebook/convnextv2-pico-1k-224',
+            'facebook/convnextv2-tiny-1k-224',
+            'facebook/convnextv2-tiny-22k-384',
+            'facebook/convnextv2-tiny-22k-224',
+            'facebook/convnextv2-nano-1k-224',
+            'facebook/convnextv2-nano-22k-384',
+            'facebook/convnextv2-base-22k-224',
+            'facebook/convnextv2-base-1k-224',
+            'facebook/convnextv2-base-22k-384',
+            'facebook/convnextv2-large-22k-224',
+            'facebook/convnextv2-large-1k-224',
+            'facebook/convnextv2-large-22k-384',
+            # 'facebook/convnextv2-huge-22k-512',
+            # 'facebook/convnextv2-huge-1k-224',
+            # 'facebook/convnextv2-huge-22k-384',
+            # 'facebook/convnextv2-nano-22k-224',
+        ],
+    },
+    'deberta': {
         # Zero-shot classification
-        'cross-encoder/nli-deberta-base',
-        'Narsil/deberta-large-mnli-zero-cls',
-    ],
-    'deberta-v2': [
+        'zero-shot-classification': [
+            'cross-encoder/nli-deberta-base',
+            'Narsil/deberta-large-mnli-zero-cls',
+        ],
+    },
+    'deberta-v2': {
         # Zero-shot classification
-        'cross-encoder/nli-deberta-v3-xsmall',
-        'cross-encoder/nli-deberta-v3-small',
-        'cross-encoder/nli-deberta-v3-base',
-        'cross-encoder/nli-deberta-v3-large',
-        'MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary',
-        'MoritzLaurer/DeBERTa-v3-base-mnli',
-        'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli',
-        'MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli',
-        'MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7',
-        'sileod/deberta-v3-base-tasksource-nli',
-        'sileod/deberta-v3-large-tasksource-nli',
-    ],
-    'deit': [
+        'zero-shot-classification': [
+            'cross-encoder/nli-deberta-v3-xsmall',
+            'cross-encoder/nli-deberta-v3-small',
+            'cross-encoder/nli-deberta-v3-base',
+            'cross-encoder/nli-deberta-v3-large',
+            'MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary',
+            'MoritzLaurer/DeBERTa-v3-base-mnli',
+            'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli',
+            'MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli',
+            'MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7',
+            'sileod/deberta-v3-base-tasksource-nli',
+            'sileod/deberta-v3-large-tasksource-nli',
+        ],
+    },
+    'deit': {
         # Image classification
-        'facebook/deit-tiny-distilled-patch16-224',
-        'facebook/deit-small-distilled-patch16-224',
-        'facebook/deit-base-distilled-patch16-224',
-        'facebook/deit-base-distilled-patch16-384',
-    ],
-    'detr': [
+        'image-classification': [
+            'facebook/deit-tiny-distilled-patch16-224',
+            'facebook/deit-small-distilled-patch16-224',
+            'facebook/deit-base-distilled-patch16-224',
+            'facebook/deit-base-distilled-patch16-384',
+        ],
+    },
+    'detr': {
         # Object detection
-        'facebook/detr-resnet-50',
-        'facebook/detr-resnet-101',
+        'object-detection': [
+            'facebook/detr-resnet-50',
+            'facebook/detr-resnet-101',
+        ],
 
         # Image segmentation
-        'facebook/detr-resnet-50-panoptic',
-    ],
-    'distilbert': [
+        'image-segmentation': [
+            'facebook/detr-resnet-50-panoptic',
+        ],
+    },
+    'distilbert': {
         # Feature extraction
-        'sentence-transformers/multi-qa-distilbert-cos-v1',
-        'sentence-transformers/distiluse-base-multilingual-cased-v1',
-        'sentence-transformers/distiluse-base-multilingual-cased-v2',
-        'sentence-transformers/distilbert-base-nli-mean-tokens',
-        'sentence-transformers/distilbert-base-nli-stsb-mean-tokens',
-        'sentence-transformers/msmarco-distilbert-base-v4',
+        'feature-extraction': [
+            'sentence-transformers/multi-qa-distilbert-cos-v1',
+            'sentence-transformers/distiluse-base-multilingual-cased-v1',
+            'sentence-transformers/distiluse-base-multilingual-cased-v2',
+            'sentence-transformers/distilbert-base-nli-mean-tokens',
+            'sentence-transformers/distilbert-base-nli-stsb-mean-tokens',
+            'sentence-transformers/msmarco-distilbert-base-v4',
+        ],
 
         # Text classification
-        'distilbert-base-uncased-finetuned-sst-2-english',
+        'text-classification': [
+            'distilbert-base-uncased-finetuned-sst-2-english',
+        ],
 
         # Question answering
-        'distilbert-base-uncased-distilled-squad',
-        'distilbert-base-cased-distilled-squad',
+        'question-answering': [
+            'distilbert-base-uncased-distilled-squad',
+            'distilbert-base-cased-distilled-squad',
+        ],
 
         # Zero-shot classification
-        'typeform/distilbert-base-uncased-mnli',
+        'zero-shot-classification': [
+            'typeform/distilbert-base-uncased-mnli',
+        ],
 
         # Token classification
-        'Davlan/distilbert-base-multilingual-cased-ner-hrl',
+        'token-classification': [
+            'Davlan/distilbert-base-multilingual-cased-ner-hrl',
+        ],
 
         # Masked language modelling
-        'distilbert-base-uncased',
-        'distilbert-base-cased',
-    ],
-    'donut': [  # NOTE: also a `vision-encoder-decoder`
+        'fill-mask': [
+            'distilbert-base-uncased',
+            'distilbert-base-cased',
+        ],
+    },
+    'donut': {  # NOTE: also a `vision-encoder-decoder`
         # Image-to-text
-        'naver-clova-ix/donut-base-finetuned-cord-v2',
-        'naver-clova-ix/donut-base-finetuned-zhtrainticket',
+        'image-to-text': [
+            'naver-clova-ix/donut-base-finetuned-cord-v2',
+            'naver-clova-ix/donut-base-finetuned-zhtrainticket',
+        ],
 
         # Document Question Answering
-        'naver-clova-ix/donut-base-finetuned-docvqa',
-    ],
-    'dpt': [
+        'document-question-answering': [
+            'naver-clova-ix/donut-base-finetuned-docvqa',
+        ],
+    },
+    'dpt': {
         # Depth estimation
-        'Intel/dpt-hybrid-midas',
-        'Intel/dpt-large',
-    ],
-    'falcon': [
+        'depth-estimation': [
+            'Intel/dpt-hybrid-midas',
+            'Intel/dpt-large',
+        ],
+    },
+    'falcon': {
         # Text generation
-        'Rocketknight1/tiny-random-falcon-7b',
-        'fxmarty/really-tiny-falcon-testing',
-    ],
-    'glpn': [
+        'text-generation': [
+            'Rocketknight1/tiny-random-falcon-7b',
+            'fxmarty/really-tiny-falcon-testing',
+        ],
+    },
+    'glpn': {
         # Depth estimation
-        'vinvino02/glpn-kitti',
-        'vinvino02/glpn-nyu',
-    ],
-    'gpt_neo': [
+        'depth-estimation': [
+            'vinvino02/glpn-kitti',
+            'vinvino02/glpn-nyu',
+        ],
+    },
+    'gpt_neo': {
         # Text generation
-        'EleutherAI/gpt-neo-125M',
-        'MBZUAI/LaMini-Neo-125M',
-        # 'MBZUAI/LaMini-Neo-1.3B', # TODO add
-        'iliemihai/gpt-neo-romanian-125m',
-    ],
-    'gpt_neox': [
+        'text-generation': [
+            'EleutherAI/gpt-neo-125M',
+            'MBZUAI/LaMini-Neo-125M',
+            # 'MBZUAI/LaMini-Neo-1.3B', # TODO add
+            'iliemihai/gpt-neo-romanian-125m',
+        ],
+    },
+    'gpt_neox': {
         # Text generation
-        'EleutherAI/pythia-14m',
-        'EleutherAI/pythia-31m',
-        'EleutherAI/pythia-70m',
-        'EleutherAI/pythia-70m-deduped',
-        'EleutherAI/pythia-160m',
-        'EleutherAI/pythia-160m-deduped',
-        'EleutherAI/pythia-410m',
-        'EleutherAI/pythia-410m-deduped',
-    ],
-    'gpt2': [
+        'text-generation': [
+            'EleutherAI/pythia-14m',
+            'EleutherAI/pythia-31m',
+            'EleutherAI/pythia-70m',
+            'EleutherAI/pythia-70m-deduped',
+            'EleutherAI/pythia-160m',
+            'EleutherAI/pythia-160m-deduped',
+            'EleutherAI/pythia-410m',
+            'EleutherAI/pythia-410m-deduped',
+        ],
+    },
+    'gpt2': {
         # Text generation
-        'gpt2',
-        'distilgpt2',
-        'MBZUAI/LaMini-Cerebras-111M',
-        'MBZUAI/LaMini-Cerebras-256M',
-        'MBZUAI/LaMini-Cerebras-590M',
-        # 'MBZUAI/LaMini-Cerebras-1.3B', # TODO add
-        'MBZUAI/LaMini-GPT-124M',
-        'MBZUAI/LaMini-GPT-774M',
-        # 'MBZUAI/LaMini-GPT-1.5B', # TODO add
-        'aisquared/dlite-v2-774m',
-        'Locutusque/gpt2-large-conversational',
-    ],
-    'gpt_bigcode': [
+        'text-generation': [
+            'gpt2',
+            'distilgpt2',
+            'MBZUAI/LaMini-Cerebras-111M',
+            'MBZUAI/LaMini-Cerebras-256M',
+            'MBZUAI/LaMini-Cerebras-590M',
+            # 'MBZUAI/LaMini-Cerebras-1.3B', # TODO add
+            'MBZUAI/LaMini-GPT-124M',
+            'MBZUAI/LaMini-GPT-774M',
+            # 'MBZUAI/LaMini-GPT-1.5B', # TODO add
+            'aisquared/dlite-v2-774m',
+            'Locutusque/gpt2-large-conversational',
+        ],
+    },
+    'gpt_bigcode': {
         # Text generation
-        'bigcode/tiny_starcoder_py',
-        'abacaj/starcoderbase-1b-sft',
-        # 'bigcode/starcoderbase-1b', # NOTE: This model is gated, so we ignore it when testing
-    ],
-    'gptj': [
+        'text-generation': [
+            'bigcode/tiny_starcoder_py',
+            'abacaj/starcoderbase-1b-sft',
+            # 'bigcode/starcoderbase-1b', # NOTE: This model is gated, so we ignore it when testing
+        ],
+    },
+    'gptj': {
         # Text generation
-        'TabbyML/J-350M',
-        'Milos/slovak-gpt-j-405M',
-        'heegyu/kogpt-j-350m',
-    ],
-    'herbert': [
+        'text-generation': [
+            'TabbyML/J-350M',
+            'Milos/slovak-gpt-j-405M',
+            'heegyu/kogpt-j-350m',
+        ],
+    },
+    'herbert': {
         # Feature extraction
-        'allegro/herbert-base-cased',
-        'allegro/herbert-large-cased',
-    ],
-    'llama': [
+        'feature-extraction': [
+            'allegro/herbert-base-cased',
+            'allegro/herbert-large-cased',
+        ],
+    },
+    'llama': {
         # Text generation
-        'Xenova/llama2.c-stories15M',
-        'Xenova/llama2.c-stories42M',
-        'Xenova/llama2.c-stories110M',
-        'RajuKandasamy/tamillama_tiny_30m',
-        'JackFram/llama-68m',
-        'JackFram/llama-160m',
-    ],
+        'text-generation': [
+            'Xenova/llama2.c-stories15M',
+            'Xenova/llama2.c-stories42M',
+            'Xenova/llama2.c-stories110M',
+            'RajuKandasamy/tamillama_tiny_30m',
+            'JackFram/llama-68m',
+            'JackFram/llama-160m',
+        ],
+    },
     'longt5': {
         # Text-to-text
         'text2text-generation': [
@@ -356,265 +453,348 @@
             'voidful/long-t5-encodec-tglobal-base',
         ],
     },
-    'm2m_100': [
+    'm2m_100': {
         # Translation
-        'facebook/nllb-200-distilled-600M',
-        'facebook/m2m100_418M',
-    ],
-    'marian': [
+        'translation': [
+            'facebook/nllb-200-distilled-600M',
+            'facebook/m2m100_418M',
+        ],
+    },
+    'marian': {
         # Translation
-        f'Helsinki-NLP/opus-mt-{x}'
-        for x in SUPPORTED_HELSINKI_NLP_MODELS
-    ],
-    'mbart': [
+        'translation': [
+            f'Helsinki-NLP/opus-mt-{x}'
+            for x in SUPPORTED_HELSINKI_NLP_MODELS
+        ],
+    },
+    'mbart': {
         # Translation
-        'facebook/mbart-large-50-many-to-many-mmt',
-        'facebook/mbart-large-50-many-to-one-mmt',
-        'facebook/mbart-large-50',
-    ],
-    'mistral': [
+        'translation': [
+            'facebook/mbart-large-50-many-to-many-mmt',
+            'facebook/mbart-large-50-many-to-one-mmt',
+            'facebook/mbart-large-50',
+        ],
+    },
+    'mistral': {
         # Text generation
-        'echarlaix/tiny-random-mistral',
-    ],
-    'mobilebert': [
+        'text-generation': [
+            'echarlaix/tiny-random-mistral',
+        ],
+    },
+    'mobilebert': {
         # Zero-shot classification
-        'typeform/mobilebert-uncased-mnli',
+        'zero-shot-classification': [
+            'typeform/mobilebert-uncased-mnli',
 
-        # TODO:
-        # https://github.com/huggingface/optimum/issues/1027
-        # 'google/mobilebert-uncased',
-    ],
-    'mobilevit': [
+            # TODO:
+            # https://github.com/huggingface/optimum/issues/1027
+            # 'google/mobilebert-uncased',
+        ],
+    },
+    'mobilevit': {
         # Image classification
-        'apple/mobilevit-small',
-        'apple/mobilevit-x-small',
-        'apple/mobilevit-xx-small',
+        'image-classification': [
+            'apple/mobilevit-small',
+            'apple/mobilevit-x-small',
+            'apple/mobilevit-xx-small',
+        ],
 
         # TODO: Image segmentation
-        # 'apple/deeplabv3-mobilevit-small',
-        # 'apple/deeplabv3-mobilevit-x-small',
-        # 'apple/deeplabv3-mobilevit-xx-small',
-    ],
-    'mpt': [
+        # 'image-segmentation': [
+        #     'apple/deeplabv3-mobilevit-small',
+        #     'apple/deeplabv3-mobilevit-x-small',
+        #     'apple/deeplabv3-mobilevit-xx-small',
+        # ],
+    },
+    'mpt': {
         # Text generation
-        'efederici/ipt-350m',
-    ],
-    'mpnet': [
+        'text-generation': [
+            'efederici/ipt-350m',
+        ],
+    },
+    'mpnet': {
         # Feature extraction
-        'sentence-transformers/all-mpnet-base-v2',
-        'sentence-transformers/nli-mpnet-base-v2',
-        'sentence-transformers/paraphrase-mpnet-base-v2',
-        'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
-        'sentence-transformers/multi-qa-mpnet-base-cos-v1',
-        'sentence-transformers/multi-qa-mpnet-base-dot-v1',
-    ],
-    'mt5': [
-        'google/mt5-small',
-        'google/mt5-base',
-    ],
-    'nougat': [
+        'feature-extraction': [
+            'sentence-transformers/all-mpnet-base-v2',
+            'sentence-transformers/nli-mpnet-base-v2',
+            'sentence-transformers/paraphrase-mpnet-base-v2',
+            'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
+            'sentence-transformers/multi-qa-mpnet-base-cos-v1',
+            'sentence-transformers/multi-qa-mpnet-base-dot-v1',
+        ],
+    },
+    'mt5': {
+        # Text-to-text
+        'text2text-generation': [
+            'google/mt5-small',
+            'google/mt5-base',
+        ],
+    },
+    'nougat': {
         # Image-to-text
-        'facebook/nougat-small',
-        'facebook/nougat-base',
-    ],
-    'opt': [
+        'image-to-text': [
+            'facebook/nougat-small',
+            'facebook/nougat-base',
+        ],
+    },
+    'opt': {
         # Text generation
-        'facebook/opt-125m',
-        'facebook/opt-350m',
-        # (TODO conversational)
-        'PygmalionAI/pygmalion-350m',
-    ],
-    'owlvit': [
+        'text-generation': [
+            # Text generation
+            'facebook/opt-125m',
+            'facebook/opt-350m',
+            # (TODO conversational)
+            'PygmalionAI/pygmalion-350m',
+        ],
+    },
+    'owlvit': {
         # Object detection (Zero-shot object detection)
         # NOTE: Exported with --batch_size 1
-        'google/owlvit-base-patch32',
-        'google/owlvit-base-patch16',
-        'google/owlvit-large-patch14',
-    ],
-    'resnet': [
+        'zero-shot-object-detection': [
+            'google/owlvit-base-patch32',
+            'google/owlvit-base-patch16',
+            'google/owlvit-large-patch14',
+        ],
+    },
+    'resnet': {
         # Image classification
-        'microsoft/resnet-18',
-        'microsoft/resnet-26',
-        'microsoft/resnet-34',
-        'microsoft/resnet-50',
-        'microsoft/resnet-101',
-        'microsoft/resnet-152',
-    ],
-    'roberta': [
-        # Masked language modelling
-        'roberta-base',
-        'distilroberta-base',
-
+        'image-classification': [
+            'microsoft/resnet-18',
+            'microsoft/resnet-26',
+            'microsoft/resnet-34',
+            'microsoft/resnet-50',
+            'microsoft/resnet-101',
+            'microsoft/resnet-152',
+        ],
+    },
+    'roberta': {
         # Feature extraction
-        'sentence-transformers/all-distilroberta-v1',
-        'sentence-transformers/all-roberta-large-v1',
+        'feature-extraction': [
+            'sentence-transformers/all-distilroberta-v1',
+            'sentence-transformers/all-roberta-large-v1',
+        ],
 
         # Text classification
-        'roberta-large-mnli',
+        'text-classification': [
+            'roberta-large-mnli',
+        ],
 
         # Token classification
-        'julien-c/EsperBERTo-small-pos',
-    ],
+        'token-classification': [
+            'julien-c/EsperBERTo-small-pos',
+        ],
+
+        # Masked language modelling
+        'fill-mask': [
+            'roberta-base',
+            'distilroberta-base',
+        ],
+    },
     # 'sam': [
     #     'facebook/sam-vit-base',
     #     'facebook/sam-vit-large',
     #     'facebook/sam-vit-huge',
     # ],
-    'speecht5': [
-        # Text-to-speech
-        'microsoft/speecht5_tts',
-    ],
-    'squeezebert': [
+
+    'speecht5': {
+        # Text-to-audio/Text-to-speech
+        'text-to-audio': [
+            'microsoft/speecht5_tts',
+        ],
+    },
+    'squeezebert': {
         # Feature extraction
-        'squeezebert/squeezebert-uncased',
-        'squeezebert/squeezebert-mnli',
-    ],
-    'swin': [
+        'feature-extraction': [
+            'squeezebert/squeezebert-uncased',
+            'squeezebert/squeezebert-mnli',
+        ],
+    },
+    'swin': {
         # Image classification
-        'microsoft/swin-tiny-patch4-window7-224',
-        'microsoft/swin-base-patch4-window7-224',
-        'microsoft/swin-large-patch4-window12-384-in22k',
-        'microsoft/swin-base-patch4-window7-224-in22k',
-        'microsoft/swin-base-patch4-window12-384-in22k',
-        'microsoft/swin-base-patch4-window12-384',
-        'microsoft/swin-large-patch4-window7-224',
-        'microsoft/swin-small-patch4-window7-224',
-        'microsoft/swin-large-patch4-window7-224-in22k',
-        'microsoft/swin-large-patch4-window12-384',
-    ],
-    'swin2sr': [
+        'image-classification': [
+            'microsoft/swin-tiny-patch4-window7-224',
+            'microsoft/swin-base-patch4-window7-224',
+            'microsoft/swin-large-patch4-window12-384-in22k',
+            'microsoft/swin-base-patch4-window7-224-in22k',
+            'microsoft/swin-base-patch4-window12-384-in22k',
+            'microsoft/swin-base-patch4-window12-384',
+            'microsoft/swin-large-patch4-window7-224',
+            'microsoft/swin-small-patch4-window7-224',
+            'microsoft/swin-large-patch4-window7-224-in22k',
+            'microsoft/swin-large-patch4-window12-384',
+        ],
+    },
+    'swin2sr': {
         # Image-to-image (Super-resolution)
-        'caidas/swin2SR-classical-sr-x2-64',
-        'caidas/swin2SR-realworld-sr-x4-64-bsrgan-psnr',
-        'caidas/swin2SR-classical-sr-x4-64',
-        'caidas/swin2SR-compressed-sr-x4-48',
-        'caidas/swin2SR-lightweight-x2-64',
+        'image-to-image': [
+            'caidas/swin2SR-classical-sr-x2-64',
+            'caidas/swin2SR-realworld-sr-x4-64-bsrgan-psnr',
+            'caidas/swin2SR-classical-sr-x4-64',
+            'caidas/swin2SR-compressed-sr-x4-48',
+            'caidas/swin2SR-lightweight-x2-64',
+        ],
 
         # Feature extraction
-        'hf-tiny-model-private/tiny-random-Swin2SRModel',
-    ],
-    't5': [
-        # Text-to-text (Translation/Summarization)
-        't5-small',
-        't5-base',
-        'google/t5-v1_1-small',
-        'google/t5-v1_1-base',
-        'google/flan-t5-small',
-        'google/flan-t5-base',
-        'MBZUAI/LaMini-Flan-T5-77M',
-        'MBZUAI/LaMini-Flan-T5-248M',
-        'MBZUAI/LaMini-Flan-T5-783M',
-        'MBZUAI/LaMini-T5-61M',
-        'MBZUAI/LaMini-T5-223M',
-        'MBZUAI/LaMini-T5-738M',
+        'feature-extraction': [
+            'hf-tiny-model-private/tiny-random-Swin2SRModel',
+        ],
+    },
+    't5': {
+        # Translation/Summarization
+        ('translation', 'summarization'): [
+            't5-small',
+            't5-base',
+            'google/t5-v1_1-small',
+            'google/t5-v1_1-base',
+            'google/flan-t5-small',
+            'google/flan-t5-base',
+        ],
+
+        # Text-to-text
+        'text2text-generation': [
+            'MBZUAI/LaMini-Flan-T5-77M',
+            'MBZUAI/LaMini-Flan-T5-248M',
+            'MBZUAI/LaMini-Flan-T5-783M',
+            'MBZUAI/LaMini-T5-61M',
+            'MBZUAI/LaMini-T5-223M',
+            'MBZUAI/LaMini-T5-738M',
+        ],
 
         # Feature extraction
-        'sentence-transformers/sentence-t5-large',
-        'hkunlp/instructor-base',
-        'hkunlp/instructor-large',
-    ],
-    'trocr': [  # NOTE: also a `vision-encoder-decoder`
-        # Text-to-image
-        'microsoft/trocr-small-printed',
-        'microsoft/trocr-base-printed',
-        'microsoft/trocr-small-handwritten',
-        'microsoft/trocr-base-handwritten',
-    ],
-    'vision-encoder-decoder': [
+        'feature-extraction': [
+            'sentence-transformers/sentence-t5-large',
+            'hkunlp/instructor-base',
+            'hkunlp/instructor-large',
+        ],
+    },
+    'trocr': {  # NOTE: also a `vision-encoder-decoder`
         # Text-to-image
-        'nlpconnect/vit-gpt2-image-captioning',
-    ],
-    'vit': [
+        'text-to-image': [
+            'microsoft/trocr-small-printed',
+            'microsoft/trocr-base-printed',
+            'microsoft/trocr-small-handwritten',
+            'microsoft/trocr-base-handwritten',
+        ],
+    },
+    'vision-encoder-decoder': {
+        # Image-to-text
+        'image-to-text': [
+            'nlpconnect/vit-gpt2-image-captioning',
+        ],
+    },
+    'vit': {
         # Feature extraction
-        'google/vit-base-patch16-224-in21k',
-        'facebook/dino-vitb16',
-        'facebook/dino-vits8',
-        'facebook/dino-vitb8',
-        'facebook/dino-vits16',
-
+        'feature-extraction': [
+            'google/vit-base-patch16-224-in21k',
+            'facebook/dino-vitb16',
+            'facebook/dino-vits8',
+            'facebook/dino-vitb8',
+            'facebook/dino-vits16',
+        ],
         # Image classification
-        'google/vit-base-patch16-224',
-    ],
-    'wav2vec2': [
+        'image-classification': [
+            'google/vit-base-patch16-224',
+        ],
+    },
+    'wav2vec2': {
         # Feature extraction # NOTE: requires --task feature-extraction
-        'facebook/mms-300m',
-        'facebook/mms-1b',
+        'feature-extraction': [
+            'facebook/mms-300m',
+            'facebook/mms-1b',
+        ],
 
         # Audio classification
-        'alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech',
-        'superb/wav2vec2-base-superb-ks',
-        'facebook/mms-lid-126',
-        'facebook/mms-lid-256',
-        'facebook/mms-lid-512',
-        'facebook/mms-lid-1024',
-        'facebook/mms-lid-2048',
-        'facebook/mms-lid-4017',
+        'audio-classification': [
+            'alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech',
+            'superb/wav2vec2-base-superb-ks',
+            'facebook/mms-lid-126',
+            'facebook/mms-lid-256',
+            'facebook/mms-lid-512',
+            'facebook/mms-lid-1024',
+            'facebook/mms-lid-2048',
+            'facebook/mms-lid-4017',
+        ],
 
         # Automatic speech recognition
-        'jonatasgrosman/wav2vec2-large-xlsr-53-english',
-        'facebook/wav2vec2-base-960h',
-        'facebook/mms-1b-l1107',
-        'facebook/mms-1b-all',
-        'facebook/mms-1b-fl102',
-    ],
-    'wavlm': [
+        'automatic-speech-recognition': [
+            'jonatasgrosman/wav2vec2-large-xlsr-53-english',
+            'facebook/wav2vec2-base-960h',
+            'facebook/mms-1b-l1107',
+            'facebook/mms-1b-all',
+            'facebook/mms-1b-fl102',
+        ],
+    },
+    'wavlm': {
         # Feature extraction
-        'microsoft/wavlm-base',
-        'microsoft/wavlm-base-plus',
-        'microsoft/wavlm-large',
-    ],
-    'whisper': [
+        'feature-extraction': [
+            'microsoft/wavlm-base',
+            'microsoft/wavlm-base-plus',
+            'microsoft/wavlm-large',
+        ],
+    },
+    'whisper': {
         # Automatic speech recognition
-        'openai/whisper-tiny',
-        'openai/whisper-tiny.en',
-        'openai/whisper-base',
-        'openai/whisper-base.en',
-        'openai/whisper-small',
-        'openai/whisper-small.en',
-        'openai/whisper-medium',
-        'openai/whisper-medium.en',
-        'openai/whisper-large',
-        'openai/whisper-large-v2',
-        'NbAiLab/nb-whisper-tiny-beta',
-        'NbAiLab/nb-whisper-base-beta',
-        'NbAiLab/nb-whisper-small-beta',
-        'NbAiLab/nb-whisper-medium-beta',
-        'NbAiLab/nb-whisper-large-beta',
-    ],
-    'xlm': [
-        'xlm-clm-ende-1024',
-        'xlm-mlm-ende-1024',
-        'xlm-clm-enfr-1024',
-        'xlm-mlm-enfr-1024',
-        'xlm-mlm-17-1280',
-        'xlm-mlm-100-1280',
-        'xlm-mlm-en-2048',
-        'xlm-mlm-enro-1024',
-        'xlm-mlm-tlm-xnli15-1024',
-        'xlm-mlm-xnli15-1024',
-    ],
-    'xlm-roberta': [
+        'automatic-speech-recognition': [
+            'openai/whisper-tiny',
+            'openai/whisper-tiny.en',
+            'openai/whisper-base',
+            'openai/whisper-base.en',
+            'openai/whisper-small',
+            'openai/whisper-small.en',
+            'openai/whisper-medium',
+            'openai/whisper-medium.en',
+            'openai/whisper-large',
+            'openai/whisper-large-v2',
+            'NbAiLab/nb-whisper-tiny-beta',
+            'NbAiLab/nb-whisper-base-beta',
+            'NbAiLab/nb-whisper-small-beta',
+            'NbAiLab/nb-whisper-medium-beta',
+            'NbAiLab/nb-whisper-large-beta',
+        ],
+    },
+    'xlm': {
+        # Masked language modelling
+        'fill-mask': [
+            'xlm-clm-ende-1024',
+            'xlm-mlm-ende-1024',
+            'xlm-clm-enfr-1024',
+            'xlm-mlm-enfr-1024',
+            'xlm-mlm-17-1280',
+            'xlm-mlm-100-1280',
+            'xlm-mlm-en-2048',
+            'xlm-mlm-enro-1024',
+            'xlm-mlm-tlm-xnli15-1024',
+            'xlm-mlm-xnli15-1024',
+        ],
+    },
+    'xlm-roberta': {
         # Masked language modelling
-        'xlm-roberta-base'
-    ],
-    'yolos': [
+        'fill-mask': [
+            'xlm-roberta-base'
+        ],
+    },
+    'yolos': {
         # Object detection
-        'hustvl/yolos-tiny',
-        'hustvl/yolos-small',
-        'hustvl/yolos-base',
-        'hustvl/yolos-small-dwr',
-        'hustvl/yolos-small-300',
-    ]
+        'object-detection': [
+            # Object detection
+            'hustvl/yolos-tiny',
+            'hustvl/yolos-small',
+            'hustvl/yolos-base',
+            'hustvl/yolos-small-dwr',
+            'hustvl/yolos-small-300',
+        ],
+    },
 }
 
 
 def main():
-    for model_type, model_ids in SUPPORTED_MODELS.items():
-        print(f'# {model_type:=^80}')
-        for model_id in model_ids:
-            print(
-                f'python -m scripts.convert --quantize --model_id {model_id}')
-        print()
+    for model_type, tasks in SUPPORTED_MODELS.items():
+        for task, model_ids in tasks.items():
+            print(f'# {model_type:=^80}')
+            for model_id in model_ids:
+                print(
+                    f'python -m scripts.convert --quantize --model_id {model_id}')
+            print()
 
 
 if __name__ == '__main__':
diff --git a/src/pipelines.js b/src/pipelines.js
index f3d802da5..7705cc190 100644
--- a/src/pipelines.js
+++ b/src/pipelines.js
@@ -1180,9 +1180,7 @@ export class ZeroShotAudioClassificationPipeline extends Pipeline {
  * **Example:** Transcribe English w/ word-level timestamps.
  * ```javascript
  * let url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
- * let transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en', {
- *     revision: 'output_attentions',
- * });
+ * let transcriber = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
  * let output = await transcriber(url, { return_timestamps: 'word' });
  * // {
  * //   "text": " And so my fellow Americans ask not what your country can do for you ask what you can do for your country.",
diff --git a/tests/generate_tests.py b/tests/generate_tests.py
index 6356e805d..8c44258a8 100644
--- a/tests/generate_tests.py
+++ b/tests/generate_tests.py
@@ -111,11 +111,18 @@
 }
 
 
+FLATTENED_SUPPORTED_MODELS = [
+    (model_type, [
+        model for task_models in tasks.values() for model in task_models
+    ]) for model_type, tasks in SUPPORTED_MODELS.items()
+]
+
+
 def generate_tokenizer_tests():
 
     results = {}
 
-    tokenizers_to_test = list(SUPPORTED_MODELS.items()) + \
+    tokenizers_to_test = FLATTENED_SUPPORTED_MODELS + \
         list(ADDITIONAL_TOKENIZERS_TO_TEST.items())
 
     for model_type, tokenizer_names in tokenizers_to_test:
@@ -180,7 +187,7 @@ def generate_tokenizer_tests():
 
 def generate_config_tests():
     results = {}
-    for model_type, config_names in SUPPORTED_MODELS.items():
+    for model_type, config_names in FLATTENED_SUPPORTED_MODELS:
         print(f'Generating tests for {model_type}')
 
         for config_name in config_names: