diff --git a/README.md b/README.md
index fce147922..755d0c505 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ npm i @xenova/transformers
Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
```html
```
@@ -134,7 +134,7 @@ Check out the Transformers.js [template](https://huggingface.co/new-space?templa
-By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@xenova/transformers@2.16.0/dist/), which should work out-of-the-box. You can customize this as follows:
+By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@xenova/transformers@2.16.1/dist/), which should work out-of-the-box. You can customize this as follows:
### Settings
@@ -210,6 +210,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
| [Token Classification](https://huggingface.co/tasks/token-classification) | `token-classification` or `ner` | Assigning a label to each token in a text. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TokenClassificationPipeline) [(models)](https://huggingface.co/models?pipeline_tag=token-classification&library=transformers.js) |
| [Translation](https://huggingface.co/tasks/translation) | `translation` | Converting text from one language to another. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TranslationPipeline) [(models)](https://huggingface.co/models?pipeline_tag=translation&library=transformers.js) |
| [Zero-Shot Classification](https://huggingface.co/tasks/zero-shot-classification) | `zero-shot-classification` | Classifying text into classes that are unseen during training. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotClassificationPipeline) [(models)](https://huggingface.co/models?pipeline_tag=zero-shot-classification&library=transformers.js) |
+| [Feature Extraction](https://huggingface.co/tasks/feature-extraction) | `feature-extraction` | Transforming raw data into numerical features that can be processed while preserving the information in the original dataset. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FeatureExtractionPipeline) [(models)](https://huggingface.co/models?pipeline_tag=feature-extraction&library=transformers.js) |
#### Vision
@@ -223,6 +224,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
| [Object Detection](https://huggingface.co/tasks/object-detection) | `object-detection` | Identify objects of certain defined classes within an image. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ObjectDetectionPipeline) [(models)](https://huggingface.co/models?pipeline_tag=object-detection&library=transformers.js) |
| [Video Classification](https://huggingface.co/tasks/video-classification) | n/a | Assigning a label or class to an entire video. | ❌ |
| [Unconditional Image Generation](https://huggingface.co/tasks/unconditional-image-generation) | n/a | Generating images with no condition in any context (like a prompt text or another image). | ❌ |
+| [Image Feature Extraction](https://huggingface.co/tasks/image-feature-extraction) | `image-feature-extraction` | Transforming raw data into numerical features that can be processed while preserving the information in the original image. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageFeatureExtractionPipeline) [(models)](https://huggingface.co/models?pipeline_tag=image-feature-extraction&library=transformers.js) |
#### Audio
@@ -247,7 +249,6 @@ You can refine your search by selecting the task you're interested in (e.g., [te
| Task | ID | Description | Supported? |
|--------------------------|----|-------------|------------|
| [Document Question Answering](https://huggingface.co/tasks/document-question-answering) | `document-question-answering` | Answering questions on document images. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.DocumentQuestionAnsweringPipeline) [(models)](https://huggingface.co/models?pipeline_tag=document-question-answering&library=transformers.js) |
-| [Feature Extraction](https://huggingface.co/tasks/feature-extraction) | `feature-extraction` | Transforming raw data into numerical features that can be processed while preserving the information in the original dataset. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FeatureExtractionPipeline) [(models)](https://huggingface.co/models?pipeline_tag=feature-extraction&library=transformers.js) |
| [Image-to-Text](https://huggingface.co/tasks/image-to-text) | `image-to-text` | Output text from a given image. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageToTextPipeline) [(models)](https://huggingface.co/models?pipeline_tag=image-to-text&library=transformers.js) |
| [Text-to-Image](https://huggingface.co/tasks/text-to-image) | `text-to-image` | Generates images from input text. | ❌ |
| [Visual Question Answering](https://huggingface.co/tasks/visual-question-answering) | `visual-question-answering` | Answering open-ended questions based on an image. | ❌ |
@@ -294,6 +295,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te
1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
diff --git a/docs/snippets/2_installation.snippet b/docs/snippets/2_installation.snippet
index c090b0072..4c6205144 100644
--- a/docs/snippets/2_installation.snippet
+++ b/docs/snippets/2_installation.snippet
@@ -7,6 +7,6 @@ npm i @xenova/transformers
Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
```html
```
diff --git a/docs/snippets/4_custom-usage.snippet b/docs/snippets/4_custom-usage.snippet
index 8ee04a23f..6a3fff9ec 100644
--- a/docs/snippets/4_custom-usage.snippet
+++ b/docs/snippets/4_custom-usage.snippet
@@ -1,6 +1,6 @@
-By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@xenova/transformers@2.16.0/dist/), which should work out-of-the-box. You can customize this as follows:
+By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@xenova/transformers@2.16.1/dist/), which should work out-of-the-box. You can customize this as follows:
### Settings
diff --git a/docs/snippets/5_supported-tasks.snippet b/docs/snippets/5_supported-tasks.snippet
index 838026092..ac71ee528 100644
--- a/docs/snippets/5_supported-tasks.snippet
+++ b/docs/snippets/5_supported-tasks.snippet
@@ -17,6 +17,7 @@
| [Token Classification](https://huggingface.co/tasks/token-classification) | `token-classification` or `ner` | Assigning a label to each token in a text. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TokenClassificationPipeline) [(models)](https://huggingface.co/models?pipeline_tag=token-classification&library=transformers.js) |
| [Translation](https://huggingface.co/tasks/translation) | `translation` | Converting text from one language to another. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.TranslationPipeline) [(models)](https://huggingface.co/models?pipeline_tag=translation&library=transformers.js) |
| [Zero-Shot Classification](https://huggingface.co/tasks/zero-shot-classification) | `zero-shot-classification` | Classifying text into classes that are unseen during training. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotClassificationPipeline) [(models)](https://huggingface.co/models?pipeline_tag=zero-shot-classification&library=transformers.js) |
+| [Feature Extraction](https://huggingface.co/tasks/feature-extraction) | `feature-extraction` | Transforming raw data into numerical features that can be processed while preserving the information in the original dataset. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FeatureExtractionPipeline) [(models)](https://huggingface.co/models?pipeline_tag=feature-extraction&library=transformers.js) |
#### Vision
@@ -30,6 +31,7 @@
| [Object Detection](https://huggingface.co/tasks/object-detection) | `object-detection` | Identify objects of certain defined classes within an image. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ObjectDetectionPipeline) [(models)](https://huggingface.co/models?pipeline_tag=object-detection&library=transformers.js) |
| [Video Classification](https://huggingface.co/tasks/video-classification) | n/a | Assigning a label or class to an entire video. | ❌ |
| [Unconditional Image Generation](https://huggingface.co/tasks/unconditional-image-generation) | n/a | Generating images with no condition in any context (like a prompt text or another image). | ❌ |
+| [Image Feature Extraction](https://huggingface.co/tasks/image-feature-extraction) | `image-feature-extraction` | Transforming raw data into numerical features that can be processed while preserving the information in the original image. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageFeatureExtractionPipeline) [(models)](https://huggingface.co/models?pipeline_tag=image-feature-extraction&library=transformers.js) |
#### Audio
@@ -54,7 +56,6 @@
| Task | ID | Description | Supported? |
|--------------------------|----|-------------|------------|
| [Document Question Answering](https://huggingface.co/tasks/document-question-answering) | `document-question-answering` | Answering questions on document images. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.DocumentQuestionAnsweringPipeline) [(models)](https://huggingface.co/models?pipeline_tag=document-question-answering&library=transformers.js) |
-| [Feature Extraction](https://huggingface.co/tasks/feature-extraction) | `feature-extraction` | Transforming raw data into numerical features that can be processed while preserving the information in the original dataset. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.FeatureExtractionPipeline) [(models)](https://huggingface.co/models?pipeline_tag=feature-extraction&library=transformers.js) |
| [Image-to-Text](https://huggingface.co/tasks/image-to-text) | `image-to-text` | Output text from a given image. | ✅ [(docs)](https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ImageToTextPipeline) [(models)](https://huggingface.co/models?pipeline_tag=image-to-text&library=transformers.js) |
| [Text-to-Image](https://huggingface.co/tasks/text-to-image) | `text-to-image` | Generates images from input text. | ❌ |
| [Visual Question Answering](https://huggingface.co/tasks/visual-question-answering) | `visual-question-answering` | Answering open-ended questions based on an image. | ❌ |
diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet
index 78948cf47..ac09bdbdb 100644
--- a/docs/snippets/6_supported-models.snippet
+++ b/docs/snippets/6_supported-models.snippet
@@ -29,6 +29,7 @@
1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
diff --git a/examples/tokenizer-playground/src/App.jsx b/examples/tokenizer-playground/src/App.jsx
index 9105e0f20..98173f8fb 100644
--- a/examples/tokenizer-playground/src/App.jsx
+++ b/examples/tokenizer-playground/src/App.jsx
@@ -4,12 +4,16 @@ import { Token } from './components/Token'
function App() {
+ // Allow user to set tokenizer and text via URL query parameters
+ const urlParams = new URLSearchParams(window.location.search);
+ const tokenizerParam = urlParams.get('tokenizer');
+ const textParam = urlParams.get('text');
const [tokenIds, setTokenIds] = useState([])
const [decodedTokens, setDecodedTokens] = useState([])
const [margins, setMargins] = useState([])
const [outputOption, setOutputOption] = useState('text');
- const [tokenizer, setTokenizer] = useState('Xenova/gpt-4');
+ const [tokenizer, setTokenizer] = useState(tokenizerParam ?? 'Xenova/gpt-4');
const textareaRef = useRef(null);
const outputRef = useRef(null);
@@ -51,6 +55,12 @@ function App() {
worker.current.postMessage({ model_id, text });
}, [tokenizer]);
+ useEffect(() => {
+ if (textParam) {
+ onInputChange({ target: { value: textParam } });
+ }
+ }, [onInputChange, textParam]);
+
const onTokenizerChange = useCallback((e) => {
const model_id = e.target.value;
setTokenizer(model_id);
@@ -70,10 +80,12 @@ function App() {
-
+
+
+
@@ -86,6 +98,7 @@ function App() {
rows="8"
className="font-mono text-lg block w-full p-2.5 text-gray-900 bg-gray-50 rounded-lg border border-gray-200"
placeholder="Enter some text"
+ defaultValue={textParam ?? textareaRef.current?.value ?? ''}
>
diff --git a/examples/tokenizer-playground/src/worker.js b/examples/tokenizer-playground/src/worker.js
index e3739e572..4db09bdc0 100644
--- a/examples/tokenizer-playground/src/worker.js
+++ b/examples/tokenizer-playground/src/worker.js
@@ -22,6 +22,7 @@ self.addEventListener('message', async (event) => {
// NOTE: We just remove the StripDecoder from the llama tokenizer
switch (tokenizer.constructor.name) {
case 'LlamaTokenizer':
+ case 'Grok1Tokenizer':
// tokenizer.decoder.decoders.at(-1).constructor.name === 'StripDecoder'
tokenizer.decoder.decoders.pop();
break;
diff --git a/package-lock.json b/package-lock.json
index e3a171abe..dc479579c 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,15 +1,15 @@
{
"name": "@xenova/transformers",
- "version": "2.16.0",
+ "version": "2.16.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@xenova/transformers",
- "version": "2.16.0",
+ "version": "2.16.1",
"license": "Apache-2.0",
"dependencies": {
- "@huggingface/jinja": "^0.2.1",
+ "@huggingface/jinja": "^0.2.2",
"onnxruntime-web": "1.14.0",
"sharp": "^0.32.0"
},
@@ -745,9 +745,9 @@
}
},
"node_modules/@huggingface/jinja": {
- "version": "0.2.1",
- "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.1.tgz",
- "integrity": "sha512-HxjVCll8oGfgUQmN91NYWCjfuaQ5mYZkc/BB1gjfp28q3s48yiB5jUEV7BvaRdIAb/+14cNdX8TIdalFykwywA==",
+ "version": "0.2.2",
+ "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz",
+ "integrity": "sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==",
"engines": {
"node": ">=18"
}
@@ -3587,9 +3587,9 @@
"integrity": "sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ=="
},
"node_modules/follow-redirects": {
- "version": "1.15.4",
- "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
- "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+ "version": "1.15.6",
+ "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+ "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
"dev": true,
"funding": [
{
diff --git a/package.json b/package.json
index 446fd5f16..ed20d1d61 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "@xenova/transformers",
- "version": "2.16.0",
+ "version": "2.16.1",
"description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
"main": "./src/transformers.js",
"types": "./types/transformers.d.ts",
@@ -40,7 +40,7 @@
"dependencies": {
"onnxruntime-web": "1.14.0",
"sharp": "^0.32.0",
- "@huggingface/jinja": "^0.2.1"
+ "@huggingface/jinja": "^0.2.2"
},
"optionalDependencies": {
"onnxruntime-node": "1.14.0"
diff --git a/src/env.js b/src/env.js
index 52f3f73e9..109a065e4 100644
--- a/src/env.js
+++ b/src/env.js
@@ -29,7 +29,7 @@ import url from 'url';
import { ONNX } from './backends/onnx.js';
const { env: onnx_env } = ONNX;
-const VERSION = '2.16.0';
+const VERSION = '2.16.1';
// Check if various APIs are available (depends on environment)
const WEB_CACHE_AVAILABLE = typeof self !== 'undefined' && 'caches' in self;
diff --git a/src/models.js b/src/models.js
index 15b656f44..65f378b5a 100644
--- a/src/models.js
+++ b/src/models.js
@@ -3779,11 +3779,7 @@ export class VitMattePreTrainedModel extends PreTrainedModel { }
* import { Tensor, cat } from '@xenova/transformers';
*
* // Visualize predicted alpha matte
- * const imageTensor = new Tensor(
- * 'uint8',
- * new Uint8Array(image.data),
- * [image.height, image.width, image.channels]
- * ).transpose(2, 0, 1);
+ * const imageTensor = image.toTensor();
*
* // Convert float (0-1) alpha matte to uint8 (0-255)
* const alphaChannel = alphas
@@ -5411,6 +5407,29 @@ export class StableLmModel extends StableLmPreTrainedModel { }
export class StableLmForCausalLM extends StableLmPreTrainedModel { }
//////////////////////////////////////////////////
+
+//////////////////////////////////////////////////
+export class EfficientNetPreTrainedModel extends PreTrainedModel { }
+
+/**
+ * The bare EfficientNet model outputting raw features without any specific head on top.
+ */
+export class EfficientNetModel extends EfficientNetPreTrainedModel { }
+
+/**
+ * EfficientNet Model with an image classification head on top (a linear layer on top of the pooled features).
+ */
+export class EfficientNetForImageClassification extends EfficientNetPreTrainedModel {
+ /**
+ * @param {any} model_inputs
+ */
+ async _call(model_inputs) {
+ return new SequenceClassifierOutput(await super._call(model_inputs));
+ }
+}
+//////////////////////////////////////////////////
+
+
//////////////////////////////////////////////////
// AutoModels, used to simplify construction of PreTrainedModels
// (uses config to instantiate correct class)
@@ -5532,6 +5551,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
['glpn', ['GLPNModel', GLPNModel]],
['hifigan', ['SpeechT5HifiGan', SpeechT5HifiGan]],
+ ['efficientnet', ['EfficientNetModel', EfficientNetModel]],
]);
@@ -5706,6 +5726,7 @@ const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
['resnet', ['ResNetForImageClassification', ResNetForImageClassification]],
['swin', ['SwinForImageClassification', SwinForImageClassification]],
['segformer', ['SegformerForImageClassification', SegformerForImageClassification]],
+ ['efficientnet', ['EfficientNetForImageClassification', EfficientNetForImageClassification]],
]);
const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([
@@ -5775,6 +5796,12 @@ const MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = new Map([
['glpn', ['GLPNForDepthEstimation', GLPNForDepthEstimation]],
])
+// NOTE: This is custom to Transformers.js, and is necessary because certain models
+// (e.g., CLIP) are split into vision and text components
+const MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES = new Map([
+ ['clip', ['CLIPVisionModelWithProjection', CLIPVisionModelWithProjection]],
+ ['siglip', ['SiglipVisionModel', SiglipVisionModel]],
+])
const MODEL_CLASS_TYPE_MAPPING = [
[MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES.EncoderOnly],
@@ -5803,6 +5830,9 @@ const MODEL_CLASS_TYPE_MAPPING = [
[MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
[MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
[MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+
+ // Custom:
+ [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
];
for (const [mappings, type] of MODEL_CLASS_TYPE_MAPPING) {
@@ -5816,9 +5846,7 @@ for (const [mappings, type] of MODEL_CLASS_TYPE_MAPPING) {
const CUSTOM_MAPPING = [
['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly],
- ['CLIPVisionModelWithProjection', CLIPVisionModelWithProjection, MODEL_TYPES.EncoderOnly],
['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly],
- ['SiglipVisionModel', SiglipVisionModel, MODEL_TYPES.EncoderOnly],
['ClapTextModelWithProjection', ClapTextModelWithProjection, MODEL_TYPES.EncoderOnly],
['ClapAudioModelWithProjection', ClapAudioModelWithProjection, MODEL_TYPES.EncoderOnly],
]
@@ -6045,6 +6073,10 @@ export class AutoModelForDepthEstimation extends PretrainedMixin {
static MODEL_CLASS_MAPPINGS = [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES];
}
+export class AutoModelForImageFeatureExtraction extends PretrainedMixin {
+ static MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES];
+}
+
//////////////////////////////////////////////////
//////////////////////////////////////////////////
diff --git a/src/pipelines.js b/src/pipelines.js
index 4101bf1cb..13528aba4 100644
--- a/src/pipelines.js
+++ b/src/pipelines.js
@@ -39,6 +39,7 @@ import {
AutoModelForDocumentQuestionAnswering,
AutoModelForImageToImage,
AutoModelForDepthEstimation,
+ AutoModelForImageFeatureExtraction,
PreTrainedModel,
} from './models.js';
import {
@@ -1206,6 +1207,82 @@ export class FeatureExtractionPipeline extends (/** @type {new (options: TextPip
}
}
+
+/**
+ * @typedef {Object} ImageFeatureExtractionPipelineOptions Parameters specific to image feature extraction pipelines.
+ * @property {boolean} [pool=null] Whether or not to return the pooled output. If set to `false`, the model will return the raw hidden states.
+ *
+ * @callback ImageFeatureExtractionPipelineCallback Extract the features of the input(s).
+ * @param {ImagePipelineInputs} images One or several images (or one list of images) to get the features of.
+ * @param {ImageFeatureExtractionPipelineOptions} [options] The options to use for image feature extraction.
+ * @returns {Promise} The image features computed by the model.
+ *
+ * @typedef {ImagePipelineConstructorArgs & ImageFeatureExtractionPipelineCallback & Disposable} ImageFeatureExtractionPipelineType
+ */
+
+/**
+ * Image feature extraction pipeline using no model head. This pipeline extracts the hidden
+ * states from the base transformer, which can be used as features in downstream tasks.
+ *
+ * **Example:** Perform image feature extraction with `Xenova/vit-base-patch16-224-in21k`.
+ * ```javascript
+ * const image_feature_extractor = await pipeline('image-feature-extraction', 'Xenova/vit-base-patch16-224-in21k');
+ * const url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png';
+ * const features = await image_feature_extractor(url);
+ * // Tensor {
+ * // dims: [ 1, 197, 768 ],
+ * // type: 'float32',
+ * // data: Float32Array(151296) [ ... ],
+ * // size: 151296
+ * // }
+ * ```
+ *
+ * **Example:** Compute image embeddings with `Xenova/clip-vit-base-patch32`.
+ * ```javascript
+ * const image_feature_extractor = await pipeline('image-feature-extraction', 'Xenova/clip-vit-base-patch32');
+ * const url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png';
+ * const features = await image_feature_extractor(url);
+ * // Tensor {
+ * // dims: [ 1, 512 ],
+ * // type: 'float32',
+ * // data: Float32Array(512) [ ... ],
+ * // size: 512
+ * // }
+ * ```
+ */
+export class ImageFeatureExtractionPipeline extends (/** @type {new (options: ImagePipelineConstructorArgs) => ImageFeatureExtractionPipelineType} */ (Pipeline)) {
+ /**
+ * Create a new ImageFeatureExtractionPipeline.
+ * @param {ImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
+ */
+ constructor(options) {
+ super(options);
+ }
+
+ /** @type {ImageFeatureExtractionPipelineCallback} */
+ async _call(images, {
+ pool = null,
+ } = {}) {
+
+ const preparedImages = await prepareImages(images);
+ const { pixel_values } = await this.processor(preparedImages);
+ const outputs = await this.model({ pixel_values });
+
+ /** @type {Tensor} */
+ let result;
+ if (pool) {
+ if (!('pooler_output' in outputs)) {
+ throw Error(`No pooled output was returned. Make sure the model has a 'pooler' layer when using the 'pool' option.`);
+ }
+ result = outputs.pooler_output;
+
+ } else {
+ result = outputs.last_hidden_state ?? outputs.logits ?? outputs.image_embeds;
+ }
+ return result;
+ }
+}
+
// TODO
// export class SentenceSimilarityPipeline extends Pipeline {
// }
@@ -2953,6 +3030,17 @@ const SUPPORTED_TASKS = Object.freeze({
},
"type": "text",
},
+ "image-feature-extraction": {
+ "processor": AutoProcessor,
+ "pipeline": ImageFeatureExtractionPipeline,
+ "model": [AutoModelForImageFeatureExtraction, AutoModel],
+ "default": {
+ // TODO: replace with original
+ // "model": "google/vit-base-patch16-224",
+ "model": "Xenova/vit-base-patch16-224-in21k",
+ },
+ "type": "image",
+ },
})
diff --git a/src/processors.js b/src/processors.js
index a8b82e913..4713a6ae2 100644
--- a/src/processors.js
+++ b/src/processors.js
@@ -33,10 +33,11 @@ import {
min,
max,
softmax,
+ bankers_round,
} from './utils/maths.js';
-import { Tensor, transpose, cat, interpolate, stack } from './utils/tensor.js';
+import { Tensor, permute, cat, interpolate, stack } from './utils/tensor.js';
import { RawImage } from './utils/image.js';
import {
@@ -174,14 +175,15 @@ function validate_audio_inputs(audio, feature_extractor) {
* @private
*/
function constraint_to_multiple_of(val, multiple, minVal = 0, maxVal = null) {
- let x = Math.round(val / multiple) * multiple;
+ const a = val / multiple;
+ let x = bankers_round(a) * multiple;
if (maxVal !== null && x > maxVal) {
- x = Math.floor(val / multiple) * multiple;
+ x = Math.floor(a) * multiple;
}
if (x < minVal) {
- x = Math.ceil(val / multiple) * multiple;
+ x = Math.ceil(a) * multiple;
}
return x;
@@ -195,8 +197,8 @@ function constraint_to_multiple_of(val, multiple, minVal = 0, maxVal = null) {
*/
function enforce_size_divisibility([width, height], divisor) {
return [
- Math.floor(width / divisor) * divisor,
- Math.floor(height / divisor) * divisor
+ Math.max(Math.floor(width / divisor), 1) * divisor,
+ Math.max(Math.floor(height / divisor), 1) * divisor
];
}
@@ -348,7 +350,7 @@ export class ImageFeatureExtractor extends FeatureExtractor {
/**
* Pad the image by a certain amount.
* @param {Float32Array} pixelData The pixel data to pad.
- * @param {number[]} imgDims The dimensions of the image.
+ * @param {number[]} imgDims The dimensions of the image (height, width, channels).
* @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
* @param {Object} options The options for padding.
* @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
@@ -361,7 +363,7 @@ export class ImageFeatureExtractor extends FeatureExtractor {
center = false,
constant_values = 0,
} = {}) {
- const [imageWidth, imageHeight, imageChannels] = imgDims;
+ const [imageHeight, imageWidth, imageChannels] = imgDims;
let paddedImageWidth, paddedImageHeight;
if (typeof padSize === 'number') {
@@ -513,8 +515,8 @@ export class ImageFeatureExtractor extends FeatureExtractor {
if (this.config.keep_aspect_ratio && this.config.ensure_multiple_of) {
// determine new height and width
- let scale_height = size.height / srcHeight;
- let scale_width = size.width / srcWidth;
+ let scale_height = newHeight / srcHeight;
+ let scale_width = newWidth / srcWidth;
// scale as little as possible
if (Math.abs(1 - scale_width) < Math.abs(1 - scale_height)) {
@@ -616,6 +618,9 @@ export class ImageFeatureExtractor extends FeatureExtractor {
/** @type {HeightWidth} */
const reshaped_input_size = [image.height, image.width];
+ // NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
+ // occurs with data in the hwc format (height, width, channels),
+ // to emulate the behavior of the original Python code (w/ numpy).
let pixelData = Float32Array.from(image.data);
let imgDims = [image.height, image.width, image.channels];
@@ -640,27 +645,29 @@ export class ImageFeatureExtractor extends FeatureExtractor {
for (let i = 0; i < pixelData.length; i += image.channels) {
for (let j = 0; j < image.channels; ++j) {
- pixelData[i + j] = (pixelData[i + j] - this.image_mean[j]) / this.image_std[j];
+ pixelData[i + j] = (pixelData[i + j] - image_mean[j]) / image_std[j];
}
}
}
// do padding after rescaling/normalizing
- if (do_pad ?? (this.do_pad && this.pad_size)) {
- const padded = this.pad_image(pixelData, [image.width, image.height, image.channels], this.pad_size);
- [pixelData, imgDims] = padded; // Update pixel data and image dimensions
+ if (do_pad ?? this.do_pad) {
+ if (this.pad_size) {
+ const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
+ [pixelData, imgDims] = padded; // Update pixel data and image dimensions
+ } else if (this.size_divisibility) {
+ const [paddedWidth, paddedHeight] = enforce_size_divisibility([imgDims[1], imgDims[0]], this.size_divisibility);
+ [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
+ }
}
- // Create HWC tensor
- const img = new Tensor('float32', pixelData, imgDims);
-
- // convert to channel dimension format:
- const transposed = transpose(img, [2, 0, 1]); // hwc -> chw
+ const pixel_values = new Tensor('float32', pixelData, imgDims)
+ .permute(2, 0, 1); // convert to channel dimension format (hwc -> chw)
return {
original_size: [srcHeight, srcWidth],
reshaped_input_size: reshaped_input_size,
- pixel_values: transposed,
+ pixel_values: pixel_values,
}
}
@@ -760,9 +767,9 @@ export class SegformerFeatureExtractor extends ImageFeatureExtractor {
return toReturn;
}
}
-export class DPTImageProcessor extends ImageFeatureExtractor { }
-export class BitImageProcessor extends ImageFeatureExtractor { }
export class DPTFeatureExtractor extends ImageFeatureExtractor { }
+export class DPTImageProcessor extends DPTFeatureExtractor { } // NOTE: extends DPTFeatureExtractor
+export class BitImageProcessor extends ImageFeatureExtractor { }
export class GLPNFeatureExtractor extends ImageFeatureExtractor { }
export class CLIPFeatureExtractor extends ImageFeatureExtractor { }
export class ChineseCLIPFeatureExtractor extends ImageFeatureExtractor { }
@@ -811,6 +818,17 @@ export class ConvNextImageProcessor extends ConvNextFeatureExtractor { } // NOT
export class ViTFeatureExtractor extends ImageFeatureExtractor { }
export class ViTImageProcessor extends ImageFeatureExtractor { }
+export class EfficientNetImageProcessor extends ImageFeatureExtractor {
+ constructor(config) {
+ super(config);
+ this.include_top = this.config.include_top ?? true;
+ if (this.include_top) {
+ this.image_std = this.image_std.map(x => x * x);
+ }
+ }
+}
+
+
export class MobileViTFeatureExtractor extends ImageFeatureExtractor { }
export class OwlViTFeatureExtractor extends ImageFeatureExtractor {
/** @type {post_process_object_detection} */
@@ -824,7 +842,7 @@ export class DeiTFeatureExtractor extends ImageFeatureExtractor { }
export class BeitFeatureExtractor extends ImageFeatureExtractor { }
export class DonutFeatureExtractor extends ImageFeatureExtractor {
pad_image(pixelData, imgDims, padSize, options = {}) {
- const [imageWidth, imageHeight, imageChannels] = imgDims;
+ const [imageHeight, imageWidth, imageChannels] = imgDims;
let image_mean = this.image_mean;
if (!Array.isArray(this.image_mean)) {
@@ -836,7 +854,7 @@ export class DonutFeatureExtractor extends ImageFeatureExtractor {
image_std = new Array(imageChannels).fill(image_mean);
}
- const constant_values = image_mean.map((x, i) => - x / this.image_std[i]);
+ const constant_values = image_mean.map((x, i) => - x / image_std[i]);
return super.pad_image(pixelData, imgDims, padSize, {
center: true,
@@ -1371,7 +1389,7 @@ export class Swin2SRImageProcessor extends ImageFeatureExtractor {
pad_image(pixelData, imgDims, padSize, options = {}) {
// NOTE: In this case, `padSize` represents the size of the sliding window for the local attention.
// In other words, the image is padded so that its width and height are multiples of `padSize`.
- const [imageWidth, imageHeight, imageChannels] = imgDims;
+ const [imageHeight, imageWidth, imageChannels] = imgDims;
return super.pad_image(pixelData, imgDims, {
// NOTE: For Swin2SR models, the original python implementation adds padding even when the image's width/height is already
@@ -2132,6 +2150,7 @@ export class AutoProcessor {
YolosFeatureExtractor,
DonutFeatureExtractor,
NougatImageProcessor,
+ EfficientNetImageProcessor,
ViTImageProcessor,
VitMatteImageProcessor,
diff --git a/src/tokenizers.js b/src/tokenizers.js
index 9692cf3b0..5b58e37c0 100644
--- a/src/tokenizers.js
+++ b/src/tokenizers.js
@@ -2519,6 +2519,18 @@ export class PreTrainedTokenizer extends Callable {
this.legacy = false;
this.chat_template = tokenizerConfig.chat_template ?? null;
+ if (Array.isArray(this.chat_template)) {
+ // Chat templates are stored as lists of dicts with fixed key names,
+ // we reconstruct that into a single dict while loading them.
+ const chat_template = Object.create(null);
+ for (const { name, template } of this.chat_template) {
+ if (typeof name !== 'string' || typeof template !== 'string') {
+ throw new Error('Chat template must be a list of objects with "name" and "template" properties');
+ }
+ chat_template[name] = template;
+ }
+ this.chat_template = chat_template;
+ }
this._compiled_template_cache = new Map();
}
@@ -2995,6 +3007,7 @@ export class PreTrainedTokenizer extends Callable {
* @param {number} [options.max_length=null] Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is false.
* If not specified, the tokenizer's `max_length` attribute will be used as a default.
* @param {boolean} [options.return_tensor=true] Whether to return the output as a Tensor or an Array. Has no effect if tokenize is false.
+ * @param {Object} [options.tokenizer_kwargs={}] Additional options to pass to the tokenizer.
* @returns {string | Tensor | number[]| number[][]} The tokenized output.
*/
apply_chat_template(conversation, {
@@ -3005,9 +3018,37 @@ export class PreTrainedTokenizer extends Callable {
truncation = false,
max_length = null,
return_tensor = true,
+ tokenizer_kwargs = {},
+ ...kwargs
} = {}) {
- chat_template ??= this.chat_template ?? this.default_chat_template;
+ // First, handle the cases when the model has a dict of multiple templates
+ if (
+ (this.chat_template && typeof this.chat_template === 'object') ||
+ (this.chat_template === null && this.default_chat_template && typeof this.default_chat_template === 'object')
+ ) {
+ const template_dict = this.chat_template ?? this.default_chat_template; // Guaranteed to be a non-null object
+
+ if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
+ // The user can pass the name of a template to the chat template argument instead of an entire template
+ chat_template = template_dict[chat_template];
+ } else if (chat_template === null && 'default' in template_dict) {
+ chat_template = template_dict['default'];
+ } else if (chat_template === null) {
+ throw Error(
+ `This model has multiple chat templates with no default specified! Please either pass a chat ` +
+ `template or the name of the template you wish to use to the 'chat_template' argument. Available ` +
+ `template names are ${Object.keys(template_dict).sort()}.`
+ )
+ }
+ } else {
+ // These are the cases when the model has a single template
+ // priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
+ chat_template ??= this.chat_template ?? this.default_chat_template;
+ }
+ if (typeof chat_template !== 'string') {
+ throw Error(`chat_template must be a string, but got ${typeof chat_template}`);
+ }
// Compilation function uses a cache to avoid recompiling the same template
let compiledTemplate = this._compiled_template_cache.get(chat_template);
@@ -3029,6 +3070,7 @@ export class PreTrainedTokenizer extends Callable {
add_generation_prompt: add_generation_prompt,
...special_tokens_map,
+ ...kwargs,
});
if (tokenize) {
@@ -3038,6 +3080,7 @@ export class PreTrainedTokenizer extends Callable {
truncation,
max_length,
return_tensor,
+ ...tokenizer_kwargs,
}).input_ids;
}
@@ -3208,6 +3251,8 @@ export class GemmaTokenizer extends PreTrainedTokenizer {
_default_chat_template = "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}"
}
+export class Grok1Tokenizer extends PreTrainedTokenizer { }
+
/**
* Helper function to build translation inputs for an `NllbTokenizer` or `M2M100Tokenizer`.
* @param {PreTrainedTokenizer} self The tokenizer instance.
@@ -4263,6 +4308,9 @@ export class VitsTokenizer extends PreTrainedTokenizer {
this.decoder = new VitsDecoder({});
}
}
+
+export class CohereTokenizer extends PreTrainedTokenizer { }
+
/**
* Helper class which is used to instantiate pretrained tokenizers with the `from_pretrained` function.
* The chosen tokenizer class is determined by the type specified in the tokenizer config.
@@ -4314,6 +4362,8 @@ export class AutoTokenizer {
VitsTokenizer,
Qwen2Tokenizer,
GemmaTokenizer,
+ Grok1Tokenizer,
+ CohereTokenizer,
// Base case:
PreTrainedTokenizer,
diff --git a/src/utils/image.js b/src/utils/image.js
index 2d12cb876..1ee77d900 100644
--- a/src/utils/image.js
+++ b/src/utils/image.js
@@ -10,6 +10,7 @@
import { getFile } from './hub.js';
import { env } from '../env.js';
+import { Tensor } from './tensor.js';
// Will be empty (or not used) if running in browser or web-worker
import sharp from 'sharp';
@@ -166,7 +167,7 @@ export class RawImage {
/**
* Helper method to create a new Image from a tensor
- * @param {import('./tensor.js').Tensor} tensor
+ * @param {Tensor} tensor
*/
static fromTensor(tensor, channel_format = 'CHW') {
if (tensor.dims.length !== 3) {
@@ -586,6 +587,23 @@ export class RawImage {
return await canvas.convertToBlob({ type, quality });
}
+ toTensor(channel_format = 'CHW') {
+ let tensor = new Tensor(
+ 'uint8',
+ new Uint8Array(this.data),
+ [this.height, this.width, this.channels]
+ );
+
+ if (channel_format === 'HWC') {
+ // Do nothing
+ } else if (channel_format === 'CHW') { // hwc -> chw
+ tensor = tensor.permute(2, 0, 1);
+ } else {
+ throw new Error(`Unsupported channel format: ${channel_format}`);
+ }
+ return tensor;
+ }
+
toCanvas() {
if (!BROWSER_ENV) {
throw new Error('toCanvas() is only supported in browser environments.')
diff --git a/src/utils/maths.js b/src/utils/maths.js
index 216def07e..264b69fc7 100644
--- a/src/utils/maths.js
+++ b/src/utils/maths.js
@@ -88,15 +88,15 @@ export function interpolate_data(input, [in_channels, in_height, in_width], [out
/**
- * Helper method to transpose a `AnyTypedArray` directly
+ * Helper method to permute a `AnyTypedArray` directly
* @template {AnyTypedArray} T
* @param {T} array
* @param {number[]} dims
* @param {number[]} axes
- * @returns {[T, number[]]} The transposed array and the new shape.
+ * @returns {[T, number[]]} The permuted array and the new shape.
*/
-export function transpose_data(array, dims, axes) {
- // Calculate the new shape of the transposed array
+export function permute_data(array, dims, axes) {
+ // Calculate the new shape of the permuted array
// and the stride of the original array
const shape = new Array(axes.length);
const stride = new Array(axes.length);
@@ -110,21 +110,21 @@ export function transpose_data(array, dims, axes) {
// Precompute inverse mapping of stride
const invStride = axes.map((_, i) => stride[axes.indexOf(i)]);
- // Create the transposed array with the new shape
+ // Create the permuted array with the new shape
// @ts-ignore
- const transposedData = new array.constructor(array.length);
+ const permutedData = new array.constructor(array.length);
- // Transpose the original array to the new array
+ // Permute the original array to the new array
for (let i = 0; i < array.length; ++i) {
let newIndex = 0;
for (let j = dims.length - 1, k = i; j >= 0; --j) {
newIndex += (k % dims[j]) * invStride[j];
k = Math.floor(k / dims[j]);
}
- transposedData[newIndex] = array[i];
+ permutedData[newIndex] = array[i];
}
- return [transposedData, shape];
+ return [permutedData, shape];
}
@@ -952,3 +952,17 @@ export function round(num, decimals) {
const pow = Math.pow(10, decimals);
return Math.round(num * pow) / pow;
}
+
+/**
+ * Helper function to round a number to the nearest integer, with ties rounded to the nearest even number.
+ * Also known as "bankers' rounding". This is the default rounding mode in python. For example:
+ * 1.5 rounds to 2 and 2.5 rounds to 2.
+ *
+ * @param {number} x The number to round
+ * @returns {number} The rounded number
+ */
+export function bankers_round(x) {
+ const r = Math.round(x);
+ const br = Math.abs(x) % 1 === 0.5 ? (r % 2 === 0 ? r : r - 1) : r;
+ return br;
+}
diff --git a/src/utils/tensor.js b/src/utils/tensor.js
index 819c2dbb6..ccdf781be 100644
--- a/src/utils/tensor.js
+++ b/src/utils/tensor.js
@@ -11,7 +11,7 @@ import { ONNX } from '../backends/onnx.js';
import {
interpolate_data,
- transpose_data
+ permute_data
} from './maths.js';
@@ -309,16 +309,18 @@ export class Tensor {
}
/**
- * Return a transposed version of this Tensor, according to the provided dimensions.
- * @param {...number} dims Dimensions to transpose.
- * @returns {Tensor} The transposed tensor.
+ * Return a permuted version of this Tensor, according to the provided dimensions.
+ * @param {...number} dims Dimensions to permute.
+ * @returns {Tensor} The permuted tensor.
*/
- transpose(...dims) {
- return transpose(this, dims);
+ permute(...dims) {
+ return permute(this, dims);
}
- // TODO: rename transpose to permute
- // TODO: implement transpose
+ // TODO: implement transpose. For now (backwards compatibility), it's just an alias for permute()
+ transpose(...dims) {
+ return this.permute(...dims);
+ }
// TODO add .max() and .min() methods
@@ -680,14 +682,14 @@ function reshape(data, dimensions) {
}
/**
- * Transposes a tensor according to the provided axes.
- * @param {any} tensor The input tensor to transpose.
- * @param {Array} axes The axes to transpose the tensor along.
- * @returns {Tensor} The transposed tensor.
+ * Permutes a tensor according to the provided axes.
+ * @param {any} tensor The input tensor to permute.
+ * @param {Array} axes The axes to permute the tensor along.
+ * @returns {Tensor} The permuted tensor.
*/
-export function transpose(tensor, axes) {
- const [transposedData, shape] = transpose_data(tensor.data, tensor.dims, axes);
- return new Tensor(tensor.type, transposedData, shape);
+export function permute(tensor, axes) {
+ const [permutedData, shape] = permute_data(tensor.data, tensor.dims, axes);
+ return new Tensor(tensor.type, permutedData, shape);
}
diff --git a/tests/maths.test.js b/tests/maths.test.js
index 9a7d3dc3c..788ae5b02 100644
--- a/tests/maths.test.js
+++ b/tests/maths.test.js
@@ -2,7 +2,7 @@
import { compare } from './test_utils.js';
import { getFile } from '../src/utils/hub.js';
-import { FFT, medianFilter } from '../src/utils/maths.js';
+import { FFT, medianFilter, bankers_round } from '../src/utils/maths.js';
const fft = (arr, complex = false) => {
@@ -27,6 +27,19 @@ const fftTestsData = await (await getFile('./tests/data/fft_tests.json')).json()
describe('Mathematical operations', () => {
+ describe('bankers rounding', () => {
+ it('should round up to nearest even', () => {
+ expect(bankers_round(-0.5)).toBeCloseTo(0);
+ expect(bankers_round(1.5)).toBeCloseTo(2);
+ expect(bankers_round(19.5)).toBeCloseTo(20);
+ });
+ it('should round down to nearest even', () => {
+ expect(bankers_round(-1.5)).toBeCloseTo(-2);
+ expect(bankers_round(2.5)).toBeCloseTo(2);
+ expect(bankers_round(18.5)).toBeCloseTo(18);
+ });
+ });
+
describe('median filtering', () => {
diff --git a/tests/processors.test.js b/tests/processors.test.js
index 38f47bb17..c9ab33982 100644
--- a/tests/processors.test.js
+++ b/tests/processors.test.js
@@ -45,11 +45,14 @@ describe('Processors', () => {
clip: 'openai/clip-vit-base-patch16',
vitmatte: 'hustvl/vitmatte-small-distinctions-646',
dinov2: 'facebook/dinov2-small-imagenet1k-1-layer',
+ efficientnet: 'google/efficientnet-b0',
}
const TEST_IMAGES = {
pattern_3x3: 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/pattern_3x3.png',
+ pattern_3x5: 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/pattern_3x5.png',
checkerboard_8x8: 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/checkerboard_8x8.png',
+ checkerboard_64x32: 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/checkerboard_64x32.png',
receipt: 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/receipt.png',
tiger: 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/tiger.jpg',
paper: 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/nougat_paper.png',
@@ -368,6 +371,7 @@ describe('Processors', () => {
// - tests custom overrides
// - tests multiple inputs
// - tests `size_divisibility` and no size (size_divisibility=32)
+ // - tests do_pad and `size_divisibility`
it(MODELS.vitmatte, async () => {
const processor = await AutoProcessor.from_pretrained(m(MODELS.vitmatte))
@@ -390,6 +394,25 @@ describe('Processors', () => {
compare(original_sizes, [[640, 960]]);
compare(reshaped_input_sizes, [[640, 960]]);
}
+
+
+ {
+ const image = await load_image(TEST_IMAGES.pattern_3x5);
+ const image2 = await load_image(TEST_IMAGES.pattern_3x5);
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image, image2);
+
+ compare(pixel_values.dims, [1, 4, 32, 32]);
+ expect(avg(pixel_values.data)).toBeCloseTo(-0.00867417361587286);
+ expect(pixel_values.data[0]).toBeCloseTo(-0.9921568632125854);
+ expect(pixel_values.data[1]).toBeCloseTo(-0.9686274528503418);
+ expect(pixel_values.data[5]).toBeCloseTo(0.0);
+ expect(pixel_values.data[32]).toBeCloseTo(-0.9215686321258545);
+ expect(pixel_values.data[33]).toBeCloseTo(-0.8980392217636108);
+ expect(pixel_values.data.at(-1)).toBeCloseTo(0.0);
+
+ compare(original_sizes, [[5, 3]]);
+ compare(reshaped_input_sizes, [[5, 3]]);
+ }
}, MAX_TEST_EXECUTION_TIME);
// BitImageProcessor
@@ -411,6 +434,7 @@ describe('Processors', () => {
// DPTImageProcessor
// - tests ensure_multiple_of
// - tests keep_aspect_ratio
+ // - tests bankers rounding
it(MODELS.dpt_2, async () => {
const processor = await AutoProcessor.from_pretrained(m(MODELS.dpt_2))
@@ -424,8 +448,36 @@ describe('Processors', () => {
compare(original_sizes, [[480, 640]]);
compare(reshaped_input_sizes, [[518, 686]]);
}
+
+ {
+ const image = await load_image(TEST_IMAGES.checkerboard_64x32);
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+ // NOTE: without bankers rounding, this would be [1, 3, 266, 518]
+ compare(pixel_values.dims, [1, 3, 252, 518]);
+ compare(avg(pixel_values.data), 0.2267402559518814);
+
+ compare(original_sizes, [[32, 64]]);
+ compare(reshaped_input_sizes, [[252, 518]]);
+ }
}, MAX_TEST_EXECUTION_TIME);
+ // EfficientNetImageProcessor
+ // - tests include_top
+ it(MODELS.efficientnet, async () => {
+ const processor = await AutoProcessor.from_pretrained(MODELS.efficientnet)
+
+ {
+ const image = await load_image(TEST_IMAGES.cats);
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+ compare(pixel_values.dims, [1, 3, 224, 224]);
+ compare(avg(pixel_values.data), 0.3015307230282871);
+
+ compare(original_sizes, [[480, 640]]);
+ compare(reshaped_input_sizes, [[224, 224]]);
+ }
+ }, MAX_TEST_EXECUTION_TIME);
});
describe('Audio processors', () => {
diff --git a/tests/tensor.test.js b/tests/tensor.test.js
index de9ffac30..bc056b9c8 100644
--- a/tests/tensor.test.js
+++ b/tests/tensor.test.js
@@ -103,6 +103,65 @@ describe('Tensor operations', () => {
});
});
+ describe('permute', () => {
+ it('should permute', async () => {
+ const x = new Tensor(
+ 'float32',
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
+ [2, 3, 4],
+ );
+ // Permute axes to (0, 1, 2) - No change
+ const permuted_1 = x.permute(0, 1, 2);
+ const target_1 = x;
+ compare(permuted_1, target_1, 1e-3);
+
+ // Permute axes to (0, 2, 1)
+ const permuted_2 = x.permute(0, 2, 1);
+ const target_2 = new Tensor(
+ 'float32',
+ [0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 12, 16, 20, 13, 17, 21, 14, 18, 22, 15, 19, 23],
+ [2, 4, 3],
+ );
+ compare(permuted_2, target_2, 1e-3);
+
+ // Permute axes to (1, 0, 2)
+ const permuted_3 = x.permute(1, 0, 2);
+ const target_3 = new Tensor(
+ 'float32',
+ [0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 16, 17, 18, 19, 8, 9, 10, 11, 20, 21, 22, 23],
+ [3, 2, 4],
+ );
+ compare(permuted_3, target_3, 1e-3);
+
+ // Permute axes to (1, 2, 0)
+ const permuted_4 = x.permute(1, 2, 0);
+ const target_4 = new Tensor(
+ 'float32',
+ [0, 12, 1, 13, 2, 14, 3, 15, 4, 16, 5, 17, 6, 18, 7, 19, 8, 20, 9, 21, 10, 22, 11, 23],
+ [3, 4, 2],
+ );
+ compare(permuted_4, target_4, 1e-3);
+
+ // Permute axes to (2, 0, 1)
+ const permuted_5 = x.permute(2, 0, 1);
+ const target_5 = new Tensor(
+ 'float32',
+ [0, 4, 8, 12, 16, 20, 1, 5, 9, 13, 17, 21, 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23],
+ [4, 2, 3],
+ );
+ compare(permuted_5, target_5, 1e-3);
+
+ // Permute axes to (2, 1, 0)
+ const permuted_6 = x.permute(2, 1, 0);
+ const target_6 = new Tensor(
+ 'float32',
+ [0, 12, 4, 16, 8, 20, 1, 13, 5, 17, 9, 21, 2, 14, 6, 18, 10, 22, 3, 15, 7, 19, 11, 23],
+ [4, 3, 2],
+ );
+ compare(permuted_6, target_6, 1e-3);
+ });
+ });
+
describe('mean', () => {
it('should calculate mean', async () => {
const t1 = new Tensor('float32', [1, 2, 3, 4, 5, 6], [2, 3, 1]);
diff --git a/tests/tokenizers.test.js b/tests/tokenizers.test.js
index 40fed05d1..8b92c6702 100644
--- a/tests/tokenizers.test.js
+++ b/tests/tokenizers.test.js
@@ -350,6 +350,42 @@ describe('Chat templates', () => {
compare(input_ids, [1, 733, 16289, 28793, 22557, 28725, 910, 460, 368, 28804, 733, 28748, 16289, 28793, 28737, 28742, 28719, 2548, 1598, 28723, 1602, 541, 315, 1316, 368, 3154, 28804, 2, 28705, 733, 16289, 28793, 315, 28742, 28715, 737, 298, 1347, 805, 910, 10706, 5752, 1077, 3791, 28808, 733, 28748, 16289, 28793])
});
+ it('should support multiple chat templates', async () => {
+
+ const tokenizer = await AutoTokenizer.from_pretrained("Xenova/c4ai-command-r-v01-tokenizer")
+
+ // define conversation input:
+ const conversation = [
+ { role: "user", content: "Whats the biggest penguin in the world?" }
+ ]
+ // define documents to ground on:
+ const documents = [
+ { title: "Tall penguins", text: "Emperor penguins are the tallest growing up to 122 cm in height." },
+ { title: "Penguin habitats", text: "Emperor penguins only live in Antarctica." }
+ ]
+
+ // render the RAG prompt as a string:
+ const grounded_generation_prompt = tokenizer.apply_chat_template(
+ conversation,
+ {
+ chat_template: "rag",
+ tokenize: false,
+ add_generation_prompt: true,
+
+ documents,
+ citation_mode: "accurate", // or "fast"
+ }
+ )
+ expect(grounded_generation_prompt).toEqual(
+ "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble\nThe instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.\n\n" +
+ "# System Preamble\n## Basic Rules\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.\n\n" +
+ "# User Preamble\n## Task and Context\nYou help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.\n\n## Style Guide\nUnless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.<|END_OF_TURN_TOKEN|>" +
+ "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|>" +
+ "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>\nDocument: 0\ntitle: Tall penguins\ntext: Emperor penguins are the tallest growing up to 122 cm in height.\n\nDocument: 1\ntitle: Penguin habitats\ntext: Emperor penguins only live in Antarctica.\n<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Carefully perform the following instructions, in order, starting each with a new line.\nFirstly, Decide which of the retrieved documents are relevant to the user's last input by writing 'Relevant Documents:' followed by comma-separated list of document numbers. If none are relevant, you should instead write 'None'.\nSecondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user's last input by writing 'Cited Documents:' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write 'None'.\nThirdly, Write 'Answer:' followed by a response to the user's last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup.\nFinally, Write 'Grounded answer:' followed by a response to the user's last input in high quality natural english. Use the symbols and to indicate when a fact comes from a document in the search result, e.g my fact for a fact from document 0.<|END_OF_TURN_TOKEN|>" +
+ "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+ );
+ });
+
it('should support user-defined chat template', async () => {
const tokenizer = await AutoTokenizer.from_pretrained("Xenova/llama-tokenizer");
@@ -395,7 +431,7 @@ describe('Chat templates', () => {
.replaceAll('USE_DEFAULT_PROMPT', true)
.replaceAll('DEFAULT_SYSTEM_MESSAGE', 'You are a helpful, respectful and honest assistant.');
- const text = await tokenizer.apply_chat_template(chat, { tokenize: false, return_tensor: false, chat_template });
+ const text = tokenizer.apply_chat_template(chat, { tokenize: false, return_tensor: false, chat_template });
expect(text).toEqual("[INST] <>\nYou are a helpful, respectful and honest assistant.\n<>\n\nHello, how are you? [/INST] I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST]");
@@ -412,7 +448,7 @@ describe('Chat templates', () => {
for (let { messages, add_generation_prompt, tokenize, target } of tests) {
- const generated = await tokenizer.apply_chat_template(messages, {
+ const generated = tokenizer.apply_chat_template(messages, {
tokenize,
add_generation_prompt,
return_tensor: false,