From 35d61f5cc94c4ede407d27105579a71a675378af Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 15 Nov 2023 16:28:29 +0200
Subject: [PATCH 1/2] Add `CLIPFeatureExtractor` (and tests) (#387)

---
 src/processors.js        |  2 ++
 tests/processors.test.js | 26 ++++++++++++++++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/processors.js b/src/processors.js
index 7691cb413..dd7130156 100644
--- a/src/processors.js
+++ b/src/processors.js
@@ -509,6 +509,7 @@ export class ImageFeatureExtractor extends FeatureExtractor {
 
 }
 
+export class CLIPFeatureExtractor extends ImageFeatureExtractor { }
 export class ConvNextFeatureExtractor extends ImageFeatureExtractor { }
 export class ViTFeatureExtractor extends ImageFeatureExtractor { }
 export class MobileViTFeatureExtractor extends ImageFeatureExtractor { }
@@ -1538,6 +1539,7 @@ export class AutoProcessor {
         WhisperFeatureExtractor,
         ViTFeatureExtractor,
         MobileViTFeatureExtractor,
+        CLIPFeatureExtractor,
         ConvNextFeatureExtractor,
         BeitFeatureExtractor,
         DeiTFeatureExtractor,
diff --git a/tests/processors.test.js b/tests/processors.test.js
index fe594613e..9e5d09f18 100644
--- a/tests/processors.test.js
+++ b/tests/processors.test.js
@@ -38,6 +38,7 @@ describe('Processors', () => {
             beit: 'microsoft/beit-base-patch16-224-pt22k-ft22k',
             detr: 'facebook/detr-resnet-50',
             yolos: 'hustvl/yolos-small-300',
+            clip: 'openai/clip-vit-base-patch16',
         }
 
         const TEST_IMAGES = {
@@ -171,7 +172,7 @@ describe('Processors', () => {
         it(MODELS.deit, async () => {
             const processor = await AutoProcessor.from_pretrained(m(MODELS.deit))
 
-            { // Tests grayscale image
+            {
                 const image = await load_image(TEST_IMAGES.tiger);
                 const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
 
@@ -187,7 +188,7 @@ describe('Processors', () => {
         it(MODELS.beit, async () => {
             const processor = await AutoProcessor.from_pretrained(m(MODELS.beit))
 
-            { // Tests grayscale image
+            {
                 const image = await load_image(TEST_IMAGES.tiger);
                 const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
 
@@ -204,7 +205,7 @@ describe('Processors', () => {
         it(MODELS.detr, async () => {
             const processor = await AutoProcessor.from_pretrained(m(MODELS.detr))
 
-            { // Tests grayscale image
+            {
                 const image = await load_image(TEST_IMAGES.tiger);
                 const { pixel_values, original_sizes, reshaped_input_sizes, pixel_mask } = await processor(image);
 
@@ -225,7 +226,7 @@ describe('Processors', () => {
         it(MODELS.yolos, async () => {
             const processor = await AutoProcessor.from_pretrained(m(MODELS.yolos))
 
-            { // Tests grayscale image
+            {
                 const image = await load_image(TEST_IMAGES.tiger);
                 const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
 
@@ -236,5 +237,22 @@ describe('Processors', () => {
                 compare(reshaped_input_sizes, [[888, 1333]]);
             }
         }, MAX_TEST_EXECUTION_TIME);
+
+        // CLIPFeatureExtractor
+        //  - tests center crop (do_center_crop=true, crop_size=224)
+        it(MODELS.clip, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.clip))
+
+            {
+                const image = await load_image(TEST_IMAGES.tiger);
+                const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+                compare(pixel_values.dims, [1, 3, 224, 224]);
+                compare(avg(pixel_values.data), -0.06678297738282096);
+
+                compare(original_sizes, [[408, 612]]);
+                compare(reshaped_input_sizes, [[224, 224]]);
+            }
+        }, MAX_TEST_EXECUTION_TIME);
     });
 });

From 4e4148cb5ce7f4a9265f58b4eeb660c64bed0386 Mon Sep 17 00:00:00 2001
From: Victor Nogueira <felladrin@gmail.com>
Date: Wed, 15 Nov 2023 17:51:33 +0200
Subject: [PATCH 2/2] Add support for Grouped Query Attention on Llama Model
 (#393)

Resolves #388
---
 src/models.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/models.js b/src/models.js
index b0a82cee0..9fabe8bdd 100644
--- a/src/models.js
+++ b/src/models.js
@@ -3035,9 +3035,9 @@ export class LlamaPreTrainedModel extends PreTrainedModel {
         // config doesn't contain pad_token_id, so we assume it is the eos_token_id
         this.config.pad_token_id = this.config.eos_token_id
 
-        this.num_heads = this.config.num_attention_heads
+        this.num_heads = this.config.num_key_value_heads ?? this.config.num_attention_heads
         this.num_layers = this.config.num_hidden_layers
-        this.dim_kv = this.config.hidden_size / this.num_heads;
+        this.dim_kv = this.config.hidden_size / this.config.num_attention_heads
     }
 }
 /**