diff --git a/doc/vector/code-samples/Makefile b/doc/vector/code-samples/Makefile
index 570af26a..62562eb5 100644
--- a/doc/vector/code-samples/Makefile
+++ b/doc/vector/code-samples/Makefile
@@ -32,6 +32,18 @@ LDFLAGS+=-static
 # with different values of VLEN.
 COMMON_SPIKE_FLAGS?=--isa=rv64gcv$(MARCH_EXT_FLAGS)
 
+# List of VLEN values being tested.
+# All code samples support values >=128, some support VLEN=64.
+#
+# Each value requires a separate Spike invocation. Overriding
+# the list is an easy way to reduce run time if only specific
+# values are of interest.
+#
+# TODO: ideally we'd have logic supporting VLEN=32, which also
+# implies ELEN=32 (i.e., rv32). The code was developed for rv64,
+# hence VLEN=32 runs are not valid.
+TESTED_VLENS?=64 128 256 512
+
 TEST_VECTORS_DIR=test-vectors
 
 CBC_VECTORS=\
@@ -80,6 +92,7 @@ C_OBJECTS=\
 	zkb-test.o \
 	zvbb-test.o \
 	zvbc-test.o \
+	zvkg-test.o \
 
 ASM_OBJECTS=\
 	vlen-bits.o \
@@ -92,7 +105,7 @@ ASM_OBJECTS=\
         zvksed.o \
         zvksh.o \
 
-default: aes-cbc-test aes-gcm-test sha-test sm3-test sm4-test zvbb-test zvbc-test
+default: aes-cbc-test aes-gcm-test sha-test sm3-test sm4-test zvbb-test zvbc-test zvkg-test
 
 .PHONY: test-vectors
 test-vectors: $(SUBDIR_CBC_VECTORS) $(SUBDIR_GCM_VECTORS) $(SUBDIR_SHA_VECTORS)
@@ -133,52 +146,69 @@ zvbb-test: zvbb-test.o zvbb.o log.o vlen-bits.o
 zvbc-test: zvbc-test.o zvbc.o log.o vlen-bits.o
 	$(LD) $(LDFLAGS) -o $@ $^
 
-# TODO: add VLEN=32, VLEN=64 runs.
+zvkg-test: zvkg-test.o zvkg.o log.o vlen-bits.o
+	$(LD) $(LDFLAGS) -o $@ $^
+
 .PHONY: run-aes-cbc
 run-aes-cbc: aes-cbc-test
-	for VLEN in 128 256 512; do \
+	for VLEN in $(TESTED_VLENS); do \
 	    $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \
 	done
 
-# TODO: add VLEN=64 runs.
+# TODO: add logic supporting VLEN=64 runs.
 .PHONY: run-aes-gcm
 run-aes-gcm: aes-gcm-test
-	for VLEN in 128 256 512; do \
-	    $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \
+	for VLEN in $(TESTED_VLENS); do \
+	    if [[ $${VLEN} == 64 ]]; then \
+	        echo "*** Skipping $< test with VLEN=$${VLEN}"; \
+	    else \
+	        $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \
+	    fi \
 	done
 
 .PHONY: run-sha
 run-sha: sha-test
-	for VLEN in 64 128 256 512; do \
+	for VLEN in $(TESTED_VLENS); do \
 	    $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \
 	done
 
 .PHONY: run-sm3
 run-sm3: sm3-test
-	for VLEN in 64 128 256 512; do \
+	for VLEN in $(TESTED_VLENS); do \
 	    $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \
 	done
 
+# TODO: add logic supporting VLEN=64 runs.
 .PHONY: run-sm4
 run-sm4: sm4-test
-	for VLEN in 128 256 512; do \
-	    $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \
+	for VLEN in $(TESTED_VLENS); do \
+	    if [[ $${VLEN} == 64 ]]; then \
+	        echo "*** Skipping $< test with VLEN=$${VLEN}"; \
+	    else \
+	        $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \
+	    fi \
 	done
 
 .PHONY: run-zvbb
 run-zvbb: zvbb-test
-	for VLEN in 64 128 256 512; do \
+	for VLEN in $(TESTED_VLENS); do \
 	    $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \
 	done
 
 .PHONY: run-zvbc
 run-zvbc: zvbc-test
-	for VLEN in 64 128 256 512; do \
+	for VLEN in $(TESTED_VLENS); do \
+	    $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \
+	done
+
+.PHONY: run-zvkg
+run-zvkg: zvkg-test
+	for VLEN in $(TESTED_VLENS); do \
 	    $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \
 	done
 
 .PHONY: run-tests
-run-tests: run-aes-cbc run-aes-gcm run-sha run-sm3 run-sm4 run-zvbb run-zvbc
+run-tests: run-aes-cbc run-aes-gcm run-sha run-sm3 run-sm4 run-zvbb run-zvbc run-zvkg
 
 .PHONY: clean
 clean:
diff --git a/doc/vector/code-samples/README.md b/doc/vector/code-samples/README.md
index 58fd219e..fe6c89f3 100644
--- a/doc/vector/code-samples/README.md
+++ b/doc/vector/code-samples/README.md
@@ -73,6 +73,8 @@ make run-tests TARGET=riscv64-unknown-linux-gnu \
 - `sm3-test` - Build the SM3 example.
 - `sm4-test` - Build the SM4 example.
 - `zvbb-test` - Build the Zvbb example.
+- `zvbc-test` - Build the Zvbc example.
+- `zvkg-test` - Build the Zvkg example.
 - `run-tests` - Build and run all examples.
 - `run-aes-cbc` - Build and run the AES-CBC example in Spike.
 - `run-aes-gcm` - Build and run the AES-GCM example in Spike.
@@ -80,12 +82,18 @@ make run-tests TARGET=riscv64-unknown-linux-gnu \
 - `run-sm3` - Build and run the SM3 example in Spike.
 - `run-sm4` - Build and run the SM4 example in Spike.
 - `run-zvbb` - Build and run the Zvbb example in Spike.
+- `run-zvbc` - Build and run the Zvbc example in Spike.
+- `run-zvkg` - Build and run the Zvkg example in Spike.
 
 ### Make variables
 
 - `TARGET` - Target triplet to use. By default riscv64-linux-gnu.
 - `PK` - Location of the riscv-pk binary. By default it's
   `~/RISC-V/$(TARGET)/bin/pk`.
+- `TESTED_VLENS` - Space separated list of VLEN values being tested
+   against. All algorithms support VLEN>=128, most support VLEN=64.
+   Tests that do not support VLEN=64 will be skipped if that value
+   is present in the list.
 
 See Makefile for more details.
 
diff --git a/doc/vector/code-samples/aes-gcm-test.c b/doc/vector/code-samples/aes-gcm-test.c
index 00115faa..d1db6c88 100644
--- a/doc/vector/code-samples/aes-gcm-test.c
+++ b/doc/vector/code-samples/aes-gcm-test.c
@@ -426,15 +426,18 @@ run_test_zvb(const struct aes_gcm_test* test, int keylen)
     }
 
     for (int i = 0; i < test->ctlen / 16; i++) {
-        if (!test->encrypt)
+        if (!test->encrypt) {
             ghash(&Y, (uint128 *)(&xordata[16 * i]), &H);
+        }
 
         encrypt_block(&buf[16 * i], &counter_block, &key);
-        for (int j = 0; j < 16; j++)
+        for (int j = 0; j < 16; j++) {
             buf[16 * i + j] ^= xordata[16 * i + j];
+        }
 
-        if (test->encrypt)
+        if (test->encrypt) {
             ghash(&Y, (uint128 *)(&buf[16 * i]), &H);
+        }
 
         increment_counter_block(&counter_block);
     }
@@ -449,8 +452,9 @@ run_test_zvb(const struct aes_gcm_test* test, int keylen)
 
         // buf shall have enough space to fit the extra bytes.
         encrypt_block(&buf[test->ctlen - rem], &counter_block, &key);
-        for (int i = 0; i < rem; i++)
+        for (int i = 0; i < rem; i++) {
             buf[test->ctlen - rem + i] ^= xordata[test->ctlen - rem + i];
+        }
 
         if (test->encrypt) {
             bzero(&temp, sizeof(temp));
diff --git a/doc/vector/code-samples/zvkg-test.c b/doc/vector/code-samples/zvkg-test.c
new file mode 100644
index 00000000..6f9a4c29
--- /dev/null
+++ b/doc/vector/code-samples/zvkg-test.c
@@ -0,0 +1,108 @@
+// Copyright 2023 Rivos Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <assert.h>
+#include <byteswap.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "log.h"
+#include "vlen-bits.h"
+#include "zvkg.h"
+
+// @brief Return a 32-bit randomly generated number using rand()
+//
+// @return uint32_t
+//
+uint32_t
+rand32()
+{
+    return rand();
+}
+
+// @brief Tests vectorized multiply in Galois Field instruction vgmul
+// using randomly generated test vectors.
+//
+// 'vghsh.vv' is used to generate the "golden" outputs that we check vgmul
+// against. The correctness of vghsh is established in the test 'aes-gcm-test'.
+//
+int
+test_rand_vgmul()
+{
+#define kNumGroups 113
+#define kNumElements (4 * (kNumGroups))
+#define kRounds 100
+
+    uint32_t y[kNumElements];
+    uint32_t z[kNumElements];
+    uint32_t expected[kNumElements];
+    uint32_t actual[kNumElements];
+
+    LOG("--- Testing vgmul against vghsh");
+
+    for (size_t round = 0; round < kRounds; ++round) {
+        for (size_t i = 0; i < kNumElements; ++i) {
+            actual[i] = expected[i] = rand32();
+            y[i] = rand32();
+            z[i] = 0;
+        }
+
+        // The reference (expected) output is produced by vghsh
+        zvkg_vghsh_vv(expected, z, y, kNumGroups);
+
+        // The tested (actual) output is produced by vgmul
+        zvkg_vgmul_vv(actual, y, kNumGroups);
+
+        if (memcmp(actual, expected, sizeof(actual))) {
+            LOG("FAILURE: 'actual' does NOT match 'expected'");
+            for (size_t i = 0; i < kNumElements; ++i) {
+                const uint32_t exp = expected[i];
+                const uint32_t act = actual[i];
+                LOG("expected[%3zd]: 0x%08" PRIx32
+                    ", actual[%3zd]: 0x%08" PRIx32
+                    "  %s", i, exp, i, act, (exp == act ? "==" : "!="));
+            }
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+// @brief Calls test functions for our intrinsics
+//
+// @return int
+//
+int
+main()
+{
+    const uint64_t vlen = vlen_bits();
+    LOG("VLEN = %" PRIu64, vlen);
+
+    int res = 0;
+
+    // The correctness of 'vghsh.vv' is established in the test 'aes-gcm-test',
+    // so this test is only there to validate 'vgmul.vv'.
+    res = test_rand_vgmul();
+    if (res != 0) {
+        return res;
+    }
+
+    return 0;
+}
diff --git a/doc/vector/code-samples/zvkg.h b/doc/vector/code-samples/zvkg.h
index 616afdac..58574248 100644
--- a/doc/vector/code-samples/zvkg.h
+++ b/doc/vector/code-samples/zvkg.h
@@ -21,7 +21,7 @@
 // does not support unaligned access.
 //
 //   Y <- (Y xor X) o H
-// Where 'o' is the Galois Field Multiplication.
+// where 'o' is the Galois Field Multiplication in GF(2^128).
 extern void
 zvkg_vghsh(
     void* Y,
@@ -29,4 +29,31 @@ zvkg_vghsh(
     const void* H
 );
 
+// Y, X, and H point to arrays of 128 bits values, 32b aligned
+// if the processor does not support unaligned access.
+// 'n' is the number of 128b element groups.
+//
+//   Y[i]_out = (Y[i]_in ^ X[i]) o H[i]
+// where 'o' is the Galois Field Multiplication in GF(2^128).
+extern void
+zvkg_vghsh_vv(
+    void* Y,
+    const void* X,
+    const void* H,
+    size_t n
+);
+
+// X and Y point to arrays of 128 bits values, 32b aligned
+// if the processor does not support unaligned access.
+// 'n' is the number of 128b element groups.
+//
+//   Y[i]_out = Y[i]_in o H[i]
+// where 'o' is the Galois Field Multiplication in GF(2^128).
+extern void
+zvkg_vgmul_vv(
+    void* Y,
+    const void* H,
+    size_t n
+);
+
 #endif  // ZVKG_H_
diff --git a/doc/vector/code-samples/zvkg.s b/doc/vector/code-samples/zvkg.s
index e91eb4b3..0bb81e0d 100644
--- a/doc/vector/code-samples/zvkg.s
+++ b/doc/vector/code-samples/zvkg.s
@@ -51,3 +51,92 @@ zvkg_vghsh:
     vghsh.vv v0, v8, v4
     vse32.v v0, (a0)
     ret
+
+
+# zvkg_vghsh_vv
+#
+# Performs vector add-multiply over GHASH Galois-Field for multiple
+# 128 bit elements groups. 'n' is the number of 128b groups.
+# The input arrays should be 32b aligned on processors that do not
+# support unaligned 32b vector loads/stores.
+#     Y[i]_out = ((Y[i]_in ^ X[i]) o H[i])
+#
+#   void zvkg_vghsh_vv(
+#       uint32_t* Y,   // a0
+#       uint32_t* X,   // a1
+#       uint32_t* H,   // a2
+#       size_t    n    // a3
+#   );
+#
+.balign 4
+.global zvkg_vghsh_vv
+zvkg_vghsh_vv:
+    beqz a3, 2f  # Early exit in the "0 bytes to process" case
+    # a3 on input is number of 128b groups in the input arrays. We multiply
+    # by 4 as the Zvkg instructions expect VSEW=32. a3 becomes the number
+    # of 32b elements to process, which is a multiple of 4.
+    slli a3, a3, 2
+1:
+    # We use LMUL=4 to enable runs with VLEN=32, as a proof of concept.
+    # Once VLEN>=128, we can simply use LMUL=1.
+    vsetvli t0, a3, e32, m4, ta, ma
+
+    vle32.v v0, (a0)
+    vle32.v v4, (a1)
+    vle32.v v8, (a2)
+    vghsh.vv v0, v8, v4  # Y(v0) = Y(v0) ^ X(v4)) o H(v8)
+    vse32.v v0, (a0)
+
+    sub a3, a3, t0       # Decrement number of remaining 32b elements
+    slli t0, t0, 2       # t0 (#bytes consumed) <- t0 (#4B) * 4
+    add a0, a0, t0
+    add a1, a1, t0
+    add a2, a2, t0
+    bnez a3, 1b          # More elements to process?
+
+2:
+    ret
+
+# zvkg_vgmul_vv
+#
+# Performs vector multiply over GHASH Galois-Field for multiple
+# 128 bit elements groups.
+# 'n' is the number of 128b element groups, n operations will be performed.
+# The input arrays should be 32b aligned on processors that do not
+# support unaligned 32b vector loads/stores.
+#
+#     Y[i]_out = (Y[i]_in o H[i])
+#
+#   void zvkg_vgmul_vv(
+#       uint32_t* Y,   // a0
+#       uint32_t* H,   // a1
+#       size_t    n    // a2
+#   );
+#
+.balign 4
+.global zvkg_vgmul_vv
+zvkg_vgmul_vv:
+    beqz a2, 2f  # Early exit in the "0 bytes to process" case
+    # a3 on input is number of 128b groups in the input arrays. We multiply
+    # by 4 as the Zvkg instructions expect VSEW=32. a3 becomes the number
+    # of 32b elements to process, which is a multiple of 4.
+    slli a2, a2, 2
+1:
+    # We use LMUL=4 to enable runs with VLEN=32, as a proof of concept.
+    # Once VLEN>=128, we can simply use LMUL=1.
+    vsetvli t0, a2, e32, m4, ta, ma
+
+    vle32.v v0, (a0)
+    vle32.v v4, (a1)
+    vgmul.vv v0, v4
+    vse32.v v0, (a0)
+
+    sub a2, a2, t0
+    slli t0, t0, 2       # t0 (#bytes consumes) <- t0 (#4B) * 4
+    add a0, a0, t0
+    add a1, a1, t0
+    bnez a2, 1b          # More elements to process?
+
+2:
+    ret
+
diff --git a/doc/vector/code-samples/zvknh.s b/doc/vector/code-samples/zvknh.s
index 7c0f552d..c5b994b8 100644
--- a/doc/vector/code-samples/zvknh.s
+++ b/doc/vector/code-samples/zvknh.s
@@ -33,7 +33,7 @@
 
 .data
 .balign 16  # Only 4 is needed, 16 is just nice.
-// Note that those values are stored in native endianness.
+# Note that those values are stored in native endianness.
 SHA256_ROUND_CONSTANTS:
     .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5  # 0-3
     .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5  # 4-7
@@ -53,7 +53,7 @@ SHA256_ROUND_CONSTANTS:
     .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2  # 60-63
 
 .balign 4  # Only 8 is needed, 32 is just nice.
-// Note that those values are stored in native endianness.
+# Note that those values are stored in native endianness.
 SHA512_ROUND_CONSTANTS:
     .dword 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc  # 0-3
     .dword 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118  # 4-7