diff --git a/doc/vector/code-samples/Makefile b/doc/vector/code-samples/Makefile index 570af26a..62562eb5 100644 --- a/doc/vector/code-samples/Makefile +++ b/doc/vector/code-samples/Makefile @@ -32,6 +32,18 @@ LDFLAGS+=-static # with different values of VLEN. COMMON_SPIKE_FLAGS?=--isa=rv64gcv$(MARCH_EXT_FLAGS) +# List of VLEN values being tested. +# All code samples support values >=128, some support VLEN=64. +# +# Each value requires a separate Spike invocation. Overriding +# the list is an easy way to reduce run time if only specific +# values are of interest. +# +# TODO: ideally we'd have logic supporting VLEN=32, which also +# implies ELEN=32 (i.e., rv32). The code was developed for rv64, +# hence VLEN=32 runs are not valid. +TESTED_VLENS?=64 128 256 512 + TEST_VECTORS_DIR=test-vectors CBC_VECTORS=\ @@ -80,6 +92,7 @@ C_OBJECTS=\ zkb-test.o \ zvbb-test.o \ zvbc-test.o \ + zvkg-test.o \ ASM_OBJECTS=\ vlen-bits.o \ @@ -92,7 +105,7 @@ ASM_OBJECTS=\ zvksed.o \ zvksh.o \ -default: aes-cbc-test aes-gcm-test sha-test sm3-test sm4-test zvbb-test zvbc-test +default: aes-cbc-test aes-gcm-test sha-test sm3-test sm4-test zvbb-test zvbc-test zvkg-test .PHONY: test-vectors test-vectors: $(SUBDIR_CBC_VECTORS) $(SUBDIR_GCM_VECTORS) $(SUBDIR_SHA_VECTORS) @@ -133,52 +146,69 @@ zvbb-test: zvbb-test.o zvbb.o log.o vlen-bits.o zvbc-test: zvbc-test.o zvbc.o log.o vlen-bits.o $(LD) $(LDFLAGS) -o $@ $^ -# TODO: add VLEN=32, VLEN=64 runs. +zvkg-test: zvkg-test.o zvkg.o log.o vlen-bits.o + $(LD) $(LDFLAGS) -o $@ $^ + .PHONY: run-aes-cbc run-aes-cbc: aes-cbc-test - for VLEN in 128 256 512; do \ + for VLEN in $(TESTED_VLENS); do \ $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \ done -# TODO: add VLEN=64 runs. +# TODO: add logic supporting VLEN=64 runs. .PHONY: run-aes-gcm run-aes-gcm: aes-gcm-test - for VLEN in 128 256 512; do \ - $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \ + for VLEN in $(TESTED_VLENS); do \ + if [[ $${VLEN} == 64 ]]; then \ + echo "*** Skipping $< test with VLEN=$${VLEN}"; \ + else \ + $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \ + fi \ done .PHONY: run-sha run-sha: sha-test - for VLEN in 64 128 256 512; do \ + for VLEN in $(TESTED_VLENS); do \ $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \ done .PHONY: run-sm3 run-sm3: sm3-test - for VLEN in 64 128 256 512; do \ + for VLEN in $(TESTED_VLENS); do \ $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \ done +# TODO: add logic supporting VLEN=64 runs. .PHONY: run-sm4 run-sm4: sm4-test - for VLEN in 128 256 512; do \ - $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \ + for VLEN in $(TESTED_VLENS); do \ + if [[ $${VLEN} == 64 ]]; then \ + echo "*** Skipping $< test with VLEN=$${VLEN}"; \ + else \ + $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \ + fi \ done .PHONY: run-zvbb run-zvbb: zvbb-test - for VLEN in 64 128 256 512; do \ + for VLEN in $(TESTED_VLENS); do \ $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \ done .PHONY: run-zvbc run-zvbc: zvbc-test - for VLEN in 64 128 256 512; do \ + for VLEN in $(TESTED_VLENS); do \ + $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \ + done + +.PHONY: run-zvkg +run-zvkg: zvkg-test + for VLEN in $(TESTED_VLENS); do \ $(SPIKE) --varch=vlen:$${VLEN},elen:64 $(COMMON_SPIKE_FLAGS) $(PK) $< || exit 1; \ done .PHONY: run-tests -run-tests: run-aes-cbc run-aes-gcm run-sha run-sm3 run-sm4 run-zvbb run-zvbc +run-tests: run-aes-cbc run-aes-gcm run-sha run-sm3 run-sm4 run-zvbb run-zvbc run-zvkg .PHONY: clean clean: diff --git a/doc/vector/code-samples/README.md b/doc/vector/code-samples/README.md index 58fd219e..fe6c89f3 100644 --- a/doc/vector/code-samples/README.md +++ b/doc/vector/code-samples/README.md @@ -73,6 +73,8 @@ make run-tests TARGET=riscv64-unknown-linux-gnu \ - `sm3-test` - Build the SM3 example. - `sm4-test` - Build the SM4 example. - `zvbb-test` - Build the Zvbb example. +- `zvbc-test` - Build the Zvbc example. +- `zvkg-test` - Build the Zvkg example. - `run-tests` - Build and run all examples. - `run-aes-cbc` - Build and run the AES-CBC example in Spike. - `run-aes-gcm` - Build and run the AES-GCM example in Spike. @@ -80,12 +82,18 @@ make run-tests TARGET=riscv64-unknown-linux-gnu \ - `run-sm3` - Build and run the SM3 example in Spike. - `run-sm4` - Build and run the SM4 example in Spike. - `run-zvbb` - Build and run the Zvbb example in Spike. +- `run-zvbc` - Build and run the Zvbc example in Spike. +- `run-zvkg` - Build and run the Zvkg example in Spike. ### Make variables - `TARGET` - Target triplet to use. By default riscv64-linux-gnu. - `PK` - Location of the riscv-pk binary. By default it's `~/RISC-V/$(TARGET)/bin/pk`. +- `TESTED_VLENS` - Space separated list of VLEN values being tested + against. All algorithms support VLEN>=128, most support VLEN=64. + Tests that do not support VLEN=64 will be skipped if that value + is present in the list. See Makefile for more details. diff --git a/doc/vector/code-samples/aes-gcm-test.c b/doc/vector/code-samples/aes-gcm-test.c index 00115faa..d1db6c88 100644 --- a/doc/vector/code-samples/aes-gcm-test.c +++ b/doc/vector/code-samples/aes-gcm-test.c @@ -426,15 +426,18 @@ run_test_zvb(const struct aes_gcm_test* test, int keylen) } for (int i = 0; i < test->ctlen / 16; i++) { - if (!test->encrypt) + if (!test->encrypt) { ghash(&Y, (uint128 *)(&xordata[16 * i]), &H); + } encrypt_block(&buf[16 * i], &counter_block, &key); - for (int j = 0; j < 16; j++) + for (int j = 0; j < 16; j++) { buf[16 * i + j] ^= xordata[16 * i + j]; + } - if (test->encrypt) + if (test->encrypt) { ghash(&Y, (uint128 *)(&buf[16 * i]), &H); + } increment_counter_block(&counter_block); } @@ -449,8 +452,9 @@ run_test_zvb(const struct aes_gcm_test* test, int keylen) // buf shall have enough space to fit the extra bytes. encrypt_block(&buf[test->ctlen - rem], &counter_block, &key); - for (int i = 0; i < rem; i++) + for (int i = 0; i < rem; i++) { buf[test->ctlen - rem + i] ^= xordata[test->ctlen - rem + i]; + } if (test->encrypt) { bzero(&temp, sizeof(temp)); diff --git a/doc/vector/code-samples/zvkg-test.c b/doc/vector/code-samples/zvkg-test.c new file mode 100644 index 00000000..6f9a4c29 --- /dev/null +++ b/doc/vector/code-samples/zvkg-test.c @@ -0,0 +1,108 @@ +// Copyright 2023 Rivos Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "vlen-bits.h" +#include "zvkg.h" + +// @brief Return a 32-bit randomly generated number using rand() +// +// @return uint32_t +// +uint32_t +rand32() +{ + return rand(); +} + +// @brief Tests vectorized multiply in Galois Field instruction vgmul +// using randomly generated test vectors. +// +// 'vghsh.vv' is used to generate the "golden" outputs that we check vgmul +// against. The correctness of vghsh is established in the test 'aes-gcm-test'. +// +int +test_rand_vgmul() +{ +#define kNumGroups 113 +#define kNumElements (4 * (kNumGroups)) +#define kRounds 100 + + uint32_t y[kNumElements]; + uint32_t z[kNumElements]; + uint32_t expected[kNumElements]; + uint32_t actual[kNumElements]; + + LOG("--- Testing vgmul against vghsh"); + + for (size_t round = 0; round < kRounds; ++round) { + for (size_t i = 0; i < kNumElements; ++i) { + actual[i] = expected[i] = rand32(); + y[i] = rand32(); + z[i] = 0; + } + + // The reference (expected) output is produced by vghsh + zvkg_vghsh_vv(expected, z, y, kNumGroups); + + // The tested (actual) output is produced by vgmul + zvkg_vgmul_vv(actual, y, kNumGroups); + + if (memcmp(actual, expected, sizeof(actual))) { + LOG("FAILURE: 'actual' does NOT match 'expected'"); + for (size_t i = 0; i < kNumElements; ++i) { + const uint32_t exp = expected[i]; + const uint32_t act = actual[i]; + LOG("expected[%3zd]: 0x%08" PRIx32 + ", actual[%3zd]: 0x%08" PRIx32 + " %s", i, exp, i, act, (exp == act ? "==" : "!=")); + } + return 1; + } + } + + return 0; +} + +// @brief Calls test functions for our intrinsics +// +// @return int +// +int +main() +{ + const uint64_t vlen = vlen_bits(); + LOG("VLEN = %" PRIu64, vlen); + + int res = 0; + + // The correctness of 'vghsh.vv' is established in the test 'aes-gcm-test', + // so this test is only there to validate 'vgmul.vv'. + res = test_rand_vgmul(); + if (res != 0) { + return res; + } + + return 0; +} diff --git a/doc/vector/code-samples/zvkg.h b/doc/vector/code-samples/zvkg.h index 616afdac..58574248 100644 --- a/doc/vector/code-samples/zvkg.h +++ b/doc/vector/code-samples/zvkg.h @@ -21,7 +21,7 @@ // does not support unaligned access. // // Y <- (Y xor X) o H -// Where 'o' is the Galois Field Multiplication. +// where 'o' is the Galois Field Multiplication in GF(2^128). extern void zvkg_vghsh( void* Y, @@ -29,4 +29,31 @@ zvkg_vghsh( const void* H ); +// Y, X, and H point to arrays of 128 bits values, 32b aligned +// if the processor does not support unaligned access. +// 'n' is the number of 128b element groups. +// +// Y[i]_out = (Y[i]_in ^ X[i]) o H[i] +// where 'o' is the Galois Field Multiplication in GF(2^128). +extern void +zvkg_vghsh_vv( + void* Y, + const void* X, + const void* H, + size_t n +); + +// X and Y point to arrays of 128 bits values, 32b aligned +// if the processor does not support unaligned access. +// 'n' is the number of 128b element groups. +// +// Y[i]_out = Y[i]_in o H[i] +// where 'o' is the Galois Field Multiplication in GF(2^128). +extern void +zvkg_vgmul_vv( + void* Y, + const void* H, + size_t n +); + #endif // ZVKG_H_ diff --git a/doc/vector/code-samples/zvkg.s b/doc/vector/code-samples/zvkg.s index e91eb4b3..0bb81e0d 100644 --- a/doc/vector/code-samples/zvkg.s +++ b/doc/vector/code-samples/zvkg.s @@ -51,3 +51,92 @@ zvkg_vghsh: vghsh.vv v0, v8, v4 vse32.v v0, (a0) ret + + +# zvkg_vghsh_vv +# +# Performs vector add-multiply over GHASH Galois-Field for multiple +# 128 bit elements groups. 'n' is the number of 128b groups. +# The input arrays should be 32b aligned on processors that do not +# support unaligned 32b vector loads/stores. +# Y[i]_out = ((Y[i]_in ^ X[i]) o H[i]) +# +# void zvkg_vghsh_vv( +# uint32_t* Y, // a0 +# uint32_t* X, // a1 +# uint32_t* H, // a2 +# size_t n // a3 +# ); +# +.balign 4 +.global zvkg_vghsh_vv +zvkg_vghsh_vv: + beqz a3, 2f # Early exit in the "0 bytes to process" case + # a3 on input is number of 128b groups in the input arrays. We multiply + # by 4 as the Zvkg instructions expect VSEW=32. a3 becomes the number + # of 32b elements to process, which is a multiple of 4. + slli a3, a3, 2 +1: + # We use LMUL=4 to enable runs with VLEN=32, as a proof of concept. + # Once VLEN>=128, we can simply use LMUL=1. + vsetvli t0, a3, e32, m4, ta, ma + + vle32.v v0, (a0) + vle32.v v4, (a1) + vle32.v v8, (a2) + vghsh.vv v0, v8, v4 # Y(v0) = Y(v0) ^ X(v4)) o H(v8) + vse32.v v0, (a0) + + sub a3, a3, t0 # Decrement number of remaining 32b elements + slli t0, t0, 2 # t0 (#bytes consumed) <- t0 (#4B) * 4 + add a0, a0, t0 + add a1, a1, t0 + add a2, a2, t0 + bnez a3, 1b # More elements to process? + +2: + ret + +# zvkg_vgmul_vv +# +# Performs vector multiply over GHASH Galois-Field for multiple +# 128 bit elements groups. +# 'n' is the number of 128b element groups, n operations will be performed. +# The input arrays should be 32b aligned on processors that do not +# support unaligned 32b vector loads/stores. +# +# Y[i]_out = (Y[i]_in o H[i]) +# +# void zvkg_vgmul_vv( +# uint32_t* Y, // a0 +# uint32_t* H, // a1 +# size_t n // a2 +# ); +# +.balign 4 +.global zvkg_vgmul_vv +zvkg_vgmul_vv: + beqz a2, 2f # Early exit in the "0 bytes to process" case + # a3 on input is number of 128b groups in the input arrays. We multiply + # by 4 as the Zvkg instructions expect VSEW=32. a3 becomes the number + # of 32b elements to process, which is a multiple of 4. + slli a2, a2, 2 +1: + # We use LMUL=4 to enable runs with VLEN=32, as a proof of concept. + # Once VLEN>=128, we can simply use LMUL=1. + vsetvli t0, a2, e32, m4, ta, ma + + vle32.v v0, (a0) + vle32.v v4, (a1) + vgmul.vv v0, v4 + vse32.v v0, (a0) + + sub a2, a2, t0 + slli t0, t0, 2 # t0 (#bytes consumes) <- t0 (#4B) * 4 + add a0, a0, t0 + add a1, a1, t0 + bnez a2, 1b # More elements to process? + +2: + ret + diff --git a/doc/vector/code-samples/zvknh.s b/doc/vector/code-samples/zvknh.s index 7c0f552d..c5b994b8 100644 --- a/doc/vector/code-samples/zvknh.s +++ b/doc/vector/code-samples/zvknh.s @@ -33,7 +33,7 @@ .data .balign 16 # Only 4 is needed, 16 is just nice. -// Note that those values are stored in native endianness. +# Note that those values are stored in native endianness. SHA256_ROUND_CONSTANTS: .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 # 0-3 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 # 4-7 @@ -53,7 +53,7 @@ SHA256_ROUND_CONSTANTS: .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 # 60-63 .balign 4 # Only 8 is needed, 32 is just nice. -// Note that those values are stored in native endianness. +# Note that those values are stored in native endianness. SHA512_ROUND_CONSTANTS: .dword 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc # 0-3 .dword 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 # 4-7