From 85e509d2026040d0b51835850b7279d745f83f7c Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Sat, 28 Sep 2024 15:18:12 +0000 Subject: [PATCH] feat: update bavard and use better syntax in asm --- ecc/bls12-377/fp/element_ops_amd64.s | 2 +- ecc/bls12-377/fp/vector_test.go | 4 +- ecc/bls12-377/fr/element_ops_amd64.go | 10 +- ecc/bls12-377/fr/element_ops_amd64.s | 4 +- ecc/bls12-377/fr/vector_test.go | 4 +- ecc/bls12-381/fp/element_ops_amd64.s | 2 +- ecc/bls12-381/fp/vector_test.go | 4 +- ecc/bls12-381/fr/element_ops_amd64.go | 10 +- ecc/bls12-381/fr/element_ops_amd64.s | 4 +- ecc/bls12-381/fr/vector_test.go | 4 +- ecc/bls24-315/fp/element_ops_amd64.s | 2 +- ecc/bls24-315/fp/vector_test.go | 4 +- ecc/bls24-315/fr/element_ops_amd64.go | 10 +- ecc/bls24-315/fr/element_ops_amd64.s | 4 +- ecc/bls24-315/fr/vector_test.go | 4 +- ecc/bls24-317/fp/element_ops_amd64.s | 2 +- ecc/bls24-317/fp/vector_test.go | 4 +- ecc/bls24-317/fr/element_ops_amd64.go | 10 +- ecc/bls24-317/fr/element_ops_amd64.s | 4 +- ecc/bls24-317/fr/vector_test.go | 4 +- ecc/bn254/fp/element_ops_amd64.go | 10 +- ecc/bn254/fp/element_ops_amd64.s | 4 +- ecc/bn254/fp/vector_test.go | 4 +- ecc/bn254/fr/element_ops_amd64.go | 10 +- ecc/bn254/fr/element_ops_amd64.s | 4 +- ecc/bn254/fr/vector_test.go | 4 +- ecc/bw6-633/fp/element_ops_amd64.s | 2 +- ecc/bw6-633/fp/vector_test.go | 4 +- ecc/bw6-633/fr/element_ops_amd64.s | 2 +- ecc/bw6-633/fr/vector_test.go | 4 +- ecc/bw6-761/fp/element_ops_amd64.s | 2 +- ecc/bw6-761/fp/vector_test.go | 4 +- ecc/bw6-761/fr/element_ops_amd64.s | 2 +- ecc/bw6-761/fr/vector_test.go | 4 +- ecc/secp256k1/fp/vector_test.go | 4 +- ecc/secp256k1/fr/vector_test.go | 4 +- ecc/stark-curve/fp/element_ops_amd64.go | 10 +- ecc/stark-curve/fp/element_ops_amd64.s | 4 +- ecc/stark-curve/fp/vector_test.go | 4 +- ecc/stark-curve/fr/element_ops_amd64.go | 10 +- ecc/stark-curve/fr/element_ops_amd64.s | 4 +- ecc/stark-curve/fr/vector_test.go | 4 +- field/asm/element_10w_amd64.s | 15 +- field/asm/element_12w_amd64.s | 15 +- field/asm/element_4w_amd64.s | 15 +- field/asm/element_5w_amd64.s | 15 +- field/asm/element_6w_amd64.s | 15 +- field/asm/vector_4w_amd64.s | 550 ++++++++---------- field/generator/asm/amd64/build.go | 36 +- field/generator/asm/amd64/element_mul.go | 1 + field/generator/asm/amd64/element_vec.go | 339 ++--------- .../internal/templates/element/ops_asm.go | 10 +- .../templates/element/tests_vector.go | 4 +- field/goldilocks/vector_test.go | 4 +- go.mod | 4 +- go.sum | 4 +- 56 files changed, 442 insertions(+), 785 deletions(-) diff --git a/ecc/bls12-377/fp/element_ops_amd64.s b/ecc/bls12-377/fp/element_ops_amd64.s index ec133ebb5..7d569a34f 100644 --- a/ecc/bls12-377/fp/element_ops_amd64.s +++ b/ecc/bls12-377/fp/element_ops_amd64.s @@ -1,6 +1,6 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 4744144000832074674 +// We include the hash to force the Go compiler to recompile: 9834095789769468746 #include "../../../field/asm/element_6w_amd64.s" diff --git a/ecc/bls12-377/fp/vector_test.go b/ecc/bls12-377/fp/vector_test.go index 5c4e3a6c3..bdce3116b 100644 --- a/ecc/bls12-377/fp/vector_test.go +++ b/ecc/bls12-377/fp/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bls12-377/fr/element_ops_amd64.go b/ecc/bls12-377/fr/element_ops_amd64.go index e79066cd4..b0f74525c 100644 --- a/ecc/bls12-377/fr/element_ops_amd64.go +++ b/ecc/bls12-377/fr/element_ops_amd64.go @@ -101,9 +101,6 @@ func (vector *Vector) Sum() (res Element) { //go:noescape func sumVec(res *Element, a *Element, n uint64) -//go:noescape -func innerProdVec(res *uint64, a, b *Element, n uint64) - // InnerProduct computes the inner product of two vectors. // It panics if the vectors don't have the same length. func (vector *Vector) InnerProduct(other Vector) (res Element) { @@ -114,10 +111,10 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { if n != uint64(len(other)) { panic("vector.InnerProduct: vectors don't have the same length") } - const minN = 0 // AVX512 slower than generic for small n const maxN = (1 << 32) - 1 - if !supportAvx512 || n <= minN || n >= maxN { + if !supportAvx512 || n >= maxN { // call innerProductVecGeneric + // note; we could split the vector into smaller chunks and call innerProductVec innerProductVecGeneric(&res, *vector, other) return } @@ -126,6 +123,9 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { return } +//go:noescape +func innerProdVec(res *uint64, a, b *Element, n uint64) + // Mul z = x * y (mod q) // // x and y must be less than q diff --git a/ecc/bls12-377/fr/element_ops_amd64.s b/ecc/bls12-377/fr/element_ops_amd64.s index 3ef8676bb..f547f8246 100644 --- a/ecc/bls12-377/fr/element_ops_amd64.s +++ b/ecc/bls12-377/fr/element_ops_amd64.s @@ -1,10 +1,10 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 444291736743577851 +// We include the hash to force the Go compiler to recompile: 13361353926491767027 #include "../../../field/asm/element_4w_amd64.s" // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 7023651991118674373 +// We include the hash to force the Go compiler to recompile: 5868826605799376126 #include "../../../field/asm/vector_4w_amd64.s" diff --git a/ecc/bls12-377/fr/vector_test.go b/ecc/bls12-377/fr/vector_test.go index f230cab99..6d032b9bd 100644 --- a/ecc/bls12-377/fr/vector_test.go +++ b/ecc/bls12-377/fr/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bls12-381/fp/element_ops_amd64.s b/ecc/bls12-381/fp/element_ops_amd64.s index ec133ebb5..7d569a34f 100644 --- a/ecc/bls12-381/fp/element_ops_amd64.s +++ b/ecc/bls12-381/fp/element_ops_amd64.s @@ -1,6 +1,6 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 4744144000832074674 +// We include the hash to force the Go compiler to recompile: 9834095789769468746 #include "../../../field/asm/element_6w_amd64.s" diff --git a/ecc/bls12-381/fp/vector_test.go b/ecc/bls12-381/fp/vector_test.go index 5c4e3a6c3..bdce3116b 100644 --- a/ecc/bls12-381/fp/vector_test.go +++ b/ecc/bls12-381/fp/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bls12-381/fr/element_ops_amd64.go b/ecc/bls12-381/fr/element_ops_amd64.go index e79066cd4..b0f74525c 100644 --- a/ecc/bls12-381/fr/element_ops_amd64.go +++ b/ecc/bls12-381/fr/element_ops_amd64.go @@ -101,9 +101,6 @@ func (vector *Vector) Sum() (res Element) { //go:noescape func sumVec(res *Element, a *Element, n uint64) -//go:noescape -func innerProdVec(res *uint64, a, b *Element, n uint64) - // InnerProduct computes the inner product of two vectors. // It panics if the vectors don't have the same length. func (vector *Vector) InnerProduct(other Vector) (res Element) { @@ -114,10 +111,10 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { if n != uint64(len(other)) { panic("vector.InnerProduct: vectors don't have the same length") } - const minN = 0 // AVX512 slower than generic for small n const maxN = (1 << 32) - 1 - if !supportAvx512 || n <= minN || n >= maxN { + if !supportAvx512 || n >= maxN { // call innerProductVecGeneric + // note; we could split the vector into smaller chunks and call innerProductVec innerProductVecGeneric(&res, *vector, other) return } @@ -126,6 +123,9 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { return } +//go:noescape +func innerProdVec(res *uint64, a, b *Element, n uint64) + // Mul z = x * y (mod q) // // x and y must be less than q diff --git a/ecc/bls12-381/fr/element_ops_amd64.s b/ecc/bls12-381/fr/element_ops_amd64.s index 3ef8676bb..f547f8246 100644 --- a/ecc/bls12-381/fr/element_ops_amd64.s +++ b/ecc/bls12-381/fr/element_ops_amd64.s @@ -1,10 +1,10 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 444291736743577851 +// We include the hash to force the Go compiler to recompile: 13361353926491767027 #include "../../../field/asm/element_4w_amd64.s" // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 7023651991118674373 +// We include the hash to force the Go compiler to recompile: 5868826605799376126 #include "../../../field/asm/vector_4w_amd64.s" diff --git a/ecc/bls12-381/fr/vector_test.go b/ecc/bls12-381/fr/vector_test.go index f230cab99..6d032b9bd 100644 --- a/ecc/bls12-381/fr/vector_test.go +++ b/ecc/bls12-381/fr/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bls24-315/fp/element_ops_amd64.s b/ecc/bls24-315/fp/element_ops_amd64.s index 605f1ee09..5cda5d1b3 100644 --- a/ecc/bls24-315/fp/element_ops_amd64.s +++ b/ecc/bls24-315/fp/element_ops_amd64.s @@ -1,6 +1,6 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 7085443902529260581 +// We include the hash to force the Go compiler to recompile: 14698325647798323165 #include "../../../field/asm/element_5w_amd64.s" diff --git a/ecc/bls24-315/fp/vector_test.go b/ecc/bls24-315/fp/vector_test.go index 6236eafee..5707a6271 100644 --- a/ecc/bls24-315/fp/vector_test.go +++ b/ecc/bls24-315/fp/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bls24-315/fr/element_ops_amd64.go b/ecc/bls24-315/fr/element_ops_amd64.go index e79066cd4..b0f74525c 100644 --- a/ecc/bls24-315/fr/element_ops_amd64.go +++ b/ecc/bls24-315/fr/element_ops_amd64.go @@ -101,9 +101,6 @@ func (vector *Vector) Sum() (res Element) { //go:noescape func sumVec(res *Element, a *Element, n uint64) -//go:noescape -func innerProdVec(res *uint64, a, b *Element, n uint64) - // InnerProduct computes the inner product of two vectors. // It panics if the vectors don't have the same length. func (vector *Vector) InnerProduct(other Vector) (res Element) { @@ -114,10 +111,10 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { if n != uint64(len(other)) { panic("vector.InnerProduct: vectors don't have the same length") } - const minN = 0 // AVX512 slower than generic for small n const maxN = (1 << 32) - 1 - if !supportAvx512 || n <= minN || n >= maxN { + if !supportAvx512 || n >= maxN { // call innerProductVecGeneric + // note; we could split the vector into smaller chunks and call innerProductVec innerProductVecGeneric(&res, *vector, other) return } @@ -126,6 +123,9 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { return } +//go:noescape +func innerProdVec(res *uint64, a, b *Element, n uint64) + // Mul z = x * y (mod q) // // x and y must be less than q diff --git a/ecc/bls24-315/fr/element_ops_amd64.s b/ecc/bls24-315/fr/element_ops_amd64.s index 3ef8676bb..f547f8246 100644 --- a/ecc/bls24-315/fr/element_ops_amd64.s +++ b/ecc/bls24-315/fr/element_ops_amd64.s @@ -1,10 +1,10 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 444291736743577851 +// We include the hash to force the Go compiler to recompile: 13361353926491767027 #include "../../../field/asm/element_4w_amd64.s" // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 7023651991118674373 +// We include the hash to force the Go compiler to recompile: 5868826605799376126 #include "../../../field/asm/vector_4w_amd64.s" diff --git a/ecc/bls24-315/fr/vector_test.go b/ecc/bls24-315/fr/vector_test.go index f230cab99..6d032b9bd 100644 --- a/ecc/bls24-315/fr/vector_test.go +++ b/ecc/bls24-315/fr/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bls24-317/fp/element_ops_amd64.s b/ecc/bls24-317/fp/element_ops_amd64.s index 605f1ee09..5cda5d1b3 100644 --- a/ecc/bls24-317/fp/element_ops_amd64.s +++ b/ecc/bls24-317/fp/element_ops_amd64.s @@ -1,6 +1,6 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 7085443902529260581 +// We include the hash to force the Go compiler to recompile: 14698325647798323165 #include "../../../field/asm/element_5w_amd64.s" diff --git a/ecc/bls24-317/fp/vector_test.go b/ecc/bls24-317/fp/vector_test.go index 6236eafee..5707a6271 100644 --- a/ecc/bls24-317/fp/vector_test.go +++ b/ecc/bls24-317/fp/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bls24-317/fr/element_ops_amd64.go b/ecc/bls24-317/fr/element_ops_amd64.go index e79066cd4..b0f74525c 100644 --- a/ecc/bls24-317/fr/element_ops_amd64.go +++ b/ecc/bls24-317/fr/element_ops_amd64.go @@ -101,9 +101,6 @@ func (vector *Vector) Sum() (res Element) { //go:noescape func sumVec(res *Element, a *Element, n uint64) -//go:noescape -func innerProdVec(res *uint64, a, b *Element, n uint64) - // InnerProduct computes the inner product of two vectors. // It panics if the vectors don't have the same length. func (vector *Vector) InnerProduct(other Vector) (res Element) { @@ -114,10 +111,10 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { if n != uint64(len(other)) { panic("vector.InnerProduct: vectors don't have the same length") } - const minN = 0 // AVX512 slower than generic for small n const maxN = (1 << 32) - 1 - if !supportAvx512 || n <= minN || n >= maxN { + if !supportAvx512 || n >= maxN { // call innerProductVecGeneric + // note; we could split the vector into smaller chunks and call innerProductVec innerProductVecGeneric(&res, *vector, other) return } @@ -126,6 +123,9 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { return } +//go:noescape +func innerProdVec(res *uint64, a, b *Element, n uint64) + // Mul z = x * y (mod q) // // x and y must be less than q diff --git a/ecc/bls24-317/fr/element_ops_amd64.s b/ecc/bls24-317/fr/element_ops_amd64.s index 3ef8676bb..f547f8246 100644 --- a/ecc/bls24-317/fr/element_ops_amd64.s +++ b/ecc/bls24-317/fr/element_ops_amd64.s @@ -1,10 +1,10 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 444291736743577851 +// We include the hash to force the Go compiler to recompile: 13361353926491767027 #include "../../../field/asm/element_4w_amd64.s" // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 7023651991118674373 +// We include the hash to force the Go compiler to recompile: 5868826605799376126 #include "../../../field/asm/vector_4w_amd64.s" diff --git a/ecc/bls24-317/fr/vector_test.go b/ecc/bls24-317/fr/vector_test.go index f230cab99..6d032b9bd 100644 --- a/ecc/bls24-317/fr/vector_test.go +++ b/ecc/bls24-317/fr/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bn254/fp/element_ops_amd64.go b/ecc/bn254/fp/element_ops_amd64.go index 8f909cd0f..3b78cf464 100644 --- a/ecc/bn254/fp/element_ops_amd64.go +++ b/ecc/bn254/fp/element_ops_amd64.go @@ -101,9 +101,6 @@ func (vector *Vector) Sum() (res Element) { //go:noescape func sumVec(res *Element, a *Element, n uint64) -//go:noescape -func innerProdVec(res *uint64, a, b *Element, n uint64) - // InnerProduct computes the inner product of two vectors. // It panics if the vectors don't have the same length. func (vector *Vector) InnerProduct(other Vector) (res Element) { @@ -114,10 +111,10 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { if n != uint64(len(other)) { panic("vector.InnerProduct: vectors don't have the same length") } - const minN = 0 // AVX512 slower than generic for small n const maxN = (1 << 32) - 1 - if !supportAvx512 || n <= minN || n >= maxN { + if !supportAvx512 || n >= maxN { // call innerProductVecGeneric + // note; we could split the vector into smaller chunks and call innerProductVec innerProductVecGeneric(&res, *vector, other) return } @@ -126,6 +123,9 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { return } +//go:noescape +func innerProdVec(res *uint64, a, b *Element, n uint64) + // Mul z = x * y (mod q) // // x and y must be less than q diff --git a/ecc/bn254/fp/element_ops_amd64.s b/ecc/bn254/fp/element_ops_amd64.s index 3ef8676bb..f547f8246 100644 --- a/ecc/bn254/fp/element_ops_amd64.s +++ b/ecc/bn254/fp/element_ops_amd64.s @@ -1,10 +1,10 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 444291736743577851 +// We include the hash to force the Go compiler to recompile: 13361353926491767027 #include "../../../field/asm/element_4w_amd64.s" // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 7023651991118674373 +// We include the hash to force the Go compiler to recompile: 5868826605799376126 #include "../../../field/asm/vector_4w_amd64.s" diff --git a/ecc/bn254/fp/vector_test.go b/ecc/bn254/fp/vector_test.go index 655f88692..e4996e522 100644 --- a/ecc/bn254/fp/vector_test.go +++ b/ecc/bn254/fp/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bn254/fr/element_ops_amd64.go b/ecc/bn254/fr/element_ops_amd64.go index e79066cd4..b0f74525c 100644 --- a/ecc/bn254/fr/element_ops_amd64.go +++ b/ecc/bn254/fr/element_ops_amd64.go @@ -101,9 +101,6 @@ func (vector *Vector) Sum() (res Element) { //go:noescape func sumVec(res *Element, a *Element, n uint64) -//go:noescape -func innerProdVec(res *uint64, a, b *Element, n uint64) - // InnerProduct computes the inner product of two vectors. // It panics if the vectors don't have the same length. func (vector *Vector) InnerProduct(other Vector) (res Element) { @@ -114,10 +111,10 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { if n != uint64(len(other)) { panic("vector.InnerProduct: vectors don't have the same length") } - const minN = 0 // AVX512 slower than generic for small n const maxN = (1 << 32) - 1 - if !supportAvx512 || n <= minN || n >= maxN { + if !supportAvx512 || n >= maxN { // call innerProductVecGeneric + // note; we could split the vector into smaller chunks and call innerProductVec innerProductVecGeneric(&res, *vector, other) return } @@ -126,6 +123,9 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { return } +//go:noescape +func innerProdVec(res *uint64, a, b *Element, n uint64) + // Mul z = x * y (mod q) // // x and y must be less than q diff --git a/ecc/bn254/fr/element_ops_amd64.s b/ecc/bn254/fr/element_ops_amd64.s index 3ef8676bb..f547f8246 100644 --- a/ecc/bn254/fr/element_ops_amd64.s +++ b/ecc/bn254/fr/element_ops_amd64.s @@ -1,10 +1,10 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 444291736743577851 +// We include the hash to force the Go compiler to recompile: 13361353926491767027 #include "../../../field/asm/element_4w_amd64.s" // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 7023651991118674373 +// We include the hash to force the Go compiler to recompile: 5868826605799376126 #include "../../../field/asm/vector_4w_amd64.s" diff --git a/ecc/bn254/fr/vector_test.go b/ecc/bn254/fr/vector_test.go index f230cab99..6d032b9bd 100644 --- a/ecc/bn254/fr/vector_test.go +++ b/ecc/bn254/fr/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bw6-633/fp/element_ops_amd64.s b/ecc/bw6-633/fp/element_ops_amd64.s index 8fecb7e10..3ba7db746 100644 --- a/ecc/bw6-633/fp/element_ops_amd64.s +++ b/ecc/bw6-633/fp/element_ops_amd64.s @@ -1,6 +1,6 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 740214979564192701 +// We include the hash to force the Go compiler to recompile: 2301942780660398757 #include "../../../field/asm/element_10w_amd64.s" diff --git a/ecc/bw6-633/fp/vector_test.go b/ecc/bw6-633/fp/vector_test.go index c6dd6cfd1..6a86fd362 100644 --- a/ecc/bw6-633/fp/vector_test.go +++ b/ecc/bw6-633/fp/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bw6-633/fr/element_ops_amd64.s b/ecc/bw6-633/fr/element_ops_amd64.s index 605f1ee09..5cda5d1b3 100644 --- a/ecc/bw6-633/fr/element_ops_amd64.s +++ b/ecc/bw6-633/fr/element_ops_amd64.s @@ -1,6 +1,6 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 7085443902529260581 +// We include the hash to force the Go compiler to recompile: 14698325647798323165 #include "../../../field/asm/element_5w_amd64.s" diff --git a/ecc/bw6-633/fr/vector_test.go b/ecc/bw6-633/fr/vector_test.go index 415a8b490..fbea4a859 100644 --- a/ecc/bw6-633/fr/vector_test.go +++ b/ecc/bw6-633/fr/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bw6-761/fp/element_ops_amd64.s b/ecc/bw6-761/fp/element_ops_amd64.s index 520b1fb87..53ef94eec 100644 --- a/ecc/bw6-761/fp/element_ops_amd64.s +++ b/ecc/bw6-761/fp/element_ops_amd64.s @@ -1,6 +1,6 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 11568055404266003096 +// We include the hash to force the Go compiler to recompile: 15402800788770244368 #include "../../../field/asm/element_12w_amd64.s" diff --git a/ecc/bw6-761/fp/vector_test.go b/ecc/bw6-761/fp/vector_test.go index 1daa1dca4..fc4837597 100644 --- a/ecc/bw6-761/fp/vector_test.go +++ b/ecc/bw6-761/fp/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/bw6-761/fr/element_ops_amd64.s b/ecc/bw6-761/fr/element_ops_amd64.s index ec133ebb5..7d569a34f 100644 --- a/ecc/bw6-761/fr/element_ops_amd64.s +++ b/ecc/bw6-761/fr/element_ops_amd64.s @@ -1,6 +1,6 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 4744144000832074674 +// We include the hash to force the Go compiler to recompile: 9834095789769468746 #include "../../../field/asm/element_6w_amd64.s" diff --git a/ecc/bw6-761/fr/vector_test.go b/ecc/bw6-761/fr/vector_test.go index 0cf7bb0b9..02e50c43e 100644 --- a/ecc/bw6-761/fr/vector_test.go +++ b/ecc/bw6-761/fr/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/secp256k1/fp/vector_test.go b/ecc/secp256k1/fp/vector_test.go index 655f88692..e4996e522 100644 --- a/ecc/secp256k1/fp/vector_test.go +++ b/ecc/secp256k1/fp/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/secp256k1/fr/vector_test.go b/ecc/secp256k1/fr/vector_test.go index f230cab99..6d032b9bd 100644 --- a/ecc/secp256k1/fr/vector_test.go +++ b/ecc/secp256k1/fr/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/stark-curve/fp/element_ops_amd64.go b/ecc/stark-curve/fp/element_ops_amd64.go index 8f909cd0f..3b78cf464 100644 --- a/ecc/stark-curve/fp/element_ops_amd64.go +++ b/ecc/stark-curve/fp/element_ops_amd64.go @@ -101,9 +101,6 @@ func (vector *Vector) Sum() (res Element) { //go:noescape func sumVec(res *Element, a *Element, n uint64) -//go:noescape -func innerProdVec(res *uint64, a, b *Element, n uint64) - // InnerProduct computes the inner product of two vectors. // It panics if the vectors don't have the same length. func (vector *Vector) InnerProduct(other Vector) (res Element) { @@ -114,10 +111,10 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { if n != uint64(len(other)) { panic("vector.InnerProduct: vectors don't have the same length") } - const minN = 0 // AVX512 slower than generic for small n const maxN = (1 << 32) - 1 - if !supportAvx512 || n <= minN || n >= maxN { + if !supportAvx512 || n >= maxN { // call innerProductVecGeneric + // note; we could split the vector into smaller chunks and call innerProductVec innerProductVecGeneric(&res, *vector, other) return } @@ -126,6 +123,9 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { return } +//go:noescape +func innerProdVec(res *uint64, a, b *Element, n uint64) + // Mul z = x * y (mod q) // // x and y must be less than q diff --git a/ecc/stark-curve/fp/element_ops_amd64.s b/ecc/stark-curve/fp/element_ops_amd64.s index 3ef8676bb..f547f8246 100644 --- a/ecc/stark-curve/fp/element_ops_amd64.s +++ b/ecc/stark-curve/fp/element_ops_amd64.s @@ -1,10 +1,10 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 444291736743577851 +// We include the hash to force the Go compiler to recompile: 13361353926491767027 #include "../../../field/asm/element_4w_amd64.s" // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 7023651991118674373 +// We include the hash to force the Go compiler to recompile: 5868826605799376126 #include "../../../field/asm/vector_4w_amd64.s" diff --git a/ecc/stark-curve/fp/vector_test.go b/ecc/stark-curve/fp/vector_test.go index 655f88692..e4996e522 100644 --- a/ecc/stark-curve/fp/vector_test.go +++ b/ecc/stark-curve/fp/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/ecc/stark-curve/fr/element_ops_amd64.go b/ecc/stark-curve/fr/element_ops_amd64.go index e79066cd4..b0f74525c 100644 --- a/ecc/stark-curve/fr/element_ops_amd64.go +++ b/ecc/stark-curve/fr/element_ops_amd64.go @@ -101,9 +101,6 @@ func (vector *Vector) Sum() (res Element) { //go:noescape func sumVec(res *Element, a *Element, n uint64) -//go:noescape -func innerProdVec(res *uint64, a, b *Element, n uint64) - // InnerProduct computes the inner product of two vectors. // It panics if the vectors don't have the same length. func (vector *Vector) InnerProduct(other Vector) (res Element) { @@ -114,10 +111,10 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { if n != uint64(len(other)) { panic("vector.InnerProduct: vectors don't have the same length") } - const minN = 0 // AVX512 slower than generic for small n const maxN = (1 << 32) - 1 - if !supportAvx512 || n <= minN || n >= maxN { + if !supportAvx512 || n >= maxN { // call innerProductVecGeneric + // note; we could split the vector into smaller chunks and call innerProductVec innerProductVecGeneric(&res, *vector, other) return } @@ -126,6 +123,9 @@ func (vector *Vector) InnerProduct(other Vector) (res Element) { return } +//go:noescape +func innerProdVec(res *uint64, a, b *Element, n uint64) + // Mul z = x * y (mod q) // // x and y must be less than q diff --git a/ecc/stark-curve/fr/element_ops_amd64.s b/ecc/stark-curve/fr/element_ops_amd64.s index 3ef8676bb..f547f8246 100644 --- a/ecc/stark-curve/fr/element_ops_amd64.s +++ b/ecc/stark-curve/fr/element_ops_amd64.s @@ -1,10 +1,10 @@ // +build !purego // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 444291736743577851 +// We include the hash to force the Go compiler to recompile: 13361353926491767027 #include "../../../field/asm/element_4w_amd64.s" // Code generated by gnark-crypto/generator. DO NOT EDIT. -// We include the hash to force the Go compiler to recompile: 7023651991118674373 +// We include the hash to force the Go compiler to recompile: 5868826605799376126 #include "../../../field/asm/vector_4w_amd64.s" diff --git a/ecc/stark-curve/fr/vector_test.go b/ecc/stark-curve/fr/vector_test.go index f230cab99..6d032b9bd 100644 --- a/ecc/stark-curve/fr/vector_test.go +++ b/ecc/stark-curve/fr/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/field/asm/element_10w_amd64.s b/field/asm/element_10w_amd64.s index 9c1f5fe9b..f12cc2ccd 100644 --- a/field/asm/element_10w_amd64.s +++ b/field/asm/element_10w_amd64.s @@ -1,17 +1,4 @@ -// Copyright 2020 ConsenSys Software Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - +// Code generated by gnark-crypto/generator. DO NOT EDIT. #include "textflag.h" #include "funcdata.h" #include "go_asm.h" diff --git a/field/asm/element_12w_amd64.s b/field/asm/element_12w_amd64.s index 1a0e759cc..649f94f4b 100644 --- a/field/asm/element_12w_amd64.s +++ b/field/asm/element_12w_amd64.s @@ -1,17 +1,4 @@ -// Copyright 2020 ConsenSys Software Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - +// Code generated by gnark-crypto/generator. DO NOT EDIT. #include "textflag.h" #include "funcdata.h" #include "go_asm.h" diff --git a/field/asm/element_4w_amd64.s b/field/asm/element_4w_amd64.s index c2b0fb6cd..cdd59505b 100644 --- a/field/asm/element_4w_amd64.s +++ b/field/asm/element_4w_amd64.s @@ -1,17 +1,4 @@ -// Copyright 2020 ConsenSys Software Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - +// Code generated by gnark-crypto/generator. DO NOT EDIT. #include "textflag.h" #include "funcdata.h" #include "go_asm.h" diff --git a/field/asm/element_5w_amd64.s b/field/asm/element_5w_amd64.s index 00065313d..efe7dd0b2 100644 --- a/field/asm/element_5w_amd64.s +++ b/field/asm/element_5w_amd64.s @@ -1,17 +1,4 @@ -// Copyright 2020 ConsenSys Software Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - +// Code generated by gnark-crypto/generator. DO NOT EDIT. #include "textflag.h" #include "funcdata.h" #include "go_asm.h" diff --git a/field/asm/element_6w_amd64.s b/field/asm/element_6w_amd64.s index abbbd289b..9cc921008 100644 --- a/field/asm/element_6w_amd64.s +++ b/field/asm/element_6w_amd64.s @@ -1,17 +1,4 @@ -// Copyright 2020 ConsenSys Software Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - +// Code generated by gnark-crypto/generator. DO NOT EDIT. #include "textflag.h" #include "funcdata.h" #include "go_asm.h" diff --git a/field/asm/vector_4w_amd64.s b/field/asm/vector_4w_amd64.s index b97a1cf55..16c877490 100644 --- a/field/asm/vector_4w_amd64.s +++ b/field/asm/vector_4w_amd64.s @@ -1,3 +1,6 @@ +// Code generated by gnark-crypto/generator. DO NOT EDIT. +// Functions are derived from Dag Arne Osvik's work in github.com/a16z/vectorized-fields + // addVec(res, a, b *Element, n uint64) res[0...n] = a[0...n] + b[0...n] TEXT ·addVec(SB), NOSPLIT, $0-32 MOVQ res+0(FP), CX @@ -517,11 +520,11 @@ accumulate_11: // lo(hi(w1)) -> CX // lo(hi(w2)) -> R15 // lo(hi(w3)) -> R14 -#define SPLIT_LO_HI(lo, hi) \ - MOVQ hi, lo; \ - ANDQ $0xffffffff, lo; \ - SHLQ $32, lo; \ - SHRQ $32, hi; \ +#define SPLIT_LO_HI(in0, in1) \ + MOVQ in1, in0 \ + ANDQ $0xffffffff, in0 \ + SHLQ $32, in0 \ + SHRQ $32, in1 \ SPLIT_LO_HI(R13, SI) SPLIT_LO_HI(CX, R8) @@ -607,9 +610,11 @@ done_9: // innerProdVec(res, a,b *Element, n uint64) res = sum(a[0...n] * b[0...n]) TEXT ·innerProdVec(SB), NOSPLIT, $0-32 - MOVQ a+8(FP), R14 - MOVQ b+16(FP), R15 - MOVQ n+24(FP), CX + MOVQ a+8(FP), R14 + MOVQ b+16(FP), R15 + MOVQ n+24(FP), CX + + // Create mask for low dword in each qword VPCMPEQB Y0, Y0, Y0 VPMOVZXDQ Y0, Z5 VPXORQ Z16, Z16, Z16 @@ -638,327 +643,238 @@ loop_13: ADDQ $32, R15 // we multiply and accumulate partial products of 4 bytes * 32 bytes - VPMULUDQ.BCST 0*4(R14), Z4, Z2 - VPSRLQ $32, Z2, Z3 - VPANDQ Z5, Z2, Z2 - VPADDQ Z2, Z16, Z16 - VPADDQ Z3, Z24, Z24 - VPMULUDQ.BCST 1*4(R14), Z4, Z2 - VPSRLQ $32, Z2, Z3 - VPANDQ Z5, Z2, Z2 - VPADDQ Z2, Z17, Z17 - VPADDQ Z3, Z25, Z25 - VPMULUDQ.BCST 2*4(R14), Z4, Z2 - VPSRLQ $32, Z2, Z3 - VPANDQ Z5, Z2, Z2 - VPADDQ Z2, Z18, Z18 - VPADDQ Z3, Z26, Z26 - VPMULUDQ.BCST 3*4(R14), Z4, Z2 - VPSRLQ $32, Z2, Z3 - VPANDQ Z5, Z2, Z2 - VPADDQ Z2, Z19, Z19 - VPADDQ Z3, Z27, Z27 - VPMULUDQ.BCST 4*4(R14), Z4, Z2 - VPSRLQ $32, Z2, Z3 - VPANDQ Z5, Z2, Z2 - VPADDQ Z2, Z20, Z20 - VPADDQ Z3, Z28, Z28 - VPMULUDQ.BCST 5*4(R14), Z4, Z2 - VPSRLQ $32, Z2, Z3 - VPANDQ Z5, Z2, Z2 - VPADDQ Z2, Z21, Z21 - VPADDQ Z3, Z29, Z29 - VPMULUDQ.BCST 6*4(R14), Z4, Z2 - VPSRLQ $32, Z2, Z3 - VPANDQ Z5, Z2, Z2 - VPADDQ Z2, Z22, Z22 - VPADDQ Z3, Z30, Z30 - VPMULUDQ.BCST 7*4(R14), Z4, Z2 - VPSRLQ $32, Z2, Z3 - VPANDQ Z5, Z2, Z2 - VPADDQ Z2, Z23, Z23 - VPADDQ Z3, Z31, Z31 - ADDQ $32, R14 - DECQ CX // decrement n - JMP loop_13 +#define MAC(in0, in1, in2) \ + VPMULUDQ.BCST in0, Z4, Z2 \ + VPSRLQ $32, Z2, Z3 \ + VPANDQ Z5, Z2, Z2 \ + VPADDQ Z2, in1, in1 \ + VPADDQ Z3, in2, in2 \ + + MAC(0*4(R14), Z16, Z24) + MAC(1*4(R14), Z17, Z25) + MAC(2*4(R14), Z18, Z26) + MAC(3*4(R14), Z19, Z27) + MAC(4*4(R14), Z20, Z28) + MAC(5*4(R14), Z21, Z29) + MAC(6*4(R14), Z22, Z30) + MAC(7*4(R14), Z23, Z31) + ADDQ $32, R14 + DECQ CX // decrement n + JMP loop_13 accumulate_15: + // we accumulate the partial products into 544bits in Z1:Z0 MOVQ $0x0000000000001555, AX KMOVD AX, K1 MOVQ $1, AX KMOVD AX, K2 // store the least significant 32 bits of ACC (starts with A0L) in Z0 - VALIGND.Z $16, Z16, Z16, K2, Z0 - KSHIFTLW $1, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VPANDQ Z5, Z24, Z2 - VPADDQ Z2, Z16, Z16 - VPANDQ Z5, Z17, Z2 - VPADDQ Z2, Z16, Z16 - VALIGND $15, Z16, Z16, K2, Z0 - KSHIFTLW $1, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VPSRLQ $32, Z24, Z24 - VPADDQ Z24, Z16, Z16 - VPSRLQ $32, Z17, Z17 - VPADDQ Z17, Z16, Z16 - VPANDQ Z5, Z25, Z2 - VPADDQ Z2, Z16, Z16 - VPANDQ Z5, Z18, Z2 - VPADDQ Z2, Z16, Z16 - VALIGND $16-2, Z16, Z16, K2, Z0 - KADDW K2, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VPSRLQ $32, Z25, Z25 - VPADDQ Z25, Z16, Z16 - VPSRLQ $32, Z18, Z18 - VPADDQ Z18, Z16, Z16 - VPANDQ Z5, Z26, Z2 - VPADDQ Z2, Z16, Z16 - VPANDQ Z5, Z19, Z2 - VPADDQ Z2, Z16, Z16 - VALIGND $16-3, Z16, Z16, K2, Z0 - KADDW K2, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VPSRLQ $32, Z26, Z26 - VPADDQ Z26, Z16, Z16 - VPSRLQ $32, Z19, Z19 - VPADDQ Z19, Z16, Z16 - VPANDQ Z5, Z27, Z2 - VPADDQ Z2, Z16, Z16 - VPANDQ Z5, Z20, Z2 - VPADDQ Z2, Z16, Z16 - VALIGND $16-4, Z16, Z16, K2, Z0 - KADDW K2, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VPSRLQ $32, Z27, Z27 - VPADDQ Z27, Z16, Z16 - VPSRLQ $32, Z20, Z20 - VPADDQ Z20, Z16, Z16 - VPANDQ Z5, Z28, Z2 - VPADDQ Z2, Z16, Z16 - VPANDQ Z5, Z21, Z2 - VPADDQ Z2, Z16, Z16 - VALIGND $16-5, Z16, Z16, K2, Z0 - KADDW K2, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VPSRLQ $32, Z28, Z28 - VPADDQ Z28, Z16, Z16 - VPSRLQ $32, Z21, Z21 - VPADDQ Z21, Z16, Z16 - VPANDQ Z5, Z29, Z2 - VPADDQ Z2, Z16, Z16 - VPANDQ Z5, Z22, Z2 - VPADDQ Z2, Z16, Z16 - VALIGND $16-6, Z16, Z16, K2, Z0 - KADDW K2, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VPSRLQ $32, Z29, Z29 - VPADDQ Z29, Z16, Z16 - VPSRLQ $32, Z22, Z22 - VPADDQ Z22, Z16, Z16 - VPANDQ Z5, Z30, Z2 - VPADDQ Z2, Z16, Z16 - VPANDQ Z5, Z23, Z2 - VPADDQ Z2, Z16, Z16 - VALIGND $16-7, Z16, Z16, K2, Z0 - KADDW K2, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VPSRLQ $32, Z30, Z30 - VPADDQ Z30, Z16, Z16 - VPSRLQ $32, Z23, Z23 - VPADDQ Z23, Z16, Z16 - VPANDQ Z5, Z31, Z2 - VPADDQ Z2, Z16, Z16 - VALIGND $16-8, Z16, Z16, K2, Z0 - KSHIFTLW $1, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VPSRLQ $32, Z31, Z31 - VPADDQ Z31, Z16, Z16 - VALIGND $16-9, Z16, Z16, K2, Z0 - KSHIFTLW $1, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VALIGND $16-10, Z16, Z16, K2, Z0 - KSHIFTLW $1, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VALIGND $16-11, Z16, Z16, K2, Z0 - KSHIFTLW $1, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VALIGND $16-12, Z16, Z16, K2, Z0 - KSHIFTLW $1, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VALIGND $16-13, Z16, Z16, K2, Z0 - KSHIFTLW $1, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VALIGND $16-14, Z16, Z16, K2, Z0 - KSHIFTLW $1, K2, K2 - VPSRLQ $32, Z16, Z2 - VALIGND.Z $2, Z16, Z16, K1, Z16 - VPADDQ Z2, Z16, Z16 - VALIGND $16-15, Z16, Z16, K2, Z0 - KSHIFTLW $1, K2, K2 + VALIGND.Z $16, Z16, Z16, K2, Z0 + KSHIFTLW $1, K2, K2 + VPSRLQ $32, Z16, Z2 + VALIGND.Z $2, Z16, Z16, K1, Z16 + VPADDQ Z2, Z16, Z16 + VPANDQ Z5, Z24, Z2 + VPADDQ Z2, Z16, Z16 + VPANDQ Z5, Z17, Z2 + VPADDQ Z2, Z16, Z16 + VALIGND $15, Z16, Z16, K2, Z0 + KSHIFTLW $1, K2, K2 + + // macro to add partial products and store the result in Z0 +#define ADDPP(in0, in1, in2, in3, in4) \ + VPSRLQ $32, Z16, Z2 \ + VALIGND.Z $2, Z16, Z16, K1, Z16 \ + VPADDQ Z2, Z16, Z16 \ + VPSRLQ $32, in0, in0 \ + VPADDQ in0, Z16, Z16 \ + VPSRLQ $32, in1, in1 \ + VPADDQ in1, Z16, Z16 \ + VPANDQ Z5, in2, Z2 \ + VPADDQ Z2, Z16, Z16 \ + VPANDQ Z5, in3, Z2 \ + VPADDQ Z2, Z16, Z16 \ + VALIGND $16-in4, Z16, Z16, K2, Z0 \ + KADDW K2, K2, K2 \ + + ADDPP(Z24, Z17, Z25, Z18, 2) + ADDPP(Z25, Z18, Z26, Z19, 3) + ADDPP(Z26, Z19, Z27, Z20, 4) + ADDPP(Z27, Z20, Z28, Z21, 5) + ADDPP(Z28, Z21, Z29, Z22, 6) + ADDPP(Z29, Z22, Z30, Z23, 7) + VPSRLQ $32, Z16, Z2 + VALIGND.Z $2, Z16, Z16, K1, Z16 + VPADDQ Z2, Z16, Z16 + VPSRLQ $32, Z30, Z30 + VPADDQ Z30, Z16, Z16 + VPSRLQ $32, Z23, Z23 + VPADDQ Z23, Z16, Z16 + VPANDQ Z5, Z31, Z2 + VPADDQ Z2, Z16, Z16 + VALIGND $16-8, Z16, Z16, K2, Z0 + KSHIFTLW $1, K2, K2 + VPSRLQ $32, Z16, Z2 + VALIGND.Z $2, Z16, Z16, K1, Z16 + VPADDQ Z2, Z16, Z16 + VPSRLQ $32, Z31, Z31 + VPADDQ Z31, Z16, Z16 + VALIGND $16-9, Z16, Z16, K2, Z0 + KSHIFTLW $1, K2, K2 + +#define ADDPP2(in0) \ + VPSRLQ $32, Z16, Z2 \ + VALIGND.Z $2, Z16, Z16, K1, Z16 \ + VPADDQ Z2, Z16, Z16 \ + VALIGND $16-in0, Z16, Z16, K2, Z0 \ + KSHIFTLW $1, K2, K2 \ + + ADDPP2(10) + ADDPP2(11) + ADDPP2(12) + ADDPP2(13) + ADDPP2(14) + ADDPP2(15) VPSRLQ $32, Z16, Z2 VALIGND.Z $2, Z16, Z16, K1, Z16 VPADDQ Z2, Z16, Z16 VMOVDQA64.Z Z16, K1, Z1 - VMOVQ X0, SI - VALIGNQ $1, Z0, Z1, Z0 - VMOVQ X0, DI - VALIGNQ $1, Z0, Z0, Z0 - VMOVQ X0, R8 - VALIGNQ $1, Z0, Z0, Z0 - VMOVQ X0, R9 - VALIGNQ $1, Z0, Z0, Z0 - XORQ BX, BX - MOVQ $const_qInvNeg, DX - MULXQ SI, DX, R10 - MULXQ ·qElement+0(SB), AX, R10 - ADDQ AX, SI - ADCQ R10, DI - MULXQ ·qElement+16(SB), AX, R10 - ADCQ AX, R8 - ADCQ R10, R9 - ADCQ $0, BX - MULXQ ·qElement+8(SB), AX, R10 - ADDQ AX, DI - ADCQ R10, R8 - MULXQ ·qElement+24(SB), AX, R10 - ADCQ AX, R9 - ADCQ R10, BX - ADCQ $0, SI - MOVQ $const_qInvNeg, DX - MULXQ DI, DX, R10 - MULXQ ·qElement+0(SB), AX, R10 - ADDQ AX, DI - ADCQ R10, R8 - MULXQ ·qElement+16(SB), AX, R10 - ADCQ AX, R9 - ADCQ R10, BX - ADCQ $0, SI - MULXQ ·qElement+8(SB), AX, R10 - ADDQ AX, R8 - ADCQ R10, R9 - MULXQ ·qElement+24(SB), AX, R10 - ADCQ AX, BX - ADCQ R10, SI - ADCQ $0, DI - MOVQ $const_qInvNeg, DX - MULXQ R8, DX, R10 - MULXQ ·qElement+0(SB), AX, R10 - ADDQ AX, R8 - ADCQ R10, R9 - MULXQ ·qElement+16(SB), AX, R10 - ADCQ AX, BX - ADCQ R10, SI - ADCQ $0, DI - MULXQ ·qElement+8(SB), AX, R10 - ADDQ AX, R9 - ADCQ R10, BX - MULXQ ·qElement+24(SB), AX, R10 - ADCQ AX, SI - ADCQ R10, DI - ADCQ $0, R8 - MOVQ $const_qInvNeg, DX - MULXQ R9, DX, R10 - MULXQ ·qElement+0(SB), AX, R10 - ADDQ AX, R9 - ADCQ R10, BX - MULXQ ·qElement+16(SB), AX, R10 - ADCQ AX, SI - ADCQ R10, DI - ADCQ $0, R8 - MULXQ ·qElement+8(SB), AX, R10 - ADDQ AX, BX - ADCQ R10, SI - MULXQ ·qElement+24(SB), AX, R10 - ADCQ AX, DI - ADCQ R10, R8 - ADCQ $0, R9 - VMOVQ X0, AX - ADDQ AX, BX - VALIGNQ $1, Z0, Z0, Z0 - VMOVQ X0, AX - ADCQ AX, SI - VALIGNQ $1, Z0, Z0, Z0 - VMOVQ X0, AX - ADCQ AX, DI - VALIGNQ $1, Z0, Z0, Z0 - VMOVQ X0, AX - ADCQ AX, R8 - VALIGNQ $1, Z0, Z0, Z0 - VMOVQ X0, AX - ADCQ AX, R9 - MOVQ R8, AX - SHRQ $32, R9, AX - MOVQ $const_mu, DX - MULQ DX - MULXQ ·qElement+0(SB), AX, R10 - SUBQ AX, BX - SBBQ R10, SI - MULXQ ·qElement+16(SB), AX, R10 - SBBQ AX, DI - SBBQ R10, R8 - SBBQ $0, R9 - MULXQ ·qElement+8(SB), AX, R10 - SUBQ AX, SI - SBBQ R10, DI - MULXQ ·qElement+24(SB), AX, R10 - SBBQ AX, R8 - SBBQ R10, R9 - MOVQ res+0(FP), R11 - MOVQ BX, 0(R11) - MOVQ SI, 8(R11) - MOVQ DI, 16(R11) - MOVQ R8, 24(R11) - SUBQ ·qElement+0(SB), BX - SBBQ ·qElement+8(SB), SI - SBBQ ·qElement+16(SB), DI - SBBQ ·qElement+24(SB), R8 - SBBQ $0, R9 - JCS done_14 - MOVQ BX, 0(R11) - MOVQ SI, 8(R11) - MOVQ DI, 16(R11) - MOVQ R8, 24(R11) - SUBQ ·qElement+0(SB), BX - SBBQ ·qElement+8(SB), SI - SBBQ ·qElement+16(SB), DI - SBBQ ·qElement+24(SB), R8 - SBBQ $0, R9 - JCS done_14 - MOVQ BX, 0(R11) - MOVQ SI, 8(R11) - MOVQ DI, 16(R11) - MOVQ R8, 24(R11) + + // Extract the 4 least significant qwords of Z0 + VMOVQ X0, SI + VALIGNQ $1, Z0, Z1, Z0 + VMOVQ X0, DI + VALIGNQ $1, Z0, Z0, Z0 + VMOVQ X0, R8 + VALIGNQ $1, Z0, Z0, Z0 + VMOVQ X0, R9 + VALIGNQ $1, Z0, Z0, Z0 + XORQ BX, BX + MOVQ $const_qInvNeg, DX + MULXQ SI, DX, R10 + MULXQ ·qElement+0(SB), AX, R10 + ADDQ AX, SI + ADCQ R10, DI + MULXQ ·qElement+16(SB), AX, R10 + ADCQ AX, R8 + ADCQ R10, R9 + ADCQ $0, BX + MULXQ ·qElement+8(SB), AX, R10 + ADDQ AX, DI + ADCQ R10, R8 + MULXQ ·qElement+24(SB), AX, R10 + ADCQ AX, R9 + ADCQ R10, BX + ADCQ $0, SI + MOVQ $const_qInvNeg, DX + MULXQ DI, DX, R10 + MULXQ ·qElement+0(SB), AX, R10 + ADDQ AX, DI + ADCQ R10, R8 + MULXQ ·qElement+16(SB), AX, R10 + ADCQ AX, R9 + ADCQ R10, BX + ADCQ $0, SI + MULXQ ·qElement+8(SB), AX, R10 + ADDQ AX, R8 + ADCQ R10, R9 + MULXQ ·qElement+24(SB), AX, R10 + ADCQ AX, BX + ADCQ R10, SI + ADCQ $0, DI + MOVQ $const_qInvNeg, DX + MULXQ R8, DX, R10 + MULXQ ·qElement+0(SB), AX, R10 + ADDQ AX, R8 + ADCQ R10, R9 + MULXQ ·qElement+16(SB), AX, R10 + ADCQ AX, BX + ADCQ R10, SI + ADCQ $0, DI + MULXQ ·qElement+8(SB), AX, R10 + ADDQ AX, R9 + ADCQ R10, BX + MULXQ ·qElement+24(SB), AX, R10 + ADCQ AX, SI + ADCQ R10, DI + ADCQ $0, R8 + MOVQ $const_qInvNeg, DX + MULXQ R9, DX, R10 + MULXQ ·qElement+0(SB), AX, R10 + ADDQ AX, R9 + ADCQ R10, BX + MULXQ ·qElement+16(SB), AX, R10 + ADCQ AX, SI + ADCQ R10, DI + ADCQ $0, R8 + MULXQ ·qElement+8(SB), AX, R10 + ADDQ AX, BX + ADCQ R10, SI + MULXQ ·qElement+24(SB), AX, R10 + ADCQ AX, DI + ADCQ R10, R8 + ADCQ $0, R9 + VMOVQ X0, AX + ADDQ AX, BX + VALIGNQ $1, Z0, Z0, Z0 + VMOVQ X0, AX + ADCQ AX, SI + VALIGNQ $1, Z0, Z0, Z0 + VMOVQ X0, AX + ADCQ AX, DI + VALIGNQ $1, Z0, Z0, Z0 + VMOVQ X0, AX + ADCQ AX, R8 + VALIGNQ $1, Z0, Z0, Z0 + VMOVQ X0, AX + ADCQ AX, R9 + + // Barrett reduction; see Handbook of Applied Cryptography, Algorithm 14.42. + MOVQ R8, AX + SHRQ $32, R9, AX + MOVQ $const_mu, DX + MULQ DX + MULXQ ·qElement+0(SB), AX, R10 + SUBQ AX, BX + SBBQ R10, SI + MULXQ ·qElement+16(SB), AX, R10 + SBBQ AX, DI + SBBQ R10, R8 + SBBQ $0, R9 + MULXQ ·qElement+8(SB), AX, R10 + SUBQ AX, SI + SBBQ R10, DI + MULXQ ·qElement+24(SB), AX, R10 + SBBQ AX, R8 + SBBQ R10, R9 + + // we need up to 2 conditional substractions to be < q + MOVQ res+0(FP), R11 + MOVQ BX, 0(R11) + MOVQ SI, 8(R11) + MOVQ DI, 16(R11) + MOVQ R8, 24(R11) + SUBQ ·qElement+0(SB), BX + SBBQ ·qElement+8(SB), SI + SBBQ ·qElement+16(SB), DI + SBBQ ·qElement+24(SB), R8 + SBBQ $0, R9 + JCS done_14 + MOVQ BX, 0(R11) + MOVQ SI, 8(R11) + MOVQ DI, 16(R11) + MOVQ R8, 24(R11) + SUBQ ·qElement+0(SB), BX + SBBQ ·qElement+8(SB), SI + SBBQ ·qElement+16(SB), DI + SBBQ ·qElement+24(SB), R8 + SBBQ $0, R9 + JCS done_14 + MOVQ BX, 0(R11) + MOVQ SI, 8(R11) + MOVQ DI, 16(R11) + MOVQ R8, 24(R11) done_14: RET diff --git a/field/generator/asm/amd64/build.go b/field/generator/asm/amd64/build.go index 0625f6216..2ed95e40d 100644 --- a/field/generator/asm/amd64/build.go +++ b/field/generator/asm/amd64/build.go @@ -23,8 +23,6 @@ import ( "path/filepath" "strings" - "github.com/consensys/bavard" - "github.com/consensys/bavard/amd64" "github.com/consensys/gnark-crypto/field/generator/config" ) @@ -78,6 +76,35 @@ func (f *FFAmd64) StackSize(maxNbRegistersNeeded, nbRegistersReserved, minStackS return max(r, minStackSize) } +func (f *FFAmd64) Define(name string, nbInputs int, fn func(args ...amd64.Register)) func(args ...amd64.Register) { + + inputs := make([]string, nbInputs) + for i := 0; i < nbInputs; i++ { + inputs[i] = fmt.Sprintf("in%d", i) + } + name = strings.ToUpper(name) + f.StartDefine() + f.WriteLn("#define " + name + "(" + strings.Join(inputs, ", ") + ")") + inputsRegisters := make([]amd64.Register, nbInputs) + for i := 0; i < nbInputs; i++ { + inputsRegisters[i] = amd64.Register(inputs[i]) + } + fn(inputsRegisters...) + f.EndDefine() + f.WriteLn("") + + return func(args ...amd64.Register) { + if len(args) != nbInputs { + panic("invalid number of arguments") + } + inputsStr := make([]string, len(args)) + for i := 0; i < len(args); i++ { + inputsStr[i] = string(args[i]) + } + f.WriteLn(name + "(" + strings.Join(inputsStr, ", ") + ")") + } +} + func max(a, b int) int { if a > b { return a @@ -214,7 +241,7 @@ func GenerateFieldWrapper(w io.Writer, F *config.FieldConfig, asmDirBuildPath, a // see internal/templates/ops* func GenerateCommonASM(w io.Writer, nbWords int) error { f := NewFFAmd64(w, nbWords) - f.WriteLn(bavard.Apache2Header("ConsenSys Software Inc.", 2020)) + f.Comment("Code generated by gnark-crypto/generator. DO NOT EDIT.") f.WriteLn("#include \"textflag.h\"") f.WriteLn("#include \"funcdata.h\"") @@ -246,6 +273,9 @@ func GenerateCommonASM(w io.Writer, nbWords int) error { func GenerateVectorASM(w io.Writer, nbWords int) error { f := NewFFAmd64(w, nbWords) f.WriteLn("") + f.Comment("Code generated by gnark-crypto/generator. DO NOT EDIT.") + f.Comment("Functions are derived from Dag Arne Osvik's work in github.com/a16z/vectorized-fields") + f.WriteLn("") f.generateAddVec() f.generateSubVec() diff --git a/field/generator/asm/amd64/element_mul.go b/field/generator/asm/amd64/element_mul.go index b5791bc30..c7c17af53 100644 --- a/field/generator/asm/amd64/element_mul.go +++ b/field/generator/asm/amd64/element_mul.go @@ -110,6 +110,7 @@ func (f *FFAmd64) MulADX(registers *amd64.Registers, x, y func(int) string, t [] if !hasFreeRegister { f.POPQ(A) } + // for j=1 to N-1 // (C,t[j-1]) := t[j] + m*q[j] + C for j := 1; j < f.NbWords; j++ { diff --git a/field/generator/asm/amd64/element_vec.go b/field/generator/asm/amd64/element_vec.go index 415208b8b..1a5350b23 100644 --- a/field/generator/asm/amd64/element_vec.go +++ b/field/generator/asm/amd64/element_vec.go @@ -16,7 +16,6 @@ package amd64 import ( "fmt" - "strconv" "github.com/consensys/bavard/amd64" ) @@ -427,15 +426,17 @@ func (f *FFAmd64) generateSumVec() { hi, lo amd64.Register } - f.WriteLn(`#define SPLIT_LO_HI(lo, hi) \ - MOVQ hi, lo; \ - ANDQ $0xffffffff, lo; \ - SHLQ $32, lo; \ - SHRQ $32, hi; \ - `) + splitLoHi := f.Define("SPLIT_LO_HI", 2, func(args ...amd64.Register) { + lo := args[0] + hi := args[1] + f.MOVQ(hi, lo) + f.ANDQ("$0xffffffff", lo) + f.SHLQ("$32", lo) + f.SHRQ("$32", hi) + }) for _, v := range []hilo{{w0h, low0h}, {w1h, low1h}, {w2h, low2h}, {w3h, low3h}} { - f.WriteLn(`SPLIT_LO_HI(` + string(v.lo) + `, ` + string(v.hi) + `)`) + splitLoHi(v.lo, v.hi) } f.WriteLn(` @@ -579,15 +580,12 @@ func (f *FFAmd64) generateInnerProduct() { A6H := amd64.Register("Z30") A7H := amd64.Register("Z31") - // X0 := amd64.Register("X0") - // load arguments f.MOVQ("a+8(FP)", PX) f.MOVQ("b+16(FP)", PY) f.MOVQ("n+24(FP)", LEN) - // Create mask for low dword in each qword - // vpmovzxdq %ymm0, LSW + f.Comment("Create mask for low dword in each qword") f.VPCMPEQB("Y0", "Y0", "Y0") f.VPMOVZXDQ("Y0", LSW) @@ -623,53 +621,26 @@ func (f *FFAmd64) generateInnerProduct() { f.Comment("we multiply and accumulate partial products of 4 bytes * 32 bytes") - f.VPMULUDQ_BCST("0*4("+PX+")", Y, PPL) - f.VPSRLQ("$32", PPL, PPH) - f.VPANDQ(LSW, PPL, PPL) - f.VPADDQ(PPL, A0L, A0L) - f.VPADDQ(PPH, A0H, A0H) - - f.VPMULUDQ_BCST("1*4("+PX+")", Y, PPL) - f.VPSRLQ("$32", PPL, PPH) - f.VPANDQ(LSW, PPL, PPL) - f.VPADDQ(PPL, A1L, A1L) - f.VPADDQ(PPH, A1H, A1H) - - f.VPMULUDQ_BCST("2*4("+PX+")", Y, PPL) - f.VPSRLQ("$32", PPL, PPH) - f.VPANDQ(LSW, PPL, PPL) - f.VPADDQ(PPL, A2L, A2L) - f.VPADDQ(PPH, A2H, A2H) - - f.VPMULUDQ_BCST("3*4("+PX+")", Y, PPL) - f.VPSRLQ("$32", PPL, PPH) - f.VPANDQ(LSW, PPL, PPL) - f.VPADDQ(PPL, A3L, A3L) - f.VPADDQ(PPH, A3H, A3H) - - f.VPMULUDQ_BCST("4*4("+PX+")", Y, PPL) - f.VPSRLQ("$32", PPL, PPH) - f.VPANDQ(LSW, PPL, PPL) - f.VPADDQ(PPL, A4L, A4L) - f.VPADDQ(PPH, A4H, A4H) - - f.VPMULUDQ_BCST("5*4("+PX+")", Y, PPL) - f.VPSRLQ("$32", PPL, PPH) - f.VPANDQ(LSW, PPL, PPL) - f.VPADDQ(PPL, A5L, A5L) - f.VPADDQ(PPH, A5H, A5H) - - f.VPMULUDQ_BCST("6*4("+PX+")", Y, PPL) - f.VPSRLQ("$32", PPL, PPH) - f.VPANDQ(LSW, PPL, PPL) - f.VPADDQ(PPL, A6L, A6L) - f.VPADDQ(PPH, A6H, A6H) - - f.VPMULUDQ_BCST("7*4("+PX+")", Y, PPL) - f.VPSRLQ("$32", PPL, PPH) - f.VPANDQ(LSW, PPL, PPL) - f.VPADDQ(PPL, A7L, A7L) - f.VPADDQ(PPH, A7H, A7H) + mac := f.Define("MAC", 3, func(inputs ...amd64.Register) { + opLeft := inputs[0] + lo := inputs[1] + hi := inputs[2] + + f.VPMULUDQ_BCST(opLeft, Y, PPL) + f.VPSRLQ("$32", PPL, PPH) + f.VPANDQ(LSW, PPL, PPL) + f.VPADDQ(PPL, lo, lo) + f.VPADDQ(PPH, hi, hi) + }) + + mac("0*4("+PX+")", A0L, A0H) + mac("1*4("+PX+")", A1L, A1H) + mac("2*4("+PX+")", A2L, A2H) + mac("3*4("+PX+")", A3L, A3H) + mac("4*4("+PX+")", A4L, A4H) + mac("5*4("+PX+")", A5L, A5H) + mac("6*4("+PX+")", A6L, A6H) + mac("7*4("+PX+")", A7L, A7H) f.ADDQ("$32", PX) @@ -679,6 +650,7 @@ func (f *FFAmd64) generateInnerProduct() { f.Push(®isters, LEN, PX, PY) f.LABEL(AddPP) + f.Comment("we accumulate the partial products into 544bits in Z1:Z0") f.MOVQ(uint64(0x1555), amd64.AX) f.KMOVD(amd64.AX, "K1") @@ -706,7 +678,13 @@ func (f *FFAmd64) generateInnerProduct() { f.VALIGND("$15", ACC, ACC, "K2", "Z0") f.KSHIFTLW("$1", "K2", "K2") - ADDPP := func(AxH, AyL, AyH, AzL, I amd64.Register) { + f.Comment("macro to add partial products and store the result in Z0") + addPP := f.Define("ADDPP", 5, func(inputs ...amd64.Register) { + AxH := inputs[0] + AyL := inputs[1] + AyH := inputs[2] + AzL := inputs[3] + I := inputs[4] f.VPSRLQ("$32", ACC, PPL) f.VALIGND_Z("$2", ACC, ACC, "K1", ACC) f.VPADDQ(PPL, ACC, ACC) @@ -720,14 +698,14 @@ func (f *FFAmd64) generateInnerProduct() { f.VPADDQ(PPL, ACC, ACC) f.VALIGND("$16-"+I, ACC, ACC, "K2", "Z0") f.KADDW("K2", "K2", "K2") - } + }) - ADDPP(A0H, A1L, A1H, A2L, "2") - ADDPP(A1H, A2L, A2H, A3L, "3") - ADDPP(A2H, A3L, A3H, A4L, "4") - ADDPP(A3H, A4L, A4H, A5L, "5") - ADDPP(A4H, A5L, A5H, A6L, "6") - ADDPP(A5H, A6L, A6H, A7L, "7") + addPP(A0H, A1L, A1H, A2L, "2") + addPP(A1H, A2L, A2H, A3L, "3") + addPP(A2H, A3L, A3H, A4L, "4") + addPP(A3H, A4L, A4H, A5L, "5") + addPP(A4H, A5L, A5H, A6L, "6") + addPP(A5H, A6L, A6H, A7L, "7") f.VPSRLQ("$32", ACC, PPL) f.VALIGND_Z("$2", ACC, ACC, "K1", ACC) @@ -749,20 +727,20 @@ func (f *FFAmd64) generateInnerProduct() { f.VALIGND("$16-9", ACC, ACC, "K2", "Z0") f.KSHIFTLW("$1", "K2", "K2") - ADDPP_2 := func(I int) { + addPP2 := f.Define("ADDPP2", 1, func(args ...amd64.Register) { f.VPSRLQ("$32", ACC, PPL) f.VALIGND_Z("$2", ACC, ACC, "K1", ACC) f.VPADDQ(PPL, ACC, ACC) - f.VALIGND("$16-"+strconv.Itoa(I), ACC, ACC, "K2", "Z0") + f.VALIGND("$16-"+args[0], ACC, ACC, "K2", "Z0") f.KSHIFTLW("$1", "K2", "K2") - } + }) - ADDPP_2(10) - ADDPP_2(11) - ADDPP_2(12) - ADDPP_2(13) - ADDPP_2(14) - ADDPP_2(15) + addPP2("10") + addPP2("11") + addPP2("12") + addPP2("13") + addPP2("14") + addPP2("15") f.VPSRLQ("$32", ACC, PPL) f.VALIGND_Z("$2", ACC, ACC, "K1", ACC) @@ -775,14 +753,7 @@ func (f *FFAmd64) generateInnerProduct() { T3 := f.Pop(®isters) T4 := f.Pop(®isters) - // Extract the 4 least significant qwords of %zmm0 - - // vmovq %xmm0, T1; valignq $1, %zmm0, %zmm1, %zmm0 // Shift in low word from zmm1 - // vmovq %xmm0, T2; valignq $1, %zmm0, %zmm0, %zmm0 - // vmovq %xmm0, T3; valignq $1, %zmm0, %zmm0, %zmm0 - // vmovq %xmm0, T4; valignq $1, %zmm0, %zmm0, %zmm0 - // xorq T0, T0 - + f.Comment("Extract the 4 least significant qwords of Z0") f.VMOVQ("X0", T1) f.VALIGNQ("$1", "Z0", "Z1", "Z0") f.VMOVQ("X0", T2) @@ -793,14 +764,6 @@ func (f *FFAmd64) generateInnerProduct() { f.VALIGNQ("$1", "Z0", "Z0", "Z0") f.XORQ(T0, T0) - // movq INV, %rdx // Load negative inverse mod 2^64 - - // mulx T1, %rdx, PH - - // mulx 0*8(PM), PL, PH; add PL, T1; adc PH, T2 - // mulx 2*8(PM), PL, PH; adc PL, T3; adc PH, T4; adc $0, T0 - // mulx 1*8(PM), PL, PH; add PL, T2; adc PH, T3 - // mulx 3*8(PM), PL, PH; adc PL, T4; adc PH, T0; adc $0, T1 PH := f.Pop(®isters) PL := amd64.AX f.MOVQ(f.qInv0(), amd64.DX) @@ -820,15 +783,6 @@ func (f *FFAmd64) generateInnerProduct() { f.ADCQ(PH, T0) f.ADCQ("$0", T1) - // // movq INV, %rdx - - // // mulx T2, %rdx, PH - - // // mulx 0*8(PM), PL, PH; add PL, T2; adc PH, T3 - // // mulx 2*8(PM), PL, PH; adc PL, T4; adc PH, T0; adc $0, T1 - // // mulx 1*8(PM), PL, PH; add PL, T3; adc PH, T4 - // // mulx 3*8(PM), PL, PH; adc PL, T0; adc PH, T1; adc $0, T2 - f.MOVQ(f.qInv0(), amd64.DX) f.MULXQ(T2, amd64.DX, PH) @@ -847,15 +801,6 @@ func (f *FFAmd64) generateInnerProduct() { f.ADCQ(PH, T1) f.ADCQ("$0", T2) - // // movq INV, %rdx - - // // mulx T3, %rdx, PH - - // // mulx 0*8(PM), PL, PH; add PL, T3; adc PH, T4 - // // mulx 2*8(PM), PL, PH; adc PL, T0; adc PH, T1; adc $0, T2 - // // mulx 1*8(PM), PL, PH; add PL, T4; adc PH, T0 - // // mulx 3*8(PM), PL, PH; adc PL, T1; adc PH, T2; adc $0, T3 - f.MOVQ(f.qInv0(), amd64.DX) f.MULXQ(T3, amd64.DX, PH) @@ -875,15 +820,6 @@ func (f *FFAmd64) generateInnerProduct() { f.ADCQ(PH, T2) f.ADCQ("$0", T3) - // // movq INV, %rdx - - // // mulx T4, %rdx, PH - - // // mulx 0*8(PM), PL, PH; add PL, T4; adc PH, T0 - // // mulx 2*8(PM), PL, PH; adc PL, T1; adc PH, T2; adc $0, T3 - // // mulx 1*8(PM), PL, PH; add PL, T0; adc PH, T1 - // // mulx 3*8(PM), PL, PH; adc PL, T2; adc PH, T3; adc $0, T4 - f.MOVQ(f.qInv0(), amd64.DX) f.MULXQ(T4, amd64.DX, PH) @@ -905,12 +841,6 @@ func (f *FFAmd64) generateInnerProduct() { // Add the remaining 5 qwords (9 dwords) from zmm0 - // vmovq %xmm0, PL; add PL, T0; valignq $1, %zmm0, %zmm0, %zmm0 - // vmovq %xmm0, PL; adc PL, T1; valignq $1, %zmm0, %zmm0, %zmm0 - // vmovq %xmm0, PL; adc PL, T2; valignq $1, %zmm0, %zmm0, %zmm0 - // vmovq %xmm0, PL; adc PL, T3; valignq $1, %zmm0, %zmm0, %zmm0 - // vmovq %xmm0, PL; adc PL, T4 // T4 < 2^32 - f.VMOVQ("X0", PL) f.ADDQ(PL, T0) f.VALIGNQ("$1", "Z0", "Z0", "Z0") @@ -926,31 +856,12 @@ func (f *FFAmd64) generateInnerProduct() { f.VMOVQ("X0", PL) f.ADCQ(PL, T4) - ////////////////////////////////////////////////// - // Barrett reduction - ////////////////////////////////////////////////// - - // // For explanation of mu, q1, q2, q3, r1, r2, see Handbook of - // // Applied Cryptography, Algorithm 14.42. - - // // q1 is low 32 bits of T4 and high 32 bits of T3 - - // movq T3, %rax - // shrd $32, T4, %rax // q1 - // mulq MU // Multiply by mu. q2 in rdx:rax, q3 in rdx - + f.Comment("Barrett reduction; see Handbook of Applied Cryptography, Algorithm 14.42.") f.MOVQ(T3, amd64.AX) f.SHRQw("$32", T4, amd64.AX) f.MOVQ(f.mu(), amd64.DX) f.MULQ(amd64.DX) - // // Subtract r2 from r1 - - // mulx 0*8(PM), PL, PH; sub PL, T0; sbb PH, T1; - // mulx 2*8(PM), PL, PH; sbb PL, T2; sbb PH, T3; sbb $0, T4 - // mulx 1*8(PM), PL, PH; sub PL, T1; sbb PH, T2; - // mulx 3*8(PM), PL, PH; sbb PL, T3; sbb PH, T4 - f.MULXQ(f.qAt(0), PL, PH) f.SUBQ(PL, T0) f.SBBQ(PH, T1) @@ -965,6 +876,8 @@ func (f *FFAmd64) generateInnerProduct() { f.SBBQ(PL, T3) f.SBBQ(PH, T4) + f.Comment("we need up to 2 conditional substractions to be < q") + PZ := f.Pop(®isters) f.MOVQ("res+0(FP)", PZ) t := []amd64.Register{T0, T1, T2, T3} @@ -996,140 +909,4 @@ func (f *FFAmd64) generateInnerProduct() { f.RET() - // f.Push(®isters, lo, hi) - - // // We have 544-bit (72-byte) result in Z1:Z0. - // // Only the modular reduction remains to be computed. - - // // for i=0 to s-1 - // // C := 0 - // // m := t[i]*n'[0] mod W - // // for j=0 to s-1 - // // (C,S) := t[i+j] + m*n[j] + C - // // t[i+j] := S - // // ADD (t[i+s],C) - - // f.XORQ(amd64.AX, amd64.AX) - // m := amd64.DX - // tr := f.Pop(®isters) - // zero := f.Pop(®isters) - - // f.MOVQ(f.qInv0(), m) - // for i := 0; i < 4; i++ { - // f.XORQ(zero, zero) - // f.IMULQ(r[i], m) - - // f.MULXQ(f.qAt(i), amd64.AX, tr) - - // // shift in the loop. - - // for j := 0; j < 4; j++ { - // f.MULXQ(f.qAt(j), amd64.AX, tr) - // f.ADCXQ(amd64.AX, r[i+j]) - // f.ADOXQ(tr, r[i+j+1]) - // } - - // for j := i + 1; j < 8; j++ { - // f.ADCXQ(zero, r[j]) - // f.ADOXQ(zero, r[j+1]) - // } - - // f.ADCXQ(zero, r[8]) - // } - - // // // j == 0 - // // // C,_ := r[i] + m*q[0] - // // f.MULXQ(f.qAt(0), amd64.AX, tr) - // // f.ADCXQ(amd64.AX, r[i]) - // // f.ADOXQ(tr, r[i+1]) - - // // for j := 1; j < 4; j++ { - // // f.MULXQ(f.qAt(j), amd64.AX, tr) // m * n[j] - // // f.ADCXQ(amd64.AX, r[i+j]) - // // f.ADOXQ(tr, r[i+j+1]) - // // } - - // // for k := i + 4; k < 9; k++ { - // // f.ADCXQ(zero, r[k]) - // // f.ADOXQ(zero, r[k]) - // // } - - // f.Push(®isters, zero, tr, m) - - // // now our result should be in r[4] to r[8] - // T0 := r[4] - // T1 := r[5] - // T2 := r[6] - // T3 := r[7] - // T4 := r[8] - - // PL := f.Pop(®isters) - // PH := f.Pop(®isters) - - // // ////////////////////////////////////////////////// - // // // Barrett reduction - // // ////////////////////////////////////////////////// - - // // // // For explanation of mu, q1, q2, q3, r1, r2, see Handbook of - // // // // Applied Cryptography, Algorithm 14.42. - - // // // // q1 is low 32 bits of T4 and high 32 bits of T3 - - // // // movq T3, %rax - // // // shrd $32, T4, %rax // q1 - // // // mulq MU // Multiply by mu. q2 in rdx:rax, q3 in rdx - - // f.MOVQ(T3, amd64.AX) - // f.SHRQw("$32", T4, amd64.AX) - // f.MOVQ(f.mu(), amd64.DX) - // f.MULQ(amd64.DX) - - // // // // Subtract r2 from r1 - - // // // mulx 0*8(PM), PL, PH; sub PL, T0; sbb PH, T1; - // // // mulx 2*8(PM), PL, PH; sbb PL, T2; sbb PH, T3; sbb $0, T4 - // // // mulx 1*8(PM), PL, PH; sub PL, T1; sbb PH, T2; - // // // mulx 3*8(PM), PL, PH; sbb PL, T3; sbb PH, T4 - - // f.MULXQ(f.qAt(0), PL, PH) - // f.SUBQ(PL, T0) - // f.SBBQ(PH, T1) - // f.MULXQ(f.qAt(2), PL, PH) - // f.SBBQ(PL, T2) - // f.SBBQ(PH, T3) - // f.SBBQ("$0", T4) - // f.MULXQ(f.qAt(1), PL, PH) - // f.SUBQ(PL, T1) - // f.SBBQ(PH, T2) - // f.MULXQ(f.qAt(3), PL, PH) - // f.SBBQ(PL, T3) - // f.SBBQ(PH, T4) - - // PZ := f.Pop(®isters) - // f.MOVQ("res+0(FP)", PZ) - // t := []amd64.Register{T0, T1, T2, T3} - // f.Mov(t, PZ) - - // // sub q - // f.SUBQ(f.qAt(0), T0) - // f.SBBQ(f.qAt(1), T1) - // f.SBBQ(f.qAt(2), T2) - // f.SBBQ(f.qAt(3), T3) - // f.SBBQ("$0", T4) - - // // if borrow, we go to done - // f.JCS(done) - - // f.Mov(t, PZ) - - // f.SUBQ(f.qAt(0), T0) - // f.SBBQ(f.qAt(1), T1) - // f.SBBQ(f.qAt(2), T2) - // f.SBBQ(f.qAt(3), T3) - // f.SBBQ("$0", T4) - - // f.JCS(done) - - // f.Mov(t, PZ) - } diff --git a/field/generator/internal/templates/element/ops_asm.go b/field/generator/internal/templates/element/ops_asm.go index 042337d20..df1b10671 100644 --- a/field/generator/internal/templates/element/ops_asm.go +++ b/field/generator/internal/templates/element/ops_asm.go @@ -86,9 +86,6 @@ func (vector *Vector) Sum() (res {{.ElementName}}) { //go:noescape func sumVec(res *{{.ElementName}}, a *{{.ElementName}}, n uint64) -//go:noescape -func innerProdVec(res *uint64, a,b *{{.ElementName}}, n uint64) - // InnerProduct computes the inner product of two vectors. // It panics if the vectors don't have the same length. func (vector *Vector) InnerProduct(other Vector) (res {{.ElementName}}) { @@ -99,10 +96,10 @@ func (vector *Vector) InnerProduct(other Vector) (res {{.ElementName}}) { if n != uint64(len(other)) { panic("vector.InnerProduct: vectors don't have the same length") } - const minN = 0 // AVX512 slower than generic for small n const maxN = (1 << 32) - 1 - if !supportAvx512 || n <= minN || n >= maxN { + if !supportAvx512 || n >= maxN { // call innerProductVecGeneric + // note; we could split the vector into smaller chunks and call innerProductVec innerProductVecGeneric(&res, *vector, other) return } @@ -111,6 +108,9 @@ func (vector *Vector) InnerProduct(other Vector) (res {{.ElementName}}) { return } +//go:noescape +func innerProdVec(res *uint64, a,b *{{.ElementName}}, n uint64) + {{- end}} // Mul z = x * y (mod q) diff --git a/field/generator/internal/templates/element/tests_vector.go b/field/generator/internal/templates/element/tests_vector.go index 2eb753c99..382217122 100644 --- a/field/generator/internal/templates/element/tests_vector.go +++ b/field/generator/internal/templates/element/tests_vector.go @@ -86,9 +86,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/field/goldilocks/vector_test.go b/field/goldilocks/vector_test.go index 3f6711ccb..20f4f5130 100644 --- a/field/goldilocks/vector_test.go +++ b/field/goldilocks/vector_test.go @@ -97,9 +97,9 @@ func (vector *Vector) unmarshalBinaryAsync(data []byte) error { func TestVectorOps(t *testing.T) { parameters := gopter.DefaultTestParameters() if testing.Short() { - parameters.MinSuccessfulTests = 5 + parameters.MinSuccessfulTests = 2 } else { - parameters.MinSuccessfulTests = 100 + parameters.MinSuccessfulTests = 10 } properties := gopter.NewProperties(parameters) diff --git a/go.mod b/go.mod index f7ed17e22..5b86c36ac 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.22 require ( github.com/bits-and-blooms/bitset v1.14.2 - github.com/consensys/bavard v0.0.0 + github.com/consensys/bavard v0.1.18 github.com/leanovate/gopter v0.2.11 github.com/mmcloughlin/addchain v0.4.0 github.com/spf13/cobra v1.8.1 @@ -15,8 +15,6 @@ require ( gopkg.in/yaml.v2 v2.4.0 ) -replace github.com/consensys/bavard => ../bavard - require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect diff --git a/go.sum b/go.sum index 0f20a3a93..2e655e3e4 100644 --- a/go.sum +++ b/go.sum @@ -55,8 +55,8 @@ github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDk github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cncf/udpa/go v0.0.0-20200629203442-efcf912fb354/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= -github.com/consensys/bavard v0.1.17 h1:53CdY/g35YSH9oRoa/b29tZinaiOEJYBmf9vydozPpE= -github.com/consensys/bavard v0.1.17/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI= +github.com/consensys/bavard v0.1.18 h1:eYXgFO6LHm8X8BKjgKPdvOl/QXag9n54b213rsv3448= +github.com/consensys/bavard v0.1.18/go.mod h1:k/zVjHHC4B+PQy1Pg7fgvG3ALicQw540Crag8qx+dZs= github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=